LLVM 23.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Intrinsics.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/Debug.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE "x86-isel"
71
73 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
79
81 "x86-br-merging-base-cost", cl::init(2),
83 "Sets the cost threshold for when multiple conditionals will be merged "
84 "into one branch versus be split in multiple branches. Merging "
85 "conditionals saves branches at the cost of additional instructions. "
86 "This value sets the instruction cost limit, below which conditionals "
87 "will be merged, and above which conditionals will be split. Set to -1 "
88 "to never merge branches."),
90
92 "x86-br-merging-ccmp-bias", cl::init(6),
93 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
94 "supports conditional compare instructions."),
96
97static cl::opt<bool>
98 WidenShift("x86-widen-shift", cl::init(true),
99 cl::desc("Replace narrow shifts with wider shifts."),
100 cl::Hidden);
101
103 "x86-br-merging-likely-bias", cl::init(0),
104 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
105 "that all conditionals will be executed. For example for merging "
106 "the conditionals (a == b && c > d), if its known that a == b is "
107 "likely, then it is likely that if the conditionals are split "
108 "both sides will be executed, so it may be desirable to increase "
109 "the instruction cost threshold. Set to -1 to never merge likely "
110 "branches."),
111 cl::Hidden);
112
114 "x86-br-merging-unlikely-bias", cl::init(-1),
115 cl::desc(
116 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
117 "that all conditionals will be executed. For example for merging "
118 "the conditionals (a == b && c > d), if its known that a == b is "
119 "unlikely, then it is unlikely that if the conditionals are split "
120 "both sides will be executed, so it may be desirable to decrease "
121 "the instruction cost threshold. Set to -1 to never merge unlikely "
122 "branches."),
123 cl::Hidden);
124
126 "mul-constant-optimization", cl::init(true),
127 cl::desc("Replace 'mul x, Const' with more effective instructions like "
128 "SHIFT, LEA, etc."),
129 cl::Hidden);
130
132 const X86Subtarget &STI)
133 : TargetLowering(TM, STI), Subtarget(STI) {
134 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
135 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
136
137 // Set up the TargetLowering object.
138
139 // X86 is weird. It always uses i8 for shift amounts and setcc results.
141 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
143
144 // X86 instruction cache is coherent with its data cache so we can use the
145 // default expansion to a no-op.
147
148 // For 64-bit, since we have so many registers, use the ILP scheduler.
149 // For 32-bit, use the register pressure specific scheduling.
150 // For Atom, always use ILP scheduling.
151 if (Subtarget.isAtom())
153 else if (Subtarget.is64Bit())
155 else
157 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
158 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
159
160 // Bypass expensive divides and use cheaper ones.
161 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
162 if (Subtarget.hasSlowDivide32())
163 addBypassSlowDiv(32, 8);
164 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
165 addBypassSlowDiv(64, 32);
166 }
167
168 if (Subtarget.canUseCMPXCHG16B())
170 else if (Subtarget.canUseCMPXCHG8B())
172 else
174
175 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
176
178
179 // Set up the register classes.
180 addRegisterClass(MVT::i8, &X86::GR8RegClass);
181 addRegisterClass(MVT::i16, &X86::GR16RegClass);
182 addRegisterClass(MVT::i32, &X86::GR32RegClass);
183 if (Subtarget.is64Bit())
184 addRegisterClass(MVT::i64, &X86::GR64RegClass);
185
186 for (MVT VT : MVT::integer_valuetypes())
188
189 // We don't accept any truncstore of integer registers.
190 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
191 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
193 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
194 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
195 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
196
197 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
198
199 // SETOEQ and SETUNE require checking two conditions.
200 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
203 }
204
205 // Integer absolute.
206 if (Subtarget.canUseCMOV()) {
207 setOperationAction(ISD::ABS , MVT::i16 , Custom);
208 setOperationAction(ISD::ABS , MVT::i32 , Custom);
209 if (Subtarget.is64Bit())
210 setOperationAction(ISD::ABS , MVT::i64 , Custom);
211 }
212
213 // Absolute difference.
214 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
215 setOperationAction(Op , MVT::i8 , Custom);
216 setOperationAction(Op , MVT::i16 , Custom);
217 setOperationAction(Op , MVT::i32 , Custom);
218 if (Subtarget.is64Bit())
219 setOperationAction(Op , MVT::i64 , Custom);
220 }
221
222 // Signed saturation subtraction.
226 if (Subtarget.is64Bit())
228
229 // Funnel shifts.
230 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
231 // For slow shld targets we only lower for code size.
232 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
233
234 setOperationAction(ShiftOp , MVT::i8 , Custom);
235 setOperationAction(ShiftOp , MVT::i16 , Custom);
236 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
237 if (Subtarget.is64Bit())
238 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
239 }
240
241 if (!Subtarget.useSoftFloat()) {
242 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
243 // operation.
248 // We have an algorithm for SSE2, and we turn this into a 64-bit
249 // FILD or VCVTUSI2SS/SD for other targets.
252 // We have an algorithm for SSE2->double, and we turn this into a
253 // 64-bit FILD followed by conditional FADD for other targets.
256
257 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
258 // this operation.
261 // SSE has no i16 to fp conversion, only i32. We promote in the handler
262 // to allow f80 to use i16 and f64 to use i16 with sse1 only
265 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
268 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
269 // are Legal, f80 is custom lowered.
272
273 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
274 // this operation.
276 // FIXME: This doesn't generate invalid exception when it should. PR44019.
282 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
283 // are Legal, f80 is custom lowered.
286
287 // Handle FP_TO_UINT by promoting the destination to a larger signed
288 // conversion.
290 // FIXME: This doesn't generate invalid exception when it should. PR44019.
293 // FIXME: This doesn't generate invalid exception when it should. PR44019.
299
304
305 if (!Subtarget.is64Bit() && Subtarget.hasX87()) {
308 }
309 }
310
311 if (Subtarget.hasSSE2()) {
312 // Custom lowering for saturating float to int conversions.
313 // We handle promotion to larger result types manually.
314 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
317 }
318 if (Subtarget.is64Bit()) {
321 }
322 }
323 if (Subtarget.hasAVX10_2()) {
324 for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v32i8}) {
327 }
332 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
333 MVT::v4i64}) {
336 }
337 if (Subtarget.is64Bit()) {
340 }
341 }
342
343 // Handle address space casts between mixed sized pointers.
346
347 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
348 if (!Subtarget.hasSSE2()) {
351 if (Subtarget.is64Bit()) {
353 // Without SSE, i64->f64 goes through memory.
355 }
356 } else if (!Subtarget.is64Bit())
358
359 // Scalar integer divide and remainder are lowered to use operations that
360 // produce two results, to match the available instructions. This exposes
361 // the two-result form to trivial CSE, which is able to combine x/y and x%y
362 // into a single instruction.
363 //
364 // Scalar integer multiply-high is also lowered to use two-result
365 // operations, to match the available instructions. However, plain multiply
366 // (low) operations are left as Legal, as there are single-result
367 // instructions for this in x86. Using the two-result multiply instructions
368 // when both high and low results are needed must be arranged by dagcombine.
369 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
376 }
377
378 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
380 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
381 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
384 }
385 if (Subtarget.is64Bit())
390
395
396 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
402 }
403
404 // Promote the i8 variants and force them on up to i32 which has a shorter
405 // encoding.
406 setOperationPromotedToType(ISD::CTTZ, MVT::i8, MVT::i32);
408 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
409 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
410 // promote that too.
411 setOperationPromotedToType(ISD::CTTZ, MVT::i16, MVT::i32);
413
414 if (!Subtarget.hasBMI()) {
417 if (Subtarget.is64Bit()) {
420 }
421 }
422
423 if (Subtarget.hasLZCNT()) {
424 // When promoting the i8 variants, force them to i32 for a shorter
425 // encoding.
426 setOperationPromotedToType(ISD::CTLZ, MVT::i8, MVT::i32);
428 } else {
429 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
430 if (VT == MVT::i64 && !Subtarget.is64Bit())
431 continue;
434 }
435 }
436
439 // Special handling for half-precision floating point conversions.
440 // If we don't have F16C support, then lower half float conversions
441 // into library calls.
443 Op, MVT::f32,
444 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
445 // There's never any support for operations beyond MVT::f32.
446 setOperationAction(Op, MVT::f64, Expand);
447 setOperationAction(Op, MVT::f80, Expand);
448 setOperationAction(Op, MVT::f128, Expand);
449 }
450
451 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
454 }
455
456 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
457 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
459 setTruncStoreAction(VT, MVT::f16, Expand);
460 setTruncStoreAction(VT, MVT::bf16, Expand);
461
464 }
465
469 if (Subtarget.is64Bit())
471 if (Subtarget.hasPOPCNT()) {
472 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
473 // popcntw is longer to encode than popcntl and also has a false dependency
474 // on the dest that popcntl hasn't had since Cannon Lake.
475 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
476 } else {
481 }
482
484
485 if (!Subtarget.hasMOVBE())
487
488 // X86 wants to expand cmov itself.
489 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
494 }
495 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
496 if (VT == MVT::i64 && !Subtarget.is64Bit())
497 continue;
500 }
501
503
504 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
507
509 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
510 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
514
515 // Darwin ABI issue.
516 for (auto VT : { MVT::i32, MVT::i64 }) {
517 if (VT == MVT::i64 && !Subtarget.is64Bit())
518 continue;
525 }
526
527 // 64-bit shl, sra, srl (iff 32-bit x86)
528 for (auto VT : { MVT::i32, MVT::i64 }) {
529 if (VT == MVT::i64 && !Subtarget.is64Bit())
530 continue;
534 }
535
536 if (Subtarget.hasSSEPrefetch())
538
540
541 // Expand certain atomics
542 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
550 }
551
552 if (!Subtarget.is64Bit())
554
555 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
556 // All CPUs supporting AVX will atomically load/store aligned 128-bit
557 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
560 }
561
562 if (Subtarget.canUseCMPXCHG16B())
564
565 // FIXME - use subtarget debug flags
566 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
567 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
568 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
570 }
571
574
577
578 setOperationAction(ISD::TRAP, MVT::Other, Legal);
580 if (Subtarget.isTargetPS())
582 else
584
585 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
587 setOperationAction(ISD::VAEND , MVT::Other, Expand);
588 bool Is64Bit = Subtarget.is64Bit();
589 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
590 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
591
594
596
597 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
600
602
603 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
604 setOperationAction(ISD::FABS, VT, Action);
605 setOperationAction(ISD::FNEG, VT, Action);
607 setOperationAction(ISD::FREM, VT, Action);
608 setOperationAction(ISD::FMA, VT, Action);
609 setOperationAction(ISD::FMINNUM, VT, Action);
610 setOperationAction(ISD::FMAXNUM, VT, Action);
615 setOperationAction(ISD::FSIN, VT, Action);
616 setOperationAction(ISD::FCOS, VT, Action);
617 setOperationAction(ISD::FSINCOS, VT, Action);
618 setOperationAction(ISD::FTAN, VT, Action);
619 setOperationAction(ISD::FSQRT, VT, Action);
620 setOperationAction(ISD::FPOW, VT, Action);
621 setOperationAction(ISD::FPOWI, VT, Action);
622 setOperationAction(ISD::FLOG, VT, Action);
623 setOperationAction(ISD::FLOG2, VT, Action);
624 setOperationAction(ISD::FLOG10, VT, Action);
625 setOperationAction(ISD::FEXP, VT, Action);
626 setOperationAction(ISD::FEXP2, VT, Action);
627 setOperationAction(ISD::FEXP10, VT, Action);
628 setOperationAction(ISD::FCEIL, VT, Action);
629 setOperationAction(ISD::FFLOOR, VT, Action);
631 setOperationAction(ISD::FRINT, VT, Action);
632 setOperationAction(ISD::BR_CC, VT, Action);
633 setOperationAction(ISD::SETCC, VT, Action);
636 setOperationAction(ISD::FROUND, VT, Action);
638 setOperationAction(ISD::FTRUNC, VT, Action);
639 setOperationAction(ISD::FLDEXP, VT, Action);
641 };
642
643 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
644 // f16, f32 and f64 use SSE.
645 // Set up the FP register classes.
646 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
647 : &X86::FR16RegClass);
648 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
649 : &X86::FR32RegClass);
650 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
651 : &X86::FR64RegClass);
652
653 // Disable f32->f64 extload as we can only generate this in one instruction
654 // under optsize. So its easier to pattern match (fpext (load)) for that
655 // case instead of needing to emit 2 instructions for extload in the
656 // non-optsize case.
657 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
658
659 for (auto VT : { MVT::f32, MVT::f64 }) {
660 // Use ANDPD to simulate FABS.
662
663 // Use XORP to simulate FNEG.
665
666 // Use ANDPD and ORPD to simulate FCOPYSIGN.
668
669 // These might be better off as horizontal vector ops.
672
673 // We don't support sin/cos/fmod
677 }
678
679 // Half type will be promoted by default.
680 setF16Action(MVT::f16, Promote);
691
721
726
731
732 // Lower this to MOVMSK plus an AND.
735
736 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
737 (UseX87 || Is64Bit)) {
738 // Use SSE for f32, x87 for f64.
739 // Set up the FP register classes.
740 addRegisterClass(MVT::f32, &X86::FR32RegClass);
741 if (UseX87)
742 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
743
744 // Use ANDPS to simulate FABS.
746
747 // Use XORP to simulate FNEG.
749
750 if (UseX87)
752
753 // Use ANDPS and ORPS to simulate FCOPYSIGN.
754 if (UseX87)
757
758 // We don't support sin/cos/fmod
762
763 if (UseX87) {
764 // Always expand sin/cos functions even though x87 has an instruction.
768 }
769 } else if (UseX87) {
770 // f32 and f64 in x87.
771 // Set up the FP register classes.
772 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
773 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
774
775 for (auto VT : { MVT::f32, MVT::f64 }) {
778
779 // Always expand sin/cos functions even though x87 has an instruction.
783 }
784 }
785
786 // Expand FP32 immediates into loads from the stack, save special cases.
787 if (isTypeLegal(MVT::f32)) {
788 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
789 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
790 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
791 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
792 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
793 } else // SSE immediates.
794 addLegalFPImmediate(APFloat(+0.0f)); // xorps
795 }
796 // Expand FP64 immediates into loads from the stack, save special cases.
797 if (isTypeLegal(MVT::f64)) {
798 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
799 addLegalFPImmediate(APFloat(+0.0)); // FLD0
800 addLegalFPImmediate(APFloat(+1.0)); // FLD1
801 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
802 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
803 } else // SSE immediates.
804 addLegalFPImmediate(APFloat(+0.0)); // xorpd
805 }
806 // Support fp16 0 immediate.
807 if (isTypeLegal(MVT::f16))
808 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
809
810 // Handle constrained floating-point operations of scalar.
823
824 // We don't support FMA.
827
828 // f80 always uses X87.
829 if (UseX87) {
830 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
833 {
835 addLegalFPImmediate(TmpFlt); // FLD0
836 TmpFlt.changeSign();
837 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
838
839 bool ignored;
840 APFloat TmpFlt2(+1.0);
842 &ignored);
843 addLegalFPImmediate(TmpFlt2); // FLD1
844 TmpFlt2.changeSign();
845 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
846 }
847
848 // Always expand sin/cos functions even though x87 has an instruction.
849 // clang-format off
861 // clang-format on
862
874
875 // Handle constrained floating-point operations of scalar.
882 if (isTypeLegal(MVT::f16)) {
885 } else {
887 }
888 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
889 // as Custom.
891 }
892
893 // f128 uses xmm registers, but most operations require libcalls.
894 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
895 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
896 : &X86::VR128RegClass);
897
898 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
899
910
914
915 // clang-format off
923 // clang-format on
924 // No STRICT_FSINCOS
927
930 // We need to custom handle any FP_ROUND with an f128 input, but
931 // LegalizeDAG uses the result type to know when to run a custom handler.
932 // So we have to list all legal floating point result types here.
933 if (isTypeLegal(MVT::f32)) {
936 }
937 if (isTypeLegal(MVT::f64)) {
940 }
941 if (isTypeLegal(MVT::f80)) {
945 }
946
948
949 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
950 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
951 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
952 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
953 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
954 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
955 }
956
957 // Always use a library call for pow.
958 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
959 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
960 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
961 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
962
971
972 // Some FP actions are always expanded for vector types.
973 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
974 MVT::v4f32, MVT::v8f32, MVT::v16f32,
975 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
976 // clang-format off
990 // clang-format on
991 }
992
993 // First set operation action for all vector types to either promote
994 // (for widening) or expand (for scalarization). Then we will selectively
995 // turn on ones that can be effectively codegen'd.
1035 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1036 setTruncStoreAction(InnerVT, VT, Expand);
1037
1038 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1039 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1040
1041 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1042 // types, we have to deal with them whether we ask for Expansion or not.
1043 // Setting Expand causes its own optimisation problems though, so leave
1044 // them legal.
1045 if (VT.getVectorElementType() == MVT::i1)
1046 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1047
1048 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1049 // split/scalarized right now.
1050 if (VT.getVectorElementType() == MVT::f16 ||
1051 VT.getVectorElementType() == MVT::bf16)
1052 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1053 }
1054 }
1055
1056 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1057 // with -msoft-float, disable use of MMX as well.
1058 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1059 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1060 // No operations on x86mmx supported, everything uses intrinsics.
1061 }
1062
1063 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1064 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1065 : &X86::VR128RegClass);
1066
1071
1072 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1073 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1081
1082 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1083 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1085
1091 }
1092
1093 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1094 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096
1097 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1098 // registers cannot be used even for integer operations.
1099 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1100 : &X86::VR128RegClass);
1101 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1102 : &X86::VR128RegClass);
1103 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1104 : &X86::VR128RegClass);
1105 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1106 : &X86::VR128RegClass);
1107 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1108 : &X86::VR128RegClass);
1109
1110 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1115 }
1116
1117 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1119 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1120
1121 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1122 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1123 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1124 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1125 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1126 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1127 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1128 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1129 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1130 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1133
1134 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1135 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1136 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1137
1138 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1140 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1142
1143 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1144 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1145
1146 setOperationAction(ISD::AND, MVT::i128, Custom);
1147 setOperationAction(ISD::OR, MVT::i128, Custom);
1148 setOperationAction(ISD::XOR, MVT::i128, Custom);
1150
1151 if (Subtarget.hasPCLMUL()) {
1152 for (auto VT : {MVT::i64, MVT::v4i32, MVT::v2i64}) {
1155 }
1159 }
1160
1161 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1162 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1163 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1164 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1165 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1169 }
1170
1171 // SSE2 can use basic vector unrolling.
1172 // SSE41 can use PHMINPOS to perform v16i8/v8i16 minmax reductions.
1173 // Fallback to ReplaceNodeResults for vXi64 reductions on 32-bit targets.
1174 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::i64}) {
1179 }
1180
1191
1196
1197 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1203
1204 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1205 // setcc all the way to isel and prefer SETGT in some isel patterns.
1208 }
1209
1210 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1211 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1216
1217 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1223 }
1224
1225 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1229
1230 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1231 continue;
1232
1235 }
1236 setF16Action(MVT::v8f16, Expand);
1237 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1238 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1239 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1240 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1241 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1242 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1244
1245 // Custom lower v2i64 and v2f64 selects.
1252
1259
1260 // Custom legalize these to avoid over promotion or custom promotion.
1261 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1266 }
1267
1272
1275
1278
1279 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1284
1289
1290 // We want to legalize this to an f64 load rather than an i64 load on
1291 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1292 // store.
1293 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1294 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1295 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1296 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1297 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1299
1300 // Add 32-bit vector stores to help vectorization opportunities.
1301 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1303
1307 if (!Subtarget.hasAVX512())
1309
1313
1315
1332
1333 // In the customized shift lowering, the legal v4i32/v2i64 cases
1334 // in AVX2 will be recognized.
1335 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1339 if (VT == MVT::v2i64) continue;
1344 }
1345
1351 }
1352
1353 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1358
1359 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1361 }
1362
1363 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1364 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
1365 }
1366
1367 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1368 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1369 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1370 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1371
1372 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1375 }
1377
1378 // These might be better off as horizontal vector ops.
1383 }
1384
1385 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1386 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1389 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1393 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1399
1401 }
1402
1403 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1404 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1405 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1406 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1407 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1408 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1409 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1410 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1411
1415
1416 // FIXME: Do we need to handle scalar-to-vector here?
1417 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1418 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1419
1420 // We directly match byte blends in the backend as they match the VSELECT
1421 // condition form.
1423
1424 // SSE41 brings specific instructions for doing vector sign extend even in
1425 // cases where we don't have SRA.
1426 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1429 }
1430
1431 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1432 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1433 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1434 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1435 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1436 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1437 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1438 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1439 }
1440
1441 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1442 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1443 // do the pre and post work in the vector domain.
1446 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1447 // so that DAG combine doesn't try to turn it into uint_to_fp.
1450 }
1451 }
1452
1453 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1455 }
1456
1457 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1458 for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1459 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1462 }
1463
1464 // XOP can efficiently perform BITREVERSE with VPPERM.
1465 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1467 }
1468
1469 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1470 bool HasInt256 = Subtarget.hasInt256();
1471
1472 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1473 : &X86::VR256RegClass);
1474 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1475 : &X86::VR256RegClass);
1476 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1477 : &X86::VR256RegClass);
1478 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1479 : &X86::VR256RegClass);
1480 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1481 : &X86::VR256RegClass);
1482 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1483 : &X86::VR256RegClass);
1484 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1485 : &X86::VR256RegClass);
1486
1487 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1500
1502
1506
1512 }
1513
1514 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1515 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1516
1517 setOperationAction(ISD::AND, MVT::i256, Custom);
1518 setOperationAction(ISD::OR, MVT::i256, Custom);
1519 setOperationAction(ISD::XOR, MVT::i256, Custom);
1522
1523 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1524 // even though v8i16 is a legal type.
1525 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1526 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1527 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1528 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1532
1539
1551
1552 if (!Subtarget.hasAVX512())
1554
1555 // In the customized shift lowering, the legal v8i32/v4i64 cases
1556 // in AVX2 will be recognized.
1557 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1570 if (VT == MVT::v4i64) continue;
1575 }
1576
1577 // These types need custom splitting if their input is a 128-bit vector.
1582
1586 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1587 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1590
1591 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1595 }
1596
1601
1602 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1607
1608 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1609 // setcc all the way to isel and prefer SETGT in some isel patterns.
1612 }
1613
1614 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1615 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1620
1621 if (Subtarget.hasAnyFMA()) {
1622 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1623 MVT::v2f64, MVT::v4f64 }) {
1626 }
1627 }
1628
1629 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1630 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1631 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1632 }
1633
1634 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1635 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1636 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1637 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1638
1639 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1640 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1641 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1642 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1643 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1644 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1645 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1646 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1647
1648 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1649 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1650
1651 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1652 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1653 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1654 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1655 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1656
1657 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1658 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1659 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1660 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1661 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1662 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1663 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1664 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1669
1670 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1671 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1672 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1673 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1674 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1675 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1676 }
1677
1678 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1681 }
1682
1683 if (HasInt256) {
1684 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1685 // when we have a 256bit-wide blend with immediate.
1688
1689 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1690 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1691 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1692 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1693 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1694 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1695 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1696 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1697 }
1698 }
1699
1700 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1701 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1702 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1704 }
1705
1706 // Extract subvector is special because the value type
1707 // (result) is 128-bit but the source is 256-bit wide.
1708 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1709 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1711 }
1712
1713 // Custom lower several nodes for 256-bit types.
1714 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1715 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1725 }
1726 setF16Action(MVT::v16f16, Expand);
1727 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1728 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1730 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1731 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1732 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1733 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1734
1735 // Only PCLMUL required as we always unroll clmul vectors.
1736 if (Subtarget.hasPCLMUL()) {
1737 for (auto VT : {MVT::v8i32, MVT::v4i64}) {
1740 }
1741 }
1742
1743 if (HasInt256) {
1745
1746 // Custom legalize 2x32 to get a little better code.
1749
1750 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1751 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1753 }
1754
1755 if (Subtarget.hasGFNI()) {
1756 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1757 setOperationAction(ISD::CTTZ, MVT::v32i8, Custom);
1758 }
1759 }
1760
1761 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1762 Subtarget.hasF16C()) {
1763 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1766 }
1767 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1770 }
1771 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1772 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1773 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1774 }
1775 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1776 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1777 }
1778
1779 // This block controls legalization of the mask vector sizes that are
1780 // available with AVX512. 512-bit vectors are in a separate block controlled
1781 // by useAVX512Regs.
1782 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1783 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1784 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1785 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1786 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1787 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1788
1792
1793 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1794 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1795 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1796 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1797 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1798 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1799 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1800 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1808
1809 // There is no byte sized k-register load or store without AVX512DQ.
1810 if (!Subtarget.hasDQI()) {
1811 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1812 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1813 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1814 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1815
1820 }
1821
1822 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1823 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1827 }
1828
1829 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1831
1832 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1836
1843 }
1844
1845 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1847 }
1848 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1849 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1852 }
1853 }
1854
1855 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1856 // elements. 512-bits can be disabled based on prefer-vector-width and
1857 // required-vector-width function attributes.
1858 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1859 bool HasBWI = Subtarget.hasBWI();
1860
1861 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1862 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1863 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1864 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1865 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1866 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1867 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1868
1869 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1870 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1871 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1872 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1873 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1874 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1875 if (HasBWI)
1876 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1877 }
1878
1879 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1890 }
1891 setOperationAction(ISD::LRINT, MVT::v16f32,
1892 Subtarget.hasDQI() ? Legal : Custom);
1893 setOperationAction(ISD::LRINT, MVT::v8f64,
1894 Subtarget.hasDQI() ? Legal : Custom);
1895 if (Subtarget.hasDQI())
1896 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1897
1898 setOperationAction(ISD::AND, MVT::i512, Custom);
1899 setOperationAction(ISD::OR, MVT::i512, Custom);
1900 setOperationAction(ISD::XOR, MVT::i512, Custom);
1901 setOperationAction(ISD::ADD, MVT::i512, Custom);
1902 setOperationAction(ISD::SUB, MVT::i512, Custom);
1903 setOperationAction(ISD::SRL, MVT::i512, Custom);
1904 setOperationAction(ISD::SHL, MVT::i512, Custom);
1905 setOperationAction(ISD::SRA, MVT::i512, Custom);
1906 setOperationAction(ISD::FSHR, MVT::i512, Custom);
1907 setOperationAction(ISD::FSHL, MVT::i512, Custom);
1908 setOperationAction(ISD::FSHR, MVT::i256, Custom);
1909 setOperationAction(ISD::FSHL, MVT::i256, Custom);
1912
1913 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1918 }
1919
1920 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1925 }
1926
1933
1945
1946 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1947 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1948 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1949 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1950 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1951 if (HasBWI)
1952 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1953
1954 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1955 // to 512-bit rather than use the AVX2 instructions so that we can use
1956 // k-masks.
1957 if (!Subtarget.hasVLX()) {
1958 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1959 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1962 }
1963 }
1964
1966 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1967 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1977
1978 if (HasBWI) {
1979 // Extends from v64i1 masks to 512-bit vectors.
1983 }
1984
1985 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1998
2000 }
2001
2002 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2005 }
2006
2007 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
2008 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
2009 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
2010 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
2011
2012 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
2013 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
2014 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
2015 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
2016
2017 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
2018 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
2019 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
2020 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
2021 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
2022 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
2023 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
2024 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
2025
2026 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
2027 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
2028
2029 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
2046
2047 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
2048 // setcc all the way to isel and prefer SETGT in some isel patterns.
2051 }
2052
2053 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
2054 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
2059
2060 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
2069 }
2070
2071 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2072 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
2073 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
2075 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
2076 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
2077 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
2078 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2083 }
2084
2085 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2086 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2087 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2088 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2089 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2090 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2091
2092 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2096 setOperationAction(Opc, MVT::v8i64, Custom);
2097
2098 if (Subtarget.hasDQI())
2099 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2100
2101 if (Subtarget.hasCDI()) {
2102 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2103 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2105 }
2106 } // Subtarget.hasCDI()
2107
2108 if (Subtarget.hasVPOPCNTDQ()) {
2109 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2112 }
2113
2114 // Extract subvector is special because the value type
2115 // (result) is 256-bit but the source is 512-bit wide.
2116 // 128-bit was made Legal under AVX1.
2117 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2118 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2120
2121 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2122 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2132 }
2133 setF16Action(MVT::v32f16, Expand);
2138 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2139 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2140 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2141
2142 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2147 }
2148 if (HasBWI) {
2149 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2152 }
2153 } else {
2154 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2155 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2156 }
2157
2158 if (Subtarget.hasVBMI2()) {
2159 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2162 }
2163
2164 setOperationAction(ISD::ROTL, MVT::v32i16, Legal);
2165 setOperationAction(ISD::ROTR, MVT::v32i16, Legal);
2166 }
2167
2168 // Only PCLMUL required as we always unroll clmul vectors.
2169 if (Subtarget.hasPCLMUL()) {
2170 for (auto VT : {MVT::v16i32, MVT::v8i64}) {
2173 }
2174 }
2175
2176 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2177 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2179
2180 if (Subtarget.hasGFNI()) {
2181 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
2182 setOperationAction(ISD::CTTZ, MVT::v64i8, Custom);
2183 }
2184 }// useAVX512Regs
2185
2186 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2187 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2188 MVT::v4i64}) {
2191 }
2192
2193 setOperationAction(ISD::ROTL, MVT::v16i16, Legal);
2194 setOperationAction(ISD::ROTR, MVT::v16i16, Legal);
2195 setOperationAction(ISD::ROTL, MVT::v8i16, Legal);
2196 setOperationAction(ISD::ROTR, MVT::v8i16, Legal);
2197 }
2198
2199 // This block controls legalization for operations that don't have
2200 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2201 // narrower widths.
2202 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2203 for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
2204 MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
2205 MVT::v16f32, MVT::v8f64})
2207
2208 // These operations are handled on non-VLX by artificially widening in
2209 // isel patterns.
2213
2214 if (Subtarget.hasDQI()) {
2215 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2216 // v2f32 UINT_TO_FP is already custom under SSE2.
2219 "Unexpected operation action!");
2220 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2225 }
2226
2227 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2233 }
2234
2235 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2238 }
2239
2240 // Custom legalize 2x32 to get a little better code.
2243
2244 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2245 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2247
2248 if (Subtarget.hasDQI()) {
2252 setOperationAction(Opc, MVT::v2i64, Custom);
2253 setOperationAction(Opc, MVT::v4i64, Custom);
2254 }
2255 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2256 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2257 }
2258
2259 if (Subtarget.hasCDI()) {
2260 for (auto VT : {MVT::i256, MVT::i512}) {
2261 if (VT == MVT::i512 && !Subtarget.useAVX512Regs())
2262 continue;
2267 }
2268 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2270 }
2271 } // Subtarget.hasCDI()
2272
2273 if (Subtarget.hasVPOPCNTDQ()) {
2274 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64})
2277 }
2278
2279 // We can try to convert vectors to different sizes to leverage legal
2280 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2281 // then specialize to Legal below.
2282 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2283 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2284 MVT::v16i16, MVT::v8i8})
2286
2287 // Legal vpcompress depends on various AVX512 extensions.
2288 // Legal in AVX512F
2289 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2291
2292 // Legal in AVX512F + AVX512VL
2293 if (Subtarget.hasVLX())
2294 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2295 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2297
2298 // Legal in AVX512F + AVX512VBMI2
2299 if (Subtarget.hasVBMI2())
2300 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2302
2303 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2304 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2305 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2307 }
2308
2309 // This block control legalization of v32i1/v64i1 which are available with
2310 // AVX512BW..
2311 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2312 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2313 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2314
2315 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2326 }
2327
2328 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2330
2331 // Extends from v32i1 masks to 256-bit vectors.
2335
2336 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2337 MVT::v16f16, MVT::v8f16}) {
2338 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2339 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2340 }
2341
2342 // These operations are handled on non-VLX by artificially widening in
2343 // isel patterns.
2344 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2345
2346 if (Subtarget.hasBITALG()) {
2347 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2349 }
2350 }
2351
2352 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2353 auto setGroup = [&] (MVT VT) {
2364
2377
2379
2382
2388
2394
2398 };
2399
2400 // AVX512_FP16 scalar operations
2401 setGroup(MVT::f16);
2419
2422
2423 if (Subtarget.useAVX512Regs()) {
2424 setGroup(MVT::v32f16);
2430 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2437
2442 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2444 MVT::v32i16);
2445 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2447 MVT::v32i16);
2448 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2450 MVT::v32i16);
2451 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2453 MVT::v32i16);
2454
2458
2459 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2460 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2461
2466 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2467 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2468 }
2469
2474
2475 if (Subtarget.hasVLX()) {
2476 setGroup(MVT::v8f16);
2477 setGroup(MVT::v16f16);
2478
2489
2496
2497 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2500
2504
2505 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2506 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2507 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2508 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2509
2510 // Need to custom widen these to prevent scalarization.
2511 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2512 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2513
2518
2523 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2524 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2525 }
2526 }
2527
2528 if (!Subtarget.useSoftFloat() &&
2529 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2530 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2531 : &X86::VR128RegClass);
2532 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2533 : &X86::VR256RegClass);
2534 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2535 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2536 // Set the operation action Custom to do the customization later.
2539 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2540 setF16Action(VT, Expand);
2541 if (!Subtarget.hasBF16())
2547 }
2548 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2549 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2550 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2551 }
2552 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2553 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2555 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2556 }
2557
2558 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2559 Subtarget.useAVX512Regs()) {
2560 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2561 setF16Action(MVT::v32bf16, Expand);
2562 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2563 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2564 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2566 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2570 }
2571
2572 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2573 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2574 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2575 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2576 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2577 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2578 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2579 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2580 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2581 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2584 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2596 }
2597 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2600 }
2601 }
2602
2603 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2604 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2605 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2606 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2607 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2608 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2609
2610 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2611 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2612 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2613 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2614 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2615
2616 if (Subtarget.hasBWI()) {
2617 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2618 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2619 }
2620
2621 if (Subtarget.hasFP16()) {
2622 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2631 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2640 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2645 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2650 }
2651 }
2652
2653 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2654 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2655 }
2656
2657 // We want to custom lower some of our intrinsics.
2661 if (!Subtarget.is64Bit()) {
2663 }
2664
2665 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2666 // handle type legalization for these operations here.
2667 //
2668 // FIXME: We really should do custom legalization for addition and
2669 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2670 // than generic legalization for 64-bit multiplication-with-overflow, though.
2671 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2672 if (VT == MVT::i64 && !Subtarget.is64Bit())
2673 continue;
2674 // Add/Sub/Mul with overflow operations are custom lowered.
2681
2682 // Support carry in as value rather than glue.
2688 }
2689
2690 // Combine sin / cos into _sincos_stret if it is available.
2693
2694 if (Subtarget.isTargetWin64()) {
2695 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2696 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2697 setOperationAction(ISD::SREM, MVT::i128, Custom);
2698 setOperationAction(ISD::UREM, MVT::i128, Custom);
2707 }
2708
2709 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2710 // is. We should promote the value to 64-bits to solve this.
2711 // This is what the CRT headers do - `fmodf` is an inline header
2712 // function casting to f64 and calling `fmod`.
2713 if (Subtarget.is32Bit() &&
2714 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2715 // clang-format off
2716 for (ISD::NodeType Op :
2734 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2735 ISD::FMODF})
2736 if (isOperationExpandOrLibCall(Op, MVT::f32))
2737 setOperationAction(Op, MVT::f32, Promote);
2738 // clang-format on
2739
2740 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2741 // it, but it's just a wrapper around ldexp.
2742 if (Subtarget.isOSWindows()) {
2744 if (isOperationExpand(Op, MVT::f32))
2745 setOperationAction(Op, MVT::f32, Promote);
2746 }
2747
2748 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
2749 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
2750 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
2751
2752 setOperationPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
2753 setOperationPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
2754 setOperationPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
2755
2756 // We have target-specific dag combine patterns for the following nodes:
2767 ISD::SHL,
2768 ISD::SRA,
2769 ISD::SRL,
2770 ISD::OR,
2771 ISD::AND,
2777 ISD::ADD,
2780 ISD::FADD,
2781 ISD::FSUB,
2782 ISD::FNEG,
2783 ISD::FMA,
2787 ISD::SUB,
2788 ISD::LOAD,
2789 ISD::LRINT,
2791 ISD::MLOAD,
2792 ISD::STORE,
2809 ISD::SETCC,
2810 ISD::MUL,
2811 ISD::XOR,
2819 ISD::ROTL,
2820 ISD::ROTR,
2821 ISD::FSHL,
2822 ISD::FSHR,
2826
2827 computeRegisterProperties(Subtarget.getRegisterInfo());
2828
2829 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2831 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2833 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2835
2836 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2837 // that needs to benchmarked and balanced with the potential use of vector
2838 // load/store types (PR33329, PR33914).
2841
2842 // Default loop alignment, which can be overridden by -align-loops.
2844
2845 // An out-of-order CPU can speculatively execute past a predictable branch,
2846 // but a conditional move could be stalled by an expensive earlier operation.
2847 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2848 EnableExtLdPromotion = true;
2850
2852
2853 // Default to having -disable-strictnode-mutation on
2854 IsStrictFPEnabled = true;
2855}
2856
2857// This has so far only been implemented for 64-bit MachO.
2859 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2860}
2861
2863 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2864 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2865}
2866
2868 const SDLoc &DL) const {
2869 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2870 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2871 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2872 return SDValue(Node, 0);
2873}
2874
2877 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2878 !Subtarget.hasBWI())
2879 return TypeSplitVector;
2880
2881 // Since v8f16 is legal, widen anything over v4f16.
2882 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2883 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2884 VT.getVectorElementType() == MVT::f16)
2885 return TypeSplitVector;
2886
2887 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2888 VT.getVectorElementType() != MVT::i1)
2889 return TypeWidenVector;
2890
2892}
2893
2895 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
2896 const LibcallLoweringInfo *libcallLowering) const {
2897 return X86::createFastISel(funcInfo, libInfo, libcallLowering);
2898}
2899
2900//===----------------------------------------------------------------------===//
2901// Other Lowering Hooks
2902//===----------------------------------------------------------------------===//
2903
2905 bool AssumeSingleUse, bool IgnoreAlignment) {
2906 if (!AssumeSingleUse && !Op.hasOneUse())
2907 return false;
2908 if (!ISD::isNormalLoad(Op.getNode()))
2909 return false;
2910
2911 // If this is an unaligned vector, make sure the target supports folding it.
2912 auto *Ld = cast<LoadSDNode>(Op.getNode());
2913 if (!IgnoreAlignment && !Subtarget.hasAVX() &&
2914 !Subtarget.hasSSEUnalignedMem() && Ld->getValueSizeInBits(0) == 128 &&
2915 Ld->getAlign() < Align(16))
2916 return false;
2917
2918 // TODO: If this is a non-temporal load and the target has an instruction
2919 // for it, it should not be folded. See "useNonTemporalLoad()".
2920
2921 return true;
2922}
2923
2925 const X86Subtarget &Subtarget,
2926 bool AssumeSingleUse) {
2927 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2928 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2929 return false;
2930
2931 // We can not replace a wide volatile load with a broadcast-from-memory,
2932 // because that would narrow the load, which isn't legal for volatiles.
2933 auto *Ld = cast<LoadSDNode>(Op.getNode());
2934 return !Ld->isVolatile() ||
2935 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2936}
2937
2939 if (!Op.hasOneUse())
2940 return false;
2941 // Peek through (oneuse) bitcast users
2942 SDNode *User = *Op->user_begin();
2943 while (User->getOpcode() == ISD::BITCAST) {
2944 if (!User->hasOneUse())
2945 return false;
2946 User = *User->user_begin();
2947 }
2948 return ISD::isNormalStore(User);
2949}
2950
2952 if (Op.hasOneUse()) {
2953 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2954 return (ISD::ZERO_EXTEND == Opcode);
2955 }
2956 return false;
2957}
2958
2959// Return true if its cheap to bitcast this to a vector type.
2961 const X86Subtarget &Subtarget) {
2962 if (peekThroughBitcasts(Op).getValueType().isVector())
2963 return true;
2965 return true;
2966
2967 EVT VT = Op.getValueType();
2968 unsigned Opcode = Op.getOpcode();
2969 if ((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
2970 DAG.getTargetLoweringInfo().getOperationAction(Opcode, VT) ==
2972 // Check for larger than legal scalar integer ops that might have been
2973 // custom lowered to vector instruction.
2974 switch (Opcode) {
2975 case ISD::BITREVERSE:
2976 return true;
2977 case ISD::SHL:
2978 case ISD::SRL:
2979 case ISD::SRA:
2980 return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget);
2981 case ISD::AND:
2982 case ISD::OR:
2983 case ISD::XOR:
2984 case ISD::ADD:
2985 case ISD::SUB:
2986 case ISD::FSHL:
2987 case ISD::FSHR:
2988 return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget) &&
2989 mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget);
2990 case ISD::SELECT:
2991 return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget) &&
2992 mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget);
2993 }
2994 }
2995 return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/true,
2996 /*IgnoreAlignment=*/true);
2997}
2998
2999static bool isLogicOp(unsigned Opcode) {
3000 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
3001 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
3002}
3003
3004static bool isTargetShuffle(unsigned Opcode) {
3005 switch(Opcode) {
3006 default: return false;
3007 case X86ISD::BLENDI:
3008 case X86ISD::PSHUFB:
3009 case X86ISD::PSHUFD:
3010 case X86ISD::PSHUFHW:
3011 case X86ISD::PSHUFLW:
3012 case X86ISD::SHUFP:
3013 case X86ISD::INSERTPS:
3014 case X86ISD::EXTRQI:
3015 case X86ISD::INSERTQI:
3016 case X86ISD::VALIGN:
3017 case X86ISD::PALIGNR:
3018 case X86ISD::VSHLDQ:
3019 case X86ISD::VSRLDQ:
3020 case X86ISD::MOVLHPS:
3021 case X86ISD::MOVHLPS:
3022 case X86ISD::MOVSHDUP:
3023 case X86ISD::MOVSLDUP:
3024 case X86ISD::MOVDDUP:
3025 case X86ISD::MOVSS:
3026 case X86ISD::MOVSD:
3027 case X86ISD::MOVSH:
3028 case X86ISD::UNPCKL:
3029 case X86ISD::UNPCKH:
3030 case X86ISD::VBROADCAST:
3031 case X86ISD::VPERMILPI:
3032 case X86ISD::VPERMILPV:
3033 case X86ISD::VPERM2X128:
3034 case X86ISD::SHUF128:
3035 case X86ISD::VPERMIL2:
3036 case X86ISD::VPERMI:
3037 case X86ISD::VPPERM:
3038 case X86ISD::VPERMV:
3039 case X86ISD::VPERMV3:
3040 case X86ISD::VZEXT_MOVL:
3041 case X86ISD::COMPRESS:
3042 case X86ISD::EXPAND:
3043 return true;
3044 }
3045}
3046
3047static bool isTargetShuffleVariableMask(unsigned Opcode) {
3048 switch (Opcode) {
3049 default: return false;
3050 // Target Shuffles.
3051 case X86ISD::PSHUFB:
3052 case X86ISD::VPERMILPV:
3053 case X86ISD::VPERMIL2:
3054 case X86ISD::VPPERM:
3055 case X86ISD::VPERMV:
3056 case X86ISD::VPERMV3:
3057 return true;
3058 // 'Faux' Target Shuffles.
3059 case ISD::OR:
3060 case ISD::AND:
3061 case X86ISD::ANDNP:
3062 return true;
3063 }
3064}
3065
3068 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3070 int ReturnAddrIndex = FuncInfo->getRAIndex();
3071
3072 if (ReturnAddrIndex == 0) {
3073 // Set up a frame object for the return address.
3074 unsigned SlotSize = RegInfo->getSlotSize();
3075 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
3076 -(int64_t)SlotSize,
3077 false);
3078 FuncInfo->setRAIndex(ReturnAddrIndex);
3079 }
3080
3081 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
3082}
3083
3085 bool HasSymbolicDisplacement) {
3086 // Offset should fit into 32 bit immediate field.
3087 if (!isInt<32>(Offset))
3088 return false;
3089
3090 // If we don't have a symbolic displacement - we don't have any extra
3091 // restrictions.
3092 if (!HasSymbolicDisplacement)
3093 return true;
3094
3095 // We can fold large offsets in the large code model because we always use
3096 // 64-bit offsets.
3097 if (CM == CodeModel::Large)
3098 return true;
3099
3100 // For kernel code model we know that all object resist in the negative half
3101 // of 32bits address space. We may not accept negative offsets, since they may
3102 // be just off and we may accept pretty large positive ones.
3103 if (CM == CodeModel::Kernel)
3104 return Offset >= 0;
3105
3106 // For other non-large code models we assume that latest small object is 16MB
3107 // before end of 31 bits boundary. We may also accept pretty large negative
3108 // constants knowing that all objects are in the positive half of address
3109 // space.
3110 return Offset < 16 * 1024 * 1024;
3111}
3112
3113/// Return true if the condition is an signed comparison operation.
3114static bool isX86CCSigned(X86::CondCode X86CC) {
3115 switch (X86CC) {
3116 default:
3117 llvm_unreachable("Invalid integer condition!");
3118 case X86::COND_E:
3119 case X86::COND_NE:
3120 case X86::COND_B:
3121 case X86::COND_A:
3122 case X86::COND_BE:
3123 case X86::COND_AE:
3124 return false;
3125 case X86::COND_G:
3126 case X86::COND_GE:
3127 case X86::COND_L:
3128 case X86::COND_LE:
3129 return true;
3130 }
3131}
3132
3134 switch (SetCCOpcode) {
3135 // clang-format off
3136 default: llvm_unreachable("Invalid integer condition!");
3137 case ISD::SETEQ: return X86::COND_E;
3138 case ISD::SETGT: return X86::COND_G;
3139 case ISD::SETGE: return X86::COND_GE;
3140 case ISD::SETLT: return X86::COND_L;
3141 case ISD::SETLE: return X86::COND_LE;
3142 case ISD::SETNE: return X86::COND_NE;
3143 case ISD::SETULT: return X86::COND_B;
3144 case ISD::SETUGT: return X86::COND_A;
3145 case ISD::SETULE: return X86::COND_BE;
3146 case ISD::SETUGE: return X86::COND_AE;
3147 // clang-format on
3148 }
3149}
3150
3151/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
3152/// condition code, returning the condition code and the LHS/RHS of the
3153/// comparison to make.
3155 bool isFP, SDValue &LHS, SDValue &RHS,
3156 SelectionDAG &DAG) {
3157 if (!isFP) {
3159 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
3160 // X > -1 -> X == 0, jump !sign.
3161 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3162 return X86::COND_NS;
3163 }
3164 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
3165 // X < 0 -> X == 0, jump on sign.
3166 return X86::COND_S;
3167 }
3168 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3169 // X >= 0 -> X == 0, jump on !sign.
3170 return X86::COND_NS;
3171 }
3172 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3173 // X < 1 -> X <= 0
3174 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3175 return X86::COND_LE;
3176 }
3177 }
3178
3179 return TranslateIntegerX86CC(SetCCOpcode);
3180 }
3181
3182 // First determine if it is required or is profitable to flip the operands.
3183
3184 // If LHS is a foldable load, but RHS is not, flip the condition.
3185 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3186 !ISD::isNON_EXTLoad(RHS.getNode())) {
3187 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3188 std::swap(LHS, RHS);
3189 }
3190
3191 switch (SetCCOpcode) {
3192 default: break;
3193 case ISD::SETOLT:
3194 case ISD::SETOLE:
3195 case ISD::SETUGT:
3196 case ISD::SETUGE:
3197 std::swap(LHS, RHS);
3198 break;
3199 }
3200
3201 // On a floating point condition, the flags are set as follows:
3202 // ZF PF CF op
3203 // 0 | 0 | 0 | X > Y
3204 // 0 | 0 | 1 | X < Y
3205 // 1 | 0 | 0 | X == Y
3206 // 1 | 1 | 1 | unordered
3207 switch (SetCCOpcode) {
3208 // clang-format off
3209 default: llvm_unreachable("Condcode should be pre-legalized away");
3210 case ISD::SETUEQ:
3211 case ISD::SETEQ: return X86::COND_E;
3212 case ISD::SETOLT: // flipped
3213 case ISD::SETOGT:
3214 case ISD::SETGT: return X86::COND_A;
3215 case ISD::SETOLE: // flipped
3216 case ISD::SETOGE:
3217 case ISD::SETGE: return X86::COND_AE;
3218 case ISD::SETUGT: // flipped
3219 case ISD::SETULT:
3220 case ISD::SETLT: return X86::COND_B;
3221 case ISD::SETUGE: // flipped
3222 case ISD::SETULE:
3223 case ISD::SETLE: return X86::COND_BE;
3224 case ISD::SETONE:
3225 case ISD::SETNE: return X86::COND_NE;
3226 case ISD::SETUO: return X86::COND_P;
3227 case ISD::SETO: return X86::COND_NP;
3228 case ISD::SETOEQ:
3229 case ISD::SETUNE: return X86::COND_INVALID;
3230 // clang-format on
3231 }
3232}
3233
3234/// Is there a floating point cmov for the specific X86 condition code?
3235/// Current x86 isa includes the following FP cmov instructions:
3236/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3237static bool hasFPCMov(unsigned X86CC) {
3238 switch (X86CC) {
3239 default:
3240 return false;
3241 case X86::COND_B:
3242 case X86::COND_BE:
3243 case X86::COND_E:
3244 case X86::COND_P:
3245 case X86::COND_A:
3246 case X86::COND_AE:
3247 case X86::COND_NE:
3248 case X86::COND_NP:
3249 return true;
3250 }
3251}
3252
3253static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3254 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3255 VT.is512BitVector();
3256}
3257
3260 MachineFunction &MF, unsigned Intrinsic) const {
3261 IntrinsicInfo Info;
3263 Info.offset = 0;
3264
3266 if (!IntrData) {
3267 switch (Intrinsic) {
3268 case Intrinsic::x86_aesenc128kl:
3269 case Intrinsic::x86_aesdec128kl:
3270 Info.opc = ISD::INTRINSIC_W_CHAIN;
3271 Info.ptrVal = I.getArgOperand(1);
3272 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3273 Info.align = Align(1);
3274 Info.flags |= MachineMemOperand::MOLoad;
3275 Infos.push_back(Info);
3276 return;
3277 case Intrinsic::x86_aesenc256kl:
3278 case Intrinsic::x86_aesdec256kl:
3279 Info.opc = ISD::INTRINSIC_W_CHAIN;
3280 Info.ptrVal = I.getArgOperand(1);
3281 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3282 Info.align = Align(1);
3283 Info.flags |= MachineMemOperand::MOLoad;
3284 Infos.push_back(Info);
3285 return;
3286 case Intrinsic::x86_aesencwide128kl:
3287 case Intrinsic::x86_aesdecwide128kl:
3288 Info.opc = ISD::INTRINSIC_W_CHAIN;
3289 Info.ptrVal = I.getArgOperand(0);
3290 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3291 Info.align = Align(1);
3292 Info.flags |= MachineMemOperand::MOLoad;
3293 Infos.push_back(Info);
3294 return;
3295 case Intrinsic::x86_aesencwide256kl:
3296 case Intrinsic::x86_aesdecwide256kl:
3297 Info.opc = ISD::INTRINSIC_W_CHAIN;
3298 Info.ptrVal = I.getArgOperand(0);
3299 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3300 Info.align = Align(1);
3301 Info.flags |= MachineMemOperand::MOLoad;
3302 Infos.push_back(Info);
3303 return;
3304 case Intrinsic::x86_cmpccxadd32:
3305 case Intrinsic::x86_cmpccxadd64:
3306 case Intrinsic::x86_atomic_bts:
3307 case Intrinsic::x86_atomic_btc:
3308 case Intrinsic::x86_atomic_btr: {
3309 Info.opc = ISD::INTRINSIC_W_CHAIN;
3310 Info.ptrVal = I.getArgOperand(0);
3311 unsigned Size = I.getType()->getScalarSizeInBits();
3312 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3313 Info.align = Align(Size);
3316 Infos.push_back(Info);
3317 return;
3318 }
3319 case Intrinsic::x86_atomic_bts_rm:
3320 case Intrinsic::x86_atomic_btc_rm:
3321 case Intrinsic::x86_atomic_btr_rm: {
3322 Info.opc = ISD::INTRINSIC_W_CHAIN;
3323 Info.ptrVal = I.getArgOperand(0);
3324 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3325 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3326 Info.align = Align(Size);
3329 Infos.push_back(Info);
3330 return;
3331 }
3332 case Intrinsic::x86_aadd32:
3333 case Intrinsic::x86_aadd64:
3334 case Intrinsic::x86_aand32:
3335 case Intrinsic::x86_aand64:
3336 case Intrinsic::x86_aor32:
3337 case Intrinsic::x86_aor64:
3338 case Intrinsic::x86_axor32:
3339 case Intrinsic::x86_axor64:
3340 case Intrinsic::x86_atomic_add_cc:
3341 case Intrinsic::x86_atomic_sub_cc:
3342 case Intrinsic::x86_atomic_or_cc:
3343 case Intrinsic::x86_atomic_and_cc:
3344 case Intrinsic::x86_atomic_xor_cc: {
3345 Info.opc = ISD::INTRINSIC_W_CHAIN;
3346 Info.ptrVal = I.getArgOperand(0);
3347 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3348 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3349 Info.align = Align(Size);
3352 Infos.push_back(Info);
3353 return;
3354 }
3355 }
3356 return;
3357 }
3358
3359 switch (IntrData->Type) {
3362 case TRUNCATE_TO_MEM_VI32: {
3363 Info.opc = ISD::INTRINSIC_VOID;
3364 Info.ptrVal = I.getArgOperand(0);
3365 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3367 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3368 ScalarVT = MVT::i8;
3369 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3370 ScalarVT = MVT::i16;
3371 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3372 ScalarVT = MVT::i32;
3373
3374 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3375 Info.align = Align(1);
3376 Info.flags |= MachineMemOperand::MOStore;
3377 Infos.push_back(Info);
3378 return;
3379 }
3380 case GATHER:
3381 case GATHER_AVX2: {
3382 Info.opc = ISD::INTRINSIC_W_CHAIN;
3383 Info.ptrVal = nullptr;
3384 MVT DataVT = MVT::getVT(I.getType());
3385 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3386 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3387 IndexVT.getVectorNumElements());
3388 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3389 Info.align = Align(1);
3390 Info.flags |= MachineMemOperand::MOLoad;
3391 Infos.push_back(Info);
3392 return;
3393 }
3394 case SCATTER: {
3395 Info.opc = ISD::INTRINSIC_VOID;
3396 Info.ptrVal = nullptr;
3397 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3398 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3399 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3400 IndexVT.getVectorNumElements());
3401 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3402 Info.align = Align(1);
3403 Info.flags |= MachineMemOperand::MOStore;
3404 Infos.push_back(Info);
3405 return;
3406 }
3407 default:
3408 return;
3409 }
3410}
3411
3412/// Returns true if the target can instruction select the
3413/// specified FP immediate natively. If false, the legalizer will
3414/// materialize the FP immediate as a load from a constant pool.
3416 bool ForCodeSize) const {
3417 for (const APFloat &FPImm : LegalFPImmediates)
3418 if (Imm.bitwiseIsEqual(FPImm))
3419 return true;
3420 return false;
3421}
3422
3424 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3425 std::optional<unsigned> ByteOffset) const {
3426 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3427
3428 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3429 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3430 N = *N->user_begin();
3431 return N;
3432 };
3433
3434 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3435 // relocation target a movq or addq instruction: don't let the load shrink.
3436 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3437 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3438 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3439 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3440
3441 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3442 // those uses are extracted directly into a store, then the extract + store
3443 // can be store-folded, or (4) any use will be used by legal full width
3444 // instruction. Then, it's probably not worth splitting the load.
3445 EVT VT = Load->getValueType(0);
3446 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3447 !SDValue(Load, 0).hasOneUse()) {
3448 bool FullWidthUse = false;
3449 bool AllExtractStores = true;
3450 for (SDUse &Use : Load->uses()) {
3451 // Skip uses of the chain value. Result 0 of the node is the load value.
3452 if (Use.getResNo() != 0)
3453 continue;
3454
3455 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3456
3457 // If this use is an extract + store, it's probably not worth splitting.
3458 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3459 all_of(User->uses(), [&](const SDUse &U) {
3460 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3461 return Inner->getOpcode() == ISD::STORE;
3462 }))
3463 continue;
3464
3465 AllExtractStores = false;
3466
3467 // If any use is a full width legal/target bin op, then assume its legal
3468 // and won't split.
3469 if (isBinOp(User->getOpcode()) &&
3470 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3471 User->getOpcode() > ISD::BUILTIN_OP_END))
3472 FullWidthUse = true;
3473 }
3474
3475 if (AllExtractStores)
3476 return false;
3477
3478 // If we have an user that uses the full vector width, then this use is
3479 // only worth splitting if the offset isn't 0 (to avoid an
3480 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3481 if (FullWidthUse)
3482 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3483 }
3484
3485 return true;
3486}
3487
3488/// Returns true if it is beneficial to convert a load of a constant
3489/// to just the constant itself.
3491 Type *Ty) const {
3492 assert(Ty->isIntegerTy());
3493
3494 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3495 if (BitSize == 0 || BitSize > 64)
3496 return false;
3497 return true;
3498}
3499
3501 // If we are using XMM registers in the ABI and the condition of the select is
3502 // a floating-point compare and we have blendv or conditional move, then it is
3503 // cheaper to select instead of doing a cross-register move and creating a
3504 // load that depends on the compare result.
3505 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3506 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3507}
3508
3510 // TODO: It might be a win to ease or lift this restriction, but the generic
3511 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3512 if (VT.isVector() && Subtarget.hasAVX512())
3513 return false;
3514
3515 return true;
3516}
3517
3519 SDValue C) const {
3520 // TODO: We handle scalars using custom code, but generic combining could make
3521 // that unnecessary.
3522 APInt MulC;
3523 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3524 return false;
3525
3526 if (VT.isVector() && VT.getScalarSizeInBits() == 8) {
3527 // Check whether a vXi8 multiply can be decomposed into two shifts
3528 // (decomposing 2^m ± 2^n as 2^(a+b) ± 2^b). Similar to
3529 // DAGCombiner::visitMUL, consider the constant `2` decomposable as
3530 // (2^0 + 1).
3531 APInt ShiftedMulC = MulC.abs();
3532 unsigned TZeros = ShiftedMulC == 2 ? 0 : ShiftedMulC.countr_zero();
3533 ShiftedMulC.lshrInPlace(TZeros);
3534 if ((ShiftedMulC - 1).isPowerOf2() || (ShiftedMulC + 1).isPowerOf2())
3535 return true;
3536 }
3537
3538 // Find the type this will be legalized too. Otherwise we might prematurely
3539 // convert this to shl+add/sub and then still have to type legalize those ops.
3540 // Another choice would be to defer the decision for illegal types until
3541 // after type legalization. But constant splat vectors of i64 can't make it
3542 // through type legalization on 32-bit targets so we would need to special
3543 // case vXi64.
3544 while (getTypeAction(Context, VT) != TypeLegal)
3545 VT = getTypeToTransformTo(Context, VT);
3546
3547 // If vector multiply is legal, assume that's faster than shl + add/sub.
3548 // Multiply is a complex op with higher latency and lower throughput in
3549 // most implementations, sub-vXi32 vector multiplies are always fast,
3550 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3551 // is always going to be slow.
3552 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3553 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3554 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3555 return false;
3556
3557 // shl+add, shl+sub, shl+add+neg
3558 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3559 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3560}
3561
3563 unsigned Index) const {
3565 return false;
3566
3567 // Mask vectors support all subregister combinations and operations that
3568 // extract half of vector.
3569 if (ResVT.getVectorElementType() == MVT::i1)
3570 return Index == 0 ||
3571 ((ResVT.getSizeInBits() * 2 == SrcVT.getSizeInBits()) &&
3572 (Index == ResVT.getVectorNumElements()));
3573
3574 return (Index % ResVT.getVectorNumElements()) == 0;
3575}
3576
3578 unsigned Opc = VecOp.getOpcode();
3579
3580 // Assume target opcodes can't be scalarized.
3581 // TODO - do we have any exceptions?
3582 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3583 return false;
3584
3585 // If the vector op is not supported, try to convert to scalar.
3586 EVT VecVT = VecOp.getValueType();
3588 return true;
3589
3590 // If the vector op is supported, but the scalar op is not, the transform may
3591 // not be worthwhile.
3592 EVT ScalarVT = VecVT.getScalarType();
3593 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3594}
3595
3597 bool) const {
3598 // TODO: Allow vectors?
3599 if (VT.isVector())
3600 return false;
3601 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3602}
3603
3605 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3606 // i32/i64 or can rely on BSF passthrough value.
3607 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3608 Subtarget.hasBitScanPassThrough() ||
3609 (!Ty->isVectorTy() &&
3610 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3611}
3612
3614 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3615 // passthrough value.
3616 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3617 Subtarget.hasBitScanPassThrough();
3618}
3619
3621 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3622 // expensive than a straight movsd. On the other hand, it's important to
3623 // shrink long double fp constant since fldt is very slow.
3624 return !Subtarget.hasSSE2() || VT == MVT::f80;
3625}
3626
3628 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3629 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3630}
3631
3633 const SelectionDAG &DAG,
3634 const MachineMemOperand &MMO) const {
3635 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3636 BitcastVT.getVectorElementType() == MVT::i1)
3637 return false;
3638
3639 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3640 return false;
3641
3642 if (LoadVT.isVector() && BitcastVT.isVector()) {
3643 // If both types are legal vectors, it's always ok to convert them.
3644 // Don't convert to an illegal type.
3645 if (isTypeLegal(LoadVT))
3646 return isTypeLegal(BitcastVT);
3647 }
3648
3649 // If we have a large vector type (even if illegal), don't bitcast to large
3650 // (illegal) scalar types. Better to load fewer vectors and extract.
3651 if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() &&
3652 BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0)
3653 return false;
3654
3655 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3656}
3657
3659 const MachineFunction &MF) const {
3660 // Do not merge to float value size (128 bytes) if no implicit
3661 // float attribute is set.
3662 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3663
3664 if (NoFloat) {
3665 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3666 return (MemVT.getSizeInBits() <= MaxIntSize);
3667 }
3668 // Make sure we don't merge greater than our preferred vector
3669 // width.
3670 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3671 return false;
3672
3673 return true;
3674}
3675
3677 return Subtarget.hasFastLZCNT();
3678}
3679
3681 const Instruction &AndI) const {
3682 return true;
3683}
3684
3686 // Scalar integer and-not compares are efficiently handled by NOT+TEST (or
3687 // BMI ANDN).
3688 return Y.getValueType().isScalarInteger();
3689}
3690
3692 EVT VT = Y.getValueType();
3693
3694 if (!VT.isVector()) {
3695 if (!Subtarget.hasBMI())
3696 return false;
3697
3698 // There are only 32-bit and 64-bit forms for 'andn'.
3699 if (VT != MVT::i32 && VT != MVT::i64)
3700 return false;
3701 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3702 }
3703
3704 // Vector.
3705 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3706 return false;
3707
3708 if (VT == MVT::v4i32)
3709 return true;
3710
3711 return Subtarget.hasSSE2();
3712}
3713
3715 return X.getValueType().isScalarInteger(); // 'bt'
3716}
3717
3721 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3722 SelectionDAG &DAG) const {
3723 // Does baseline recommend not to perform the fold by default?
3725 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3726 return false;
3727 // For scalars this transform is always beneficial.
3728 if (X.getValueType().isScalarInteger())
3729 return true;
3730 // If all the shift amounts are identical, then transform is beneficial even
3731 // with rudimentary SSE2 shifts.
3732 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3733 return true;
3734 // If we have AVX2 with it's powerful shift operations, then it's also good.
3735 if (Subtarget.hasAVX2())
3736 return true;
3737 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3738 return NewShiftOpcode == ISD::SHL;
3739}
3740
3742 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3743 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3744 if (!VT.isInteger())
3745 return ShiftOpc;
3746
3747 bool PreferRotate = false;
3748 if (VT.isVector()) {
3749 // For vectors, if we have rotate instruction support, then its definetly
3750 // best. Otherwise its not clear what the best so just don't make changed.
3751 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3752 VT.getScalarType() == MVT::i64);
3753 } else {
3754 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3755 // rotate unless we have a zext mask+shr.
3756 PreferRotate = Subtarget.hasBMI2();
3757 if (!PreferRotate) {
3758 unsigned MaskBits =
3759 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3760 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3761 }
3762 }
3763
3764 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3765 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3766
3767 if (PreferRotate && MayTransformRotate)
3768 return ISD::ROTL;
3769
3770 // If vector we don't really get much benefit swapping around constants.
3771 // Maybe we could check if the DAG has the flipped node already in the
3772 // future.
3773 if (VT.isVector())
3774 return ShiftOpc;
3775
3776 // See if the beneficial to swap shift type.
3777 if (ShiftOpc == ISD::SHL) {
3778 // If the current setup has imm64 mask, then inverse will have
3779 // at least imm32 mask (or be zext i32 -> i64).
3780 if (VT == MVT::i64)
3781 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3782 : ShiftOpc;
3783
3784 // We can only benefit if req at least 7-bit for the mask. We
3785 // don't want to replace shl of 1,2,3 as they can be implemented
3786 // with lea/add.
3787 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3788 }
3789
3790 if (VT == MVT::i64)
3791 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3792 // extremely efficient.
3793 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3794
3795 // Keep small shifts as shl so we can generate add/lea.
3796 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3797 }
3798
3799 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3800 // (PreferRotate will be set in the latter case).
3801 if (PreferRotate || !MayTransformRotate || VT.isVector())
3802 return ShiftOpc;
3803
3804 // Non-vector type and we have a zext mask with SRL.
3805 return ISD::SRL;
3806}
3807
3810 const Value *Lhs,
3811 const Value *Rhs) const {
3812 using namespace llvm::PatternMatch;
3813 int BaseCost = BrMergingBaseCostThresh.getValue();
3814 // With CCMP, branches can be merged in a more efficient way.
3815 if (BaseCost >= 0 && Subtarget.hasCCMP())
3816 BaseCost += BrMergingCcmpBias;
3817 // a == b && a == c is a fast pattern on x86.
3818 if (BaseCost >= 0 && Opc == Instruction::And &&
3821 BaseCost += 1;
3822
3823 // For OR conditions with EQ comparisons, prefer splitting into branches
3824 // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops,
3825 // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed
3826 // comparisons (SLT, SGT) that can be optimized.
3827 if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or &&
3830 return {-1, -1, -1};
3831
3832 return {BaseCost, BrMergingLikelyBias.getValue(),
3833 BrMergingUnlikelyBias.getValue()};
3834}
3835
3837 return N->getOpcode() != ISD::FP_EXTEND;
3838}
3839
3841 const SDNode *N) const {
3842 assert(((N->getOpcode() == ISD::SHL &&
3843 N->getOperand(0).getOpcode() == ISD::SRL) ||
3844 (N->getOpcode() == ISD::SRL &&
3845 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3846 "Expected shift-shift mask");
3847 // TODO: Should we always create i64 masks? Or only folded immediates?
3848 EVT VT = N->getValueType(0);
3849 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3850 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3851 // Only fold if the shift values are equal - so it folds to AND.
3852 // TODO - we should fold if either is a non-uniform vector but we don't do
3853 // the fold for non-splats yet.
3854 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3855 }
3857}
3858
3860 EVT VT = Y.getValueType();
3861
3862 // For vectors, we don't have a preference, but we probably want a mask.
3863 if (VT.isVector())
3864 return false;
3865
3866 unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
3867 return VT.getScalarSizeInBits() <= MaxWidth;
3868}
3869
3872 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3874 !Subtarget.isOSWindows())
3877 ExpansionFactor);
3878}
3879
3881 // Any legal vector type can be splatted more efficiently than
3882 // loading/spilling from memory.
3883 return isTypeLegal(VT);
3884}
3885
3887 MVT VT = MVT::getIntegerVT(NumBits);
3888 if (isTypeLegal(VT))
3889 return VT;
3890
3891 // PMOVMSKB can handle this.
3892 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3893 return MVT::v16i8;
3894
3895 // VPMOVMSKB can handle this.
3896 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3897 return MVT::v32i8;
3898
3899 // TODO: Allow 64-bit type for 32-bit target.
3900 // TODO: 512-bit types should be allowed, but make sure that those
3901 // cases are handled in combineVectorSizedSetCCEquality().
3902
3904}
3905
3906/// Val is the undef sentinel value or equal to the specified value.
3907static bool isUndefOrEqual(int Val, int CmpVal) {
3908 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3909}
3910
3911/// Return true if every element in Mask is the undef sentinel value or equal to
3912/// the specified value.
3913static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3914 return llvm::all_of(Mask, [CmpVal](int M) {
3915 return (M == SM_SentinelUndef) || (M == CmpVal);
3916 });
3917}
3918
3919/// Return true if every element in Mask, beginning from position Pos and ending
3920/// in Pos+Size is the undef sentinel value or equal to the specified value.
3921static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3922 unsigned Size) {
3923 return llvm::all_of(Mask.slice(Pos, Size),
3924 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3925}
3926
3927/// Val is either the undef or zero sentinel value.
3928static bool isUndefOrZero(int Val) {
3929 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3930}
3931
3932/// Return true if every element in Mask, beginning from position Pos and ending
3933/// in Pos+Size is the undef sentinel value.
3934static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3935 return llvm::all_of(Mask.slice(Pos, Size), equal_to(SM_SentinelUndef));
3936}
3937
3938/// Return true if the mask creates a vector whose lower half is undefined.
3940 unsigned NumElts = Mask.size();
3941 return isUndefInRange(Mask, 0, NumElts / 2);
3942}
3943
3944/// Return true if the mask creates a vector whose upper half is undefined.
3946 unsigned NumElts = Mask.size();
3947 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3948}
3949
3950/// Return true if Val falls within the specified range (L, H].
3951static bool isInRange(int Val, int Low, int Hi) {
3952 return (Val >= Low && Val < Hi);
3953}
3954
3955/// Return true if the value of any element in Mask falls within the specified
3956/// range (L, H].
3957static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3958 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3959}
3960
3961/// Return true if the value of any element in Mask is the zero sentinel value.
3962static bool isAnyZero(ArrayRef<int> Mask) {
3963 return llvm::any_of(Mask, equal_to(SM_SentinelZero));
3964}
3965
3966/// Return true if Val is undef or if its value falls within the
3967/// specified range (L, H].
3968static bool isUndefOrInRange(int Val, int Low, int Hi) {
3969 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3970}
3971
3972/// Return true if every element in Mask is undef or if its value
3973/// falls within the specified range (L, H].
3974static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3975 return llvm::all_of(
3976 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3977}
3978
3979/// Return true if Val is undef, zero or if its value falls within the
3980/// specified range (L, H].
3981static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3982 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3983}
3984
3985/// Return true if every element in Mask is undef, zero or if its value
3986/// falls within the specified range (L, H].
3987static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3988 return llvm::all_of(
3989 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3990}
3991
3992/// Return true if every element in Mask, is an in-place blend/select mask or is
3993/// undef.
3994[[maybe_unused]] static bool isBlendOrUndef(ArrayRef<int> Mask) {
3995 unsigned NumElts = Mask.size();
3996 for (auto [I, M] : enumerate(Mask))
3997 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3998 return false;
3999 return true;
4000}
4001
4002/// Return true if every element in Mask, beginning
4003/// from position Pos and ending in Pos + Size, falls within the specified
4004/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
4005static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
4006 unsigned Size, int Low, int Step = 1) {
4007 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
4008 if (!isUndefOrEqual(Mask[i], Low))
4009 return false;
4010 return true;
4011}
4012
4013/// Return true if every element in Mask, beginning
4014/// from position Pos and ending in Pos+Size, falls within the specified
4015/// sequential range (Low, Low+Size], or is undef or is zero.
4017 unsigned Size, int Low,
4018 int Step = 1) {
4019 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
4020 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4021 return false;
4022 return true;
4023}
4024
4025/// Return true if every element in Mask, beginning
4026/// from position Pos and ending in Pos+Size is undef or is zero.
4027static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4028 unsigned Size) {
4029 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
4030}
4031
4032/// Return true if every element of a single input is referenced by the shuffle
4033/// mask. i.e. it just permutes them all.
4035 unsigned NumElts = Mask.size();
4036 APInt DemandedElts = APInt::getZero(NumElts);
4037 for (int M : Mask)
4038 if (isInRange(M, 0, NumElts))
4039 DemandedElts.setBit(M);
4040 return DemandedElts.isAllOnes();
4041}
4042
4043/// Helper function to test whether a shuffle mask could be
4044/// simplified by widening the elements being shuffled.
4045///
4046/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4047/// leaves it in an unspecified state.
4048///
4049/// NOTE: This must handle normal vector shuffle masks and *target* vector
4050/// shuffle masks. The latter have the special property of a '-2' representing
4051/// a zero-ed lane of a vector.
4053 SmallVectorImpl<int> &WidenedMask) {
4054 WidenedMask.assign(Mask.size() / 2, 0);
4055 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4056 int M0 = Mask[i];
4057 int M1 = Mask[i + 1];
4058
4059 // If both elements are undef, its trivial.
4060 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4061 WidenedMask[i / 2] = SM_SentinelUndef;
4062 continue;
4063 }
4064
4065 // Check for an undef mask and a mask value properly aligned to fit with
4066 // a pair of values. If we find such a case, use the non-undef mask's value.
4067 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4068 WidenedMask[i / 2] = M1 / 2;
4069 continue;
4070 }
4071 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4072 WidenedMask[i / 2] = M0 / 2;
4073 continue;
4074 }
4075
4076 // When zeroing, we need to spread the zeroing across both lanes to widen.
4077 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4078 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4080 WidenedMask[i / 2] = SM_SentinelZero;
4081 continue;
4082 }
4083 return false;
4084 }
4085
4086 // Finally check if the two mask values are adjacent and aligned with
4087 // a pair.
4088 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4089 WidenedMask[i / 2] = M0 / 2;
4090 continue;
4091 }
4092
4093 // Otherwise we can't safely widen the elements used in this shuffle.
4094 return false;
4095 }
4096 assert(WidenedMask.size() == Mask.size() / 2 &&
4097 "Incorrect size of mask after widening the elements!");
4098
4099 return true;
4100}
4101
4103 const APInt &Zeroable,
4104 bool V2IsZero,
4105 SmallVectorImpl<int> &WidenedMask) {
4106 // Create an alternative mask with info about zeroable elements.
4107 // Here we do not set undef elements as zeroable.
4108 SmallVector<int, 64> ZeroableMask(Mask);
4109 if (V2IsZero) {
4110 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
4111 for (int i = 0, Size = Mask.size(); i != Size; ++i)
4112 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
4113 ZeroableMask[i] = SM_SentinelZero;
4114 }
4115 return canWidenShuffleElements(ZeroableMask, WidenedMask);
4116}
4117
4119 SmallVector<int, 32> WidenedMask;
4120 return canWidenShuffleElements(Mask, WidenedMask);
4121}
4122
4123// Attempt to narrow/widen shuffle mask until it matches the target number of
4124// elements.
4125static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
4126 SmallVectorImpl<int> &ScaledMask) {
4127 unsigned NumSrcElts = Mask.size();
4128 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
4129 "Illegal shuffle scale factor");
4130
4131 // Narrowing is guaranteed to work.
4132 if (NumDstElts >= NumSrcElts) {
4133 int Scale = NumDstElts / NumSrcElts;
4134 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
4135 return true;
4136 }
4137
4138 // We have to repeat the widening until we reach the target size, but we can
4139 // split out the first widening as it sets up ScaledMask for us.
4140 if (canWidenShuffleElements(Mask, ScaledMask)) {
4141 while (ScaledMask.size() > NumDstElts) {
4142 SmallVector<int, 16> WidenedMask;
4143 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
4144 return false;
4145 ScaledMask = std::move(WidenedMask);
4146 }
4147 return true;
4148 }
4149
4150 return false;
4151}
4152
4153static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
4154 SmallVector<int, 32> ScaledMask;
4155 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
4156}
4157
4158// Helper to grow the shuffle mask for a larger value type.
4159// NOTE: This is different to scaleShuffleElements which is a same size type.
4160static void growShuffleMask(ArrayRef<int> SrcMask,
4161 SmallVectorImpl<int> &DstMask,
4162 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
4163 assert(DstMask.empty() && "Expected an empty shuffle mas");
4164 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
4165 unsigned Scale = DstSizeInBits / SrcSizeInBits;
4166 unsigned NumSrcElts = SrcMask.size();
4167 DstMask.assign(SrcMask.begin(), SrcMask.end());
4168 for (int &M : DstMask) {
4169 if (M < 0)
4170 continue;
4171 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
4172 }
4173 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
4174}
4175
4176/// Returns true if Elt is a constant zero or a floating point constant +0.0.
4178 return isNullConstant(Elt) || isNullFPConstant(Elt);
4179}
4180
4181// Build a vector of constants.
4182// Use an UNDEF node if MaskElt == -1.
4183// Split 64-bit constants in the 32-bit mode.
4185 const SDLoc &dl, bool IsMask = false) {
4186
4188 bool Split = false;
4189
4190 MVT ConstVecVT = VT;
4191 unsigned NumElts = VT.getVectorNumElements();
4192 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4193 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4194 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4195 Split = true;
4196 }
4197
4198 MVT EltVT = ConstVecVT.getVectorElementType();
4199 for (unsigned i = 0; i < NumElts; ++i) {
4200 bool IsUndef = Values[i] < 0 && IsMask;
4201 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4202 DAG.getConstant(Values[i], dl, EltVT);
4203 Ops.push_back(OpNode);
4204 if (Split)
4205 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4206 DAG.getConstant(0, dl, EltVT));
4207 }
4208 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4209 if (Split)
4210 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4211 return ConstsNode;
4212}
4213
4214static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4215 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4216 assert(Bits.size() == Undefs.getBitWidth() &&
4217 "Unequal constant and undef arrays");
4219 bool Split = false;
4220
4221 MVT ConstVecVT = VT;
4222 unsigned NumElts = VT.getVectorNumElements();
4223 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4224 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4225 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4226 Split = true;
4227 }
4228
4229 MVT EltVT = ConstVecVT.getVectorElementType();
4230 MVT EltIntVT = EltVT.changeTypeToInteger();
4231 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4232 if (Undefs[i]) {
4233 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4234 continue;
4235 }
4236 const APInt &V = Bits[i];
4237 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4238 if (Split) {
4239 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4240 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4241 } else {
4242 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4243 }
4244 }
4245
4246 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4247 return DAG.getBitcast(VT, ConstsNode);
4248}
4249
4251 SelectionDAG &DAG, const SDLoc &dl) {
4252 APInt Undefs = APInt::getZero(Bits.size());
4253 return getConstVector(Bits, Undefs, VT, DAG, dl);
4254}
4255
4256/// Returns a vector of specified type with all zero elements.
4257static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4258 SelectionDAG &DAG, const SDLoc &dl) {
4259 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4260 VT.getVectorElementType() == MVT::i1) &&
4261 "Unexpected vector type");
4262
4263 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4264 // type. This ensures they get CSE'd. But if the integer type is not
4265 // available, use a floating-point +0.0 instead.
4266 SDValue Vec;
4267 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4268 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4269 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4270 } else if (VT.isFloatingPoint() &&
4272 Vec = DAG.getConstantFP(+0.0, dl, VT);
4273 } else if (VT.getVectorElementType() == MVT::i1) {
4274 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4275 "Unexpected vector type");
4276 Vec = DAG.getConstant(0, dl, VT);
4277 } else {
4278 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4279 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4280 }
4281 return DAG.getBitcast(VT, Vec);
4282}
4283
4284// Helper to determine if the ops are all the extracted subvectors come from a
4285// single source. If we allow commute they don't have to be in order (Lo/Hi).
4286static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4287 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4288 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4289 LHS.getValueType() != RHS.getValueType() ||
4290 LHS.getOperand(0) != RHS.getOperand(0))
4291 return SDValue();
4292
4293 SDValue Src = LHS.getOperand(0);
4294 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4295 return SDValue();
4296
4297 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4298 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4299 RHS.getConstantOperandAPInt(1) == NumElts) ||
4300 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4301 LHS.getConstantOperandAPInt(1) == NumElts))
4302 return Src;
4303
4304 return SDValue();
4305}
4306
4307static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4308 const SDLoc &dl, unsigned vectorWidth) {
4309 EVT VT = Vec.getValueType();
4310 EVT ElVT = VT.getVectorElementType();
4311 unsigned ResultNumElts =
4312 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4313 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4314
4315 assert(ResultVT.getSizeInBits() == vectorWidth &&
4316 "Illegal subvector extraction");
4317
4318 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4319 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4320 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4321
4322 // This is the index of the first element of the vectorWidth-bit chunk
4323 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4324 IdxVal &= ~(ElemsPerChunk - 1);
4325
4326 // If the input is a buildvector just emit a smaller one.
4327 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4328 return DAG.getBuildVector(ResultVT, dl,
4329 Vec->ops().slice(IdxVal, ElemsPerChunk));
4330
4331 // Check if we're extracting the upper undef of a widening pattern.
4332 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4333 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4334 isNullConstant(Vec.getOperand(2)))
4335 return DAG.getUNDEF(ResultVT);
4336
4337 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4338}
4339
4340/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4341/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4342/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4343/// instructions or a simple subregister reference. Idx is an index in the
4344/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4345/// lowering EXTRACT_VECTOR_ELT operations easier.
4346static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4347 SelectionDAG &DAG, const SDLoc &dl) {
4349 Vec.getValueType().is512BitVector()) &&
4350 "Unexpected vector size!");
4351 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4352}
4353
4354/// Generate a DAG to grab 256-bits from a 512-bit vector.
4355static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4356 SelectionDAG &DAG, const SDLoc &dl) {
4357 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4358 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4359}
4360
4361static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4362 SelectionDAG &DAG, const SDLoc &dl,
4363 unsigned vectorWidth) {
4364 assert((vectorWidth == 128 || vectorWidth == 256) &&
4365 "Unsupported vector width");
4366 // Inserting UNDEF is Result
4367 if (Vec.isUndef())
4368 return Result;
4369
4370 // Insert the relevant vectorWidth bits.
4371 EVT VT = Vec.getValueType();
4372 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4373 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4374
4375 // This is the index of the first element of the vectorWidth-bit chunk
4376 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4377 IdxVal &= ~(ElemsPerChunk - 1);
4378 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4379}
4380
4381/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4382/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4383/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4384/// simple superregister reference. Idx is an index in the 128 bits
4385/// we want. It need not be aligned to a 128-bit boundary. That makes
4386/// lowering INSERT_VECTOR_ELT operations easier.
4387static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4388 SelectionDAG &DAG, const SDLoc &dl) {
4389 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4390 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4391}
4392
4393/// Widen a vector to a larger size with the same scalar type, with the new
4394/// elements either zero or undef.
4395static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4396 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4397 const SDLoc &dl) {
4398 EVT VecVT = Vec.getValueType();
4400 VecVT.getScalarType() == VT.getScalarType() &&
4401 "Unsupported vector widening type");
4402 // If the upper 128-bits of a build vector are already undef/zero, then try to
4403 // widen from the lower 128-bits.
4404 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4405 unsigned NumSrcElts = VecVT.getVectorNumElements();
4406 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4407 if (all_of(Hi, [&](SDValue V) {
4408 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4409 }))
4410 Vec = extract128BitVector(Vec, 0, DAG, dl);
4411 }
4412 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4413 : DAG.getUNDEF(VT);
4414 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4415}
4416
4417/// Widen a vector to a larger size with the same scalar type, with the new
4418/// elements either zero or undef.
4419static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4420 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4421 const SDLoc &dl, unsigned WideSizeInBits) {
4422 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4423 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4424 "Unsupported vector widening type");
4425 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4426 MVT SVT = Vec.getSimpleValueType().getScalarType();
4427 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4428 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4429}
4430
4431/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4432/// and bitcast with integer types.
4433static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4434 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4435 unsigned NumElts = VT.getVectorNumElements();
4436 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4437 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4438 return VT;
4439}
4440
4441/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4442/// bitcast with integer types.
4443static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4444 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4445 const SDLoc &dl) {
4446 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4447 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4448}
4449
4450// Helper function to collect subvector ops that are concatenated together,
4451// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4452// The subvectors in Ops are guaranteed to be the same type.
4454 SelectionDAG &DAG) {
4455 assert(Ops.empty() && "Expected an empty ops vector");
4456
4457 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4458 Ops.append(N->op_begin(), N->op_end());
4459 return true;
4460 }
4461
4462 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4463 SDValue Src = N->getOperand(0);
4464 SDValue Sub = N->getOperand(1);
4465 const APInt &Idx = N->getConstantOperandAPInt(2);
4466 EVT VT = Src.getValueType();
4467 EVT SubVT = Sub.getValueType();
4468
4469 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4470 // insert_subvector(undef, x, lo)
4471 if (Idx == 0 && Src.isUndef()) {
4472 Ops.push_back(Sub);
4473 Ops.push_back(DAG.getUNDEF(SubVT));
4474 return true;
4475 }
4476 if (Idx == (VT.getVectorNumElements() / 2)) {
4477 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4478 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4479 Src.getOperand(1).getValueType() == SubVT &&
4480 isNullConstant(Src.getOperand(2))) {
4481 // Attempt to recurse into inner (matching) concats.
4482 SDValue Lo = Src.getOperand(1);
4483 SDValue Hi = Sub;
4484 SmallVector<SDValue, 2> LoOps, HiOps;
4485 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4486 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4487 LoOps.size() == HiOps.size()) {
4488 Ops.append(LoOps);
4489 Ops.append(HiOps);
4490 return true;
4491 }
4492 Ops.push_back(Lo);
4493 Ops.push_back(Hi);
4494 return true;
4495 }
4496 // insert_subvector(x, extract_subvector(x, lo), hi)
4497 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4498 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4499 Ops.append(2, Sub);
4500 return true;
4501 }
4502 // insert_subvector(undef, x, hi)
4503 if (Src.isUndef()) {
4504 Ops.push_back(DAG.getUNDEF(SubVT));
4505 Ops.push_back(Sub);
4506 return true;
4507 }
4508 }
4509 }
4510 }
4511
4512 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4513 EVT VT = N->getValueType(0);
4514 SDValue Src = N->getOperand(0);
4515 uint64_t Idx = N->getConstantOperandVal(1);
4516
4517 // Collect all the subvectors from the source vector and slice off the
4518 // extraction.
4520 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4521 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4522 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4523 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4524 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4525 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4526 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4527 return true;
4528 }
4529 }
4530
4531 assert(Ops.empty() && "Expected an empty ops vector");
4532 return false;
4533}
4534
4535// Helper to check if \p V can be split into subvectors and the upper subvectors
4536// are all undef. In which case return the lower subvector.
4538 SelectionDAG &DAG) {
4539 SmallVector<SDValue> SubOps;
4540 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4541 return SDValue();
4542
4543 unsigned NumSubOps = SubOps.size();
4544 unsigned HalfNumSubOps = NumSubOps / 2;
4545 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4546
4547 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4548 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4549 return SDValue();
4550
4551 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4552 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4553 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4554}
4555
4556// Helper to check if we can access all the constituent subvectors without any
4557// extract ops.
4560 return collectConcatOps(V.getNode(), Ops, DAG);
4561}
4562
4563static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4564 const SDLoc &dl) {
4565 EVT VT = Op.getValueType();
4566 unsigned NumElems = VT.getVectorNumElements();
4567 unsigned SizeInBits = VT.getSizeInBits();
4568 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4569 "Can't split odd sized vector");
4570
4572 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4573 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4574 unsigned HalfOps = SubOps.size() / 2;
4575 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4576 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4577 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4578 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4579 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4580 return std::make_pair(Lo, Hi);
4581 }
4582
4583 // If this is a splat value (with no-undefs) then use the lower subvector,
4584 // which should be a free extraction.
4585 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4586 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4587 return std::make_pair(Lo, Lo);
4588
4589 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4590 return std::make_pair(Lo, Hi);
4591}
4592
4593/// Break an operation into 2 half sized ops and then concatenate the results.
4595 unsigned NumOps = Op.getNumOperands();
4596 EVT VT = Op.getValueType();
4597
4598 // Extract the LHS Lo/Hi vectors
4601 for (unsigned I = 0; I != NumOps; ++I) {
4602 SDValue SrcOp = Op.getOperand(I);
4603 if (!SrcOp.getValueType().isVector()) {
4604 LoOps[I] = HiOps[I] = SrcOp;
4605 continue;
4606 }
4607 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4608 }
4609
4610 EVT LoVT, HiVT;
4611 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4612 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4613 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4614 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4615}
4616
4617/// Break an unary integer operation into 2 half sized ops and then
4618/// concatenate the result back.
4620 const SDLoc &dl) {
4621 // Make sure we only try to split 256/512-bit types to avoid creating
4622 // narrow vectors.
4623 [[maybe_unused]] EVT VT = Op.getValueType();
4624 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4625 Op.getOperand(0).getValueType().is512BitVector()) &&
4626 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4627 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4628 VT.getVectorNumElements() &&
4629 "Unexpected VTs!");
4630 return splitVectorOp(Op, DAG, dl);
4631}
4632
4633/// Break a binary integer operation into 2 half sized ops and then
4634/// concatenate the result back.
4636 const SDLoc &dl) {
4637 // Assert that all the types match.
4638 [[maybe_unused]] EVT VT = Op.getValueType();
4639 assert(Op.getOperand(0).getValueType() == VT &&
4640 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4641 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4642 return splitVectorOp(Op, DAG, dl);
4643}
4644
4645// Helper for splitting operands of an operation to legal target size and
4646// apply a function on each part.
4647// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4648// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4649// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4650// The argument Builder is a function that will be applied on each split part:
4651// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4652template <typename F>
4654 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4655 F Builder, bool CheckBWI = true,
4656 bool AllowAVX512 = true) {
4657 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4658 unsigned NumSubs = 1;
4659 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4660 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4661 if (VT.getSizeInBits() > 512) {
4662 NumSubs = VT.getSizeInBits() / 512;
4663 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4664 }
4665 } else if (Subtarget.hasAVX2()) {
4666 if (VT.getSizeInBits() > 256) {
4667 NumSubs = VT.getSizeInBits() / 256;
4668 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4669 }
4670 } else {
4671 if (VT.getSizeInBits() > 128) {
4672 NumSubs = VT.getSizeInBits() / 128;
4673 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4674 }
4675 }
4676
4677 if (NumSubs == 1)
4678 return Builder(DAG, DL, Ops);
4679
4681 for (unsigned i = 0; i != NumSubs; ++i) {
4683 for (SDValue Op : Ops) {
4684 EVT OpVT = Op.getValueType();
4685 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4686 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4687 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4688 }
4689 Subs.push_back(Builder(DAG, DL, SubOps));
4690 }
4691 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4692}
4693
4694// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4695// targets.
4696static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4698 const X86Subtarget &Subtarget) {
4699 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4700 MVT SVT = VT.getScalarType();
4701
4702 // If we have a 32/64 splatted constant, splat it to DstTy to
4703 // encourage a foldable broadcast'd operand.
4704 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4705 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4706 // AVX512 broadcasts 32/64-bit operands.
4707 // TODO: Support float once getAVX512Node is used by fp-ops.
4708 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4710 return SDValue();
4711 // If we're not widening, don't bother if we're not bitcasting.
4712 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4713 return SDValue();
4715 APInt SplatValue, SplatUndef;
4716 unsigned SplatBitSize;
4717 bool HasAnyUndefs;
4718 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4719 HasAnyUndefs, OpEltSizeInBits) &&
4720 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4721 return DAG.getConstant(SplatValue, DL, DstVT);
4722 }
4723 return SDValue();
4724 };
4725
4726 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4727
4728 MVT DstVT = VT;
4729 if (Widen)
4730 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4731
4732 // Canonicalize src operands.
4733 SmallVector<SDValue> SrcOps(Ops);
4734 for (SDValue &Op : SrcOps) {
4735 MVT OpVT = Op.getSimpleValueType();
4736 // Just pass through scalar operands.
4737 if (!OpVT.isVector())
4738 continue;
4739 assert(OpVT == VT && "Vector type mismatch");
4740
4741 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4742 Op = BroadcastOp;
4743 continue;
4744 }
4745
4746 // Just widen the subvector by inserting into an undef wide vector.
4747 if (Widen)
4748 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4749 }
4750
4751 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4752
4753 // Perform the 512-bit op then extract the bottom subvector.
4754 if (Widen)
4755 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4756 return Res;
4757}
4758
4759/// Insert i1-subvector to i1-vector.
4761 const X86Subtarget &Subtarget) {
4762
4763 SDLoc dl(Op);
4764 SDValue Vec = Op.getOperand(0);
4765 SDValue SubVec = Op.getOperand(1);
4766 SDValue Idx = Op.getOperand(2);
4767 unsigned IdxVal = Op.getConstantOperandVal(2);
4768
4769 // Inserting undef is a nop. We can just return the original vector.
4770 if (SubVec.isUndef())
4771 return Vec;
4772
4773 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4774 return Op;
4775
4776 MVT OpVT = Op.getSimpleValueType();
4777 unsigned NumElems = OpVT.getVectorNumElements();
4778 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4779
4780 // Extend to natively supported kshift.
4781 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4782
4783 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4784 // if necessary.
4785 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4786 // May need to promote to a legal type.
4787 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4788 DAG.getConstant(0, dl, WideOpVT),
4789 SubVec, Idx);
4790 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4791 }
4792
4793 MVT SubVecVT = SubVec.getSimpleValueType();
4794 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4795 assert(IdxVal + SubVecNumElems <= NumElems &&
4796 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4797 "Unexpected index value in INSERT_SUBVECTOR");
4798
4799 SDValue Undef = DAG.getUNDEF(WideOpVT);
4800
4801 if (IdxVal == 0) {
4802 // Zero lower bits of the Vec
4803 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4804 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4805 ZeroIdx);
4806 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4807 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4808 // Merge them together, SubVec should be zero extended.
4809 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4810 DAG.getConstant(0, dl, WideOpVT),
4811 SubVec, ZeroIdx);
4812 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4813 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4814 }
4815
4816 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4817 Undef, SubVec, ZeroIdx);
4818
4819 if (Vec.isUndef()) {
4820 assert(IdxVal != 0 && "Unexpected index");
4821 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4822 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4823 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4824 }
4825
4827 assert(IdxVal != 0 && "Unexpected index");
4828 // If upper elements of Vec are known undef, then just shift into place.
4829 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4830 [](SDValue V) { return V.isUndef(); })) {
4831 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4832 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4833 } else {
4834 NumElems = WideOpVT.getVectorNumElements();
4835 unsigned ShiftLeft = NumElems - SubVecNumElems;
4836 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4837 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4838 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4839 if (ShiftRight != 0)
4840 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4841 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4842 }
4843 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4844 }
4845
4846 // Simple case when we put subvector in the upper part
4847 if (IdxVal + SubVecNumElems == NumElems) {
4848 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4849 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4850 if (SubVecNumElems * 2 == NumElems) {
4851 // Special case, use legal zero extending insert_subvector. This allows
4852 // isel to optimize when bits are known zero.
4853 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4854 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4855 DAG.getConstant(0, dl, WideOpVT),
4856 Vec, ZeroIdx);
4857 } else {
4858 // Otherwise use explicit shifts to zero the bits.
4859 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4860 Undef, Vec, ZeroIdx);
4861 NumElems = WideOpVT.getVectorNumElements();
4862 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4863 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4864 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4865 }
4866 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4867 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4868 }
4869
4870 // Inserting into the middle is more complicated.
4871
4872 NumElems = WideOpVT.getVectorNumElements();
4873
4874 // Widen the vector if needed.
4875 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4876
4877 unsigned ShiftLeft = NumElems - SubVecNumElems;
4878 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4879
4880 // Do an optimization for the most frequently used types.
4881 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4882 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4883 Mask0.flipAllBits();
4884 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4885 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4886 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4887 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4888 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4889 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4890 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4891 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4892
4893 // Reduce to original width if needed.
4894 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4895 }
4896
4897 // Clear the upper bits of the subvector and move it to its insert position.
4898 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4899 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4900 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4901 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4902
4903 // Isolate the bits below the insertion point.
4904 unsigned LowShift = NumElems - IdxVal;
4905 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4906 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4907 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4908 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4909
4910 // Isolate the bits after the last inserted bit.
4911 unsigned HighShift = IdxVal + SubVecNumElems;
4912 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4913 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4914 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4915 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4916
4917 // Now OR all 3 pieces together.
4918 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4919 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4920
4921 // Reduce to original width if needed.
4922 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4923}
4924
4926 const SDLoc &dl) {
4927 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4928 EVT SubVT = V1.getValueType();
4929 EVT SubSVT = SubVT.getScalarType();
4930 unsigned SubNumElts = SubVT.getVectorNumElements();
4931 unsigned SubVectorWidth = SubVT.getSizeInBits();
4932 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4933 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4934 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4935}
4936
4937/// Returns a vector of specified type with all bits set.
4938/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4939/// Then bitcast to their original type, ensuring they get CSE'd.
4940static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4941 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4942 "Expected a 128/256/512-bit vector type");
4943 unsigned NumElts = VT.getSizeInBits() / 32;
4944 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4945 return DAG.getBitcast(VT, Vec);
4946}
4947
4948// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
4949static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
4950 switch (Opc) {
4951 case ISD::SHL:
4952 case X86ISD::VSHL:
4953 case X86ISD::VSHLI:
4954 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
4955 case ISD::SRL:
4956 case X86ISD::VSRL:
4957 case X86ISD::VSRLI:
4958 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
4959 case ISD::SRA:
4960 case X86ISD::VSRA:
4961 case X86ISD::VSRAI:
4962 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
4963 }
4964 llvm_unreachable("Unknown target vector shift node");
4965}
4966
4967/// Handle vector element shifts where the shift amount is a constant.
4968/// Takes immediate version of shift as input.
4969static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
4970 SDValue SrcOp, uint64_t ShiftAmt,
4971 SelectionDAG &DAG) {
4972 assert(
4973 (Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) &&
4974 "Unknown target vector shift-by-constant node");
4975
4976 // Bitcast the source vector to the output type, this is mainly necessary for
4977 // vXi8/vXi64 shifts.
4978 SrcOp = DAG.getBitcast(VT, SrcOp);
4979
4980 // Fold this packed shift into its first operand if ShiftAmt is 0.
4981 if (ShiftAmt == 0)
4982 return SrcOp;
4983
4984 // Check for ShiftAmt >= element width
4985 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4986 if (ShiftAmt >= EltSizeInBits) {
4987 if (Opc == X86ISD::VSRAI)
4988 ShiftAmt = EltSizeInBits - 1;
4989 else
4990 return DAG.getConstant(0, dl, VT);
4991 }
4992
4993 // Fold this packed vector shift into a build vector if SrcOp is a
4994 // vector of Constants or UNDEFs.
4996 unsigned ShiftOpc;
4997 switch (Opc) {
4998 default:
4999 llvm_unreachable("Unknown opcode!");
5000 case X86ISD::VSHLI:
5001 ShiftOpc = ISD::SHL;
5002 break;
5003 case X86ISD::VSRLI:
5004 ShiftOpc = ISD::SRL;
5005 break;
5006 case X86ISD::VSRAI:
5007 ShiftOpc = ISD::SRA;
5008 break;
5009 }
5010
5011 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
5012 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
5013 return C;
5014 }
5015
5016 return DAG.getNode(Opc, dl, VT, SrcOp,
5017 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
5018}
5019
5020/// Handle vector element shifts by a splat shift amount
5021static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
5022 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
5023 const X86Subtarget &Subtarget,
5024 SelectionDAG &DAG) {
5025 MVT AmtVT = ShAmt.getSimpleValueType();
5026 assert(AmtVT.isVector() && "Vector shift type mismatch");
5027 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
5028 "Illegal vector splat index");
5029
5030 // Move the splat element to the bottom element.
5031 if (ShAmtIdx != 0) {
5032 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
5033 Mask[0] = ShAmtIdx;
5034 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
5035 }
5036
5037 // Peek through any zext node if we can get back to a 128-bit source.
5038 if (AmtVT.getScalarSizeInBits() == 64 &&
5039 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
5041 ShAmt.getOperand(0).getValueType().isSimple() &&
5042 ShAmt.getOperand(0).getValueType().is128BitVector()) {
5043 ShAmt = ShAmt.getOperand(0);
5044 AmtVT = ShAmt.getSimpleValueType();
5045 }
5046
5047 // See if we can mask off the upper elements using the existing source node.
5048 // The shift uses the entire lower 64-bits of the amount vector, so no need to
5049 // do this for vXi64 types.
5050 bool IsMasked = false;
5051 if (AmtVT.getScalarSizeInBits() < 64) {
5052 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
5053 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
5054 // If the shift amount has come from a scalar, then zero-extend the scalar
5055 // before moving to the vector.
5056 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
5057 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
5058 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
5059 AmtVT = MVT::v4i32;
5060 IsMasked = true;
5061 } else if (ShAmt.getOpcode() == ISD::AND) {
5062 // See if the shift amount is already masked (e.g. for rotation modulo),
5063 // then we can zero-extend it by setting all the other mask elements to
5064 // zero.
5065 SmallVector<SDValue> MaskElts(
5066 AmtVT.getVectorNumElements(),
5067 DAG.getConstant(0, dl, AmtVT.getScalarType()));
5068 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
5069 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
5070 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
5071 {ShAmt.getOperand(1), Mask}))) {
5072 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
5073 IsMasked = true;
5074 }
5075 }
5076 }
5077
5078 // Extract if the shift amount vector is larger than 128-bits.
5079 if (AmtVT.getSizeInBits() > 128) {
5080 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
5081 AmtVT = ShAmt.getSimpleValueType();
5082 }
5083
5084 // Zero-extend bottom element to v2i64 vector type, either by extension or
5085 // shuffle masking.
5086 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
5087 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
5088 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
5089 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
5090 } else if (Subtarget.hasSSE41()) {
5091 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
5092 MVT::v2i64, ShAmt);
5093 } else {
5094 SDValue ByteShift = DAG.getTargetConstant(
5095 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
5096 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
5097 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
5098 ByteShift);
5099 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
5100 ByteShift);
5101 }
5102 }
5103
5104 // Change opcode to non-immediate version.
5106
5107 // The return type has to be a 128-bit type with the same element
5108 // type as the input type.
5109 MVT EltVT = VT.getVectorElementType();
5110 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
5111
5112 ShAmt = DAG.getBitcast(ShVT, ShAmt);
5113 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
5114}
5115
5116static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
5117 SDValue In, SelectionDAG &DAG) {
5118 EVT InVT = In.getValueType();
5119 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
5120
5121 // Canonicalize Opcode to general extension version.
5122 switch (Opcode) {
5123 case ISD::ANY_EXTEND:
5125 Opcode = ISD::ANY_EXTEND;
5126 break;
5127 case ISD::SIGN_EXTEND:
5129 Opcode = ISD::SIGN_EXTEND;
5130 break;
5131 case ISD::ZERO_EXTEND:
5133 Opcode = ISD::ZERO_EXTEND;
5134 break;
5135 default:
5136 llvm_unreachable("Unknown extension opcode");
5137 }
5138
5139 // For 256-bit vectors, we only need the lower (128-bit) input half.
5140 // For 512-bit vectors, we only need the lower input half or quarter.
5141 if (InVT.getSizeInBits() > 128) {
5142 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
5143 "Expected VTs to be the same size!");
5144 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5145 In = extractSubVector(In, 0, DAG, DL,
5146 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
5147 InVT = In.getValueType();
5148 }
5149
5150 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
5151 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
5152
5153 return DAG.getNode(Opcode, DL, VT, In);
5154}
5155
5156// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
5158 SDValue Mask, SelectionDAG &DAG) {
5159 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
5160 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
5161 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
5162}
5163
5165 bool Lo, bool Unary) {
5166 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
5167 "Illegal vector type to unpack");
5168 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5169 int NumElts = VT.getVectorNumElements();
5170 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5171 for (int i = 0; i < NumElts; ++i) {
5172 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5173 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5174 Pos += (Unary ? 0 : NumElts * (i % 2));
5175 Pos += (Lo ? 0 : NumEltsInLane / 2);
5176 Mask.push_back(Pos);
5177 }
5178}
5179
5180/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
5181/// imposed by AVX and specific to the unary pattern. Example:
5182/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
5183/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
5185 bool Lo) {
5186 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5187 int NumElts = VT.getVectorNumElements();
5188 for (int i = 0; i < NumElts; ++i) {
5189 int Pos = i / 2;
5190 Pos += (Lo ? 0 : NumElts / 2);
5191 Mask.push_back(Pos);
5192 }
5193}
5194
5195// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
5196static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
5197 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
5200 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
5201 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
5202 int M = Mask[I];
5203 if (M < 0)
5204 continue;
5205 SDValue V = (M < NumElts) ? V1 : V2;
5206 if (V.isUndef())
5207 continue;
5208 Ops[I] = V.getOperand(M % NumElts);
5209 }
5210 return DAG.getBuildVector(VT, dl, Ops);
5211 }
5212
5213 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5214}
5215
5216/// Returns a vector_shuffle node for an unpackl operation.
5217static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
5218 SDValue V1, SDValue V2) {
5220 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5221 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
5222}
5223
5224/// Returns a vector_shuffle node for an unpackh operation.
5225static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
5226 SDValue V1, SDValue V2) {
5228 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5229 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
5230}
5231
5232/// Returns a node that packs the LHS + RHS nodes together at half width.
5233/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
5234/// TODO: Add subvector splitting if/when we have a need for it.
5235static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5236 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
5237 bool PackHiHalf = false) {
5238 MVT OpVT = LHS.getSimpleValueType();
5239 unsigned EltSizeInBits = VT.getScalarSizeInBits();
5240 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
5241 assert(OpVT == RHS.getSimpleValueType() &&
5242 VT.getSizeInBits() == OpVT.getSizeInBits() &&
5243 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
5244 "Unexpected PACK operand types");
5245 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
5246 "Unexpected PACK result type");
5247
5248 // Rely on vector shuffles for vXi64 -> vXi32 packing.
5249 if (EltSizeInBits == 32) {
5250 SmallVector<int> PackMask;
5251 int Offset = PackHiHalf ? 1 : 0;
5252 int NumElts = VT.getVectorNumElements();
5253 for (int I = 0; I != NumElts; I += 4) {
5254 PackMask.push_back(I + Offset);
5255 PackMask.push_back(I + Offset + 2);
5256 PackMask.push_back(I + Offset + NumElts);
5257 PackMask.push_back(I + Offset + NumElts + 2);
5258 }
5259 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
5260 DAG.getBitcast(VT, RHS), PackMask);
5261 }
5262
5263 // See if we already have sufficient leading bits for PACKSS/PACKUS.
5264 if (!PackHiHalf) {
5265 if (UsePackUS &&
5266 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
5267 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
5268 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
5269
5270 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
5271 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
5272 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
5273 }
5274
5275 // Fallback to sign/zero extending the requested half and pack.
5276 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
5277 if (UsePackUS) {
5278 if (PackHiHalf) {
5279 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
5280 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
5281 } else {
5282 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
5283 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
5284 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
5285 };
5286 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
5287 };
5288
5289 if (!PackHiHalf) {
5290 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
5291 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
5292 }
5293 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
5294 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
5295 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
5296}
5297
5298/// Return a vector_shuffle of the specified vector of zero or undef vector.
5299/// This produces a shuffle where the low element of V2 is swizzled into the
5300/// zero/undef vector, landing at element Idx.
5301/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5303 bool IsZero,
5304 const X86Subtarget &Subtarget,
5305 SelectionDAG &DAG) {
5306 MVT VT = V2.getSimpleValueType();
5307 SDValue V1 = IsZero
5308 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5309 int NumElems = VT.getVectorNumElements();
5310 SmallVector<int, 16> MaskVec(NumElems);
5311 for (int i = 0; i != NumElems; ++i)
5312 // If this is the insertion idx, put the low elt of V2 here.
5313 MaskVec[i] = (i == Idx) ? NumElems : i;
5314 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5315}
5316
5318 if (Ptr.getOpcode() == X86ISD::Wrapper ||
5319 Ptr.getOpcode() == X86ISD::WrapperRIP)
5320 Ptr = Ptr.getOperand(0);
5321 return dyn_cast<ConstantPoolSDNode>(Ptr);
5322}
5323
5324// TODO: Add support for non-zero offsets.
5327 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
5328 return nullptr;
5329 return CNode->getConstVal();
5330}
5331
5333 if (!Load || !ISD::isNormalLoad(Load))
5334 return nullptr;
5335 return getTargetConstantFromBasePtr(Load->getBasePtr());
5336}
5337
5342
5343const Constant *
5345 assert(LD && "Unexpected null LoadSDNode");
5346 return getTargetConstantFromNode(LD);
5347}
5348
5350 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
5351 SDValue Cond = N->getOperand(0);
5352 SDValue RHS = N->getOperand(2);
5353 EVT CondVT = Cond.getValueType();
5354 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
5355 CondVT.getVectorElementType() == MVT::i1 &&
5356 ISD::isBuildVectorAllZeros(RHS.getNode());
5357}
5358
5359// Extract raw constant bits from constant pools.
5360static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5361 APInt &UndefElts,
5362 SmallVectorImpl<APInt> &EltBits,
5363 bool AllowWholeUndefs = true,
5364 bool AllowPartialUndefs = false) {
5365 assert(EltBits.empty() && "Expected an empty EltBits vector");
5366
5368
5369 EVT VT = Op.getValueType();
5370 unsigned SizeInBits = VT.getSizeInBits();
5371 unsigned NumElts = SizeInBits / EltSizeInBits;
5372
5373 // Can't split constant.
5374 if ((SizeInBits % EltSizeInBits) != 0)
5375 return false;
5376
5377 // Bitcast a source array of element bits to the target size.
5378 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5379 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5380 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5381 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5382 "Constant bit sizes don't match");
5383
5384 // Don't split if we don't allow undef bits.
5385 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5386 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5387 return false;
5388
5389 // If we're already the right size, don't bother bitcasting.
5390 if (NumSrcElts == NumElts) {
5391 UndefElts = UndefSrcElts;
5392 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5393 return true;
5394 }
5395
5396 // Extract all the undef/constant element data and pack into single bitsets.
5397 APInt UndefBits(SizeInBits, 0);
5398 APInt MaskBits(SizeInBits, 0);
5399
5400 for (unsigned i = 0; i != NumSrcElts; ++i) {
5401 unsigned BitOffset = i * SrcEltSizeInBits;
5402 if (UndefSrcElts[i])
5403 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5404 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5405 }
5406
5407 // Split the undef/constant single bitset data into the target elements.
5408 UndefElts = APInt(NumElts, 0);
5409 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5410
5411 for (unsigned i = 0; i != NumElts; ++i) {
5412 unsigned BitOffset = i * EltSizeInBits;
5413 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5414
5415 // Only treat an element as UNDEF if all bits are UNDEF.
5416 if (UndefEltBits.isAllOnes()) {
5417 if (!AllowWholeUndefs)
5418 return false;
5419 UndefElts.setBit(i);
5420 continue;
5421 }
5422
5423 // If only some bits are UNDEF then treat them as zero (or bail if not
5424 // supported).
5425 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5426 return false;
5427
5428 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5429 }
5430 return true;
5431 };
5432
5433 // Collect constant bits and insert into mask/undef bit masks.
5434 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5435 unsigned UndefBitIndex) {
5436 if (!Cst)
5437 return false;
5438 if (isa<UndefValue>(Cst)) {
5439 Undefs.setBit(UndefBitIndex);
5440 return true;
5441 }
5442 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5443 Mask = APInt::getSplat(CInt->getType()->getPrimitiveSizeInBits(),
5444 CInt->getValue());
5445 return true;
5446 }
5447 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5448 Mask = APInt::getSplat(CFP->getType()->getPrimitiveSizeInBits(),
5449 CFP->getValueAPF().bitcastToAPInt());
5450 return true;
5451 }
5452 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5453 Type *Ty = CDS->getType();
5454 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5455 Type *EltTy = CDS->getElementType();
5456 bool IsInteger = EltTy->isIntegerTy();
5457 bool IsFP =
5458 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5459 if (!IsInteger && !IsFP)
5460 return false;
5461 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5462 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5463 if (IsInteger)
5464 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5465 else
5466 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5467 I * EltBits);
5468 return true;
5469 }
5470 return false;
5471 };
5472
5473 // Handle UNDEFs.
5474 if (Op.isUndef()) {
5475 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5476 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5477 return CastBitData(UndefSrcElts, SrcEltBits);
5478 }
5479
5480 // Extract scalar constant bits.
5481 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5482 APInt UndefSrcElts = APInt::getZero(1);
5483 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5484 return CastBitData(UndefSrcElts, SrcEltBits);
5485 }
5486 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5487 APInt UndefSrcElts = APInt::getZero(1);
5488 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5489 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5490 return CastBitData(UndefSrcElts, SrcEltBits);
5491 }
5492
5493 // Extract constant bits from build vector.
5494 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5495 BitVector Undefs;
5496 SmallVector<APInt> SrcEltBits;
5497 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5498 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5499 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5500 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5501 if (Undefs[I])
5502 UndefSrcElts.setBit(I);
5503 return CastBitData(UndefSrcElts, SrcEltBits);
5504 }
5505 }
5506
5507 // Extract constant bits from constant pool vector.
5508 if (auto *Cst = getTargetConstantFromNode(Op)) {
5509 Type *CstTy = Cst->getType();
5510 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5511 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5512 return false;
5513
5514 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5515 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5516 if ((SizeInBits % SrcEltSizeInBits) != 0)
5517 return false;
5518
5519 APInt UndefSrcElts(NumSrcElts, 0);
5520 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5521 for (unsigned i = 0; i != NumSrcElts; ++i)
5522 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5523 UndefSrcElts, i))
5524 return false;
5525
5526 return CastBitData(UndefSrcElts, SrcEltBits);
5527 }
5528
5529 // Extract constant bits from a broadcasted constant pool scalar.
5530 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5531 EltSizeInBits <= VT.getScalarSizeInBits()) {
5532 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5533 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5534 return false;
5535
5536 SDValue Ptr = MemIntr->getBasePtr();
5537 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
5538 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5539 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5540
5541 APInt UndefSrcElts(NumSrcElts, 0);
5542 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5543 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5544 if (UndefSrcElts[0])
5545 UndefSrcElts.setBits(0, NumSrcElts);
5546 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5547 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5548 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5549 return CastBitData(UndefSrcElts, SrcEltBits);
5550 }
5551 }
5552 }
5553
5554 // Extract constant bits from a subvector broadcast.
5555 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5556 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5557 SDValue Ptr = MemIntr->getBasePtr();
5558 // The source constant may be larger than the subvector broadcast,
5559 // ensure we extract the correct subvector constants.
5560 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5561 Type *CstTy = Cst->getType();
5562 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5563 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5564 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5565 (SizeInBits % SubVecSizeInBits) != 0)
5566 return false;
5567 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5568 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5569 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5570 APInt UndefSubElts(NumSubElts, 0);
5571 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5572 APInt(CstEltSizeInBits, 0));
5573 for (unsigned i = 0; i != NumSubElts; ++i) {
5574 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5575 UndefSubElts, i))
5576 return false;
5577 for (unsigned j = 1; j != NumSubVecs; ++j)
5578 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5579 }
5580 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5581 UndefSubElts);
5582 return CastBitData(UndefSubElts, SubEltBits);
5583 }
5584 }
5585
5586 // Extract a rematerialized scalar constant insertion.
5587 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5588 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5589 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5590 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5591 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5592
5593 APInt UndefSrcElts(NumSrcElts, 0);
5594 SmallVector<APInt, 64> SrcEltBits;
5595 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5596 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5597 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5598 return CastBitData(UndefSrcElts, SrcEltBits);
5599 }
5600
5601 // Insert constant bits from a base and sub vector sources.
5602 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5603 // If bitcasts to larger elements we might lose track of undefs - don't
5604 // allow any to be safe.
5605 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5606 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5607
5608 APInt UndefSrcElts, UndefSubElts;
5609 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5610 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5611 UndefSubElts, EltSubBits,
5612 AllowWholeUndefs && AllowUndefs,
5613 AllowPartialUndefs && AllowUndefs) &&
5614 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5615 UndefSrcElts, EltSrcBits,
5616 AllowWholeUndefs && AllowUndefs,
5617 AllowPartialUndefs && AllowUndefs)) {
5618 unsigned BaseIdx = Op.getConstantOperandVal(2);
5619 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5620 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5621 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5622 return CastBitData(UndefSrcElts, EltSrcBits);
5623 }
5624 }
5625
5626 // Extract constant bits from a subvector's source.
5627 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5628 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5629 EltBits, AllowWholeUndefs,
5630 AllowPartialUndefs)) {
5631 EVT SrcVT = Op.getOperand(0).getValueType();
5632 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5633 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5634 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5635 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5636 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5637 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5638 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5639
5640 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5641 if ((BaseIdx + NumSubElts) != NumSrcElts)
5642 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5643 if (BaseIdx != 0)
5644 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5645 return true;
5646 }
5647
5648 // Extract constant bits from shuffle node sources.
5649 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5650 // TODO - support shuffle through bitcasts.
5651 if (EltSizeInBits != VT.getScalarSizeInBits())
5652 return false;
5653
5654 ArrayRef<int> Mask = SVN->getMask();
5655 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5656 llvm::any_of(Mask, [](int M) { return M < 0; }))
5657 return false;
5658
5659 APInt UndefElts0, UndefElts1;
5660 SmallVector<APInt, 32> EltBits0, EltBits1;
5661 if (isAnyInRange(Mask, 0, NumElts) &&
5662 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5663 UndefElts0, EltBits0, AllowWholeUndefs,
5664 AllowPartialUndefs))
5665 return false;
5666 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5667 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5668 UndefElts1, EltBits1, AllowWholeUndefs,
5669 AllowPartialUndefs))
5670 return false;
5671
5672 UndefElts = APInt::getZero(NumElts);
5673 for (int i = 0; i != (int)NumElts; ++i) {
5674 int M = Mask[i];
5675 if (M < 0) {
5676 UndefElts.setBit(i);
5677 EltBits.push_back(APInt::getZero(EltSizeInBits));
5678 } else if (M < (int)NumElts) {
5679 if (UndefElts0[M])
5680 UndefElts.setBit(i);
5681 EltBits.push_back(EltBits0[M]);
5682 } else {
5683 if (UndefElts1[M - NumElts])
5684 UndefElts.setBit(i);
5685 EltBits.push_back(EltBits1[M - NumElts]);
5686 }
5687 }
5688 return true;
5689 }
5690
5691 return false;
5692}
5693
5694namespace llvm {
5695namespace X86 {
5696bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5697 APInt UndefElts;
5698 SmallVector<APInt, 16> EltBits;
5700 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5701 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5702 int SplatIndex = -1;
5703 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5704 if (UndefElts[i])
5705 continue;
5706 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5707 SplatIndex = -1;
5708 break;
5709 }
5710 SplatIndex = i;
5711 }
5712 if (0 <= SplatIndex) {
5713 SplatVal = EltBits[SplatIndex];
5714 return true;
5715 }
5716 }
5717
5718 return false;
5719}
5720
5721int getRoundingModeX86(unsigned RM) {
5722 switch (static_cast<::llvm::RoundingMode>(RM)) {
5723 // clang-format off
5724 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest;
5725 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward;
5726 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward;
5727 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero;
5728 default: return X86::rmInvalid;
5729 // clang-format on
5730 }
5731}
5732
5733} // namespace X86
5734} // namespace llvm
5735
5737 unsigned MaskEltSizeInBits,
5739 APInt &UndefElts) {
5740 // Extract the raw target constant bits.
5741 SmallVector<APInt, 64> EltBits;
5742 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5743 EltBits, /* AllowWholeUndefs */ true,
5744 /* AllowPartialUndefs */ false))
5745 return false;
5746
5747 // Insert the extracted elements into the mask.
5748 for (const APInt &Elt : EltBits)
5749 RawMask.push_back(Elt.getZExtValue());
5750
5751 return true;
5752}
5753
5754static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5755 bool AllowUndefs) {
5756 APInt UndefElts;
5757 SmallVector<APInt, 64> EltBits;
5758 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5759 /*AllowWholeUndefs*/ AllowUndefs,
5760 /*AllowPartialUndefs*/ false))
5761 return false;
5762
5763 bool IsPow2OrUndef = true;
5764 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5765 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5766 return IsPow2OrUndef;
5767}
5768
5769// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5771 // TODO: don't always ignore oneuse constraints.
5772 V = peekThroughBitcasts(V);
5773 EVT VT = V.getValueType();
5774
5775 // Match not(xor X, -1) -> X.
5776 if (V.getOpcode() == ISD::XOR &&
5777 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5778 isAllOnesConstant(V.getOperand(1))))
5779 return V.getOperand(0);
5780
5781 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5782 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5783 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5784 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5785 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5786 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5787 V.getOperand(1));
5788 }
5789 }
5790
5791 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5792 if (V.getOpcode() == X86ISD::PCMPGT &&
5793 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5794 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5795 V.getOperand(0).hasOneUse()) {
5796 APInt UndefElts;
5797 SmallVector<APInt> EltBits;
5798 if (getTargetConstantBitsFromNode(V.getOperand(0),
5799 V.getScalarValueSizeInBits(), UndefElts,
5800 EltBits) &&
5801 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5802 // Don't fold min_signed_value -> (min_signed_value - 1)
5803 bool MinSigned = false;
5804 for (APInt &Elt : EltBits) {
5805 MinSigned |= Elt.isMinSignedValue();
5806 Elt -= 1;
5807 }
5808 if (!MinSigned) {
5809 SDLoc DL(V);
5810 MVT VT = V.getSimpleValueType();
5811 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5812 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5813 }
5814 }
5815 }
5816
5817 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5819 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5820 for (SDValue &CatOp : CatOps) {
5821 SDValue NotCat = IsNOT(CatOp, DAG);
5822 if (!NotCat)
5823 return SDValue();
5824 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5825 }
5826 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5827 }
5828
5829 // Match not(or(not(X),not(Y))) -> and(X, Y).
5830 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5831 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5832 // TODO: Handle cases with single NOT operand -> ANDNP
5833 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5834 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5835 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5836 DAG.getBitcast(VT, Op1));
5837 }
5838
5839 return SDValue();
5840}
5841
5842/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5843/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5844/// Note: This ignores saturation, so inputs must be checked first.
5846 bool Unary, unsigned NumStages = 1) {
5847 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5848 unsigned NumElts = VT.getVectorNumElements();
5849 unsigned NumLanes = VT.getSizeInBits() / 128;
5850 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5851 unsigned Offset = Unary ? 0 : NumElts;
5852 unsigned Repetitions = 1u << (NumStages - 1);
5853 unsigned Increment = 1u << NumStages;
5854 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5855
5856 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5857 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5858 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5859 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5860 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5861 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5862 }
5863 }
5864}
5865
5866// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5867static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5868 APInt &DemandedLHS, APInt &DemandedRHS) {
5869 int NumLanes = VT.getSizeInBits() / 128;
5870 int NumElts = DemandedElts.getBitWidth();
5871 int NumInnerElts = NumElts / 2;
5872 int NumEltsPerLane = NumElts / NumLanes;
5873 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5874
5875 DemandedLHS = APInt::getZero(NumInnerElts);
5876 DemandedRHS = APInt::getZero(NumInnerElts);
5877
5878 // Map DemandedElts to the packed operands.
5879 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5880 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5881 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5882 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5883 if (DemandedElts[OuterIdx])
5884 DemandedLHS.setBit(InnerIdx);
5885 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5886 DemandedRHS.setBit(InnerIdx);
5887 }
5888 }
5889}
5890
5891// Split the demanded elts of a HADD/HSUB node between its operands.
5892static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5893 APInt &DemandedLHS, APInt &DemandedRHS) {
5895 DemandedLHS, DemandedRHS);
5896 DemandedLHS |= DemandedLHS << 1;
5897 DemandedRHS |= DemandedRHS << 1;
5898}
5899
5900/// Calculates the shuffle mask corresponding to the target-specific opcode.
5901/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5902/// operands in \p Ops, and returns true.
5903/// Sets \p IsUnary to true if only one source is used. Note that this will set
5904/// IsUnary for shuffles which use a single input multiple times, and in those
5905/// cases it will adjust the mask to only have indices within that single input.
5906/// It is an error to call this with non-empty Mask/Ops vectors.
5907static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5909 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5910 if (!isTargetShuffle(N.getOpcode()))
5911 return false;
5912
5913 MVT VT = N.getSimpleValueType();
5914 unsigned NumElems = VT.getVectorNumElements();
5915 unsigned MaskEltSize = VT.getScalarSizeInBits();
5917 APInt RawUndefs;
5918 uint64_t ImmN;
5919
5920 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5921 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5922
5923 IsUnary = false;
5924 bool IsFakeUnary = false;
5925 switch (N.getOpcode()) {
5926 case X86ISD::BLENDI:
5927 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5928 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5929 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5930 DecodeBLENDMask(NumElems, ImmN, Mask);
5931 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5932 break;
5933 case X86ISD::SHUFP:
5934 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5935 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5936 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5937 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5938 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5939 break;
5940 case X86ISD::INSERTPS:
5941 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5942 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5943 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5944 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5945 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5946 break;
5947 case X86ISD::EXTRQI:
5948 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5949 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5950 isa<ConstantSDNode>(N.getOperand(2))) {
5951 int BitLen = N.getConstantOperandVal(1);
5952 int BitIdx = N.getConstantOperandVal(2);
5953 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5954 IsUnary = true;
5955 }
5956 break;
5957 case X86ISD::INSERTQI:
5958 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5959 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5960 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5961 isa<ConstantSDNode>(N.getOperand(3))) {
5962 int BitLen = N.getConstantOperandVal(2);
5963 int BitIdx = N.getConstantOperandVal(3);
5964 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5965 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5966 }
5967 break;
5968 case X86ISD::UNPCKH:
5969 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5970 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5971 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5972 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5973 break;
5974 case X86ISD::UNPCKL:
5975 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5976 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5977 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5978 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5979 break;
5980 case X86ISD::MOVHLPS:
5981 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5982 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5983 DecodeMOVHLPSMask(NumElems, Mask);
5984 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5985 break;
5986 case X86ISD::MOVLHPS:
5987 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5988 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5989 DecodeMOVLHPSMask(NumElems, Mask);
5990 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5991 break;
5992 case X86ISD::VALIGN:
5993 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5994 "Only 32-bit and 64-bit elements are supported!");
5995 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5996 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5997 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5998 DecodeVALIGNMask(NumElems, ImmN, Mask);
5999 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6000 Ops.push_back(N.getOperand(1));
6001 Ops.push_back(N.getOperand(0));
6002 break;
6003 case X86ISD::PALIGNR:
6004 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
6005 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6006 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6007 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6008 DecodePALIGNRMask(NumElems, ImmN, Mask);
6009 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6010 Ops.push_back(N.getOperand(1));
6011 Ops.push_back(N.getOperand(0));
6012 break;
6013 case X86ISD::VSHLDQ:
6014 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
6015 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6016 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6017 DecodePSLLDQMask(NumElems, ImmN, Mask);
6018 IsUnary = true;
6019 break;
6020 case X86ISD::VSRLDQ:
6021 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
6022 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6023 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6024 DecodePSRLDQMask(NumElems, ImmN, Mask);
6025 IsUnary = true;
6026 break;
6027 case X86ISD::PSHUFD:
6028 case X86ISD::VPERMILPI:
6029 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6030 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6031 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
6032 IsUnary = true;
6033 break;
6034 case X86ISD::PSHUFHW:
6035 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6036 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6037 DecodePSHUFHWMask(NumElems, ImmN, Mask);
6038 IsUnary = true;
6039 break;
6040 case X86ISD::PSHUFLW:
6041 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6042 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6043 DecodePSHUFLWMask(NumElems, ImmN, Mask);
6044 IsUnary = true;
6045 break;
6046 case X86ISD::VZEXT_MOVL:
6047 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6048 DecodeZeroMoveLowMask(NumElems, Mask);
6049 IsUnary = true;
6050 break;
6051 case X86ISD::VBROADCAST:
6052 // We only decode broadcasts of same-sized vectors, peeking through to
6053 // extracted subvectors is likely to cause hasOneUse issues with
6054 // SimplifyDemandedBits etc.
6055 if (N.getOperand(0).getValueType() == VT) {
6056 DecodeVectorBroadcast(NumElems, Mask);
6057 IsUnary = true;
6058 break;
6059 }
6060 return false;
6061 case X86ISD::VPERMILPV: {
6062 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6063 IsUnary = true;
6064 SDValue MaskNode = N.getOperand(1);
6065 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6066 RawUndefs)) {
6067 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
6068 break;
6069 }
6070 return false;
6071 }
6072 case X86ISD::PSHUFB: {
6073 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
6074 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6075 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6076 IsUnary = true;
6077 SDValue MaskNode = N.getOperand(1);
6078 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6079 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
6080 break;
6081 }
6082 return false;
6083 }
6084 case X86ISD::VPERMI:
6085 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6086 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6087 DecodeVPERMMask(NumElems, ImmN, Mask);
6088 IsUnary = true;
6089 break;
6090 case X86ISD::MOVSS:
6091 case X86ISD::MOVSD:
6092 case X86ISD::MOVSH:
6093 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6094 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6095 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
6096 break;
6097 case X86ISD::VPERM2X128:
6098 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6099 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6100 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6101 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
6102 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6103 break;
6104 case X86ISD::SHUF128:
6105 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6106 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6107 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6108 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
6109 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6110 break;
6111 case X86ISD::MOVSLDUP:
6112 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6113 DecodeMOVSLDUPMask(NumElems, Mask);
6114 IsUnary = true;
6115 break;
6116 case X86ISD::MOVSHDUP:
6117 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6118 DecodeMOVSHDUPMask(NumElems, Mask);
6119 IsUnary = true;
6120 break;
6121 case X86ISD::MOVDDUP:
6122 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6123 DecodeMOVDDUPMask(NumElems, Mask);
6124 IsUnary = true;
6125 break;
6126 case X86ISD::VPERMIL2: {
6127 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6128 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6129 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6130 SDValue MaskNode = N.getOperand(2);
6131 SDValue CtrlNode = N.getOperand(3);
6132 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
6133 unsigned CtrlImm = CtrlOp->getZExtValue();
6134 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6135 RawUndefs)) {
6136 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
6137 Mask);
6138 break;
6139 }
6140 }
6141 return false;
6142 }
6143 case X86ISD::VPPERM: {
6144 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6145 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6146 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6147 SDValue MaskNode = N.getOperand(2);
6148 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6149 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
6150 break;
6151 }
6152 return false;
6153 }
6154 case X86ISD::VPERMV: {
6155 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6156 IsUnary = true;
6157 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
6158 Ops.push_back(N.getOperand(1));
6159 SDValue MaskNode = N.getOperand(0);
6160 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6161 RawUndefs)) {
6162 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
6163 break;
6164 }
6165 return false;
6166 }
6167 case X86ISD::VPERMV3: {
6168 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6169 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
6170 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
6171 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
6172 Ops.push_back(N.getOperand(0));
6173 Ops.push_back(N.getOperand(2));
6174 SDValue MaskNode = N.getOperand(1);
6175 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6176 RawUndefs)) {
6177 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
6178 break;
6179 }
6180 return false;
6181 }
6182 case X86ISD::COMPRESS: {
6183 SDValue CmpVec = N.getOperand(0);
6184 SDValue PassThru = N.getOperand(1);
6185 SDValue CmpMask = N.getOperand(2);
6186 APInt UndefElts;
6187 SmallVector<APInt> EltBits;
6188 if (!getTargetConstantBitsFromNode(CmpMask, 1, UndefElts, EltBits))
6189 return false;
6190 assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
6191 "Illegal compression mask");
6192 for (unsigned I = 0; I != NumElems; ++I) {
6193 if (!EltBits[I].isZero())
6194 Mask.push_back(I);
6195 }
6196 while (Mask.size() != NumElems) {
6197 Mask.push_back(NumElems + Mask.size());
6198 }
6199 Ops.push_back(CmpVec);
6200 Ops.push_back(PassThru);
6201 return true;
6202 }
6203 case X86ISD::EXPAND: {
6204 SDValue ExpVec = N.getOperand(0);
6205 SDValue PassThru = N.getOperand(1);
6206 SDValue ExpMask = N.getOperand(2);
6207 APInt UndefElts;
6208 SmallVector<APInt> EltBits;
6209 if (!getTargetConstantBitsFromNode(ExpMask, 1, UndefElts, EltBits))
6210 return false;
6211 assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
6212 "Illegal expansion mask");
6213 unsigned ExpIndex = 0;
6214 for (unsigned I = 0; I != NumElems; ++I) {
6215 if (EltBits[I].isZero())
6216 Mask.push_back(I + NumElems);
6217 else
6218 Mask.push_back(ExpIndex++);
6219 }
6220 Ops.push_back(ExpVec);
6221 Ops.push_back(PassThru);
6222 return true;
6223 }
6224 default:
6225 llvm_unreachable("unknown target shuffle node");
6226 }
6227
6228 // Empty mask indicates the decode failed.
6229 if (Mask.empty())
6230 return false;
6231
6232 // Check if we're getting a shuffle mask with zero'd elements.
6233 if (!AllowSentinelZero && isAnyZero(Mask))
6234 return false;
6235
6236 // If we have a fake unary shuffle, the shuffle mask is spread across two
6237 // inputs that are actually the same node. Re-map the mask to always point
6238 // into the first input.
6239 if (IsFakeUnary)
6240 for (int &M : Mask)
6241 if (M >= (int)Mask.size())
6242 M -= Mask.size();
6243
6244 // If we didn't already add operands in the opcode-specific code, default to
6245 // adding 1 or 2 operands starting at 0.
6246 if (Ops.empty()) {
6247 Ops.push_back(N.getOperand(0));
6248 if (!IsUnary || IsFakeUnary)
6249 Ops.push_back(N.getOperand(1));
6250 }
6251
6252 return true;
6253}
6254
6255// Wrapper for getTargetShuffleMask with InUnary;
6256static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
6258 SmallVectorImpl<int> &Mask) {
6259 bool IsUnary;
6260 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
6261}
6262
6263/// Compute whether each element of a shuffle is zeroable.
6264///
6265/// A "zeroable" vector shuffle element is one which can be lowered to zero.
6266/// Either it is an undef element in the shuffle mask, the element of the input
6267/// referenced is undef, or the element of the input referenced is known to be
6268/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
6269/// as many lanes with this technique as possible to simplify the remaining
6270/// shuffle.
6272 SDValue V1, SDValue V2,
6273 APInt &KnownUndef, APInt &KnownZero) {
6274 int Size = Mask.size();
6275 KnownUndef = KnownZero = APInt::getZero(Size);
6276
6277 V1 = peekThroughBitcasts(V1);
6278 V2 = peekThroughBitcasts(V2);
6279
6280 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
6281 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
6282
6283 int VectorSizeInBits = V1.getValueSizeInBits();
6284 int ScalarSizeInBits = VectorSizeInBits / Size;
6285 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
6286
6287 for (int i = 0; i < Size; ++i) {
6288 int M = Mask[i];
6289 // Handle the easy cases.
6290 if (M < 0) {
6291 KnownUndef.setBit(i);
6292 continue;
6293 }
6294 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
6295 KnownZero.setBit(i);
6296 continue;
6297 }
6298
6299 // Determine shuffle input and normalize the mask.
6300 SDValue V = M < Size ? V1 : V2;
6301 M %= Size;
6302
6303 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
6304 if (V.getOpcode() != ISD::BUILD_VECTOR)
6305 continue;
6306
6307 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
6308 // the (larger) source element must be UNDEF/ZERO.
6309 if ((Size % V.getNumOperands()) == 0) {
6310 int Scale = Size / V->getNumOperands();
6311 SDValue Op = V.getOperand(M / Scale);
6312 if (Op.isUndef())
6313 KnownUndef.setBit(i);
6314 if (X86::isZeroNode(Op))
6315 KnownZero.setBit(i);
6316 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
6317 APInt Val = Cst->getAPIntValue();
6318 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6319 if (Val == 0)
6320 KnownZero.setBit(i);
6321 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6322 APInt Val = Cst->getValueAPF().bitcastToAPInt();
6323 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6324 if (Val == 0)
6325 KnownZero.setBit(i);
6326 }
6327 continue;
6328 }
6329
6330 // If the BUILD_VECTOR has more elements then all the (smaller) source
6331 // elements must be UNDEF or ZERO.
6332 if ((V.getNumOperands() % Size) == 0) {
6333 int Scale = V->getNumOperands() / Size;
6334 bool AllUndef = true;
6335 bool AllZero = true;
6336 for (int j = 0; j < Scale; ++j) {
6337 SDValue Op = V.getOperand((M * Scale) + j);
6338 AllUndef &= Op.isUndef();
6339 AllZero &= X86::isZeroNode(Op);
6340 }
6341 if (AllUndef)
6342 KnownUndef.setBit(i);
6343 if (AllZero)
6344 KnownZero.setBit(i);
6345 continue;
6346 }
6347 }
6348}
6349
6350/// Decode a target shuffle mask and inputs and see if any values are
6351/// known to be undef or zero from their inputs.
6352/// Returns true if the target shuffle mask was decoded.
6353/// FIXME: Merge this with computeZeroableShuffleElements?
6356 APInt &KnownUndef, APInt &KnownZero) {
6357 bool IsUnary;
6358 if (!isTargetShuffle(N.getOpcode()))
6359 return false;
6360
6361 MVT VT = N.getSimpleValueType();
6362 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
6363 return false;
6364
6365 int Size = Mask.size();
6366 SDValue V1 = Ops[0];
6367 SDValue V2 = IsUnary ? V1 : Ops[1];
6368 KnownUndef = KnownZero = APInt::getZero(Size);
6369
6370 V1 = peekThroughBitcasts(V1);
6371 V2 = peekThroughBitcasts(V2);
6372
6373 assert((VT.getSizeInBits() % Size) == 0 &&
6374 "Illegal split of shuffle value type");
6375 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
6376
6377 // Extract known constant input data.
6378 APInt UndefSrcElts[2];
6379 SmallVector<APInt, 32> SrcEltBits[2];
6380 bool IsSrcConstant[2] = {
6381 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6382 SrcEltBits[0], /*AllowWholeUndefs*/ true,
6383 /*AllowPartialUndefs*/ false),
6384 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6385 SrcEltBits[1], /*AllowWholeUndefs*/ true,
6386 /*AllowPartialUndefs*/ false)};
6387
6388 for (int i = 0; i < Size; ++i) {
6389 int M = Mask[i];
6390
6391 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6392 if (M < 0) {
6393 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
6394 if (SM_SentinelUndef == M)
6395 KnownUndef.setBit(i);
6396 if (SM_SentinelZero == M)
6397 KnownZero.setBit(i);
6398 continue;
6399 }
6400
6401 // Determine shuffle input and normalize the mask.
6402 unsigned SrcIdx = M / Size;
6403 SDValue V = M < Size ? V1 : V2;
6404 M %= Size;
6405
6406 // We are referencing an UNDEF input.
6407 if (V.isUndef()) {
6408 KnownUndef.setBit(i);
6409 continue;
6410 }
6411
6412 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6413 // TODO: We currently only set UNDEF for integer types - floats use the same
6414 // registers as vectors and many of the scalar folded loads rely on the
6415 // SCALAR_TO_VECTOR pattern.
6416 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6417 (Size % V.getValueType().getVectorNumElements()) == 0) {
6418 int Scale = Size / V.getValueType().getVectorNumElements();
6419 int Idx = M / Scale;
6420 if (Idx != 0 && !VT.isFloatingPoint())
6421 KnownUndef.setBit(i);
6422 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6423 KnownZero.setBit(i);
6424 continue;
6425 }
6426
6427 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6428 // base vectors.
6429 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6430 SDValue Vec = V.getOperand(0);
6431 int NumVecElts = Vec.getValueType().getVectorNumElements();
6432 if (Vec.isUndef() && Size == NumVecElts) {
6433 int Idx = V.getConstantOperandVal(2);
6434 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6435 if (M < Idx || (Idx + NumSubElts) <= M)
6436 KnownUndef.setBit(i);
6437 }
6438 continue;
6439 }
6440
6441 // Attempt to extract from the source's constant bits.
6442 if (IsSrcConstant[SrcIdx]) {
6443 if (UndefSrcElts[SrcIdx][M])
6444 KnownUndef.setBit(i);
6445 else if (SrcEltBits[SrcIdx][M] == 0)
6446 KnownZero.setBit(i);
6447 }
6448 }
6449
6450 assert(VT.getVectorNumElements() == (unsigned)Size &&
6451 "Different mask size from vector size!");
6452 return true;
6453}
6454
6455// Replace target shuffle mask elements with known undef/zero sentinels.
6457 const APInt &KnownUndef,
6458 const APInt &KnownZero,
6459 bool ResolveKnownZeros= true) {
6460 unsigned NumElts = Mask.size();
6461 assert(KnownUndef.getBitWidth() == NumElts &&
6462 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6463
6464 for (unsigned i = 0; i != NumElts; ++i) {
6465 if (KnownUndef[i])
6466 Mask[i] = SM_SentinelUndef;
6467 else if (ResolveKnownZeros && KnownZero[i])
6468 Mask[i] = SM_SentinelZero;
6469 }
6470}
6471
6472// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6474 APInt &KnownUndef,
6475 APInt &KnownZero) {
6476 unsigned NumElts = Mask.size();
6477 KnownUndef = KnownZero = APInt::getZero(NumElts);
6478
6479 for (unsigned i = 0; i != NumElts; ++i) {
6480 int M = Mask[i];
6481 if (SM_SentinelUndef == M)
6482 KnownUndef.setBit(i);
6483 if (SM_SentinelZero == M)
6484 KnownZero.setBit(i);
6485 }
6486}
6487
6488// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6490 SDValue Cond, bool IsBLENDV = false) {
6491 EVT CondVT = Cond.getValueType();
6492 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6493 unsigned NumElts = CondVT.getVectorNumElements();
6494
6495 APInt UndefElts;
6496 SmallVector<APInt, 32> EltBits;
6497 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6498 /*AllowWholeUndefs*/ true,
6499 /*AllowPartialUndefs*/ false))
6500 return false;
6501
6502 Mask.resize(NumElts, SM_SentinelUndef);
6503
6504 for (int i = 0; i != (int)NumElts; ++i) {
6505 Mask[i] = i;
6506 // Arbitrarily choose from the 2nd operand if the select condition element
6507 // is undef.
6508 // TODO: Can we do better by matching patterns such as even/odd?
6509 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6510 (IsBLENDV && EltBits[i].isNonNegative()))
6511 Mask[i] += NumElts;
6512 }
6513
6514 return true;
6515}
6516
6517// Forward declaration (for getFauxShuffleMask recursive check).
6518static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6521 const SelectionDAG &DAG, unsigned Depth,
6522 bool ResolveKnownElts);
6523
6524// Attempt to decode ops that could be represented as a shuffle mask.
6525// The decoded shuffle mask may contain a different number of elements to the
6526// destination value type.
6527// TODO: Merge into getTargetShuffleInputs()
6528static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6531 const SelectionDAG &DAG, unsigned Depth,
6532 bool ResolveKnownElts) {
6533 Mask.clear();
6534 Ops.clear();
6535
6536 MVT VT = N.getSimpleValueType();
6537 unsigned NumElts = VT.getVectorNumElements();
6538 unsigned NumSizeInBits = VT.getSizeInBits();
6539 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6540 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6541 return false;
6542 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6543 unsigned NumSizeInBytes = NumSizeInBits / 8;
6544 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6545
6546 unsigned Opcode = N.getOpcode();
6547 switch (Opcode) {
6548 case ISD::VECTOR_SHUFFLE: {
6549 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6550 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6551 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6552 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6553 Ops.push_back(N.getOperand(0));
6554 Ops.push_back(N.getOperand(1));
6555 return true;
6556 }
6557 return false;
6558 }
6559 case ISD::AND:
6560 case X86ISD::ANDNP: {
6561 // Attempt to decode as a per-byte mask.
6562 APInt UndefElts;
6563 SmallVector<APInt, 32> EltBits;
6564 SDValue N0 = N.getOperand(0);
6565 SDValue N1 = N.getOperand(1);
6566 bool IsAndN = (X86ISD::ANDNP == Opcode);
6567 uint64_t ZeroMask = IsAndN ? 255 : 0;
6568 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6569 /*AllowWholeUndefs*/ false,
6570 /*AllowPartialUndefs*/ false))
6571 return false;
6572 // We can't assume an undef src element gives an undef dst - the other src
6573 // might be zero.
6574 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6575 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6576 const APInt &ByteBits = EltBits[i];
6577 if (ByteBits != 0 && ByteBits != 255)
6578 return false;
6579 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6580 }
6581 Ops.push_back(IsAndN ? N1 : N0);
6582 return true;
6583 }
6584 case ISD::OR: {
6585 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6586 // is a valid shuffle index.
6587 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6588 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6589 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6590 return false;
6591
6592 SmallVector<int, 64> SrcMask0, SrcMask1;
6593 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6596 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6597 Depth + 1, true) ||
6598 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6599 Depth + 1, true))
6600 return false;
6601
6602 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6603 SmallVector<int, 64> Mask0, Mask1;
6604 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6605 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6606 for (int i = 0; i != (int)MaskSize; ++i) {
6607 // NOTE: Don't handle demanded SM_SentinelUndef, as we can end up in
6608 // infinite loops converting between OR and BLEND shuffles due to
6609 // canWidenShuffleElements merging away undef elements, meaning we
6610 // fail to recognise the OR as the undef element isn't known zero.
6611 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6612 Mask.push_back(SM_SentinelZero);
6613 else if (Mask1[i] == SM_SentinelZero)
6614 Mask.push_back(i);
6615 else if (Mask0[i] == SM_SentinelZero)
6616 Mask.push_back(i + MaskSize);
6617 else if (MaskSize == NumElts && !DemandedElts[i])
6618 Mask.push_back(SM_SentinelUndef);
6619 else
6620 return false;
6621 }
6622 Ops.push_back(N.getOperand(0));
6623 Ops.push_back(N.getOperand(1));
6624 return true;
6625 }
6626 case ISD::CONCAT_VECTORS: {
6627 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6628 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6629 if (NumBitsPerElt == 64) {
6630 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6631 for (unsigned M = 0; M != NumSubElts; ++M)
6632 Mask.push_back((I * NumElts) + M);
6633 Ops.push_back(N.getOperand(I));
6634 }
6635 return true;
6636 }
6637 return false;
6638 }
6639 case ISD::INSERT_SUBVECTOR: {
6640 SDValue Src = N.getOperand(0);
6641 SDValue Sub = N.getOperand(1);
6642 EVT SubVT = Sub.getValueType();
6643 unsigned NumSubElts = SubVT.getVectorNumElements();
6644 uint64_t InsertIdx = N.getConstantOperandVal(2);
6645 // Subvector isn't demanded - just return the base vector.
6646 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6647 Mask.resize(NumElts);
6648 std::iota(Mask.begin(), Mask.end(), 0);
6649 Ops.push_back(Src);
6650 return true;
6651 }
6652 // Handle CONCAT(SUB0, SUB1).
6653 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6654 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6655 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6656 Src.getOperand(0).isUndef() &&
6657 Src.getOperand(1).getValueType() == SubVT &&
6658 Src.getConstantOperandVal(2) == 0 &&
6659 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6660 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6661 Mask.resize(NumElts);
6662 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6663 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6664 Ops.push_back(Src.getOperand(1));
6665 Ops.push_back(Sub);
6666 return true;
6667 }
6668 // Handle INSERT_SUBVECTOR(UNDEF, SUB, IDX) iff IDX != 0
6669 if (InsertIdx != 0 && Src.isUndef() &&
6671 Mask.assign(NumElts, SM_SentinelUndef);
6672 std::iota(Mask.begin() + InsertIdx, Mask.begin() + InsertIdx + NumSubElts,
6673 0);
6674 Ops.push_back(Sub);
6675 return true;
6676 }
6677 if (!N->isOnlyUserOf(Sub.getNode()))
6678 return false;
6679
6680 SmallVector<int, 64> SubMask;
6681 SmallVector<SDValue, 2> SubInputs;
6683 EVT SubSrcVT = SubSrc.getValueType();
6684 if (!SubSrcVT.isVector())
6685 return false;
6686
6687 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6688 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6689 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6690 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6691 SDValue SubSrcSrc = SubSrc.getOperand(0);
6692 unsigned NumSubSrcSrcElts =
6693 SubSrcSrc.getValueType().getVectorNumElements();
6694 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6695 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6696 "Subvector valuetype mismatch");
6697 InsertIdx *= (MaxElts / NumElts);
6698 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6699 NumSubElts *= (MaxElts / NumElts);
6700 bool SrcIsUndef = Src.isUndef();
6701 for (int i = 0; i != (int)MaxElts; ++i)
6702 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6703 for (int i = 0; i != (int)NumSubElts; ++i)
6704 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6705 if (!SrcIsUndef)
6706 Ops.push_back(Src);
6707 Ops.push_back(SubSrcSrc);
6708 return true;
6709 }
6710
6711 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6712 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6713 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6714 Depth + 1, ResolveKnownElts))
6715 return false;
6716
6717 // Subvector shuffle inputs must not be larger than the subvector.
6718 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6719 return SubVT.getFixedSizeInBits() <
6720 SubInput.getValueSizeInBits().getFixedValue();
6721 }))
6722 return false;
6723
6724 if (SubMask.size() != NumSubElts) {
6725 assert(((SubMask.size() % NumSubElts) == 0 ||
6726 (NumSubElts % SubMask.size()) == 0) &&
6727 "Illegal submask scale");
6728 if ((NumSubElts % SubMask.size()) == 0) {
6729 int Scale = NumSubElts / SubMask.size();
6730 SmallVector<int, 64> ScaledSubMask;
6731 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6732 SubMask = ScaledSubMask;
6733 } else {
6734 int Scale = SubMask.size() / NumSubElts;
6735 NumSubElts = SubMask.size();
6736 NumElts *= Scale;
6737 InsertIdx *= Scale;
6738 }
6739 }
6740 Ops.push_back(Src);
6741 Ops.append(SubInputs.begin(), SubInputs.end());
6742 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6743 Mask.append(NumElts, SM_SentinelZero);
6744 else
6745 for (int i = 0; i != (int)NumElts; ++i)
6746 Mask.push_back(i);
6747 for (int i = 0; i != (int)NumSubElts; ++i) {
6748 int M = SubMask[i];
6749 if (0 <= M) {
6750 int InputIdx = M / NumSubElts;
6751 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6752 }
6753 Mask[i + InsertIdx] = M;
6754 }
6755 return true;
6756 }
6757 case X86ISD::PINSRB:
6758 case X86ISD::PINSRW:
6761 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6762 // vector, for matching src/dst vector types.
6763 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6764
6765 unsigned DstIdx = 0;
6766 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6767 // Check we have an in-range constant insertion index.
6768 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6769 N.getConstantOperandAPInt(2).uge(NumElts))
6770 return false;
6771 DstIdx = N.getConstantOperandVal(2);
6772
6773 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6774 if (X86::isZeroNode(Scl)) {
6775 Ops.push_back(N.getOperand(0));
6776 for (unsigned i = 0; i != NumElts; ++i)
6777 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6778 return true;
6779 }
6780 }
6781
6782 // Peek through trunc/aext/zext/bitcast.
6783 // TODO: aext shouldn't require SM_SentinelZero padding.
6784 // TODO: handle shift of scalars.
6785 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6786 while (Scl.getOpcode() == ISD::TRUNCATE ||
6787 Scl.getOpcode() == ISD::ANY_EXTEND ||
6788 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6789 (Scl.getOpcode() == ISD::BITCAST &&
6792 Scl = Scl.getOperand(0);
6793 MinBitsPerElt =
6794 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6795 }
6796 if ((MinBitsPerElt % 8) != 0)
6797 return false;
6798
6799 // Attempt to find the source vector the scalar was extracted from.
6800 SDValue SrcExtract;
6801 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6802 Scl.getOpcode() == X86ISD::PEXTRW ||
6803 Scl.getOpcode() == X86ISD::PEXTRB) &&
6804 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6805 SrcExtract = Scl;
6806 }
6807 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6808 return false;
6809
6810 SDValue SrcVec = SrcExtract.getOperand(0);
6811 EVT SrcVT = SrcVec.getValueType();
6812 if (!SrcVT.getScalarType().isByteSized())
6813 return false;
6814 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6815 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6816 unsigned DstByte = DstIdx * NumBytesPerElt;
6817 MinBitsPerElt =
6818 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6819
6820 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6821 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6822 Ops.push_back(SrcVec);
6823 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6824 } else {
6825 Ops.push_back(SrcVec);
6826 Ops.push_back(N.getOperand(0));
6827 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6828 Mask.push_back(NumSizeInBytes + i);
6829 }
6830
6831 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6832 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6833 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6834 Mask[DstByte + i] = SrcByte + i;
6835 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6836 Mask[DstByte + i] = SM_SentinelZero;
6837 return true;
6838 }
6839 case X86ISD::PACKSS:
6840 case X86ISD::PACKUS: {
6841 SDValue N0 = N.getOperand(0);
6842 SDValue N1 = N.getOperand(1);
6843 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6844 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6845 "Unexpected input value type");
6846
6847 APInt EltsLHS, EltsRHS;
6848 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6849
6850 // If we know input saturation won't happen (or we don't care for particular
6851 // lanes), we can treat this as a truncation shuffle.
6852 bool Offset0 = false, Offset1 = false;
6853 if (Opcode == X86ISD::PACKSS) {
6854 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6855 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6856 (!(N1.isUndef() || EltsRHS.isZero()) &&
6857 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6858 return false;
6859 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6860 // PACKSS then it was likely being used for sign-extension for a
6861 // truncation, so just peek through and adjust the mask accordingly.
6862 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6863 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6864 Offset0 = true;
6865 N0 = N0.getOperand(0);
6866 }
6867 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6868 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6869 Offset1 = true;
6870 N1 = N1.getOperand(0);
6871 }
6872 } else {
6873 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6874 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6875 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6876 (!(N1.isUndef() || EltsRHS.isZero()) &&
6877 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6878 return false;
6879 }
6880
6881 bool IsUnary = (N0 == N1);
6882
6883 Ops.push_back(N0);
6884 if (!IsUnary)
6885 Ops.push_back(N1);
6886
6887 createPackShuffleMask(VT, Mask, IsUnary);
6888
6889 if (Offset0 || Offset1) {
6890 for (int &M : Mask)
6891 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6892 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6893 ++M;
6894 }
6895 return true;
6896 }
6897 case ISD::VSELECT:
6898 case X86ISD::BLENDV: {
6899 SDValue Cond = N.getOperand(0);
6900 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6901 Ops.push_back(N.getOperand(1));
6902 Ops.push_back(N.getOperand(2));
6903 return true;
6904 }
6905 return false;
6906 }
6907 case X86ISD::VTRUNC: {
6908 SDValue Src = N.getOperand(0);
6909 EVT SrcVT = Src.getValueType();
6910 if (SrcVT.getSizeInBits() != NumSizeInBits)
6911 return false;
6912 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6913 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6914 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6915 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6916 for (unsigned i = 0; i != NumSrcElts; ++i)
6917 Mask.push_back(i * Scale);
6918 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6919 Ops.push_back(Src);
6920 return true;
6921 }
6922 case ISD::SHL:
6923 case ISD::SRL: {
6924 APInt UndefElts;
6925 SmallVector<APInt, 32> EltBits;
6926 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6927 UndefElts, EltBits,
6928 /*AllowWholeUndefs*/ true,
6929 /*AllowPartialUndefs*/ false))
6930 return false;
6931
6932 // We can only decode 'whole byte' bit shifts as shuffles.
6933 for (unsigned I = 0; I != NumElts; ++I)
6934 if (DemandedElts[I] && !UndefElts[I] &&
6935 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6936 return false;
6937
6938 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6939 Ops.push_back(N.getOperand(0));
6940
6941 for (unsigned I = 0; I != NumElts; ++I) {
6942 if (!DemandedElts[I] || UndefElts[I])
6943 continue;
6944 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6945 unsigned Lo = I * NumBytesPerElt;
6946 unsigned Hi = Lo + NumBytesPerElt;
6947 // Clear mask to all zeros and insert the shifted byte indices.
6948 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6949 if (ISD::SHL == Opcode)
6950 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6951 else
6952 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6953 Lo + ByteShift);
6954 }
6955 return true;
6956 }
6957 case X86ISD::VSHLI:
6958 case X86ISD::VSRLI: {
6959 uint64_t ShiftVal = N.getConstantOperandVal(1);
6960 // Out of range bit shifts are guaranteed to be zero.
6961 if (NumBitsPerElt <= ShiftVal) {
6962 Mask.append(NumElts, SM_SentinelZero);
6963 return true;
6964 }
6965
6966 // We can only decode 'whole byte' bit shifts as shuffles.
6967 if ((ShiftVal % 8) != 0)
6968 break;
6969
6970 uint64_t ByteShift = ShiftVal / 8;
6971 Ops.push_back(N.getOperand(0));
6972
6973 // Clear mask to all zeros and insert the shifted byte indices.
6974 Mask.append(NumSizeInBytes, SM_SentinelZero);
6975
6976 if (X86ISD::VSHLI == Opcode) {
6977 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6978 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6979 Mask[i + j] = i + j - ByteShift;
6980 } else {
6981 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6982 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6983 Mask[i + j - ByteShift] = i + j;
6984 }
6985 return true;
6986 }
6987 case ISD::ROTL:
6988 case ISD::ROTR: {
6989 APInt UndefElts;
6990 SmallVector<APInt, 32> EltBits;
6991 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6992 UndefElts, EltBits,
6993 /*AllowWholeUndefs*/ true,
6994 /*AllowPartialUndefs*/ false))
6995 return false;
6996
6997 // We can only decode 'whole byte' bit rotates as shuffles.
6998 for (unsigned I = 0; I != NumElts; ++I)
6999 if (DemandedElts[I] && !UndefElts[I] &&
7000 (EltBits[I].urem(NumBitsPerElt) % 8) != 0)
7001 return false;
7002
7003 Ops.push_back(N.getOperand(0));
7004 for (unsigned I = 0; I != NumElts; ++I) {
7005 if (!DemandedElts[I] || UndefElts[I]) {
7006 Mask.append(NumBytesPerElt, SM_SentinelUndef);
7007 continue;
7008 }
7009 int Offset = EltBits[I].urem(NumBitsPerElt) / 8;
7010 Offset = (ISD::ROTL == Opcode ? NumBytesPerElt - Offset : Offset);
7011 int BaseIdx = I * NumBytesPerElt;
7012 for (int J = 0; J != (int)NumBytesPerElt; ++J) {
7013 Mask.push_back(BaseIdx + ((Offset + J) % NumBytesPerElt));
7014 }
7015 }
7016 return true;
7017 }
7018 case X86ISD::VROTLI:
7019 case X86ISD::VROTRI: {
7020 // We can only decode 'whole byte' bit rotates as shuffles.
7021 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7022 if ((RotateVal % 8) != 0)
7023 return false;
7024 Ops.push_back(N.getOperand(0));
7025 int Offset = RotateVal / 8;
7026 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
7027 for (int i = 0; i != (int)NumElts; ++i) {
7028 int BaseIdx = i * NumBytesPerElt;
7029 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
7030 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
7031 }
7032 }
7033 return true;
7034 }
7035 case X86ISD::VBROADCAST: {
7036 SDValue Src = N.getOperand(0);
7037 if (!Src.getSimpleValueType().isVector()) {
7038 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7039 !isNullConstant(Src.getOperand(1)) ||
7040 Src.getOperand(0).getValueType().getScalarType() !=
7041 VT.getScalarType())
7042 return false;
7043 Src = Src.getOperand(0);
7044 }
7045 Ops.push_back(Src);
7046 Mask.append(NumElts, 0);
7047 return true;
7048 }
7050 SDValue Src = N.getOperand(0);
7051 EVT SrcVT = Src.getValueType();
7052 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7053
7054 // Extended source must be a simple vector.
7055 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7056 (NumBitsPerSrcElt % 8) != 0)
7057 return false;
7058
7059 // We can only handle all-signbits extensions.
7060 APInt DemandedSrcElts =
7061 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
7062 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
7063 return false;
7064
7065 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
7066 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
7067 for (unsigned I = 0; I != NumElts; ++I)
7068 Mask.append(Scale, I);
7069 Ops.push_back(Src);
7070 return true;
7071 }
7072 case ISD::ZERO_EXTEND:
7073 case ISD::ANY_EXTEND:
7076 SDValue Src = N.getOperand(0);
7077 EVT SrcVT = Src.getValueType();
7078
7079 // Extended source must be a simple vector.
7080 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7081 (SrcVT.getScalarSizeInBits() % 8) != 0)
7082 return false;
7083
7084 bool IsAnyExtend =
7085 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7086 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
7087 IsAnyExtend, Mask);
7088 Ops.push_back(Src);
7089 return true;
7090 }
7091 }
7092
7093 return false;
7094}
7095
7096/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7098 SmallVectorImpl<int> &Mask) {
7099 int MaskWidth = Mask.size();
7100 SmallVector<SDValue, 16> UsedInputs;
7101 for (int i = 0, e = Inputs.size(); i < e; ++i) {
7102 int lo = UsedInputs.size() * MaskWidth;
7103 int hi = lo + MaskWidth;
7104
7105 // Strip UNDEF input usage.
7106 if (Inputs[i].isUndef())
7107 for (int &M : Mask)
7108 if ((lo <= M) && (M < hi))
7109 M = SM_SentinelUndef;
7110
7111 // Check for unused inputs.
7112 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7113 for (int &M : Mask)
7114 if (lo <= M)
7115 M -= MaskWidth;
7116 continue;
7117 }
7118
7119 // Check for repeated inputs.
7120 bool IsRepeat = false;
7121 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7122 if (peekThroughBitcasts(UsedInputs[j]) != peekThroughBitcasts(Inputs[i]))
7123 continue;
7124 for (int &M : Mask)
7125 if (lo <= M)
7126 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7127 IsRepeat = true;
7128 break;
7129 }
7130 if (IsRepeat)
7131 continue;
7132
7133 UsedInputs.push_back(Inputs[i]);
7134 }
7135 Inputs = std::move(UsedInputs);
7136}
7137
7138/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7139/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7140/// Returns true if the target shuffle mask was decoded.
7141static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7144 APInt &KnownUndef, APInt &KnownZero,
7145 const SelectionDAG &DAG, unsigned Depth,
7146 bool ResolveKnownElts) {
7148 return false; // Limit search depth.
7149
7150 EVT VT = Op.getValueType();
7151 if (!VT.isSimple() || !VT.isVector())
7152 return false;
7153
7154 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
7155 if (ResolveKnownElts)
7156 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7157 return true;
7158 }
7159 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7160 ResolveKnownElts)) {
7161 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7162 return true;
7163 }
7164 return false;
7165}
7166
7167static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7170 const SelectionDAG &DAG, unsigned Depth,
7171 bool ResolveKnownElts) {
7172 APInt KnownUndef, KnownZero;
7173 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7174 KnownZero, DAG, Depth, ResolveKnownElts);
7175}
7176
7179 const SelectionDAG &DAG, unsigned Depth = 0,
7180 bool ResolveKnownElts = true) {
7181 EVT VT = Op.getValueType();
7182 if (!VT.isSimple() || !VT.isVector())
7183 return false;
7184
7185 unsigned NumElts = Op.getValueType().getVectorNumElements();
7186 APInt DemandedElts = APInt::getAllOnes(NumElts);
7187 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
7188 ResolveKnownElts);
7189}
7190
7191// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
7192static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
7193 EVT MemVT, MemSDNode *Mem, unsigned Offset,
7194 SelectionDAG &DAG) {
7195 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
7196 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
7197 "Unknown broadcast load type");
7198
7199 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
7200 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
7201 return SDValue();
7202
7203 SDValue Ptr = DAG.getMemBasePlusOffset(Mem->getBasePtr(),
7205 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7206 SDValue Ops[] = {Mem->getChain(), Ptr};
7207 SDValue BcstLd = DAG.getMemIntrinsicNode(
7208 Opcode, DL, Tys, Ops, MemVT,
7210 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
7211 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
7212 return BcstLd;
7213}
7214
7215/// Returns the scalar element that will make up the i'th
7216/// element of the result of the vector shuffle.
7217static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
7218 SelectionDAG &DAG, unsigned Depth) {
7220 return SDValue(); // Limit search depth.
7221
7222 EVT VT = Op.getValueType();
7223 unsigned Opcode = Op.getOpcode();
7224 unsigned NumElems = VT.getVectorNumElements();
7225
7226 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
7227 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
7228 int Elt = SV->getMaskElt(Index);
7229
7230 if (Elt < 0)
7231 return DAG.getUNDEF(VT.getVectorElementType());
7232
7233 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
7234 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
7235 }
7236
7237 // Recurse into target specific vector shuffles to find scalars.
7238 if (isTargetShuffle(Opcode)) {
7239 MVT ShufVT = VT.getSimpleVT();
7240 MVT ShufSVT = ShufVT.getVectorElementType();
7241 int NumElems = (int)ShufVT.getVectorNumElements();
7242 SmallVector<int, 16> ShuffleMask;
7244 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
7245 return SDValue();
7246
7247 int Elt = ShuffleMask[Index];
7248 if (Elt == SM_SentinelZero)
7249 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
7250 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
7251 if (Elt == SM_SentinelUndef)
7252 return DAG.getUNDEF(ShufSVT);
7253
7254 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
7255 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
7256 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
7257 }
7258
7259 // Recurse into insert_subvector base/sub vector to find scalars.
7260 if (Opcode == ISD::INSERT_SUBVECTOR) {
7261 SDValue Vec = Op.getOperand(0);
7262 SDValue Sub = Op.getOperand(1);
7263 uint64_t SubIdx = Op.getConstantOperandVal(2);
7264 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
7265
7266 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
7267 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
7268 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
7269 }
7270
7271 // Recurse into concat_vectors sub vector to find scalars.
7272 if (Opcode == ISD::CONCAT_VECTORS) {
7273 EVT SubVT = Op.getOperand(0).getValueType();
7274 unsigned NumSubElts = SubVT.getVectorNumElements();
7275 uint64_t SubIdx = Index / NumSubElts;
7276 uint64_t SubElt = Index % NumSubElts;
7277 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
7278 }
7279
7280 // Recurse into extract_subvector src vector to find scalars.
7281 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
7282 SDValue Src = Op.getOperand(0);
7283 uint64_t SrcIdx = Op.getConstantOperandVal(1);
7284 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
7285 }
7286
7287 // We only peek through bitcasts of the same vector width.
7288 if (Opcode == ISD::BITCAST) {
7289 SDValue Src = Op.getOperand(0);
7290 EVT SrcVT = Src.getValueType();
7291 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
7292 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
7293 return SDValue();
7294 }
7295
7296 // Actual nodes that may contain scalar elements
7297
7298 // For insert_vector_elt - either return the index matching scalar or recurse
7299 // into the base vector.
7300 if (Opcode == ISD::INSERT_VECTOR_ELT &&
7301 isa<ConstantSDNode>(Op.getOperand(2))) {
7302 if (Op.getConstantOperandAPInt(2) == Index)
7303 return Op.getOperand(1);
7304 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
7305 }
7306
7307 if (Opcode == ISD::SCALAR_TO_VECTOR)
7308 return (Index == 0) ? Op.getOperand(0)
7309 : DAG.getUNDEF(VT.getVectorElementType());
7310
7311 if (Opcode == ISD::BUILD_VECTOR)
7312 return Op.getOperand(Index);
7313
7314 return SDValue();
7315}
7316
7317// Use PINSRB/PINSRW/PINSRD to create a build vector.
7319 const APInt &NonZeroMask,
7320 unsigned NumNonZero, unsigned NumZero,
7321 SelectionDAG &DAG,
7322 const X86Subtarget &Subtarget) {
7323 MVT VT = Op.getSimpleValueType();
7324 unsigned NumElts = VT.getVectorNumElements();
7325 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
7326 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
7327 "Illegal vector insertion");
7328
7329 SDValue V;
7330 bool First = true;
7331
7332 for (unsigned i = 0; i < NumElts; ++i) {
7333 bool IsNonZero = NonZeroMask[i];
7334 if (!IsNonZero)
7335 continue;
7336
7337 // If the build vector contains zeros or our first insertion is not the
7338 // first index then insert into zero vector to break any register
7339 // dependency else use SCALAR_TO_VECTOR.
7340 if (First) {
7341 First = false;
7342 if (NumZero || 0 != i)
7343 V = getZeroVector(VT, Subtarget, DAG, DL);
7344 else {
7345 assert(0 == i && "Expected insertion into zero-index");
7346 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7347 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
7348 V = DAG.getBitcast(VT, V);
7349 continue;
7350 }
7351 }
7352 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
7353 DAG.getVectorIdxConstant(i, DL));
7354 }
7355
7356 return V;
7357}
7358
7359/// Custom lower build_vector of v16i8.
7361 const APInt &NonZeroMask,
7362 unsigned NumNonZero, unsigned NumZero,
7363 SelectionDAG &DAG,
7364 const X86Subtarget &Subtarget) {
7365 if (NumNonZero > 8 && !Subtarget.hasSSE41())
7366 return SDValue();
7367
7368 // SSE4.1 - use PINSRB to insert each byte directly.
7369 if (Subtarget.hasSSE41())
7370 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
7371 DAG, Subtarget);
7372
7373 SDValue V;
7374
7375 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
7376 // If both the lowest 16-bits are non-zero, then convert to MOVD.
7377 if (!NonZeroMask.extractBits(2, 0).isZero() &&
7378 !NonZeroMask.extractBits(2, 2).isZero()) {
7379 for (unsigned I = 0; I != 4; ++I) {
7380 if (!NonZeroMask[I])
7381 continue;
7382 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
7383 if (I != 0)
7384 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
7385 DAG.getConstant(I * 8, DL, MVT::i8));
7386 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
7387 }
7388 assert(V && "Failed to fold v16i8 vector to zero");
7389 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
7390 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
7391 V = DAG.getBitcast(MVT::v8i16, V);
7392 }
7393 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
7394 bool ThisIsNonZero = NonZeroMask[i];
7395 bool NextIsNonZero = NonZeroMask[i + 1];
7396 if (!ThisIsNonZero && !NextIsNonZero)
7397 continue;
7398
7399 SDValue Elt;
7400 if (ThisIsNonZero) {
7401 if (NumZero || NextIsNonZero)
7402 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7403 else
7404 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7405 }
7406
7407 if (NextIsNonZero) {
7408 SDValue NextElt = Op.getOperand(i + 1);
7409 if (i == 0 && NumZero)
7410 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
7411 else
7412 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
7413 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
7414 DAG.getConstant(8, DL, MVT::i8));
7415 if (ThisIsNonZero)
7416 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
7417 else
7418 Elt = NextElt;
7419 }
7420
7421 // If our first insertion is not the first index or zeros are needed, then
7422 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
7423 // elements undefined).
7424 if (!V) {
7425 if (i != 0 || NumZero)
7426 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
7427 else {
7428 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
7429 V = DAG.getBitcast(MVT::v8i16, V);
7430 continue;
7431 }
7432 }
7433 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7434 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
7435 DAG.getVectorIdxConstant(i / 2, DL));
7436 }
7437
7438 return DAG.getBitcast(MVT::v16i8, V);
7439}
7440
7441/// Custom lower build_vector of v8i16.
7443 const APInt &NonZeroMask,
7444 unsigned NumNonZero, unsigned NumZero,
7445 SelectionDAG &DAG,
7446 const X86Subtarget &Subtarget) {
7447 if (NumNonZero > 4 && !Subtarget.hasSSE41())
7448 return SDValue();
7449
7450 // Use PINSRW to insert each byte directly.
7451 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
7452 Subtarget);
7453}
7454
7455/// Custom lower build_vector of v4i32 or v4f32.
7457 SelectionDAG &DAG,
7458 const X86Subtarget &Subtarget) {
7459 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7460 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7461 // Because we're creating a less complicated build vector here, we may enable
7462 // further folding of the MOVDDUP via shuffle transforms.
7463 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7464 Op.getOperand(0) == Op.getOperand(2) &&
7465 Op.getOperand(1) == Op.getOperand(3) &&
7466 Op.getOperand(0) != Op.getOperand(1)) {
7467 MVT VT = Op.getSimpleValueType();
7468 MVT EltVT = VT.getVectorElementType();
7469 // Create a new build vector with the first 2 elements followed by undef
7470 // padding, bitcast to v2f64, duplicate, and bitcast back.
7471 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7472 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7473 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7474 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7475 return DAG.getBitcast(VT, Dup);
7476 }
7477
7478 // Find all zeroable elements.
7479 std::bitset<4> Zeroable, Undefs;
7480 for (int i = 0; i < 4; ++i) {
7481 SDValue Elt = Op.getOperand(i);
7482 Undefs[i] = Elt.isUndef();
7483 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7484 }
7485 assert(Zeroable.size() - Zeroable.count() > 1 &&
7486 "We expect at least two non-zero elements!");
7487
7488 // We only know how to deal with build_vector nodes where elements are either
7489 // zeroable or extract_vector_elt with constant index.
7490 SDValue FirstNonZero;
7491 unsigned FirstNonZeroIdx;
7492 for (unsigned i = 0; i < 4; ++i) {
7493 if (Zeroable[i])
7494 continue;
7495 SDValue Elt = Op.getOperand(i);
7496 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7498 return SDValue();
7499 // Make sure that this node is extracting from a 128-bit vector.
7500 MVT VT = Elt.getOperand(0).getSimpleValueType();
7501 if (!VT.is128BitVector())
7502 return SDValue();
7503 if (!FirstNonZero.getNode()) {
7504 FirstNonZero = Elt;
7505 FirstNonZeroIdx = i;
7506 }
7507 }
7508
7509 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7510 SDValue V1 = FirstNonZero.getOperand(0);
7511 MVT VT = V1.getSimpleValueType();
7512
7513 // See if this build_vector can be lowered as a blend with zero.
7514 SDValue Elt;
7515 unsigned EltMaskIdx, EltIdx;
7516 int Mask[4];
7517 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7518 if (Zeroable[EltIdx]) {
7519 // The zero vector will be on the right hand side.
7520 Mask[EltIdx] = EltIdx+4;
7521 continue;
7522 }
7523
7524 Elt = Op->getOperand(EltIdx);
7525 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7526 EltMaskIdx = Elt.getConstantOperandVal(1);
7527 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7528 break;
7529 Mask[EltIdx] = EltIdx;
7530 }
7531
7532 if (EltIdx == 4) {
7533 // Let the shuffle legalizer deal with blend operations.
7534 SDValue VZeroOrUndef = (Zeroable == Undefs)
7535 ? DAG.getUNDEF(VT)
7536 : getZeroVector(VT, Subtarget, DAG, DL);
7537 if (V1.getSimpleValueType() != VT)
7538 V1 = DAG.getBitcast(VT, V1);
7539 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7540 }
7541
7542 // See if we can lower this build_vector to a INSERTPS.
7543 if (!Subtarget.hasSSE41())
7544 return SDValue();
7545
7546 SDValue V2 = Elt.getOperand(0);
7547 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7548 V1 = SDValue();
7549
7550 bool CanFold = true;
7551 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7552 if (Zeroable[i])
7553 continue;
7554
7555 SDValue Current = Op->getOperand(i);
7556 SDValue SrcVector = Current->getOperand(0);
7557 if (!V1.getNode())
7558 V1 = SrcVector;
7559 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7560 }
7561
7562 if (!CanFold)
7563 return SDValue();
7564
7565 assert(V1.getNode() && "Expected at least two non-zero elements!");
7566 if (V1.getSimpleValueType() != MVT::v4f32)
7567 V1 = DAG.getBitcast(MVT::v4f32, V1);
7568 if (V2.getSimpleValueType() != MVT::v4f32)
7569 V2 = DAG.getBitcast(MVT::v4f32, V2);
7570
7571 // Ok, we can emit an INSERTPS instruction.
7572 unsigned ZMask = Zeroable.to_ulong();
7573
7574 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7575 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7576 SDValue Result =
7577 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7578 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7579 return DAG.getBitcast(VT, Result);
7580}
7581
7582/// Return a vector logical shift node.
7583static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7584 SelectionDAG &DAG, const TargetLowering &TLI,
7585 const SDLoc &dl) {
7586 assert(VT.is128BitVector() && "Unknown type for VShift");
7587 MVT ShVT = MVT::v16i8;
7588 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7589 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7590 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7591 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7592 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7593}
7594
7596 SelectionDAG &DAG) {
7597
7598 // Check if the scalar load can be widened into a vector load. And if
7599 // the address is "base + cst" see if the cst can be "absorbed" into
7600 // the shuffle mask.
7602 SDValue Ptr = LD->getBasePtr();
7603 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7604 return SDValue();
7605 EVT PVT = LD->getValueType(0);
7606 if (PVT != MVT::i32 && PVT != MVT::f32)
7607 return SDValue();
7608
7609 int FI = -1;
7610 int64_t Offset = 0;
7611 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7612 FI = FINode->getIndex();
7613 Offset = 0;
7614 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7616 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7618 Ptr = Ptr.getOperand(0);
7619 } else {
7620 return SDValue();
7621 }
7622
7623 // FIXME: 256-bit vector instructions don't require a strict alignment,
7624 // improve this code to support it better.
7625 Align RequiredAlign(VT.getSizeInBits() / 8);
7626 SDValue Chain = LD->getChain();
7627 // Make sure the stack object alignment is at least 16 or 32.
7629 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7630 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7631 if (MFI.isFixedObjectIndex(FI)) {
7632 // Can't change the alignment. FIXME: It's possible to compute
7633 // the exact stack offset and reference FI + adjust offset instead.
7634 // If someone *really* cares about this. That's the way to implement it.
7635 return SDValue();
7636 } else {
7637 MFI.setObjectAlignment(FI, RequiredAlign);
7638 }
7639 }
7640
7641 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7642 // Ptr + (Offset & ~15).
7643 if (Offset < 0)
7644 return SDValue();
7645 if ((Offset % RequiredAlign.value()) & 3)
7646 return SDValue();
7647 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7648 if (StartOffset) {
7649 SDLoc DL(Ptr);
7650 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7651 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7652 }
7653
7654 int EltNo = (Offset - StartOffset) >> 2;
7655 unsigned NumElems = VT.getVectorNumElements();
7656
7657 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7658 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7659 LD->getPointerInfo().getWithOffset(StartOffset));
7660
7661 SmallVector<int, 8> Mask(NumElems, EltNo);
7662
7663 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7664 }
7665
7666 return SDValue();
7667}
7668
7669// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7670static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7671 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7672 auto *BaseLd = cast<LoadSDNode>(Elt);
7673 if (!BaseLd->isSimple())
7674 return false;
7675 Ld = BaseLd;
7676 ByteOffset = 0;
7677 return true;
7678 }
7679
7680 switch (Elt.getOpcode()) {
7681 case ISD::BITCAST:
7682 case ISD::TRUNCATE:
7684 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7685 case ISD::SRL:
7686 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7687 uint64_t Amt = AmtC->getZExtValue();
7688 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7689 ByteOffset += Amt / 8;
7690 return true;
7691 }
7692 }
7693 break;
7695 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7696 SDValue Src = Elt.getOperand(0);
7697 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7698 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7699 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7700 findEltLoadSrc(Src, Ld, ByteOffset)) {
7701 uint64_t Idx = IdxC->getZExtValue();
7702 ByteOffset += Idx * (SrcSizeInBits / 8);
7703 return true;
7704 }
7705 }
7706 break;
7707 }
7708
7709 return false;
7710}
7711
7712/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7713/// elements can be replaced by a single large load which has the same value as
7714/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7715///
7716/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7718 const SDLoc &DL, SelectionDAG &DAG,
7719 const X86Subtarget &Subtarget,
7720 bool IsAfterLegalize,
7721 unsigned Depth = 0) {
7723 return SDValue(); // Limit search depth.
7724 if ((VT.getScalarSizeInBits() % 8) != 0)
7725 return SDValue();
7726
7727 unsigned NumElems = Elts.size();
7728
7729 int LastLoadedElt = -1;
7730 APInt LoadMask = APInt::getZero(NumElems);
7731 APInt ZeroMask = APInt::getZero(NumElems);
7732 APInt UndefMask = APInt::getZero(NumElems);
7733
7734 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7735 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7736
7737 // For each element in the initializer, see if we've found a load, zero or an
7738 // undef.
7739 for (unsigned i = 0; i < NumElems; ++i) {
7740 SDValue Elt = peekThroughBitcasts(Elts[i]);
7741 if (!Elt.getNode())
7742 return SDValue();
7743 if (Elt.isUndef()) {
7744 UndefMask.setBit(i);
7745 continue;
7746 }
7748 ZeroMask.setBit(i);
7749 continue;
7750 }
7751
7752 // Each loaded element must be the correct fractional portion of the
7753 // requested vector load.
7754 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7755 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7756 return SDValue();
7757
7758 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7759 return SDValue();
7760 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7761 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7762 return SDValue();
7763
7764 LoadMask.setBit(i);
7765 LastLoadedElt = i;
7766 }
7767 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7768 NumElems &&
7769 "Incomplete element masks");
7770
7771 // Handle Special Cases - all undef or undef/zero.
7772 if (UndefMask.popcount() == NumElems)
7773 return DAG.getUNDEF(VT);
7774 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7775 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7776 : DAG.getConstantFP(0.0, DL, VT);
7777
7778 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7779 int FirstLoadedElt = LoadMask.countr_zero();
7780 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7781 EVT EltBaseVT = EltBase.getValueType();
7782 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7783 "Register/Memory size mismatch");
7784 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7785 assert(LDBase && "Did not find base load for merging consecutive loads");
7786 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7787 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7788 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7789 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7790 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7791
7792 // TODO: Support offsetting the base load.
7793 if (ByteOffsets[FirstLoadedElt] != 0)
7794 return SDValue();
7795
7796 // Check to see if the element's load is consecutive to the base load
7797 // or offset from a previous (already checked) load.
7798 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7799 LoadSDNode *Ld = Loads[EltIdx];
7800 int64_t ByteOffset = ByteOffsets[EltIdx];
7801 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7802 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7803 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7804 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7805 }
7806 int Stride = EltIdx - FirstLoadedElt;
7807 if (DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, Stride))
7808 return true;
7809 // Try again using the memory load size (we might have broken a large load
7810 // into smaller elements), ensure the stride is the full memory load size
7811 // apart and a whole number of elements fit in each memory load.
7812 unsigned BaseMemSizeInBits = Base->getMemoryVT().getSizeInBits();
7813 if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
7814 (BaseMemSizeInBits % BaseSizeInBits) == 0) {
7815 unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
7816 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseMemSizeInBits / 8,
7817 Stride / Scale);
7818 }
7819 return false;
7820 };
7821
7822 // Consecutive loads can contain UNDEFS but not ZERO elements.
7823 // Consecutive loads with UNDEFs and ZEROs elements require a
7824 // an additional shuffle stage to clear the ZERO elements.
7825 bool IsConsecutiveLoad = true;
7826 bool IsConsecutiveLoadWithZeros = true;
7827 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7828 if (LoadMask[i]) {
7829 if (!CheckConsecutiveLoad(LDBase, i)) {
7830 IsConsecutiveLoad = false;
7831 IsConsecutiveLoadWithZeros = false;
7832 break;
7833 }
7834 } else if (ZeroMask[i]) {
7835 IsConsecutiveLoad = false;
7836 }
7837 }
7838
7839 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7840 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7841 assert(LDBase->isSimple() &&
7842 "Cannot merge volatile or atomic loads.");
7843 SDValue NewLd =
7844 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7845 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7846 for (auto *LD : Loads)
7847 if (LD)
7848 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7849 return NewLd;
7850 };
7851
7852 // Check if the base load is entirely dereferenceable.
7853 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7854 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7855
7856 // LOAD - all consecutive load/undefs (must start/end with a load or be
7857 // entirely dereferenceable). If we have found an entire vector of loads and
7858 // undefs, then return a large load of the entire vector width starting at the
7859 // base pointer. If the vector contains zeros, then attempt to shuffle those
7860 // elements.
7861 if (FirstLoadedElt == 0 &&
7862 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7863 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7864 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7865 return SDValue();
7866
7867 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7868 // will lower to regular temporal loads and use the cache.
7869 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7870 VT.is256BitVector() && !Subtarget.hasInt256())
7871 return SDValue();
7872
7873 if (NumElems == 1)
7874 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7875
7876 if (!ZeroMask)
7877 return CreateLoad(VT, LDBase);
7878
7879 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7880 // vector and a zero vector to clear out the zero elements.
7881 if (!IsAfterLegalize && VT.isVector()) {
7882 unsigned NumMaskElts = VT.getVectorNumElements();
7883 if ((NumMaskElts % NumElems) == 0) {
7884 unsigned Scale = NumMaskElts / NumElems;
7885 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7886 for (unsigned i = 0; i < NumElems; ++i) {
7887 if (UndefMask[i])
7888 continue;
7889 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7890 for (unsigned j = 0; j != Scale; ++j)
7891 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7892 }
7893 SDValue V = CreateLoad(VT, LDBase);
7894 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7895 : DAG.getConstantFP(0.0, DL, VT);
7896 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7897 }
7898 }
7899 }
7900
7901 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7902 if (VT.is256BitVector() || VT.is512BitVector()) {
7903 unsigned HalfNumElems = NumElems / 2;
7904 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7905 EVT HalfVT =
7906 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7907 SDValue HalfLD =
7908 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7909 DAG, Subtarget, IsAfterLegalize, Depth + 1);
7910 if (HalfLD)
7911 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7912 HalfLD, DAG.getVectorIdxConstant(0, DL));
7913 }
7914 }
7915
7916 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7917 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7918 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7919 LoadSizeInBits == 64) &&
7920 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7921 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7922 : MVT::getIntegerVT(LoadSizeInBits);
7923 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7924 // Allow v4f32 on SSE1 only targets.
7925 // FIXME: Add more isel patterns so we can just use VT directly.
7926 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7927 VecVT = MVT::v4f32;
7928 if (TLI.isTypeLegal(VecVT)) {
7929 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7930 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7931 SDValue ResNode = DAG.getMemIntrinsicNode(
7932 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7934 for (auto *LD : Loads)
7935 if (LD)
7936 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7937 return DAG.getBitcast(VT, ResNode);
7938 }
7939 }
7940
7941 // BROADCAST - match the smallest possible repetition pattern, load that
7942 // scalar/subvector element and then broadcast to the entire vector.
7943 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7944 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7945 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7946 unsigned RepeatSize = SubElems * BaseSizeInBits;
7947 unsigned ScalarSize = std::min(RepeatSize, 64u);
7948 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7949 continue;
7950
7951 // Don't attempt a 1:N subvector broadcast - it should be caught by
7952 // combineConcatVectorOps, else will cause infinite loops.
7953 if (RepeatSize > ScalarSize && SubElems == 1)
7954 continue;
7955
7956 bool Match = true;
7957 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7958 for (unsigned i = 0; i != NumElems && Match; ++i) {
7959 if (!LoadMask[i])
7960 continue;
7961 SDValue Elt = peekThroughBitcasts(Elts[i]);
7962 if (RepeatedLoads[i % SubElems].isUndef())
7963 RepeatedLoads[i % SubElems] = Elt;
7964 else
7965 Match &= (RepeatedLoads[i % SubElems] == Elt);
7966 }
7967
7968 // We must have loads at both ends of the repetition.
7969 Match &= !RepeatedLoads.front().isUndef();
7970 Match &= !RepeatedLoads.back().isUndef();
7971 if (!Match)
7972 continue;
7973
7974 EVT RepeatVT =
7975 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7976 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7977 : EVT::getFloatingPointVT(ScalarSize);
7978 if (RepeatSize > ScalarSize)
7979 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7980 RepeatSize / ScalarSize);
7981 EVT BroadcastVT =
7982 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7983 VT.getSizeInBits() / ScalarSize);
7984 if (TLI.isTypeLegal(BroadcastVT)) {
7985 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7986 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize,
7987 Depth + 1)) {
7988 SDValue Broadcast = RepeatLoad;
7989 if (RepeatSize > ScalarSize) {
7990 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7991 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7992 } else {
7993 if (!Subtarget.hasAVX2() &&
7995 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7996 Subtarget,
7997 /*AssumeSingleUse=*/true))
7998 return SDValue();
7999 Broadcast =
8000 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
8001 }
8002 return DAG.getBitcast(VT, Broadcast);
8003 }
8004 }
8005 }
8006 }
8007
8008 // REVERSE - attempt to match the loads in reverse and then shuffle back.
8009 // TODO: Do this for any permute or mismatching element counts.
8010 if (Depth == 0 && ZeroMask.isZero() && UndefMask.isZero() &&
8011 TLI.isTypeLegal(VT) && VT.isVector() &&
8012 NumElems == VT.getVectorNumElements()) {
8013 SmallVector<SDValue, 16> ReverseElts(Elts.rbegin(), Elts.rend());
8015 VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) {
8016 SmallVector<int, 16> ReverseMask(NumElems);
8017 std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0);
8018 return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask);
8019 }
8020 }
8021
8022 return SDValue();
8023}
8024
8025// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8026// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8027// are consecutive, non-overlapping, and in the right order.
8029 SelectionDAG &DAG,
8030 const X86Subtarget &Subtarget,
8031 bool IsAfterLegalize) {
8033 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8034 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
8035 Elts.push_back(Elt);
8036 continue;
8037 }
8038 return SDValue();
8039 }
8040 assert(Elts.size() == VT.getVectorNumElements());
8041 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8042 IsAfterLegalize);
8043}
8044
8046 const APInt &Undefs, LLVMContext &C) {
8047 unsigned ScalarSize = VT.getScalarSizeInBits();
8048 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
8049
8050 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
8051 if (VT.isFloatingPoint()) {
8052 if (ScalarSize == 16)
8053 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
8054 if (ScalarSize == 32)
8055 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8056 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
8057 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8058 }
8059 return Constant::getIntegerValue(Ty, Val);
8060 };
8061
8062 SmallVector<Constant *, 32> ConstantVec;
8063 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
8064 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
8065 : getConstantScalar(Bits[I]));
8066
8067 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8068}
8069
8070static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8071 unsigned SplatBitSize, LLVMContext &C) {
8072 unsigned ScalarSize = VT.getScalarSizeInBits();
8073
8074 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
8075 if (VT.isFloatingPoint()) {
8076 if (ScalarSize == 16)
8077 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
8078 if (ScalarSize == 32)
8079 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8080 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
8081 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8082 }
8083 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8084 };
8085
8086 if (ScalarSize == SplatBitSize)
8087 return getConstantScalar(SplatValue);
8088
8089 unsigned NumElm = SplatBitSize / ScalarSize;
8090 SmallVector<Constant *, 32> ConstantVec;
8091 for (unsigned I = 0; I != NumElm; ++I) {
8092 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
8093 ConstantVec.push_back(getConstantScalar(Val));
8094 }
8095 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8096}
8097
8099 for (auto *U : N->users()) {
8100 unsigned Opc = U->getOpcode();
8101 // VPERMV/VPERMV3 shuffles can never fold their index operands.
8102 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8103 return false;
8104 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8105 return false;
8106 if (isTargetShuffle(Opc))
8107 return true;
8108 if (Opc == ISD::BITCAST) // Ignore bitcasts
8109 return isFoldableUseOfShuffle(U);
8110 if (N->hasOneUse()) {
8111 // TODO, there may be some general way to know if a SDNode can
8112 // be folded. We now only know whether an MI is foldable.
8113 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
8114 return false;
8115 return true;
8116 }
8117 }
8118 return false;
8119}
8120
8121// If the node has a single use by a VSELECT then AVX512 targets may be able to
8122// fold as a predicated instruction.
8123static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
8124 unsigned SizeInBits = V.getValueSizeInBits();
8125 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
8126 (SizeInBits >= 128 && Subtarget.hasVLX())) {
8127 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
8128 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
8129 return true;
8130 }
8131 }
8132 return false;
8133}
8134
8135/// Attempt to use the vbroadcast instruction to generate a splat value
8136/// from a splat BUILD_VECTOR which uses:
8137/// a. A single scalar load, or a constant.
8138/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8139///
8140/// The VBROADCAST node is returned when a pattern is found,
8141/// or SDValue() otherwise.
8143 const SDLoc &dl,
8144 const X86Subtarget &Subtarget,
8145 SelectionDAG &DAG) {
8146 // VBROADCAST requires AVX.
8147 // TODO: Splats could be generated for non-AVX CPUs using SSE
8148 // instructions, but there's less potential gain for only 128-bit vectors.
8149 if (!Subtarget.hasAVX())
8150 return SDValue();
8151
8152 MVT VT = BVOp->getSimpleValueType(0);
8153 unsigned NumElts = VT.getVectorNumElements();
8154 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8155 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
8156 "Unsupported vector type for broadcast.");
8157
8158 // See if the build vector is a repeating sequence of scalars (inc. splat).
8159 SDValue Ld;
8160 BitVector UndefElements;
8161 SmallVector<SDValue, 16> Sequence;
8162 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
8163 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
8164 if (Sequence.size() == 1)
8165 Ld = Sequence[0];
8166 }
8167
8168 // Attempt to use VBROADCASTM
8169 // From this pattern:
8170 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8171 // b. t1 = (build_vector t0 t0)
8172 //
8173 // Create (VBROADCASTM v2i1 X)
8174 if (!Sequence.empty() && Subtarget.hasCDI()) {
8175 // If not a splat, are the upper sequence values zeroable?
8176 unsigned SeqLen = Sequence.size();
8177 bool UpperZeroOrUndef =
8178 SeqLen == 1 ||
8179 llvm::all_of(ArrayRef(Sequence).drop_front(),
8180 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
8181 SDValue Op0 = Sequence[0];
8182 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
8183 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
8184 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
8185 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
8186 ? Op0.getOperand(0)
8187 : Op0.getOperand(0).getOperand(0);
8188 MVT MaskVT = BOperand.getSimpleValueType();
8189 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
8190 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
8191 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8192 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
8193 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
8194 unsigned Scale = 512 / VT.getSizeInBits();
8195 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
8196 }
8197 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8198 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
8199 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
8200 return DAG.getBitcast(VT, Bcst);
8201 }
8202 }
8203 }
8204
8205 unsigned NumUndefElts = UndefElements.count();
8206 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8207 APInt SplatValue, Undef;
8208 unsigned SplatBitSize;
8209 bool HasUndef;
8210 // Check if this is a repeated constant pattern suitable for broadcasting.
8211 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8212 SplatBitSize > VT.getScalarSizeInBits() &&
8213 SplatBitSize < VT.getSizeInBits()) {
8214 // Avoid replacing with broadcast when it's a use of a shuffle
8215 // instruction to preserve the present custom lowering of shuffles.
8216 if (isFoldableUseOfShuffle(BVOp))
8217 return SDValue();
8218 // replace BUILD_VECTOR with broadcast of the repeated constants.
8219 LLVMContext *Ctx = DAG.getContext();
8220 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8221 if (SplatBitSize == 32 || SplatBitSize == 64 ||
8222 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8223 // Load the constant scalar/subvector and broadcast it.
8224 MVT CVT = MVT::getIntegerVT(SplatBitSize);
8225 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
8226 SDValue CP = DAG.getConstantPool(C, PVT);
8227 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8228
8229 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8230 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8231 SDValue Ops[] = {DAG.getEntryNode(), CP};
8232 MachinePointerInfo MPI =
8234 SDValue Brdcst =
8235 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
8236 MPI, Alignment, MachineMemOperand::MOLoad);
8237 return DAG.getBitcast(VT, Brdcst);
8238 }
8239 if (SplatBitSize > 64) {
8240 // Load the vector of constants and broadcast it.
8241 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
8242 SDValue VCP = DAG.getConstantPool(VecC, PVT);
8243 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8244 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
8245 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
8246 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8247 SDValue Ops[] = {DAG.getEntryNode(), VCP};
8248 MachinePointerInfo MPI =
8250 return DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, dl, Tys,
8251 Ops, VVT, MPI, Alignment,
8253 }
8254 }
8255
8256 // If we are moving a scalar into a vector (Ld must be set and all elements
8257 // but 1 are undef) and that operation is not obviously supported by
8258 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8259 // That's better than general shuffling and may eliminate a load to GPR and
8260 // move from scalar to vector register.
8261 if (!Ld || NumElts - NumUndefElts != 1)
8262 return SDValue();
8263 unsigned ScalarSize = Ld.getValueSizeInBits();
8264 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8265 return SDValue();
8266 }
8267
8268 bool ConstSplatVal =
8269 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8270 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8271
8272 // TODO: Handle broadcasts of non-constant sequences.
8273
8274 // Make sure that all of the users of a non-constant load are from the
8275 // BUILD_VECTOR node.
8276 // FIXME: Is the use count needed for non-constant, non-load case?
8277 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8278 return SDValue();
8279
8280 unsigned ScalarSize = Ld.getValueSizeInBits();
8281 bool IsGE256 = (VT.getSizeInBits() >= 256);
8282
8283 // When optimizing for size, generate up to 5 extra bytes for a broadcast
8284 // instruction to save 8 or more bytes of constant pool data.
8285 // TODO: If multiple splats are generated to load the same constant,
8286 // it may be detrimental to overall size. There needs to be a way to detect
8287 // that condition to know if this is truly a size win.
8288 bool OptForSize = DAG.shouldOptForSize();
8289
8290 // Handle broadcasting a single constant scalar from the constant pool
8291 // into a vector.
8292 // On Sandybridge (no AVX2), it is still better to load a constant vector
8293 // from the constant pool and not to broadcast it from a scalar.
8294 // But override that restriction when optimizing for size.
8295 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
8296 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
8297 EVT CVT = Ld.getValueType();
8298 assert(!CVT.isVector() && "Must not broadcast a vector type");
8299
8300 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
8301 // For size optimization, also splat v2f64 and v2i64, and for size opt
8302 // with AVX2, also splat i8 and i16.
8303 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
8304 if (ScalarSize == 32 ||
8305 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
8306 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
8307 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
8308 const Constant *C = nullptr;
8310 C = CI->getConstantIntValue();
8312 C = CF->getConstantFPValue();
8313
8314 assert(C && "Invalid constant type");
8315
8316 SDValue CP =
8318 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8319
8320 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8321 SDValue Ops[] = {DAG.getEntryNode(), CP};
8322 MachinePointerInfo MPI =
8324 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
8325 MPI, Alignment, MachineMemOperand::MOLoad);
8326 }
8327 }
8328
8329 // Handle AVX2 in-register broadcasts.
8330 if (!IsLoad && Subtarget.hasInt256() &&
8331 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
8332 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8333
8334 // The scalar source must be a normal load.
8335 if (!IsLoad)
8336 return SDValue();
8337
8338 // Make sure the non-chain result is only used by this build vector.
8339 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
8340 return SDValue();
8341
8342 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8343 (Subtarget.hasVLX() && ScalarSize == 64)) {
8344 auto *LN = cast<LoadSDNode>(Ld);
8345 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8346 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8347 SDValue BCast =
8348 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
8349 LN->getMemoryVT(), LN->getMemOperand());
8350 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8351 return BCast;
8352 }
8353
8354 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
8355 // double since there is no vbroadcastsd xmm
8356 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
8357 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
8358 auto *LN = cast<LoadSDNode>(Ld);
8359 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8360 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8361 SDValue BCast =
8362 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
8363 LN->getMemoryVT(), LN->getMemOperand());
8364 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8365 return BCast;
8366 }
8367
8368 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
8369 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8370
8371 // Unsupported broadcast.
8372 return SDValue();
8373}
8374
8375/// For an EXTRACT_VECTOR_ELT with a constant index return the real
8376/// underlying vector and index.
8377///
8378/// Modifies \p ExtractedFromVec to the real vector and returns the real
8379/// index.
8380static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
8381 SDValue ExtIdx) {
8382 int Idx = ExtIdx->getAsZExtVal();
8383 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
8384 return Idx;
8385
8386 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
8387 // lowered this:
8388 // (extract_vector_elt (v8f32 %1), Constant<6>)
8389 // to:
8390 // (extract_vector_elt (vector_shuffle<2,u,u,u>
8391 // (extract_subvector (v8f32 %0), Constant<4>),
8392 // undef)
8393 // Constant<0>)
8394 // In this case the vector is the extract_subvector expression and the index
8395 // is 2, as specified by the shuffle.
8396 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
8397 SDValue ShuffleVec = SVOp->getOperand(0);
8398 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
8399 assert(ShuffleVecVT.getVectorElementType() ==
8400 ExtractedFromVec.getSimpleValueType().getVectorElementType());
8401
8402 int ShuffleIdx = SVOp->getMaskElt(Idx);
8403 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
8404 ExtractedFromVec = ShuffleVec;
8405 return ShuffleIdx;
8406 }
8407 return Idx;
8408}
8409
8411 SelectionDAG &DAG) {
8412 MVT VT = Op.getSimpleValueType();
8413
8414 // Skip if insert_vec_elt is not supported.
8415 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8417 return SDValue();
8418
8419 unsigned NumElems = Op.getNumOperands();
8420 SDValue VecIn1;
8421 SDValue VecIn2;
8422 SmallVector<unsigned, 4> InsertIndices;
8423 SmallVector<int, 8> Mask(NumElems, -1);
8424
8425 for (unsigned i = 0; i != NumElems; ++i) {
8426 unsigned Opc = Op.getOperand(i).getOpcode();
8427
8428 if (Opc == ISD::POISON || Opc == ISD::UNDEF)
8429 continue;
8430
8432 // Quit if more than 1 elements need inserting.
8433 if (InsertIndices.size() > 1)
8434 return SDValue();
8435
8436 InsertIndices.push_back(i);
8437 continue;
8438 }
8439
8440 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
8441 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
8442
8443 // Quit if non-constant index.
8444 if (!isa<ConstantSDNode>(ExtIdx))
8445 return SDValue();
8446 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
8447
8448 // Quit if extracted from vector of different type.
8449 if (ExtractedFromVec.getValueType() != VT)
8450 return SDValue();
8451
8452 if (!VecIn1.getNode())
8453 VecIn1 = ExtractedFromVec;
8454 else if (VecIn1 != ExtractedFromVec) {
8455 if (!VecIn2.getNode())
8456 VecIn2 = ExtractedFromVec;
8457 else if (VecIn2 != ExtractedFromVec)
8458 // Quit if more than 2 vectors to shuffle
8459 return SDValue();
8460 }
8461
8462 if (ExtractedFromVec == VecIn1)
8463 Mask[i] = Idx;
8464 else if (ExtractedFromVec == VecIn2)
8465 Mask[i] = Idx + NumElems;
8466 }
8467
8468 if (!VecIn1.getNode())
8469 return SDValue();
8470
8471 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getPOISON(VT);
8472 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
8473
8474 for (unsigned Idx : InsertIndices)
8475 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
8476 DAG.getVectorIdxConstant(Idx, DL));
8477
8478 return NV;
8479}
8480
8481// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
8483 const X86Subtarget &Subtarget) {
8484 MVT VT = Op.getSimpleValueType();
8485 MVT SVT = Subtarget.hasFP16() ? MVT::f16 : MVT::i16;
8486 MVT IVT = VT.changeVectorElementType(SVT);
8488 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8489 NewOps.push_back(DAG.getBitcast(SVT, Op.getOperand(I)));
8490 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8491 return DAG.getBitcast(VT, Res);
8492}
8493
8494// Lower BUILD_VECTOR operation for vXi1 types.
8496 SelectionDAG &DAG,
8497 const X86Subtarget &Subtarget) {
8498
8499 MVT VT = Op.getSimpleValueType();
8500 assert((VT.getVectorElementType() == MVT::i1) &&
8501 "Unexpected type in LowerBUILD_VECTORvXi1!");
8502 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8503 ISD::isBuildVectorAllOnes(Op.getNode()))
8504 return Op;
8505
8506 uint64_t Immediate = 0;
8507 SmallVector<unsigned, 16> NonConstIdx;
8508 bool IsSplat = true;
8509 bool HasConstElts = false;
8510 int SplatIdx = -1;
8511 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8512 SDValue In = Op.getOperand(idx);
8513 if (In.isUndef())
8514 continue;
8515 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8516 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8517 HasConstElts = true;
8518 } else {
8519 NonConstIdx.push_back(idx);
8520 }
8521 if (SplatIdx < 0)
8522 SplatIdx = idx;
8523 else if (In != Op.getOperand(SplatIdx))
8524 IsSplat = false;
8525 }
8526
8527 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8528 if (IsSplat) {
8529 // The build_vector allows the scalar element to be larger than the vector
8530 // element type. We need to mask it to use as a condition unless we know
8531 // the upper bits are zero.
8532 // FIXME: Use computeKnownBits instead of checking specific opcode?
8533 SDValue Cond = Op.getOperand(SplatIdx);
8534 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8535 if (Cond.getOpcode() != ISD::SETCC)
8536 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8537 DAG.getConstant(1, dl, MVT::i8));
8538
8539 // Perform the select in the scalar domain so we can use cmov.
8540 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8541 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8542 DAG.getAllOnesConstant(dl, MVT::i32),
8543 DAG.getConstant(0, dl, MVT::i32));
8544 Select = DAG.getBitcast(MVT::v32i1, Select);
8545 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8546 } else {
8547 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8548 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8549 DAG.getAllOnesConstant(dl, ImmVT),
8550 DAG.getConstant(0, dl, ImmVT));
8551 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8552 Select = DAG.getBitcast(VecVT, Select);
8553 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8554 DAG.getVectorIdxConstant(0, dl));
8555 }
8556 }
8557
8558 // See if we can cheaply generate a vXi8 vector and convert to vXi1.
8559 MVT OpVT = Op.getOperand(0).getSimpleValueType();
8560 if (NonConstIdx.size() > 1 && OpVT == MVT::i8) {
8561 // On pre-BWI targets, we must extend to vXi32 instead.
8562 MVT ByteVT = VT.changeVectorElementType(MVT::i8);
8563 MVT WideSVT = Subtarget.hasBWI() ? MVT::i8 : MVT::i32;
8564 if (ByteVT.getSizeInBits() < 128) {
8565 WideSVT = ByteVT == MVT::v4i8 ? MVT::i32 : MVT::i64;
8566 ByteVT = MVT::v16i8;
8567 }
8568 MVT WideVT = VT.changeVectorElementType(WideSVT);
8569 if (DAG.getTargetLoweringInfo().isTypeLegal(ByteVT) &&
8570 DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
8571 SmallVector<SDValue, 16> Elts(Op->op_values());
8572 Elts.append(ByteVT.getVectorNumElements() - Elts.size(),
8573 DAG.getPOISON(OpVT));
8574 SDValue ByteBV = DAG.getBuildVector(ByteVT, dl, Elts);
8575 SDValue WideBV =
8576 getEXTEND_VECTOR_INREG(ISD::ANY_EXTEND, dl, WideVT, ByteBV, DAG);
8577 WideBV = DAG.getNode(ISD::AND, dl, WideVT, WideBV,
8578 DAG.getConstant(1, dl, WideVT));
8579 return DAG.getSetCC(dl, VT, WideBV, DAG.getConstant(0, dl, WideVT),
8580 ISD::SETNE);
8581 }
8582 }
8583
8584 // insert elements one by one
8585 SDValue DstVec;
8586 if (HasConstElts) {
8587 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8588 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8589 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8590 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8591 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8592 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8593 } else {
8594 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8595 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8596 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8597 DstVec = DAG.getBitcast(VecVT, Imm);
8598 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8599 DAG.getVectorIdxConstant(0, dl));
8600 }
8601 } else
8602 DstVec = DAG.getUNDEF(VT);
8603
8604 for (unsigned InsertIdx : NonConstIdx) {
8605 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8606 Op.getOperand(InsertIdx),
8607 DAG.getVectorIdxConstant(InsertIdx, dl));
8608 }
8609 return DstVec;
8610}
8611
8612[[maybe_unused]] static bool isHorizOp(unsigned Opcode) {
8613 switch (Opcode) {
8614 case X86ISD::PACKSS:
8615 case X86ISD::PACKUS:
8616 case X86ISD::FHADD:
8617 case X86ISD::FHSUB:
8618 case X86ISD::HADD:
8619 case X86ISD::HSUB:
8620 case X86ISD::HADDS:
8621 case X86ISD::HSUBS:
8622 return true;
8623 }
8624 return false;
8625}
8626
8627/// This is a helper function of LowerToHorizontalOp().
8628/// This function checks that the build_vector \p N in input implements a
8629/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8630/// may not match the layout of an x86 256-bit horizontal instruction.
8631/// In other words, if this returns true, then some extraction/insertion will
8632/// be required to produce a valid horizontal instruction.
8633///
8634/// Parameter \p Opcode defines the kind of horizontal operation to match.
8635/// For example, if \p Opcode is equal to ISD::ADD, then this function
8636/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8637/// is equal to ISD::SUB, then this function checks if this is a horizontal
8638/// arithmetic sub.
8639///
8640/// This function only analyzes elements of \p N whose indices are
8641/// in range [BaseIdx, LastIdx).
8642///
8643/// TODO: This function was originally used to match both real and fake partial
8644/// horizontal operations, but the index-matching logic is incorrect for that.
8645/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8646/// code because it is only used for partial h-op matching now?
8647static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8648 const SDLoc &DL, SelectionDAG &DAG,
8649 unsigned BaseIdx, unsigned LastIdx,
8650 SDValue &V0, SDValue &V1) {
8651 EVT VT = N->getValueType(0);
8652 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8653 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8654 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8655 "Invalid Vector in input!");
8656
8657 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8658 bool CanFold = true;
8659 unsigned ExpectedVExtractIdx = BaseIdx;
8660 unsigned NumElts = LastIdx - BaseIdx;
8661 V0 = DAG.getUNDEF(VT);
8662 V1 = DAG.getUNDEF(VT);
8663
8664 // Check if N implements a horizontal binop.
8665 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8666 SDValue Op = N->getOperand(i + BaseIdx);
8667
8668 // Skip UNDEFs.
8669 if (Op->isUndef()) {
8670 // Update the expected vector extract index.
8671 if (i * 2 == NumElts)
8672 ExpectedVExtractIdx = BaseIdx;
8673 ExpectedVExtractIdx += 2;
8674 continue;
8675 }
8676
8677 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8678
8679 if (!CanFold)
8680 break;
8681
8682 SDValue Op0 = Op.getOperand(0);
8683 SDValue Op1 = Op.getOperand(1);
8684
8685 // Try to match the following pattern:
8686 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8687 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8689 Op0.getOperand(0) == Op1.getOperand(0) &&
8692 if (!CanFold)
8693 break;
8694
8695 unsigned I0 = Op0.getConstantOperandVal(1);
8696 unsigned I1 = Op1.getConstantOperandVal(1);
8697
8698 if (i * 2 < NumElts) {
8699 if (V0.isUndef()) {
8700 V0 = Op0.getOperand(0);
8701 if (V0.getValueType() != VT)
8702 return false;
8703 }
8704 } else {
8705 if (V1.isUndef()) {
8706 V1 = Op0.getOperand(0);
8707 if (V1.getValueType() != VT)
8708 return false;
8709 }
8710 if (i * 2 == NumElts)
8711 ExpectedVExtractIdx = BaseIdx;
8712 }
8713
8714 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8715 if (I0 == ExpectedVExtractIdx)
8716 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8717 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8718 // Try to match the following dag sequence:
8719 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8720 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8721 } else
8722 CanFold = false;
8723
8724 ExpectedVExtractIdx += 2;
8725 }
8726
8727 return CanFold;
8728}
8729
8730/// Emit a sequence of two 128-bit horizontal add/sub followed by
8731/// a concat_vector.
8732///
8733/// This is a helper function of LowerToHorizontalOp().
8734/// This function expects two 256-bit vectors called V0 and V1.
8735/// At first, each vector is split into two separate 128-bit vectors.
8736/// Then, the resulting 128-bit vectors are used to implement two
8737/// horizontal binary operations.
8738///
8739/// The kind of horizontal binary operation is defined by \p X86Opcode.
8740///
8741/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8742/// the two new horizontal binop.
8743/// When Mode is set, the first horizontal binop dag node would take as input
8744/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8745/// horizontal binop dag node would take as input the lower 128-bit of V1
8746/// and the upper 128-bit of V1.
8747/// Example:
8748/// HADD V0_LO, V0_HI
8749/// HADD V1_LO, V1_HI
8750///
8751/// Otherwise, the first horizontal binop dag node takes as input the lower
8752/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8753/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8754/// Example:
8755/// HADD V0_LO, V1_LO
8756/// HADD V0_HI, V1_HI
8757///
8758/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8759/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8760/// the upper 128-bits of the result.
8761static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8762 const SDLoc &DL, SelectionDAG &DAG,
8763 unsigned X86Opcode, bool Mode,
8764 bool isUndefLO, bool isUndefHI) {
8765 MVT VT = V0.getSimpleValueType();
8766 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8767 "Invalid nodes in input!");
8768
8769 unsigned NumElts = VT.getVectorNumElements();
8770 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8771 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8772 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8773 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8774 MVT NewVT = V0_LO.getSimpleValueType();
8775
8776 SDValue LO = DAG.getUNDEF(NewVT);
8777 SDValue HI = DAG.getUNDEF(NewVT);
8778
8779 if (Mode) {
8780 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8781 if (!isUndefLO && !V0->isUndef())
8782 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8783 if (!isUndefHI && !V1->isUndef())
8784 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8785 } else {
8786 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8787 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8788 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8789
8790 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8791 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8792 }
8793
8794 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8795}
8796
8797/// Returns true iff \p BV builds a vector with the result equivalent to
8798/// the result of ADDSUB/SUBADD operation.
8799/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8800/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8801/// \p Opnd0 and \p Opnd1.
8803 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8804 SDValue &Opnd0, SDValue &Opnd1,
8805 unsigned &NumExtracts, bool &IsSubAdd,
8806 bool &HasAllowContract) {
8807 using namespace SDPatternMatch;
8808
8809 MVT VT = BV->getSimpleValueType(0);
8810 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8811 return false;
8812
8813 unsigned NumElts = VT.getVectorNumElements();
8814 SDValue InVec0 = DAG.getUNDEF(VT);
8815 SDValue InVec1 = DAG.getUNDEF(VT);
8816
8817 NumExtracts = 0;
8818 HasAllowContract = NumElts != 0;
8819
8820 // Odd-numbered elements in the input build vector are obtained from
8821 // adding/subtracting two integer/float elements.
8822 // Even-numbered elements in the input build vector are obtained from
8823 // subtracting/adding two integer/float elements.
8824 unsigned Opc[2] = {0, 0};
8825 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8826 SDValue Op = BV->getOperand(i);
8827
8828 // Skip 'undef' values.
8829 unsigned Opcode = Op.getOpcode();
8830 if (Opcode == ISD::UNDEF)
8831 continue;
8832
8833 // Early exit if we found an unexpected opcode.
8834 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8835 return false;
8836
8837 SDValue Op0 = Op.getOperand(0);
8838 SDValue Op1 = Op.getOperand(1);
8839
8840 // Try to match the following pattern:
8841 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8842 // Early exit if we cannot match that sequence.
8843 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8844 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8845 return false;
8846
8847 // We found a valid add/sub node, make sure its the same opcode as previous
8848 // elements for this parity.
8849 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8850 return false;
8851 Opc[i % 2] = Opcode;
8852
8853 // Update InVec0 and InVec1.
8854 if (InVec0.isUndef())
8855 InVec0 = Op0.getOperand(0);
8856 if (InVec1.isUndef())
8857 InVec1 = Op1.getOperand(0);
8858
8859 // Make sure that operands in input to each add/sub node always
8860 // come from a same pair of vectors.
8861 if (InVec0 != Op0.getOperand(0)) {
8862 if (Opcode == ISD::FSUB)
8863 return false;
8864
8865 // FADD is commutable. Try to commute the operands
8866 // and then test again.
8867 std::swap(Op0, Op1);
8868 if (InVec0 != Op0.getOperand(0))
8869 return false;
8870 }
8871
8872 if (InVec1 != Op1.getOperand(0))
8873 return false;
8874
8875 // Increment the number of extractions done.
8876 ++NumExtracts;
8877 HasAllowContract &= Op->getFlags().hasAllowContract();
8878 }
8879
8880 // Ensure we have found an opcode for both parities and that they are
8881 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8882 // inputs are undef.
8883 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8884 InVec0.isUndef() || InVec1.isUndef())
8885 return false;
8886
8887 IsSubAdd = Opc[0] == ISD::FADD;
8888
8889 Opnd0 = InVec0;
8890 Opnd1 = InVec1;
8891 return true;
8892}
8893
8894/// Returns true if is possible to fold MUL and an idiom that has already been
8895/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8896/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8897/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8898///
8899/// Prior to calling this function it should be known that there is some
8900/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8901/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8902/// before replacement of such SDNode with ADDSUB operation. Thus the number
8903/// of \p Opnd0 uses is expected to be equal to 2.
8904/// For example, this function may be called for the following IR:
8905/// %AB = fmul fast <2 x double> %A, %B
8906/// %Sub = fsub fast <2 x double> %AB, %C
8907/// %Add = fadd fast <2 x double> %AB, %C
8908/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8909/// <2 x i32> <i32 0, i32 3>
8910/// There is a def for %Addsub here, which potentially can be replaced by
8911/// X86ISD::ADDSUB operation:
8912/// %Addsub = X86ISD::ADDSUB %AB, %C
8913/// and such ADDSUB can further be replaced with FMADDSUB:
8914/// %Addsub = FMADDSUB %A, %B, %C.
8915///
8916/// The main reason why this method is called before the replacement of the
8917/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8918/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8919/// FMADDSUB is.
8920static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8921 SelectionDAG &DAG, SDValue &Opnd0,
8922 SDValue &Opnd1, SDValue &Opnd2,
8923 unsigned ExpectedUses,
8924 bool AllowSubAddOrAddSubContract) {
8925 if (Opnd0.getOpcode() != ISD::FMUL ||
8926 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8927 return false;
8928
8929 // FIXME: These checks must match the similar ones in
8930 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8931 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8932 // or MUL + ADDSUB to FMADDSUB.
8933 bool AllowFusion =
8934 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8935 if (!AllowFusion)
8936 return false;
8937
8938 Opnd2 = Opnd1;
8939 Opnd1 = Opnd0.getOperand(1);
8940 Opnd0 = Opnd0.getOperand(0);
8941
8942 return true;
8943}
8944
8945/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8946/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8947/// X86ISD::FMSUBADD node.
8949 const SDLoc &DL,
8950 const X86Subtarget &Subtarget,
8951 SelectionDAG &DAG) {
8952 SDValue Opnd0, Opnd1;
8953 unsigned NumExtracts;
8954 bool IsSubAdd;
8955 bool HasAllowContract;
8956 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8957 HasAllowContract))
8958 return SDValue();
8959
8960 MVT VT = BV->getSimpleValueType(0);
8961
8962 // Try to generate X86ISD::FMADDSUB node here.
8963 SDValue Opnd2;
8964 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8965 HasAllowContract)) {
8966 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8967 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8968 }
8969
8970 // We only support ADDSUB.
8971 if (IsSubAdd)
8972 return SDValue();
8973
8974 // There are no known X86 targets with 512-bit ADDSUB instructions!
8975 // Convert to blend(fsub,fadd).
8976 if (VT.is512BitVector()) {
8977 SmallVector<int> Mask;
8978 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8979 Mask.push_back(I);
8980 Mask.push_back(I + E + 1);
8981 }
8982 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8983 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8984 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8985 }
8986
8987 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8988}
8989
8991 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8992 // Initialize outputs to known values.
8993 MVT VT = BV->getSimpleValueType(0);
8994 HOpcode = ISD::DELETED_NODE;
8995 V0 = DAG.getUNDEF(VT);
8996 V1 = DAG.getUNDEF(VT);
8997
8998 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8999 // half of the result is calculated independently from the 128-bit halves of
9000 // the inputs, so that makes the index-checking logic below more complicated.
9001 unsigned NumElts = VT.getVectorNumElements();
9002 unsigned GenericOpcode = ISD::DELETED_NODE;
9003 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9004 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9005 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9006 for (unsigned i = 0; i != Num128BitChunks; ++i) {
9007 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9008 // Ignore undef elements.
9009 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9010 if (Op.isUndef())
9011 continue;
9012
9013 // If there's an opcode mismatch, we're done.
9014 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9015 return false;
9016
9017 // Initialize horizontal opcode.
9018 if (HOpcode == ISD::DELETED_NODE) {
9019 GenericOpcode = Op.getOpcode();
9020 switch (GenericOpcode) {
9021 // clang-format off
9022 case ISD::ADD: HOpcode = X86ISD::HADD; break;
9023 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9024 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9025 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9026 default: return false;
9027 // clang-format on
9028 }
9029 }
9030
9031 SDValue Op0 = Op.getOperand(0);
9032 SDValue Op1 = Op.getOperand(1);
9033 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9035 Op0.getOperand(0) != Op1.getOperand(0) ||
9037 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9038 return false;
9039
9040 // The source vector is chosen based on which 64-bit half of the
9041 // destination vector is being calculated.
9042 if (j < NumEltsIn64Bits) {
9043 if (V0.isUndef())
9044 V0 = Op0.getOperand(0);
9045 } else {
9046 if (V1.isUndef())
9047 V1 = Op0.getOperand(0);
9048 }
9049
9050 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9051 if (SourceVec != Op0.getOperand(0))
9052 return false;
9053
9054 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9055 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9056 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9057 unsigned ExpectedIndex = i * NumEltsIn128Bits +
9058 (j % NumEltsIn64Bits) * 2;
9059 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9060 continue;
9061
9062 // If this is not a commutative op, this does not match.
9063 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9064 return false;
9065
9066 // Addition is commutative, so try swapping the extract indexes.
9067 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9068 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9069 continue;
9070
9071 // Extract indexes do not match horizontal requirement.
9072 return false;
9073 }
9074 }
9075 // We matched. Opcode and operands are returned by reference as arguments.
9076 return true;
9077}
9078
9080 const SDLoc &DL, SelectionDAG &DAG,
9081 unsigned HOpcode, SDValue V0, SDValue V1) {
9082 // If either input vector is not the same size as the build vector,
9083 // extract/insert the low bits to the correct size.
9084 // This is free (examples: zmm --> xmm, xmm --> ymm).
9085 MVT VT = BV->getSimpleValueType(0);
9086 unsigned Width = VT.getSizeInBits();
9087 if (V0.getValueSizeInBits() > Width)
9088 V0 = extractSubVector(V0, 0, DAG, DL, Width);
9089 else if (V0.getValueSizeInBits() < Width)
9090 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
9091
9092 if (V1.getValueSizeInBits() > Width)
9093 V1 = extractSubVector(V1, 0, DAG, DL, Width);
9094 else if (V1.getValueSizeInBits() < Width)
9095 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
9096
9097 unsigned NumElts = VT.getVectorNumElements();
9098 APInt DemandedElts = APInt::getAllOnes(NumElts);
9099 for (unsigned i = 0; i != NumElts; ++i)
9100 if (BV->getOperand(i).isUndef())
9101 DemandedElts.clearBit(i);
9102
9103 // If we don't need the upper xmm, then perform as a xmm hop.
9104 unsigned HalfNumElts = NumElts / 2;
9105 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9106 MVT HalfVT = VT.getHalfNumVectorElementsVT();
9107 V0 = extractSubVector(V0, 0, DAG, DL, 128);
9108 V1 = extractSubVector(V1, 0, DAG, DL, 128);
9109 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
9110 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
9111 }
9112
9113 return DAG.getNode(HOpcode, DL, VT, V0, V1);
9114}
9115
9116/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9118 const X86Subtarget &Subtarget,
9119 SelectionDAG &DAG) {
9120 // We need at least 2 non-undef elements to make this worthwhile by default.
9121 unsigned NumNonUndefs =
9122 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9123 if (NumNonUndefs < 2)
9124 return SDValue();
9125
9126 // There are 4 sets of horizontal math operations distinguished by type:
9127 // int/FP at 128-bit/256-bit. Each type was introduced with a different
9128 // subtarget feature. Try to match those "native" patterns first.
9129 MVT VT = BV->getSimpleValueType(0);
9130 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9131 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9132 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9133 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9134 unsigned HOpcode;
9135 SDValue V0, V1;
9136 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9137 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
9138 }
9139
9140 // Try harder to match 256-bit ops by using extract/concat.
9141 if (!Subtarget.hasAVX() || !VT.is256BitVector())
9142 return SDValue();
9143
9144 // Count the number of UNDEF operands in the build_vector in input.
9145 unsigned NumElts = VT.getVectorNumElements();
9146 unsigned Half = NumElts / 2;
9147 unsigned NumUndefsLO = 0;
9148 unsigned NumUndefsHI = 0;
9149 for (unsigned i = 0, e = Half; i != e; ++i)
9150 if (BV->getOperand(i)->isUndef())
9151 NumUndefsLO++;
9152
9153 for (unsigned i = Half, e = NumElts; i != e; ++i)
9154 if (BV->getOperand(i)->isUndef())
9155 NumUndefsHI++;
9156
9157 SDValue InVec0, InVec1;
9158 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9159 SDValue InVec2, InVec3;
9160 unsigned X86Opcode;
9161 bool CanFold = true;
9162
9163 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
9164 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
9165 InVec3) &&
9166 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9167 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9168 X86Opcode = X86ISD::HADD;
9169 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
9170 InVec1) &&
9171 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
9172 InVec3) &&
9173 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9174 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9175 X86Opcode = X86ISD::HSUB;
9176 else
9177 CanFold = false;
9178
9179 if (CanFold) {
9180 // Do not try to expand this build_vector into a pair of horizontal
9181 // add/sub if we can emit a pair of scalar add/sub.
9182 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9183 return SDValue();
9184
9185 // Convert this build_vector into a pair of horizontal binops followed by
9186 // a concat vector. We must adjust the outputs from the partial horizontal
9187 // matching calls above to account for undefined vector halves.
9188 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9189 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9190 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
9191 bool isUndefLO = NumUndefsLO == Half;
9192 bool isUndefHI = NumUndefsHI == Half;
9193 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9194 isUndefHI);
9195 }
9196 }
9197
9198 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9199 VT == MVT::v16i16) {
9200 unsigned X86Opcode;
9201 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
9202 InVec1))
9203 X86Opcode = X86ISD::HADD;
9204 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
9205 InVec1))
9206 X86Opcode = X86ISD::HSUB;
9207 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
9208 InVec1))
9209 X86Opcode = X86ISD::FHADD;
9210 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
9211 InVec1))
9212 X86Opcode = X86ISD::FHSUB;
9213 else
9214 return SDValue();
9215
9216 // Don't try to expand this build_vector into a pair of horizontal add/sub
9217 // if we can simply emit a pair of scalar add/sub.
9218 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9219 return SDValue();
9220
9221 // Convert this build_vector into two horizontal add/sub followed by
9222 // a concat vector.
9223 bool isUndefLO = NumUndefsLO == Half;
9224 bool isUndefHI = NumUndefsHI == Half;
9225 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9226 isUndefLO, isUndefHI);
9227 }
9228
9229 return SDValue();
9230}
9231
9232static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
9233 SelectionDAG &DAG);
9234
9235/// If a BUILD_VECTOR's source elements all apply the same bit operation and
9236/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9237/// just apply the bit to the vectors.
9238/// NOTE: Its not in our interest to start make a general purpose vectorizer
9239/// from this, but enough scalar bit operations are created from the later
9240/// legalization + scalarization stages to need basic support.
9242 const X86Subtarget &Subtarget,
9243 SelectionDAG &DAG) {
9244 MVT VT = Op->getSimpleValueType(0);
9245 unsigned NumElems = VT.getVectorNumElements();
9246 unsigned ElemSize = VT.getScalarSizeInBits();
9247 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9248
9249 // Check that all elements have the same opcode.
9250 // TODO: Should we allow UNDEFS and if so how many?
9251 unsigned Opcode = Op->getOperand(0).getOpcode();
9252 for (unsigned i = 1; i < NumElems; ++i)
9253 if (Opcode != Op->getOperand(i).getOpcode())
9254 return SDValue();
9255
9256 // TODO: We may be able to add support for other Ops (e.g. ADD/SUB).
9257 bool IsShift = false;
9258 switch (Opcode) {
9259 default:
9260 return SDValue();
9261 case ISD::SHL:
9262 case ISD::SRL:
9263 case ISD::SRA:
9264 IsShift = true;
9265 break;
9266 case ISD::AND:
9267 case ISD::XOR:
9268 case ISD::OR:
9269 // Don't do this if the buildvector is a splat - we'd replace one
9270 // constant with an entire vector.
9271 if (Op->getSplatValue())
9272 return SDValue();
9273 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9274 return SDValue();
9275 break;
9276 }
9277
9278 // Collect elements.
9279 bool RHSAllConst = true;
9280 SmallVector<SDValue, 4> LHSElts, RHSElts;
9281 for (SDValue Elt : Op->ops()) {
9282 SDValue LHS = Elt.getOperand(0);
9283 SDValue RHS = Elt.getOperand(1);
9284 RHSAllConst &= isa<ConstantSDNode>(RHS);
9285 LHSElts.push_back(LHS);
9286 RHSElts.push_back(RHS);
9287 }
9288
9289 // Canonicalize shift amounts.
9290 if (IsShift) {
9291 // We expect the canonicalized RHS operand to be the constant.
9292 // TODO: Permit non-constant XOP/AVX2 cases?
9293 if (!RHSAllConst)
9294 return SDValue();
9295
9296 // Extend shift amounts.
9297 for (SDValue &Op1 : RHSElts)
9298 if (Op1.getValueSizeInBits() != ElemSize)
9299 Op1 = DAG.getZExtOrTrunc(Op1, DL, VT.getScalarType());
9300
9301 // Limit to shifts by uniform immediates.
9302 // TODO: Only accept vXi8/vXi64 special cases?
9303 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9304 if (any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9305 return SDValue();
9306 }
9307 assert(all_of(llvm::concat<SDValue>(LHSElts, RHSElts),
9308 [ElemSize](SDValue V) {
9309 return V.getValueSizeInBits() == ElemSize;
9310 }) &&
9311 "Element size mismatch");
9312
9313 // To avoid an increase in GPR->FPU instructions, LHS/RHS must be foldable as
9314 // a load or RHS must be constant.
9315 SDValue LHS = EltsFromConsecutiveLoads(VT, LHSElts, DL, DAG, Subtarget,
9316 /*IsAfterLegalize=*/true);
9317 SDValue RHS = EltsFromConsecutiveLoads(VT, RHSElts, DL, DAG, Subtarget,
9318 /*IsAfterLegalize=*/true);
9319 if (!LHS && !RHS && !RHSAllConst)
9320 return SDValue();
9321
9322 if (!LHS)
9323 LHS = DAG.getBuildVector(VT, DL, LHSElts);
9324 if (!RHS)
9325 RHS = DAG.getBuildVector(VT, DL, RHSElts);
9326 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
9327
9328 if (!IsShift)
9329 return Res;
9330
9331 // Immediately lower the shift to ensure the constant build vector doesn't
9332 // get converted to a constant pool before the shift is lowered.
9333 return LowerShift(Res, Subtarget, DAG);
9334}
9335
9336static bool isShuffleFoldableLoad(SDValue);
9337
9338/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
9339/// representing a blend.
9341 X86Subtarget const &Subtarget,
9342 SelectionDAG &DAG) {
9343 MVT VT = BVOp->getSimpleValueType(0u);
9344
9345 if (VT != MVT::v4f64)
9346 return SDValue();
9347
9348 // Collect unique operands.
9349 auto UniqueOps = SmallSet<SDValue, 16u>();
9350 for (SDValue Op : BVOp->ops()) {
9351 if (isIntOrFPConstant(Op) || Op.isUndef())
9352 return SDValue();
9353 UniqueOps.insert(Op);
9354 }
9355
9356 // Candidate BUILD_VECTOR must have 2 unique operands.
9357 if (UniqueOps.size() != 2u)
9358 return SDValue();
9359
9360 SDValue Op0 = BVOp->getOperand(0u);
9361 UniqueOps.erase(Op0);
9362 SDValue Op1 = *UniqueOps.begin();
9363
9364 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
9365 isShuffleFoldableLoad(Op1)) {
9366 // Create shuffle mask.
9367 auto const NumElems = VT.getVectorNumElements();
9368 SmallVector<int, 16u> Mask(NumElems);
9369 for (auto I = 0u; I < NumElems; ++I) {
9370 SDValue Op = BVOp->getOperand(I);
9371 Mask[I] = Op == Op0 ? I : I + NumElems;
9372 }
9373 // Create shuffle of splats.
9374 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
9375 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
9376 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
9377 }
9378
9379 return SDValue();
9380}
9381
9382/// Widen a BUILD_VECTOR if the scalar operands are freely mergeable.
9384 X86Subtarget const &Subtarget,
9385 SelectionDAG &DAG) {
9386 using namespace SDPatternMatch;
9387 MVT VT = BVOp->getSimpleValueType(0);
9388 MVT SVT = VT.getScalarType();
9389 unsigned NumElts = VT.getVectorNumElements();
9390 unsigned EltBits = SVT.getSizeInBits();
9391
9392 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
9393 return SDValue();
9394
9395 unsigned WideBits = 2 * EltBits;
9396 MVT WideSVT = MVT::getIntegerVT(WideBits);
9397 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts / 2);
9398 if (!DAG.getTargetLoweringInfo().isTypeLegal(WideSVT))
9399 return SDValue();
9400
9402 for (unsigned I = 0; I != NumElts; I += 2) {
9403 SDValue Op0 = BVOp->getOperand(I + 0);
9404 SDValue Op1 = BVOp->getOperand(I + 1);
9405
9406 if (Op0.isUndef() && Op1.isUndef()) {
9407 WideOps.push_back(DAG.getUNDEF(WideSVT));
9408 continue;
9409 }
9410
9411 // TODO: Constant repacking?
9412
9413 // Merge scalars that have been split from the same source.
9414 SDValue X, Y;
9415 if (sd_match(Op0, m_Trunc(m_Value(X))) &&
9416 sd_match(Op1, m_Trunc(m_Srl(m_Value(Y), m_SpecificInt(EltBits)))) &&
9418 X.getValueType().bitsGE(WideSVT)) {
9419 if (X.getValueType().bitsGT(WideSVT))
9420 X = DAG.getNode(ISD::TRUNCATE, DL, WideSVT, X);
9421 WideOps.push_back(X);
9422 continue;
9423 }
9424
9425 return SDValue();
9426 }
9427
9428 assert(WideOps.size() == (NumElts / 2) && "Failed to widen build vector");
9429 return DAG.getBitcast(VT, DAG.getBuildVector(WideVT, DL, WideOps));
9430}
9431
9432/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9433/// functionality to do this, so it's all zeros, all ones, or some derivation
9434/// that is cheap to calculate.
9436 SelectionDAG &DAG,
9437 const X86Subtarget &Subtarget) {
9438 MVT VT = Op.getSimpleValueType();
9439
9440 // Vectors containing all zeros can be matched by pxor and xorps.
9441 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9442 return Op;
9443
9444 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9445 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9446 // vpcmpeqd on 256-bit vectors.
9447 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
9448 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9449 return Op;
9450
9451 return getOnesVector(VT, DAG, DL);
9452 }
9453
9454 return SDValue();
9455}
9456
9457/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
9458/// from a vector of source values and a vector of extraction indices.
9459/// The vectors might be manipulated to match the type of the permute op.
9460static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
9461 const SDLoc &DL, SelectionDAG &DAG,
9462 const X86Subtarget &Subtarget) {
9463 MVT ShuffleVT = VT;
9464 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9465 unsigned NumElts = VT.getVectorNumElements();
9466 unsigned SizeInBits = VT.getSizeInBits();
9467
9468 // Adjust IndicesVec to match VT size.
9469 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
9470 "Illegal variable permute mask size");
9471 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
9472 // Narrow/widen the indices vector to the correct size.
9473 if (IndicesVec.getValueSizeInBits() > SizeInBits)
9474 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
9475 NumElts * VT.getScalarSizeInBits());
9476 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
9477 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
9478 SDLoc(IndicesVec), SizeInBits);
9479 // Zero-extend the index elements within the vector.
9480 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
9481 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
9482 IndicesVT, IndicesVec);
9483 }
9484 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
9485
9486 // Handle SrcVec that don't match VT type.
9487 if (SrcVec.getValueSizeInBits() != SizeInBits) {
9488 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
9489 // Handle larger SrcVec by treating it as a larger permute.
9490 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
9491 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
9492 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9493 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
9494 Subtarget, DAG, SDLoc(IndicesVec));
9495 SDValue NewSrcVec =
9496 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9497 if (NewSrcVec)
9498 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
9499 return SDValue();
9500 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
9501 // Widen smaller SrcVec to match VT.
9502 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
9503 } else
9504 return SDValue();
9505 }
9506
9507 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
9508 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
9509 EVT SrcVT = Idx.getValueType();
9510 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
9511 uint64_t IndexScale = 0;
9512 uint64_t IndexOffset = 0;
9513
9514 // If we're scaling a smaller permute op, then we need to repeat the
9515 // indices, scaling and offsetting them as well.
9516 // e.g. v4i32 -> v16i8 (Scale = 4)
9517 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
9518 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
9519 for (uint64_t i = 0; i != Scale; ++i) {
9520 IndexScale |= Scale << (i * NumDstBits);
9521 IndexOffset |= i << (i * NumDstBits);
9522 }
9523
9524 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
9525 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
9526 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
9527 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
9528 return Idx;
9529 };
9530
9531 unsigned Opcode = 0;
9532 switch (VT.SimpleTy) {
9533 default:
9534 break;
9535 case MVT::v16i8:
9536 if (Subtarget.hasSSSE3())
9537 Opcode = X86ISD::PSHUFB;
9538 break;
9539 case MVT::v8i16:
9540 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9541 Opcode = X86ISD::VPERMV;
9542 else if (Subtarget.hasSSSE3()) {
9543 Opcode = X86ISD::PSHUFB;
9544 ShuffleVT = MVT::v16i8;
9545 }
9546 break;
9547 case MVT::v4f32:
9548 case MVT::v4i32:
9549 if (Subtarget.hasAVX()) {
9550 Opcode = X86ISD::VPERMILPV;
9551 ShuffleVT = MVT::v4f32;
9552 } else if (Subtarget.hasSSSE3()) {
9553 Opcode = X86ISD::PSHUFB;
9554 ShuffleVT = MVT::v16i8;
9555 }
9556 break;
9557 case MVT::v2f64:
9558 case MVT::v2i64:
9559 if (Subtarget.hasAVX()) {
9560 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
9561 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9562 Opcode = X86ISD::VPERMILPV;
9563 ShuffleVT = MVT::v2f64;
9564 } else if (Subtarget.hasSSE41()) {
9565 // SSE41 can compare v2i64 - select between indices 0 and 1.
9566 return DAG.getSelectCC(
9567 DL, IndicesVec,
9568 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
9569 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
9570 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
9572 }
9573 break;
9574 case MVT::v32i8:
9575 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9576 Opcode = X86ISD::VPERMV;
9577 else if (Subtarget.hasXOP()) {
9578 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
9579 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9580 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9581 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9582 return DAG.getNode(
9584 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9585 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9586 } else if (Subtarget.hasAVX()) {
9587 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9588 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9589 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9590 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9591 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9593 // Permute Lo and Hi and then select based on index range.
9594 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9595 // care about the bit[7] as its just an index vector.
9596 SDValue Idx = Ops[2];
9597 EVT VT = Idx.getValueType();
9598 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9599 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9600 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9602 };
9603 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9604 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9605 PSHUFBBuilder);
9606 }
9607 break;
9608 case MVT::v16i16:
9609 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9610 Opcode = X86ISD::VPERMV;
9611 else if (Subtarget.hasAVX()) {
9612 // Scale to v32i8 and perform as v32i8.
9613 IndicesVec = ScaleIndices(IndicesVec, 2);
9614 return DAG.getBitcast(
9616 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9617 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9618 }
9619 break;
9620 case MVT::v8f32:
9621 case MVT::v8i32:
9622 if (Subtarget.hasAVX2())
9623 Opcode = X86ISD::VPERMV;
9624 else if (Subtarget.hasAVX()) {
9625 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9626 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9627 {0, 1, 2, 3, 0, 1, 2, 3});
9628 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9629 {4, 5, 6, 7, 4, 5, 6, 7});
9630 if (Subtarget.hasXOP())
9631 return DAG.getBitcast(
9632 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9633 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9634 // Permute Lo and Hi and then select based on index range.
9635 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9636 SDValue Res = DAG.getSelectCC(
9637 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9638 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9639 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9641 return DAG.getBitcast(VT, Res);
9642 }
9643 break;
9644 case MVT::v4i64:
9645 case MVT::v4f64:
9646 if (Subtarget.hasAVX512()) {
9647 if (!Subtarget.hasVLX()) {
9648 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9649 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9650 SDLoc(SrcVec));
9651 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9652 DAG, SDLoc(IndicesVec));
9653 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9654 DAG, Subtarget);
9655 return extract256BitVector(Res, 0, DAG, DL);
9656 }
9657 Opcode = X86ISD::VPERMV;
9658 } else if (Subtarget.hasAVX()) {
9659 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9660 SDValue LoLo =
9661 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9662 SDValue HiHi =
9663 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9664 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9665 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9666 if (Subtarget.hasXOP())
9667 return DAG.getBitcast(
9668 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9669 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9670 // Permute Lo and Hi and then select based on index range.
9671 // This works as VPERMILPD only uses index bit[1] to permute elements.
9672 SDValue Res = DAG.getSelectCC(
9673 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9674 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9675 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9677 return DAG.getBitcast(VT, Res);
9678 }
9679 break;
9680 case MVT::v64i8:
9681 if (Subtarget.hasVBMI())
9682 Opcode = X86ISD::VPERMV;
9683 break;
9684 case MVT::v32i16:
9685 if (Subtarget.hasBWI())
9686 Opcode = X86ISD::VPERMV;
9687 break;
9688 case MVT::v16f32:
9689 case MVT::v16i32:
9690 case MVT::v8f64:
9691 case MVT::v8i64:
9692 if (Subtarget.hasAVX512())
9693 Opcode = X86ISD::VPERMV;
9694 break;
9695 }
9696 if (!Opcode)
9697 return SDValue();
9698
9699 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9700 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9701 "Illegal variable permute shuffle type");
9702
9703 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9704 if (Scale > 1)
9705 IndicesVec = ScaleIndices(IndicesVec, Scale);
9706
9707 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9708 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9709
9710 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9711 SDValue Res = Opcode == X86ISD::VPERMV
9712 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9713 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9714 return DAG.getBitcast(VT, Res);
9715}
9716
9717// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9718// reasoned to be a permutation of a vector by indices in a non-constant vector.
9719// (build_vector (extract_elt V, (extract_elt I, 0)),
9720// (extract_elt V, (extract_elt I, 1)),
9721// ...
9722// ->
9723// (vpermv I, V)
9724//
9725// TODO: Handle undefs
9726// TODO: Utilize pshufb and zero mask blending to support more efficient
9727// construction of vectors with constant-0 elements.
9728static SDValue
9730 SelectionDAG &DAG,
9731 const X86Subtarget &Subtarget) {
9732 SDValue SrcVec, IndicesVec;
9733
9734 // Check for a match of the permute source vector and permute index elements.
9735 // This is done by checking that the i-th build_vector operand is of the form:
9736 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9737 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9738 SDValue Op = peekThroughOneUseFreeze(V.getOperand(Idx));
9739 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9740 return SDValue();
9741
9742 // If this is the first extract encountered in V, set the source vector,
9743 // otherwise verify the extract is from the previously defined source
9744 // vector.
9745 if (!SrcVec)
9746 SrcVec = Op.getOperand(0);
9747 else if (SrcVec != Op.getOperand(0))
9748 return SDValue();
9749 SDValue ExtractedIndex = Op->getOperand(1);
9750 // Peek through extends.
9751 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9752 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9753 ExtractedIndex = ExtractedIndex.getOperand(0);
9754 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9755 return SDValue();
9756
9757 // If this is the first extract from the index vector candidate, set the
9758 // indices vector, otherwise verify the extract is from the previously
9759 // defined indices vector.
9760 if (!IndicesVec)
9761 IndicesVec = ExtractedIndex.getOperand(0);
9762 else if (IndicesVec != ExtractedIndex.getOperand(0))
9763 return SDValue();
9764
9765 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9766 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9767 return SDValue();
9768 }
9769
9770 MVT VT = V.getSimpleValueType();
9771 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9772}
9773
9774SDValue
9775X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9776 SDLoc dl(Op);
9777
9778 MVT VT = Op.getSimpleValueType();
9779 MVT EltVT = VT.getVectorElementType();
9780 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9781 unsigned NumElems = Op.getNumOperands();
9782
9783 // Generate vectors for predicate vectors.
9784 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9785 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9786
9787 if (VT.getVectorElementType() == MVT::bf16 &&
9788 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9789 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9790
9791 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9792 return VectorCst;
9793
9794 unsigned EVTBits = EltVT.getSizeInBits();
9795 APInt UndefMask = APInt::getZero(NumElems);
9796 APInt FrozenUndefMask = APInt::getZero(NumElems);
9797 APInt ZeroMask = APInt::getZero(NumElems);
9798 APInt NonZeroMask = APInt::getZero(NumElems);
9799 bool IsAllConstants = true;
9800 bool OneUseFrozenUndefs = true;
9801 SmallSet<SDValue, 8> Values;
9802 unsigned NumConstants = NumElems;
9803 for (unsigned i = 0; i < NumElems; ++i) {
9804 SDValue Elt = Op.getOperand(i);
9805 if (Elt.isUndef()) {
9806 UndefMask.setBit(i);
9807 continue;
9808 }
9809 if (ISD::isFreezeUndef(Elt.getNode())) {
9810 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9811 FrozenUndefMask.setBit(i);
9812 continue;
9813 }
9814 Values.insert(Elt);
9815 if (!isIntOrFPConstant(Elt)) {
9816 IsAllConstants = false;
9817 NumConstants--;
9818 }
9819 if (X86::isZeroNode(Elt)) {
9820 ZeroMask.setBit(i);
9821 } else {
9822 NonZeroMask.setBit(i);
9823 }
9824 }
9825
9826 // All undef vector. Return an UNDEF.
9827 if (UndefMask.isAllOnes())
9828 return DAG.getUNDEF(VT);
9829
9830 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9831 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9832 return DAG.getFreeze(DAG.getUNDEF(VT));
9833
9834 // All undef/freeze(undef)/zero vector. Return a zero vector.
9835 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9836 return getZeroVector(VT, Subtarget, DAG, dl);
9837
9838 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9839 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9840 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9841 // and blend the FREEZE-UNDEF operands back in.
9842 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9843 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9844 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9845 SmallVector<int, 16> BlendMask(NumElems, -1);
9846 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9847 for (unsigned i = 0; i < NumElems; ++i) {
9848 if (UndefMask[i]) {
9849 BlendMask[i] = -1;
9850 continue;
9851 }
9852 BlendMask[i] = i;
9853 if (!FrozenUndefMask[i])
9854 Elts[i] = Op.getOperand(i);
9855 else
9856 BlendMask[i] += NumElems;
9857 }
9858 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9859 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9860 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9861 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9862 }
9863
9864 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9865
9866 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9867 // be better off lowering to a smaller build vector and padding with
9868 // undef/zero.
9869 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9871 unsigned UpperElems = NumElems / 2;
9872 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9873 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9874 if (NumUpperUndefsOrZeros >= UpperElems) {
9875 if (VT.is512BitVector() &&
9876 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9877 UpperElems = NumElems - (NumElems / 4);
9878 // If freeze(undef) is in any upper elements, force to zero.
9879 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9880 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9881 SDValue NewBV =
9882 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9883 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9884 }
9885 }
9886
9887 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9888 return AddSub;
9889 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9890 return HorizontalOp;
9891 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9892 return Broadcast;
9893 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9894 return BitOp;
9895 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9896 return Blend;
9897 if (SDValue WideBV = widenBuildVector(BV, dl, Subtarget, DAG))
9898 return WideBV;
9899
9900 unsigned NumZero = ZeroMask.popcount();
9901 unsigned NumNonZero = NonZeroMask.popcount();
9902
9903 // If we are inserting one variable into a vector of non-zero constants, try
9904 // to avoid loading each constant element as a scalar. Load the constants as a
9905 // vector and then insert the variable scalar element. If insertion is not
9906 // supported, fall back to a shuffle to get the scalar blended with the
9907 // constants. Insertion into a zero vector is handled as a special-case
9908 // somewhere below here.
9909 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9910 FrozenUndefMask.isZero() &&
9913 // Create an all-constant vector. The variable element in the old
9914 // build vector is replaced by undef in the constant vector. Save the
9915 // variable scalar element and its index for use in the insertelement.
9916 LLVMContext &Context = *DAG.getContext();
9917 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9918 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9919 SDValue VarElt;
9920 SDValue InsIndex;
9921 for (unsigned i = 0; i != NumElems; ++i) {
9922 SDValue Elt = Op.getOperand(i);
9923 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9924 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9925 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9926 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9927 else if (!Elt.isUndef()) {
9928 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9929 "Expected one variable element in this vector");
9930 VarElt = Elt;
9931 InsIndex = DAG.getVectorIdxConstant(i, dl);
9932 }
9933 }
9934 Constant *CV = ConstantVector::get(ConstVecOps);
9935 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9936
9937 // The constants we just created may not be legal (eg, floating point). We
9938 // must lower the vector right here because we can not guarantee that we'll
9939 // legalize it before loading it. This is also why we could not just create
9940 // a new build vector here. If the build vector contains illegal constants,
9941 // it could get split back up into a series of insert elements.
9942 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9943 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9944 MachineFunction &MF = DAG.getMachineFunction();
9945 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9946 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9947 unsigned InsertC = InsIndex->getAsZExtVal();
9948 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9949 if (InsertC < NumEltsInLow128Bits)
9950 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9951
9952 // There's no good way to insert into the high elements of a >128-bit
9953 // vector, so use shuffles to avoid an extract/insert sequence.
9954 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9955 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9956 SmallVector<int, 8> ShuffleMask;
9957 unsigned NumElts = VT.getVectorNumElements();
9958 for (unsigned i = 0; i != NumElts; ++i)
9959 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9960 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9961 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9962 }
9963
9964 // Special case for single non-zero, non-undef, element.
9965 if (NumNonZero == 1) {
9966 unsigned Idx = NonZeroMask.countr_zero();
9967 SDValue Item = Op.getOperand(Idx);
9968
9969 // If we have a constant or non-constant insertion into the low element of
9970 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9971 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9972 // depending on what the source datatype is.
9973 if (Idx == 0) {
9974 if (NumZero == 0)
9975 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9976
9977 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9978 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9979 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9980 assert((VT.is128BitVector() || VT.is256BitVector() ||
9981 VT.is512BitVector()) &&
9982 "Expected an SSE value type!");
9983 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9984 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9985 // zero vector.
9986 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9987 }
9988
9989 // We can't directly insert an i8 or i16 into a vector, so zero extend
9990 // it to i32 first.
9991 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9992 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9993 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9994 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9995 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9996 return DAG.getBitcast(VT, Item);
9997 }
9998 }
9999
10000 // Is it a vector logical left shift?
10001 if (NumElems == 2 && Idx == 1 &&
10002 X86::isZeroNode(Op.getOperand(0)) &&
10003 !X86::isZeroNode(Op.getOperand(1))) {
10004 unsigned NumBits = VT.getSizeInBits();
10005 return getVShift(true, VT,
10007 VT, Op.getOperand(1)),
10008 NumBits/2, DAG, *this, dl);
10009 }
10010
10011 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10012 return SDValue();
10013
10014 // Otherwise, if this is a vector with i32 or f32 elements, and the element
10015 // is a non-constant being inserted into an element other than the low one,
10016 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
10017 // movd/movss) to move this into the low element, then shuffle it into
10018 // place.
10019 if (EVTBits == 32) {
10020 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10021 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10022 }
10023 }
10024
10025 // Splat is obviously ok. Let legalizer expand it to a shuffle.
10026 if (Values.size() == 1) {
10027 if (EVTBits == 32) {
10028 // Instead of a shuffle like this:
10029 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10030 // Check if it's possible to issue this instead.
10031 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10032 unsigned Idx = NonZeroMask.countr_zero();
10033 SDValue Item = Op.getOperand(Idx);
10034 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10035 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10036 }
10037 return SDValue();
10038 }
10039
10040 // A vector full of immediates; various special cases are already
10041 // handled, so this is best done with a single constant-pool load.
10042 if (IsAllConstants)
10043 return SDValue();
10044
10045 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
10046 return V;
10047
10048 // See if we can use a vector load to get all of the elements.
10049 {
10050 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
10051 if (SDValue LD =
10052 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10053 return LD;
10054 }
10055
10056 // If this is a splat of pairs of 32-bit elements, we can use a narrower
10057 // build_vector and broadcast it.
10058 // TODO: We could probably generalize this more.
10059 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10060 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10061 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10062 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10063 // Make sure all the even/odd operands match.
10064 for (unsigned i = 2; i != NumElems; ++i)
10065 if (Ops[i % 2] != Op.getOperand(i))
10066 return false;
10067 return true;
10068 };
10069 if (CanSplat(Op, NumElems, Ops)) {
10070 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10071 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10072 // Create a new build vector and cast to v2i64/v2f64.
10073 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10074 DAG.getBuildVector(NarrowVT, dl, Ops));
10075 // Broadcast from v2i64/v2f64 and cast to final VT.
10076 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10077 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10078 NewBV));
10079 }
10080 }
10081
10082 // For AVX-length vectors, build the individual 128-bit pieces and use
10083 // shuffles to put them in place.
10084 if (VT.getSizeInBits() > 128) {
10085 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10086
10087 // Build both the lower and upper subvector.
10088 SDValue Lower =
10089 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10091 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10092
10093 // Recreate the wider vector with the lower and upper part.
10094 return concatSubVectors(Lower, Upper, DAG, dl);
10095 }
10096
10097 // Let legalizer expand 2-wide build_vectors.
10098 if (EVTBits == 64) {
10099 if (NumNonZero == 1) {
10100 // One half is zero or undef.
10101 unsigned Idx = NonZeroMask.countr_zero();
10102 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10103 Op.getOperand(Idx));
10104 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10105 }
10106 return SDValue();
10107 }
10108
10109 // If element VT is < 32 bits, convert it to inserts into a zero vector.
10110 if (EVTBits == 8 && NumElems == 16)
10111 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
10112 NumZero, DAG, Subtarget))
10113 return V;
10114
10115 if (EltVT == MVT::i16 && NumElems == 8)
10116 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
10117 NumZero, DAG, Subtarget))
10118 return V;
10119
10120 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10121 if (EVTBits == 32 && NumElems == 4)
10122 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
10123 return V;
10124
10125 // If element VT is == 32 bits, turn it into a number of shuffles.
10126 if (NumElems == 4 && NumZero > 0) {
10127 SmallVector<SDValue, 8> Ops(NumElems);
10128 for (unsigned i = 0; i < 4; ++i) {
10129 bool isZero = !NonZeroMask[i];
10130 if (isZero)
10131 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10132 else
10133 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10134 }
10135
10136 for (unsigned i = 0; i < 2; ++i) {
10137 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10138 default: llvm_unreachable("Unexpected NonZero count");
10139 case 0:
10140 Ops[i] = Ops[i*2]; // Must be a zero vector.
10141 break;
10142 case 1:
10143 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10144 break;
10145 case 2:
10146 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10147 break;
10148 case 3:
10149 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10150 break;
10151 }
10152 }
10153
10154 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10155 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10156 int MaskVec[] = {
10157 Reverse1 ? 1 : 0,
10158 Reverse1 ? 0 : 1,
10159 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10160 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10161 };
10162 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10163 }
10164
10165 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
10166
10167 // Check for a build vector from mostly shuffle plus few inserting.
10168 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
10169 return Sh;
10170
10171 // For SSE 4.1, use insertps to put the high elements into the low element.
10172 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
10174 if (!Op.getOperand(0).isUndef())
10175 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10176 else
10177 Result = DAG.getUNDEF(VT);
10178
10179 for (unsigned i = 1; i < NumElems; ++i) {
10180 if (Op.getOperand(i).isUndef()) continue;
10181 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10182 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
10183 }
10184 return Result;
10185 }
10186
10187 // Otherwise, expand into a number of unpckl*, start by extending each of
10188 // our (non-undef) elements to the full vector width with the element in the
10189 // bottom slot of the vector (which generates no code for SSE).
10190 SmallVector<SDValue, 8> Ops(NumElems);
10191 for (unsigned i = 0; i < NumElems; ++i) {
10192 if (!Op.getOperand(i).isUndef())
10193 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10194 else
10195 Ops[i] = DAG.getUNDEF(VT);
10196 }
10197
10198 // Next, we iteratively mix elements, e.g. for v4f32:
10199 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10200 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10201 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10202 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10203 // Generate scaled UNPCKL shuffle mask.
10204 SmallVector<int, 16> Mask;
10205 for(unsigned i = 0; i != Scale; ++i)
10206 Mask.push_back(i);
10207 for (unsigned i = 0; i != Scale; ++i)
10208 Mask.push_back(NumElems+i);
10209 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10210
10211 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10212 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10213 }
10214 return Ops[0];
10215}
10216
10217// 256-bit AVX can use the vinsertf128 instruction
10218// to create 256-bit vectors from two other 128-bit ones.
10219// TODO: Detect subvector broadcast here instead of DAG combine?
10221 SelectionDAG &DAG,
10222 const X86Subtarget &Subtarget) {
10223 MVT ResVT = Op.getSimpleValueType();
10224 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
10225 "Value type must be 256-/512-bit wide");
10226
10227 unsigned NumOperands = Op.getNumOperands();
10228 unsigned NumFreezeUndef = 0;
10229 unsigned NumZero = 0;
10230 unsigned NumNonZero = 0;
10231 unsigned NonZeros = 0;
10232 SmallSet<SDValue, 4> Undefs;
10233 for (unsigned i = 0; i != NumOperands; ++i) {
10234 SDValue SubVec = Op.getOperand(i);
10235 if (SubVec.isUndef())
10236 continue;
10237 if (ISD::isFreezeUndef(SubVec.getNode())) {
10238 // If the freeze(undef) has multiple uses then we must fold to zero.
10239 if (SubVec.hasOneUse()) {
10240 ++NumFreezeUndef;
10241 } else {
10242 ++NumZero;
10243 Undefs.insert(SubVec);
10244 }
10245 }
10246 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10247 ++NumZero;
10248 else {
10249 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10250 NonZeros |= 1 << i;
10251 ++NumNonZero;
10252 }
10253 }
10254
10255 // If we have more than 2 non-zeros, build each half separately.
10256 if (NumNonZero > 2) {
10257 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10258 ArrayRef<SDUse> Ops = Op->ops();
10259 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10260 Ops.slice(0, NumOperands/2));
10261 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10262 Ops.slice(NumOperands/2));
10263 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10264 }
10265
10266 // Otherwise, build it up through insert_subvectors.
10267 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10268 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
10269 : DAG.getUNDEF(ResVT));
10270
10271 // Replace Undef operands with ZeroVector.
10272 for (SDValue U : Undefs)
10274 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
10275
10276 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10277 unsigned NumSubElems = SubVT.getVectorNumElements();
10278 for (unsigned i = 0; i != NumOperands; ++i) {
10279 if ((NonZeros & (1 << i)) == 0)
10280 continue;
10281
10282 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
10283 DAG.getVectorIdxConstant(i * NumSubElems, dl));
10284 }
10285
10286 return Vec;
10287}
10288
10289// Returns true if the given node is a type promotion (by concatenating i1
10290// zeros) of the result of a node that already zeros all upper bits of
10291// k-register.
10292// TODO: Merge this with LowerAVXCONCAT_VECTORS?
10294 const X86Subtarget &Subtarget,
10295 SelectionDAG & DAG) {
10296 MVT ResVT = Op.getSimpleValueType();
10297 unsigned NumOperands = Op.getNumOperands();
10298 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
10299 "Unexpected number of operands in CONCAT_VECTORS");
10300
10301 uint64_t Zeros = 0;
10302 uint64_t NonZeros = 0;
10303 for (unsigned i = 0; i != NumOperands; ++i) {
10304 SDValue SubVec = Op.getOperand(i);
10305 if (SubVec.isUndef())
10306 continue;
10307 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10308 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10309 Zeros |= (uint64_t)1 << i;
10310 else
10311 NonZeros |= (uint64_t)1 << i;
10312 }
10313
10314 unsigned NumElems = ResVT.getVectorNumElements();
10315
10316 // If we are inserting non-zero vector and there are zeros in LSBs and undef
10317 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10318 // insert_subvector will give us two kshifts.
10319 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10320 Log2_64(NonZeros) != NumOperands - 1) {
10321 unsigned Idx = Log2_64(NonZeros);
10322 SDValue SubVec = Op.getOperand(Idx);
10323 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10324 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
10325 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
10326 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
10327 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10328 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10329 DAG.getVectorIdxConstant(0, dl));
10330 }
10331
10332 // If there are zero or one non-zeros we can handle this very simply.
10333 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10334 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10335 if (!NonZeros)
10336 return Vec;
10337 unsigned Idx = Log2_64(NonZeros);
10338 SDValue SubVec = Op.getOperand(Idx);
10339 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10340 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10341 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
10342 }
10343
10344 if (NumOperands > 2) {
10345 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10346 ArrayRef<SDUse> Ops = Op->ops();
10347 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10348 Ops.slice(0, NumOperands / 2));
10349 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10350 Ops.slice(NumOperands / 2));
10351 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10352 }
10353
10354 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
10355
10356 if (ResVT.getVectorNumElements() >= 16)
10357 return Op; // The operation is legal with KUNPCK
10358
10359 SDValue Vec =
10360 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
10361 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
10362 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10363 DAG.getVectorIdxConstant(NumElems / 2, dl));
10364}
10365
10367 const X86Subtarget &Subtarget,
10368 SelectionDAG &DAG) {
10369 SDLoc DL(Op);
10370 MVT VT = Op.getSimpleValueType();
10371 if (VT.getVectorElementType() == MVT::i1)
10372 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
10373
10374 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10375 // from two other 128-bit ones.
10376 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10377 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
10378 (VT.is512BitVector() &&
10379 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
10380 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
10381}
10382
10383//===----------------------------------------------------------------------===//
10384// Vector shuffle lowering
10385//
10386// This is an experimental code path for lowering vector shuffles on x86. It is
10387// designed to handle arbitrary vector shuffles and blends, gracefully
10388// degrading performance as necessary. It works hard to recognize idiomatic
10389// shuffles and lower them to optimal instruction patterns without leaving
10390// a framework that allows reasonably efficient handling of all vector shuffle
10391// patterns.
10392//===----------------------------------------------------------------------===//
10393
10394/// Checks whether the vector elements referenced by two shuffle masks are
10395/// equivalent.
10396static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
10397 int Idx, int ExpectedIdx) {
10398 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
10399 ExpectedIdx < MaskSize && "Out of range element index");
10400 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
10401 return false;
10402
10403 EVT VT = Op.getValueType();
10404 EVT ExpectedVT = ExpectedOp.getValueType();
10405
10406 // Sources must be vectors and match the mask's element count.
10407 if (!VT.isVector() || !ExpectedVT.isVector() ||
10408 (int)VT.getVectorNumElements() != MaskSize ||
10409 (int)ExpectedVT.getVectorNumElements() != MaskSize)
10410 return false;
10411
10412 // Exact match.
10413 if (Idx == ExpectedIdx && Op == ExpectedOp)
10414 return true;
10415
10416 switch (Op.getOpcode()) {
10417 case ISD::BUILD_VECTOR:
10418 // If the values are build vectors, we can look through them to find
10419 // equivalent inputs that make the shuffles equivalent.
10420 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
10421 case ISD::BITCAST: {
10423 EVT SrcVT = Src.getValueType();
10424 if (Op == ExpectedOp && SrcVT.isVector()) {
10425 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
10426 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
10427 return (Idx % Scale) == (ExpectedIdx % Scale) &&
10428 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
10429 Idx / Scale, ExpectedIdx / Scale);
10430 }
10431 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
10432 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
10433 for (unsigned I = 0; I != Scale; ++I)
10434 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
10435 (Idx * Scale) + I,
10436 (ExpectedIdx * Scale) + I))
10437 return false;
10438 return true;
10439 }
10440 }
10441 break;
10442 }
10443 case ISD::VECTOR_SHUFFLE: {
10444 auto *SVN = cast<ShuffleVectorSDNode>(Op);
10445 return Op == ExpectedOp &&
10446 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
10447 }
10448 case X86ISD::VBROADCAST:
10449 case X86ISD::VBROADCAST_LOAD:
10450 return Op == ExpectedOp;
10451 case X86ISD::SUBV_BROADCAST_LOAD:
10452 if (Op == ExpectedOp) {
10453 auto *MemOp = cast<MemSDNode>(Op);
10454 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
10455 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
10456 }
10457 break;
10458 case X86ISD::VPERMI: {
10459 if (Op == ExpectedOp) {
10461 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
10462 SDValue Src = Op.getOperand(0);
10463 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
10464 Mask[ExpectedIdx]);
10465 }
10466 break;
10467 }
10468 case X86ISD::HADD:
10469 case X86ISD::HSUB:
10470 case X86ISD::FHADD:
10471 case X86ISD::FHSUB:
10472 case X86ISD::PACKSS:
10473 case X86ISD::PACKUS:
10474 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
10475 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
10476 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
10477 int NumElts = VT.getVectorNumElements();
10478 int NumLanes = VT.getSizeInBits() / 128;
10479 int NumEltsPerLane = NumElts / NumLanes;
10480 int NumHalfEltsPerLane = NumEltsPerLane / 2;
10481 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
10482 bool SameElt =
10483 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
10484 return SameLane && SameElt;
10485 }
10486 break;
10487 }
10488
10489 return false;
10490}
10491
10492/// Tiny helper function to identify a no-op mask.
10493///
10494/// This is a somewhat boring predicate function. It checks whether the mask
10495/// array input, which is assumed to be a single-input shuffle mask of the kind
10496/// used by the X86 shuffle instructions (not a fully general
10497/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10498/// in-place shuffle are 'no-op's.
10500 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10501 assert(Mask[i] >= -1 && "Out of bound mask element!");
10502 if (Mask[i] >= 0 && Mask[i] != i)
10503 return false;
10504 }
10505 return true;
10506}
10507
10508/// Test whether there are elements crossing LaneSizeInBits lanes in this
10509/// shuffle mask.
10510///
10511/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10512/// and we routinely test for these.
10513static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10514 unsigned ScalarSizeInBits,
10515 ArrayRef<int> Mask) {
10516 assert(LaneSizeInBits && ScalarSizeInBits &&
10517 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10518 "Illegal shuffle lane size");
10519 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10520 int Size = Mask.size();
10521 for (int i = 0; i < Size; ++i)
10522 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10523 return true;
10524 return false;
10525}
10526
10527/// Test whether there are elements crossing 128-bit lanes in this
10528/// shuffle mask.
10530 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10531}
10532
10533/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10534/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10535/// better support 'repeated mask + lane permute' style shuffles.
10536static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10537 unsigned ScalarSizeInBits,
10538 ArrayRef<int> Mask) {
10539 assert(LaneSizeInBits && ScalarSizeInBits &&
10540 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10541 "Illegal shuffle lane size");
10542 int NumElts = Mask.size();
10543 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10544 int NumLanes = NumElts / NumEltsPerLane;
10545 if (NumLanes > 1) {
10546 for (int i = 0; i != NumLanes; ++i) {
10547 int SrcLane = -1;
10548 for (int j = 0; j != NumEltsPerLane; ++j) {
10549 int M = Mask[(i * NumEltsPerLane) + j];
10550 if (M < 0)
10551 continue;
10552 int Lane = (M % NumElts) / NumEltsPerLane;
10553 if (SrcLane >= 0 && SrcLane != Lane)
10554 return true;
10555 SrcLane = Lane;
10556 }
10557 }
10558 }
10559 return false;
10560}
10561
10562/// Test whether a shuffle mask is equivalent within each sub-lane.
10563///
10564/// This checks a shuffle mask to see if it is performing the same
10565/// lane-relative shuffle in each sub-lane. This trivially implies
10566/// that it is also not lane-crossing. It may however involve a blend from the
10567/// same lane of a second vector.
10568///
10569/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10570/// non-trivial to compute in the face of undef lanes. The representation is
10571/// suitable for use with existing 128-bit shuffles as entries from the second
10572/// vector have been remapped to [LaneSize, 2*LaneSize).
10573static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10574 ArrayRef<int> Mask,
10575 SmallVectorImpl<int> &RepeatedMask) {
10576 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10577 RepeatedMask.assign(LaneSize, -1);
10578 int Size = Mask.size();
10579 for (int i = 0; i < Size; ++i) {
10580 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10581 if (Mask[i] < 0)
10582 continue;
10583 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10584 // This entry crosses lanes, so there is no way to model this shuffle.
10585 return false;
10586
10587 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10588 // Adjust second vector indices to start at LaneSize instead of Size.
10589 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10590 : Mask[i] % LaneSize + LaneSize;
10591 if (RepeatedMask[i % LaneSize] < 0)
10592 // This is the first non-undef entry in this slot of a 128-bit lane.
10593 RepeatedMask[i % LaneSize] = LocalM;
10594 else if (RepeatedMask[i % LaneSize] != LocalM)
10595 // Found a mismatch with the repeated mask.
10596 return false;
10597 }
10598 return true;
10599}
10600
10601/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10602static bool
10604 SmallVectorImpl<int> &RepeatedMask) {
10605 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10606}
10607
10608static bool
10610 SmallVector<int, 32> RepeatedMask;
10611 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10612}
10613
10614/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10615static bool
10617 SmallVectorImpl<int> &RepeatedMask) {
10618 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10619}
10620
10621/// Test whether a target shuffle mask is equivalent within each sub-lane.
10622/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10623static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10624 unsigned EltSizeInBits,
10625 ArrayRef<int> Mask,
10626 SmallVectorImpl<int> &RepeatedMask) {
10627 int LaneSize = LaneSizeInBits / EltSizeInBits;
10628 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10629 int Size = Mask.size();
10630 for (int i = 0; i < Size; ++i) {
10631 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10632 if (Mask[i] == SM_SentinelUndef)
10633 continue;
10634 if (Mask[i] == SM_SentinelZero) {
10635 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10636 return false;
10637 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10638 continue;
10639 }
10640 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10641 // This entry crosses lanes, so there is no way to model this shuffle.
10642 return false;
10643
10644 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10645 // later vector indices to start at multiples of LaneSize instead of Size.
10646 int LaneM = Mask[i] / Size;
10647 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10648 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10649 // This is the first non-undef entry in this slot of a 128-bit lane.
10650 RepeatedMask[i % LaneSize] = LocalM;
10651 else if (RepeatedMask[i % LaneSize] != LocalM)
10652 // Found a mismatch with the repeated mask.
10653 return false;
10654 }
10655 return true;
10656}
10657
10658/// Test whether a target shuffle mask is equivalent within each sub-lane.
10659/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10660static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10661 ArrayRef<int> Mask,
10662 SmallVectorImpl<int> &RepeatedMask) {
10663 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10664 Mask, RepeatedMask);
10665}
10666
10667/// Checks whether a shuffle mask is equivalent to an explicit list of
10668/// arguments.
10669///
10670/// This is a fast way to test a shuffle mask against a fixed pattern:
10671///
10672/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10673///
10674/// It returns true if the mask is exactly as wide as the argument list, and
10675/// each element of the mask is either -1 (signifying undef) or the value given
10676/// in the argument.
10677static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10678 SDValue V1 = SDValue(),
10679 SDValue V2 = SDValue()) {
10680 int Size = Mask.size();
10681 if (Size != (int)ExpectedMask.size())
10682 return false;
10683
10684 for (int i = 0; i < Size; ++i) {
10685 assert(Mask[i] >= -1 && "Out of bound mask element!");
10686 int MaskIdx = Mask[i];
10687 int ExpectedIdx = ExpectedMask[i];
10688 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10689 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10690 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10691 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10692 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10693 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10694 return false;
10695 }
10696 }
10697 return true;
10698}
10699
10700/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10701///
10702/// The masks must be exactly the same width.
10703///
10704/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10705/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10706///
10707/// SM_SentinelZero is accepted as a valid negative index but must match in
10708/// both, or via a known bits test.
10710 ArrayRef<int> ExpectedMask,
10711 const SelectionDAG &DAG,
10712 SDValue V1 = SDValue(),
10713 SDValue V2 = SDValue()) {
10714 int Size = Mask.size();
10715 if (Size != (int)ExpectedMask.size())
10716 return false;
10717 assert(llvm::all_of(ExpectedMask,
10718 [Size](int M) {
10719 return M == SM_SentinelZero ||
10720 isInRange(M, 0, 2 * Size);
10721 }) &&
10722 "Illegal target shuffle mask");
10723
10724 // Check for out-of-range target shuffle mask indices.
10725 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10726 return false;
10727
10728 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10729 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10730 !V1.getValueType().isVector()))
10731 V1 = SDValue();
10732 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10733 !V2.getValueType().isVector()))
10734 V2 = SDValue();
10735
10736 APInt ZeroV1 = APInt::getZero(Size);
10737 APInt ZeroV2 = APInt::getZero(Size);
10738
10739 for (int i = 0; i < Size; ++i) {
10740 int MaskIdx = Mask[i];
10741 int ExpectedIdx = ExpectedMask[i];
10742 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10743 continue;
10744 // If we failed to match an expected SM_SentinelZero then early out.
10745 if (ExpectedIdx < 0)
10746 return false;
10747 if (MaskIdx == SM_SentinelZero) {
10748 // If we need this expected index to be a zero element, then update the
10749 // relevant zero mask and perform the known bits at the end to minimize
10750 // repeated computes.
10751 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10752 if (ExpectedV &&
10753 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10754 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10755 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10756 ZeroMask.setBit(BitIdx);
10757 continue;
10758 }
10759 }
10760 if (MaskIdx >= 0) {
10761 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10762 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10763 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10764 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10765 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10766 continue;
10767 }
10768 return false;
10769 }
10770 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10771 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10772}
10773
10774// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10775// instructions.
10777 const SelectionDAG &DAG) {
10778 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10779 return false;
10780
10781 SmallVector<int, 8> Unpcklwd;
10782 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10783 /* Unary = */ false);
10784 SmallVector<int, 8> Unpckhwd;
10785 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10786 /* Unary = */ false);
10787 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10788 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10789 return IsUnpackwdMask;
10790}
10791
10793 const SelectionDAG &DAG) {
10794 // Create 128-bit vector type based on mask size.
10795 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10796 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10797
10798 // We can't assume a canonical shuffle mask, so try the commuted version too.
10799 SmallVector<int, 4> CommutedMask(Mask);
10801
10802 // Match any of unary/binary or low/high.
10803 for (unsigned i = 0; i != 4; ++i) {
10804 SmallVector<int, 16> UnpackMask;
10805 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10806 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10807 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10808 return true;
10809 }
10810 return false;
10811}
10812
10813/// Return true if a shuffle mask chooses elements identically in its top and
10814/// bottom halves. For example, any splat mask has the same top and bottom
10815/// halves. If an element is undefined in only one half of the mask, the halves
10816/// are not considered identical.
10818 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10819 unsigned HalfSize = Mask.size() / 2;
10820 for (unsigned i = 0; i != HalfSize; ++i) {
10821 if (Mask[i] != Mask[i + HalfSize])
10822 return false;
10823 }
10824 return true;
10825}
10826
10827/// Get a 4-lane 8-bit shuffle immediate for a mask.
10828///
10829/// This helper function produces an 8-bit shuffle immediate corresponding to
10830/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10831/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10832/// example.
10833///
10834/// NB: We rely heavily on "undef" masks preserving the input lane.
10835static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10836 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10837 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10838 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10839 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10840 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10841
10842 // If the mask only uses one non-undef element, then fully 'splat' it to
10843 // improve later broadcast matching.
10844 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10845 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10846
10847 int FirstElt = Mask[FirstIndex];
10848 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10849 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10850
10851 unsigned Imm = 0;
10852 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10853 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10854 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10855 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10856 return Imm;
10857}
10858
10860 SelectionDAG &DAG) {
10861 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10862}
10863
10864// Canonicalize SHUFPD mask to improve chances of further folding.
10865// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10866static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10867 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10868 "Unexpected SHUFPD mask size");
10869 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10870 "Unexpected SHUFPD mask elements");
10871
10872 // If the mask only uses one non-undef element, then fully 'splat' it to
10873 // improve later broadcast matching.
10874 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10875 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10876 "All undef shuffle mask");
10877
10878 int FirstElt = Mask[FirstIndex];
10879 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10880 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10881 unsigned Imm = 0;
10882 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10883 Imm |= FirstElt << I;
10884 return Imm;
10885 }
10886
10887 // Attempt to keep any undef elements in place to improve chances of the
10888 // shuffle becoming a (commutative) blend.
10889 unsigned Imm = 0;
10890 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10891 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10892
10893 return Imm;
10894}
10895
10897 SelectionDAG &DAG) {
10898 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10899}
10900
10901// The Shuffle result is as follow:
10902// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10903// Each Zeroable's element correspond to a particular Mask's element.
10904// As described in computeZeroableShuffleElements function.
10905//
10906// The function looks for a sub-mask that the nonzero elements are in
10907// increasing order. If such sub-mask exist. The function returns true.
10908static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10909 ArrayRef<int> Mask, const EVT &VectorType,
10910 bool &IsZeroSideLeft) {
10911 int NextElement = -1;
10912 // Check if the Mask's nonzero elements are in increasing order.
10913 for (int i = 0, e = Mask.size(); i < e; i++) {
10914 // Checks if the mask's zeros elements are built from only zeros.
10915 assert(Mask[i] >= -1 && "Out of bound mask element!");
10916 if (Mask[i] < 0)
10917 return false;
10918 if (Zeroable[i])
10919 continue;
10920 // Find the lowest non zero element
10921 if (NextElement < 0) {
10922 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10923 IsZeroSideLeft = NextElement != 0;
10924 }
10925 // Exit if the mask's non zero elements are not in increasing order.
10926 if (NextElement != Mask[i])
10927 return false;
10928 NextElement++;
10929 }
10930 return true;
10931}
10932
10933static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10935 const X86Subtarget &Subtarget,
10936 unsigned Depth = 0);
10937
10938/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10940 ArrayRef<int> Mask, SDValue V1,
10941 SDValue V2, const APInt &Zeroable,
10942 const X86Subtarget &Subtarget,
10943 SelectionDAG &DAG) {
10944 int Size = Mask.size();
10945 int LaneSize = 128 / VT.getScalarSizeInBits();
10946 const int NumBytes = VT.getSizeInBits() / 8;
10947 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10948
10949 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10950 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10951 (Subtarget.hasBWI() && VT.is512BitVector()));
10952
10953 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10954 // Sign bit set in i8 mask means zero element.
10955 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10956
10957 SDValue V;
10958 for (int i = 0; i < NumBytes; ++i) {
10959 int M = Mask[i / NumEltBytes];
10960 if (M < 0) {
10961 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10962 continue;
10963 }
10964 if (Zeroable[i / NumEltBytes]) {
10965 PSHUFBMask[i] = ZeroMask;
10966 continue;
10967 }
10968
10969 // We can only use a single input of V1 or V2.
10970 SDValue SrcV = (M >= Size ? V2 : V1);
10971 if (V && V != SrcV)
10972 return SDValue();
10973 V = SrcV;
10974 M %= Size;
10975
10976 // PSHUFB can't cross lanes, ensure this doesn't happen.
10977 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10978 return SDValue();
10979
10980 M = M % LaneSize;
10981 M = M * NumEltBytes + (i % NumEltBytes);
10982 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10983 }
10984 assert(V && "Failed to find a source input");
10985
10986 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10987 return DAG.getBitcast(
10988 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10989 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10990}
10991
10992/// Return Mask with the necessary casting or extending
10993/// for \p Mask according to \p MaskVT when lowering masking intrinsics
10994static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10995 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10996 const SDLoc &dl) {
10997 MVT SrcVT = Mask.getSimpleValueType();
10998 assert(SrcVT.isScalarInteger() && "Expected scalar integer mask source!");
10999 assert(MaskVT.bitsLE(SrcVT) && "Unexpected mask size!");
11000 assert(MaskVT.getVectorElementType() == MVT::i1 && "Bool vector expected!");
11001
11002 if (isAllOnesConstant(Mask))
11003 return DAG.getConstant(1, dl, MaskVT);
11004 if (X86::isZeroNode(Mask))
11005 return DAG.getConstant(0, dl, MaskVT);
11006
11007 // Attempt to pre-truncate the mask source (to a minimum of i8).
11008 if (SrcVT.getSizeInBits() > MaskVT.getVectorNumElements()) {
11009 SrcVT = MVT::getIntegerVT(std::max((int)MaskVT.getVectorNumElements(), 8));
11010 Mask = DAG.getNode(ISD::TRUNCATE, dl, SrcVT, Mask);
11011 }
11012
11013 if (SrcVT == MVT::i64 && Subtarget.is32Bit()) {
11014 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
11015 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
11016 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
11017 SDValue Lo, Hi;
11018 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
11019 Lo = DAG.getBitcast(MVT::v32i1, Lo);
11020 Hi = DAG.getBitcast(MVT::v32i1, Hi);
11021 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
11022 }
11023
11024 MVT BitcastVT = MVT::getVectorVT(MVT::i1, SrcVT.getSizeInBits());
11025 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
11026 // are extracted by EXTRACT_SUBVECTOR.
11027 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
11028 DAG.getBitcast(BitcastVT, Mask),
11029 DAG.getVectorIdxConstant(0, dl));
11030}
11031
11032// X86 has dedicated shuffle that can be lowered to VEXPAND
11034 SDValue V2, ArrayRef<int> Mask,
11035 const APInt &Zeroable,
11036 const X86Subtarget &Subtarget,
11037 SelectionDAG &DAG) {
11038 bool IsLeftZeroSide = true;
11039 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11040 IsLeftZeroSide))
11041 return SDValue();
11042 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11044 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11045 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11046 unsigned NumElts = VT.getVectorNumElements();
11047 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
11048 "Unexpected number of vector elements");
11049 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11050 Subtarget, DAG, DL);
11051 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11052 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11053 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11054}
11055
11056static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11057 unsigned &UnpackOpcode, bool IsUnary,
11058 ArrayRef<int> TargetMask, const SDLoc &DL,
11059 SelectionDAG &DAG,
11060 const X86Subtarget &Subtarget) {
11061 int NumElts = VT.getVectorNumElements();
11062
11063 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11064 for (int i = 0; i != NumElts; i += 2) {
11065 int M1 = TargetMask[i + 0];
11066 int M2 = TargetMask[i + 1];
11067 Undef1 &= (SM_SentinelUndef == M1);
11068 Undef2 &= (SM_SentinelUndef == M2);
11069 Zero1 &= isUndefOrZero(M1);
11070 Zero2 &= isUndefOrZero(M2);
11071 }
11072 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
11073 "Zeroable shuffle detected");
11074
11075 // Attempt to match the target mask against the unpack lo/hi mask patterns.
11076 SmallVector<int, 64> Unpckl, Unpckh;
11077 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11078 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
11079 (IsUnary ? V1 : V2))) {
11080 UnpackOpcode = X86ISD::UNPCKL;
11081 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11082 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11083 return true;
11084 }
11085
11086 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11087 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
11088 (IsUnary ? V1 : V2))) {
11089 UnpackOpcode = X86ISD::UNPCKH;
11090 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11091 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11092 return true;
11093 }
11094
11095 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11096 if (IsUnary && (Zero1 || Zero2)) {
11097 // Don't bother if we can blend instead.
11098 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11099 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11100 return false;
11101
11102 bool MatchLo = true, MatchHi = true;
11103 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11104 int M = TargetMask[i];
11105
11106 // Ignore if the input is known to be zero or the index is undef.
11107 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11108 (M == SM_SentinelUndef))
11109 continue;
11110
11111 MatchLo &= (M == Unpckl[i]);
11112 MatchHi &= (M == Unpckh[i]);
11113 }
11114
11115 if (MatchLo || MatchHi) {
11116 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11117 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11118 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11119 return true;
11120 }
11121 }
11122
11123 // If a binary shuffle, commute and try again.
11124 if (!IsUnary) {
11126 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
11127 UnpackOpcode = X86ISD::UNPCKL;
11128 std::swap(V1, V2);
11129 return true;
11130 }
11131
11133 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
11134 UnpackOpcode = X86ISD::UNPCKH;
11135 std::swap(V1, V2);
11136 return true;
11137 }
11138 }
11139
11140 return false;
11141}
11142
11143// X86 has dedicated unpack instructions that can handle specific blend
11144// operations: UNPCKH and UNPCKL.
11146 SDValue V2, ArrayRef<int> Mask,
11147 SelectionDAG &DAG) {
11148 SmallVector<int, 8> Unpckl;
11149 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11150 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11151 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11152
11153 SmallVector<int, 8> Unpckh;
11154 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11155 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11156 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11157
11158 // Commute and try again.
11160 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11161 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11162
11164 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11165 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11166
11167 return SDValue();
11168}
11169
11170/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11171/// followed by unpack 256-bit.
11173 SDValue V2, ArrayRef<int> Mask,
11174 SelectionDAG &DAG) {
11175 SmallVector<int, 32> Unpckl, Unpckh;
11176 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11177 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11178
11179 unsigned UnpackOpcode;
11180 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11181 UnpackOpcode = X86ISD::UNPCKL;
11182 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11183 UnpackOpcode = X86ISD::UNPCKH;
11184 else
11185 return SDValue();
11186
11187 // This is a "natural" unpack operation (rather than the 128-bit sectored
11188 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11189 // input in order to use the x86 instruction.
11190 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11191 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11192 V1 = DAG.getBitcast(VT, V1);
11193 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11194}
11195
11196// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11197// source into the lower elements and zeroing the upper elements.
11198static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11199 ArrayRef<int> Mask, const APInt &Zeroable,
11200 const X86Subtarget &Subtarget) {
11201 if (!VT.is512BitVector() && !Subtarget.hasVLX())
11202 return false;
11203
11204 unsigned NumElts = Mask.size();
11205 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11206 unsigned MaxScale = 64 / EltSizeInBits;
11207
11208 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11209 unsigned SrcEltBits = EltSizeInBits * Scale;
11210 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11211 continue;
11212 unsigned NumSrcElts = NumElts / Scale;
11213 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11214 continue;
11215 unsigned UpperElts = NumElts - NumSrcElts;
11216 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
11217 continue;
11218 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11219 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11220 DstVT = MVT::getIntegerVT(EltSizeInBits);
11221 if ((NumSrcElts * EltSizeInBits) >= 128) {
11222 // ISD::TRUNCATE
11223 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11224 } else {
11225 // X86ISD::VTRUNC
11226 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11227 }
11228 return true;
11229 }
11230
11231 return false;
11232}
11233
11234// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11235// element padding to the final DstVT.
11236static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11237 const X86Subtarget &Subtarget,
11238 SelectionDAG &DAG, bool ZeroUppers) {
11239 MVT SrcVT = Src.getSimpleValueType();
11240 MVT DstSVT = DstVT.getScalarType();
11241 unsigned NumDstElts = DstVT.getVectorNumElements();
11242 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11243 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11244
11245 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11246 return SDValue();
11247
11248 // Perform a direct ISD::TRUNCATE if possible.
11249 if (NumSrcElts == NumDstElts)
11250 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11251
11252 if (NumSrcElts > NumDstElts) {
11253 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11254 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11255 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11256 }
11257
11258 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11259 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11260 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11261 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11262 DstVT.getSizeInBits());
11263 }
11264
11265 // Non-VLX targets must truncate from a 512-bit type, so we need to
11266 // widen, truncate and then possibly extract the original subvector.
11267 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11268 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11269 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11270 }
11271
11272 // Fallback to a X86ISD::VTRUNC, padding if necessary.
11273 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11274 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11275 if (DstVT != TruncVT)
11276 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11277 DstVT.getSizeInBits());
11278 return Trunc;
11279}
11280
11281// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11282//
11283// An example is the following:
11284//
11285// t0: ch = EntryToken
11286// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11287// t25: v4i32 = truncate t2
11288// t41: v8i16 = bitcast t25
11289// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11290// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11291// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11292// t18: v2i64 = bitcast t51
11293//
11294// One can just use a single vpmovdw instruction, without avx512vl we need to
11295// use the zmm variant and extract the lower subvector, padding with zeroes.
11296// TODO: Merge with lowerShuffleAsVTRUNC.
11298 SDValue V2, ArrayRef<int> Mask,
11299 const APInt &Zeroable,
11300 const X86Subtarget &Subtarget,
11301 SelectionDAG &DAG) {
11302 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
11303 if (!Subtarget.hasAVX512())
11304 return SDValue();
11305
11306 unsigned NumElts = VT.getVectorNumElements();
11307 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11308 unsigned MaxScale = 64 / EltSizeInBits;
11309 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11310 unsigned SrcEltBits = EltSizeInBits * Scale;
11311 unsigned NumSrcElts = NumElts / Scale;
11312 unsigned UpperElts = NumElts - NumSrcElts;
11313 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11314 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
11315 continue;
11316
11317 // Attempt to find a matching source truncation, but as a fall back VLX
11318 // cases can use the VPMOV directly.
11319 SDValue Src = peekThroughBitcasts(V1);
11320 if (Src.getOpcode() == ISD::TRUNCATE &&
11321 Src.getScalarValueSizeInBits() == SrcEltBits) {
11322 Src = Src.getOperand(0);
11323 } else if (Subtarget.hasVLX()) {
11324 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11325 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11326 Src = DAG.getBitcast(SrcVT, Src);
11327 // Don't do this if PACKSS/PACKUS could perform it cheaper.
11328 if (Scale == 2 &&
11329 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
11330 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
11331 return SDValue();
11332 } else
11333 return SDValue();
11334
11335 // VPMOVWB is only available with avx512bw.
11336 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
11337 return SDValue();
11338
11339 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11340 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11341 }
11342
11343 return SDValue();
11344}
11345
11346// Attempt to match binary shuffle patterns as a truncate.
11348 SDValue V2, ArrayRef<int> Mask,
11349 const APInt &Zeroable,
11350 const X86Subtarget &Subtarget,
11351 SelectionDAG &DAG) {
11352 assert((VT.is128BitVector() || VT.is256BitVector()) &&
11353 "Unexpected VTRUNC type");
11354 if (!Subtarget.hasAVX512() ||
11355 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
11356 return SDValue();
11357
11358 unsigned NumElts = VT.getVectorNumElements();
11359 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11360 unsigned MaxScale = 64 / EltSizeInBits;
11361 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11362 // TODO: Support non-BWI VPMOVWB truncations?
11363 unsigned SrcEltBits = EltSizeInBits * Scale;
11364 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11365 continue;
11366
11367 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
11368 // Bail if the V2 elements are undef.
11369 unsigned NumHalfSrcElts = NumElts / Scale;
11370 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11371 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
11372 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
11373 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11374 continue;
11375
11376 // The elements beyond the truncation must be undef/zero.
11377 unsigned UpperElts = NumElts - NumSrcElts;
11378 if (UpperElts > 0 &&
11379 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
11380 continue;
11381 bool UndefUppers =
11382 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11383
11384 // As we're using both sources then we need to concat them together
11385 // and truncate from the double-sized src.
11386 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
11387
11388 // For offset truncations, ensure that the concat is cheap.
11389 SDValue Src =
11390 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
11391 if (!Src) {
11392 if (Offset)
11393 continue;
11394 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11395 }
11396
11397 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11398 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11399 Src = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, SrcVT, Src,
11400 Offset * EltSizeInBits, DAG);
11401 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11402 }
11403 }
11404
11405 return SDValue();
11406}
11407
11408/// Check whether a compaction lowering can be done by dropping even/odd
11409/// elements and compute how many times even/odd elements must be dropped.
11410///
11411/// This handles shuffles which take every Nth element where N is a power of
11412/// two. Example shuffle masks:
11413///
11414/// (even)
11415/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11416/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11417/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11418/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11419/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11420/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11421///
11422/// (odd)
11423/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
11424/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
11425///
11426/// Any of these lanes can of course be undef.
11427///
11428/// This routine only supports N <= 3.
11429/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11430/// for larger N.
11431///
11432/// \returns N above, or the number of times even/odd elements must be dropped
11433/// if there is such a number. Otherwise returns zero.
11434static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
11435 bool IsSingleInput) {
11436 // The modulus for the shuffle vector entries is based on whether this is
11437 // a single input or not.
11438 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11439 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11440 "We should only be called with masks with a power-of-2 size!");
11441
11442 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11443 int Offset = MatchEven ? 0 : 1;
11444
11445 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11446 // and 2^3 simultaneously. This is because we may have ambiguity with
11447 // partially undef inputs.
11448 bool ViableForN[3] = {true, true, true};
11449
11450 for (int i = 0, e = Mask.size(); i < e; ++i) {
11451 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11452 // want.
11453 if (Mask[i] < 0)
11454 continue;
11455
11456 bool IsAnyViable = false;
11457 for (unsigned j = 0; j != std::size(ViableForN); ++j)
11458 if (ViableForN[j]) {
11459 uint64_t N = j + 1;
11460
11461 // The shuffle mask must be equal to (i * 2^N) % M.
11462 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
11463 IsAnyViable = true;
11464 else
11465 ViableForN[j] = false;
11466 }
11467 // Early exit if we exhaust the possible powers of two.
11468 if (!IsAnyViable)
11469 break;
11470 }
11471
11472 for (unsigned j = 0; j != std::size(ViableForN); ++j)
11473 if (ViableForN[j])
11474 return j + 1;
11475
11476 // Return 0 as there is no viable power of two.
11477 return 0;
11478}
11479
11480// X86 has dedicated pack instructions that can handle specific truncation
11481// operations: PACKSS and PACKUS.
11482// Checks for compaction shuffle masks if MaxStages > 1.
11483// TODO: Add support for matching multiple PACKSS/PACKUS stages.
11484static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11485 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11486 const SelectionDAG &DAG,
11487 const X86Subtarget &Subtarget,
11488 unsigned MaxStages = 1) {
11489 unsigned NumElts = VT.getVectorNumElements();
11490 unsigned BitSize = VT.getScalarSizeInBits();
11491 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11492 "Illegal maximum compaction");
11493
11494 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11495 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11496 unsigned NumPackedBits = NumSrcBits - BitSize;
11497 N1 = peekThroughBitcasts(N1);
11498 N2 = peekThroughBitcasts(N2);
11499 unsigned NumBits1 = N1.getScalarValueSizeInBits();
11500 unsigned NumBits2 = N2.getScalarValueSizeInBits();
11501 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
11502 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
11503 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11504 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11505 return false;
11506 if (Subtarget.hasSSE41() || BitSize == 8) {
11507 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11508 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
11509 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
11510 V1 = N1;
11511 V2 = N2;
11512 SrcVT = PackVT;
11513 PackOpcode = X86ISD::PACKUS;
11514 return true;
11515 }
11516 }
11517 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
11518 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
11519 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
11520 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
11521 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11522 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
11523 V1 = N1;
11524 V2 = N2;
11525 SrcVT = PackVT;
11526 PackOpcode = X86ISD::PACKSS;
11527 return true;
11528 }
11529 return false;
11530 };
11531
11532 // Attempt to match against wider and wider compaction patterns.
11533 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11534 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11535 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11536
11537 // Try binary shuffle.
11538 SmallVector<int, 32> BinaryMask;
11539 createPackShuffleMask(VT, BinaryMask, false, NumStages);
11540 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
11541 if (MatchPACK(V1, V2, PackVT))
11542 return true;
11543
11544 // Try unary shuffle.
11545 SmallVector<int, 32> UnaryMask;
11546 createPackShuffleMask(VT, UnaryMask, true, NumStages);
11547 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
11548 if (MatchPACK(V1, V1, PackVT))
11549 return true;
11550 }
11551
11552 return false;
11553}
11554
11556 SDValue V2, ArrayRef<int> Mask,
11557 const X86Subtarget &Subtarget,
11558 SelectionDAG &DAG) {
11559 MVT PackVT;
11560 unsigned PackOpcode;
11561 unsigned SizeBits = VT.getSizeInBits();
11562 unsigned EltBits = VT.getScalarSizeInBits();
11563 unsigned MaxStages = Log2_32(64 / EltBits);
11564 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11565 Subtarget, MaxStages))
11566 return SDValue();
11567
11568 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11569 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11570
11571 // Don't lower multi-stage packs on AVX512, truncation is better.
11572 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11573 return SDValue();
11574
11575 // Pack to the largest type possible:
11576 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11577 unsigned MaxPackBits = 16;
11578 if (CurrentEltBits > 16 &&
11579 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11580 MaxPackBits = 32;
11581
11582 // Repeatedly pack down to the target size.
11583 SDValue Res;
11584 for (unsigned i = 0; i != NumStages; ++i) {
11585 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11586 unsigned NumSrcElts = SizeBits / SrcEltBits;
11587 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11588 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11589 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11590 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11591 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11592 DAG.getBitcast(SrcVT, V2));
11593 V1 = V2 = Res;
11594 CurrentEltBits /= 2;
11595 }
11596 assert(Res && Res.getValueType() == VT &&
11597 "Failed to lower compaction shuffle");
11598 return Res;
11599}
11600
11601/// Try to emit a bitmask instruction for a shuffle.
11602///
11603/// This handles cases where we can model a blend exactly as a bitmask due to
11604/// one of the inputs being zeroable.
11606 SDValue V2, ArrayRef<int> Mask,
11607 const APInt &Zeroable,
11608 const X86Subtarget &Subtarget,
11609 SelectionDAG &DAG) {
11610 MVT MaskVT = VT;
11611 MVT EltVT = VT.getVectorElementType();
11612 SDValue Zero, AllOnes;
11613 // Use f64 if i64 isn't legal.
11614 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11615 EltVT = MVT::f64;
11616 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11617 }
11618
11619 MVT LogicVT = VT;
11620 if (EltVT.isFloatingPoint()) {
11621 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11622 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11623 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11624 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11625 } else {
11626 Zero = DAG.getConstant(0, DL, EltVT);
11627 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11628 }
11629
11630 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11631 SDValue V;
11632 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11633 if (Zeroable[i])
11634 continue;
11635 if (Mask[i] % Size != i)
11636 return SDValue(); // Not a blend.
11637 if (!V)
11638 V = Mask[i] < Size ? V1 : V2;
11639 else if (V != (Mask[i] < Size ? V1 : V2))
11640 return SDValue(); // Can only let one input through the mask.
11641
11642 VMaskOps[i] = AllOnes;
11643 }
11644 if (!V)
11645 return SDValue(); // No non-zeroable elements!
11646
11647 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11648 VMask = DAG.getBitcast(LogicVT, VMask);
11649 V = DAG.getBitcast(LogicVT, V);
11650 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11651 return DAG.getBitcast(VT, And);
11652}
11653
11654/// Try to emit a blend instruction for a shuffle using bit math.
11655///
11656/// This is used as a fallback approach when first class blend instructions are
11657/// unavailable. Currently it is only suitable for integer vectors, but could
11658/// be generalized for floating point vectors if desirable.
11660 SDValue V2, ArrayRef<int> Mask,
11661 SelectionDAG &DAG) {
11662 assert(VT.isInteger() && "Only supports integer vector types!");
11663 MVT EltVT = VT.getVectorElementType();
11664 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11665 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11667 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11668 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11669 return SDValue(); // Shuffled input!
11670 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11671 }
11672
11673 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11674 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11675}
11676
11678 SDValue PreservedSrc,
11679 const X86Subtarget &Subtarget,
11680 SelectionDAG &DAG);
11681
11684 const APInt &Zeroable, bool &ForceV1Zero,
11685 bool &ForceV2Zero, uint64_t &BlendMask) {
11686 bool V1IsZeroOrUndef =
11688 bool V2IsZeroOrUndef =
11690
11691 BlendMask = 0;
11692 ForceV1Zero = false, ForceV2Zero = false;
11693 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11694
11695 int NumElts = Mask.size();
11696 int NumLanes = VT.getSizeInBits() / 128;
11697 int NumEltsPerLane = NumElts / NumLanes;
11698 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11699
11700 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11701 // then ensure the blend mask part for that lane just references that input.
11702 bool ForceWholeLaneMasks =
11703 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11704
11705 // Attempt to generate the binary blend mask. If an input is zero then
11706 // we can use any lane.
11707 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11708 // Keep track of the inputs used per lane.
11709 bool LaneV1InUse = false;
11710 bool LaneV2InUse = false;
11711 uint64_t LaneBlendMask = 0;
11712 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11713 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11714 int M = Mask[Elt];
11715 if (M == SM_SentinelUndef)
11716 continue;
11717 if (M == Elt || (0 <= M && M < NumElts &&
11718 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11719 Mask[Elt] = Elt;
11720 LaneV1InUse = true;
11721 continue;
11722 }
11723 if (M == (Elt + NumElts) ||
11724 (NumElts <= M &&
11725 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11726 LaneBlendMask |= 1ull << LaneElt;
11727 Mask[Elt] = Elt + NumElts;
11728 LaneV2InUse = true;
11729 continue;
11730 }
11731 if (Zeroable[Elt]) {
11732 if (V1IsZeroOrUndef) {
11733 ForceV1Zero = true;
11734 Mask[Elt] = Elt;
11735 LaneV1InUse = true;
11736 continue;
11737 }
11738 if (V2IsZeroOrUndef) {
11739 ForceV2Zero = true;
11740 LaneBlendMask |= 1ull << LaneElt;
11741 Mask[Elt] = Elt + NumElts;
11742 LaneV2InUse = true;
11743 continue;
11744 }
11745 }
11746 return false;
11747 }
11748
11749 // If we only used V2 then splat the lane blend mask to avoid any demanded
11750 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11751 // blend mask bit).
11752 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11753 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11754
11755 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11756 }
11757 return true;
11758}
11759
11760/// Try to emit a blend instruction for a shuffle.
11761///
11762/// This doesn't do any checks for the availability of instructions for blending
11763/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11764/// be matched in the backend with the type given. What it does check for is
11765/// that the shuffle mask is a blend, or convertible into a blend with zero.
11767 SDValue V2, ArrayRef<int> Original,
11768 const APInt &Zeroable,
11769 const X86Subtarget &Subtarget,
11770 SelectionDAG &DAG) {
11771 uint64_t BlendMask = 0;
11772 bool ForceV1Zero = false, ForceV2Zero = false;
11773 SmallVector<int, 64> Mask(Original);
11774 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11775 BlendMask))
11776 return SDValue();
11777
11778 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11779 if (ForceV1Zero)
11780 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11781 if (ForceV2Zero)
11782 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11783
11784 unsigned NumElts = VT.getVectorNumElements();
11785
11786 switch (VT.SimpleTy) {
11787 case MVT::v4i64:
11788 case MVT::v8i32:
11789 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11790 [[fallthrough]];
11791 case MVT::v4f64:
11792 case MVT::v8f32:
11793 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11794 [[fallthrough]];
11795 case MVT::v2f64:
11796 case MVT::v2i64:
11797 case MVT::v4f32:
11798 case MVT::v4i32:
11799 case MVT::v8i16:
11800 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11801 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11802 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11803 case MVT::v16i16: {
11804 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11805 SmallVector<int, 8> RepeatedMask;
11806 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11807 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11808 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11809 BlendMask = 0;
11810 for (int i = 0; i < 8; ++i)
11811 if (RepeatedMask[i] >= 8)
11812 BlendMask |= 1ull << i;
11813 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11814 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11815 }
11816 // Use PBLENDW for lower/upper lanes and then blend lanes.
11817 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11818 // merge to VSELECT where useful.
11819 uint64_t LoMask = BlendMask & 0xFF;
11820 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11821 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11822 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11823 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11824 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11825 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11826 return DAG.getVectorShuffle(
11827 MVT::v16i16, DL, Lo, Hi,
11828 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11829 }
11830 [[fallthrough]];
11831 }
11832 case MVT::v32i8:
11833 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11834 [[fallthrough]];
11835 case MVT::v16i8: {
11836 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11837
11838 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11839 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11840 Subtarget, DAG))
11841 return Masked;
11842
11843 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11844 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11845 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11846 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11847 }
11848
11849 // If we have VPTERNLOG, we can use that as a bit blend.
11850 if (Subtarget.hasVLX())
11851 if (SDValue BitBlend =
11852 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11853 return BitBlend;
11854
11855 // Scale the blend by the number of bytes per element.
11856 int Scale = VT.getScalarSizeInBits() / 8;
11857
11858 // This form of blend is always done on bytes. Compute the byte vector
11859 // type.
11860 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11861
11862 // x86 allows load folding with blendvb from the 2nd source operand. But
11863 // we are still using LLVM select here (see comment below), so that's V1.
11864 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11865 // allow that load-folding possibility.
11866 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11868 std::swap(V1, V2);
11869 }
11870
11871 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11872 // mix of LLVM's code generator and the x86 backend. We tell the code
11873 // generator that boolean values in the elements of an x86 vector register
11874 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11875 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11876 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11877 // of the element (the remaining are ignored) and 0 in that high bit would
11878 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11879 // the LLVM model for boolean values in vector elements gets the relevant
11880 // bit set, it is set backwards and over constrained relative to x86's
11881 // actual model.
11882 SmallVector<SDValue, 32> VSELECTMask;
11883 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11884 for (int j = 0; j < Scale; ++j)
11885 VSELECTMask.push_back(
11886 Mask[i] < 0
11887 ? DAG.getUNDEF(MVT::i8)
11888 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11889
11890 V1 = DAG.getBitcast(BlendVT, V1);
11891 V2 = DAG.getBitcast(BlendVT, V2);
11892 return DAG.getBitcast(
11893 VT,
11894 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11895 V1, V2));
11896 }
11897 case MVT::v16f32:
11898 case MVT::v8f64:
11899 case MVT::v8i64:
11900 case MVT::v16i32:
11901 case MVT::v32i16:
11902 case MVT::v64i8: {
11903 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11904 bool OptForSize = DAG.shouldOptForSize();
11905 if (!OptForSize) {
11906 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11907 Subtarget, DAG))
11908 return Masked;
11909 }
11910
11911 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11912 // masked move.
11913 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11914 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11915 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11916 }
11917 default:
11918 llvm_unreachable("Not a supported integer vector type!");
11919 }
11920}
11921
11922/// Try to lower as a blend of elements from two inputs followed by
11923/// a single-input permutation.
11924///
11925/// This matches the pattern where we can blend elements from two inputs and
11926/// then reduce the shuffle to a single-input permutation.
11928 SDValue V1, SDValue V2,
11929 ArrayRef<int> Mask,
11930 SelectionDAG &DAG,
11931 bool ImmBlends = false) {
11932 // We build up the blend mask while checking whether a blend is a viable way
11933 // to reduce the shuffle.
11934 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11935 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11936
11937 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11938 if (Mask[i] < 0)
11939 continue;
11940
11941 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11942
11943 if (BlendMask[Mask[i] % Size] < 0)
11944 BlendMask[Mask[i] % Size] = Mask[i];
11945 else if (BlendMask[Mask[i] % Size] != Mask[i])
11946 return SDValue(); // Can't blend in the needed input!
11947
11948 PermuteMask[i] = Mask[i] % Size;
11949 }
11950
11951 // If only immediate blends, then bail if the blend mask can't be widened to
11952 // i16.
11953 unsigned EltSize = VT.getScalarSizeInBits();
11954 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11955 return SDValue();
11956
11957 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11958 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11959}
11960
11961/// Try to lower as an unpack of elements from two inputs followed by
11962/// a single-input permutation.
11963///
11964/// This matches the pattern where we can unpack elements from two inputs and
11965/// then reduce the shuffle to a single-input (wider) permutation.
11967 SDValue V1, SDValue V2,
11968 ArrayRef<int> Mask,
11969 SelectionDAG &DAG) {
11970 int NumElts = Mask.size();
11971 int NumLanes = VT.getSizeInBits() / 128;
11972 int NumLaneElts = NumElts / NumLanes;
11973 int NumHalfLaneElts = NumLaneElts / 2;
11974
11975 bool MatchLo = true, MatchHi = true;
11976 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11977
11978 // Determine UNPCKL/UNPCKH type and operand order.
11979 for (int Elt = 0; Elt != NumElts; ++Elt) {
11980 int M = Mask[Elt];
11981 if (M < 0)
11982 continue;
11983
11984 // Normalize the mask value depending on whether it's V1 or V2.
11985 int NormM = M;
11986 SDValue &Op = Ops[Elt & 1];
11987 if (M < NumElts && (Op.isUndef() || Op == V1))
11988 Op = V1;
11989 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11990 Op = V2;
11991 NormM -= NumElts;
11992 } else
11993 return SDValue();
11994
11995 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11996 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11997 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11998 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11999 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
12000 if (MatchLoAnyLane || MatchHiAnyLane) {
12001 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
12002 "Failed to match UNPCKLO/UNPCKHI");
12003 break;
12004 }
12005 }
12006 MatchLo &= MatchLoAnyLane;
12007 MatchHi &= MatchHiAnyLane;
12008 if (!MatchLo && !MatchHi)
12009 return SDValue();
12010 }
12011 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
12012
12013 // Element indices have changed after unpacking. Calculate permute mask
12014 // so that they will be put back to the position as dictated by the
12015 // original shuffle mask indices.
12016 SmallVector<int, 32> PermuteMask(NumElts, -1);
12017 for (int Elt = 0; Elt != NumElts; ++Elt) {
12018 int M = Mask[Elt];
12019 if (M < 0)
12020 continue;
12021 int NormM = M;
12022 if (NumElts <= M)
12023 NormM -= NumElts;
12024 bool IsFirstOp = M < NumElts;
12025 int BaseMaskElt =
12026 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
12027 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
12028 PermuteMask[Elt] = BaseMaskElt;
12029 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
12030 PermuteMask[Elt] = BaseMaskElt + 1;
12031 assert(PermuteMask[Elt] != -1 &&
12032 "Input mask element is defined but failed to assign permute mask");
12033 }
12034
12035 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12036 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12037 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12038}
12039
12040/// Try to lower a shuffle as a permute of the inputs followed by an
12041/// UNPCK instruction.
12042///
12043/// This specifically targets cases where we end up with alternating between
12044/// the two inputs, and so can permute them into something that feeds a single
12045/// UNPCK instruction. Note that this routine only targets integer vectors
12046/// because for floating point vectors we have a generalized SHUFPS lowering
12047/// strategy that handles everything that doesn't *exactly* match an unpack,
12048/// making this clever lowering unnecessary.
12050 SDValue V1, SDValue V2,
12051 ArrayRef<int> Mask,
12052 const X86Subtarget &Subtarget,
12053 SelectionDAG &DAG) {
12054 int Size = Mask.size();
12055 assert(Mask.size() >= 2 && "Single element masks are invalid.");
12056
12057 // This routine only supports 128-bit integer dual input vectors.
12058 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
12059 return SDValue();
12060
12061 int NumLoInputs =
12062 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
12063 int NumHiInputs =
12064 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
12065
12066 bool UnpackLo = NumLoInputs >= NumHiInputs;
12067
12068 auto TryUnpack = [&](int ScalarSize, int Scale) {
12069 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
12070 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
12071
12072 for (int i = 0; i < Size; ++i) {
12073 if (Mask[i] < 0)
12074 continue;
12075
12076 // Each element of the unpack contains Scale elements from this mask.
12077 int UnpackIdx = i / Scale;
12078
12079 // We only handle the case where V1 feeds the first slots of the unpack.
12080 // We rely on canonicalization to ensure this is the case.
12081 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
12082 return SDValue();
12083
12084 // Setup the mask for this input. The indexing is tricky as we have to
12085 // handle the unpack stride.
12086 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
12087 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
12088 Mask[i] % Size;
12089 }
12090
12091 // If we will have to shuffle both inputs to use the unpack, check whether
12092 // we can just unpack first and shuffle the result. If so, skip this unpack.
12093 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
12094 !isNoopShuffleMask(V2Mask))
12095 return SDValue();
12096
12097 // Shuffle the inputs into place.
12098 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12099 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12100
12101 // Cast the inputs to the type we will use to unpack them.
12102 MVT UnpackVT =
12103 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
12104 V1 = DAG.getBitcast(UnpackVT, V1);
12105 V2 = DAG.getBitcast(UnpackVT, V2);
12106
12107 // Unpack the inputs and cast the result back to the desired type.
12108 return DAG.getBitcast(
12109 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12110 UnpackVT, V1, V2));
12111 };
12112
12113 // We try each unpack from the largest to the smallest to try and find one
12114 // that fits this mask.
12115 int OrigScalarSize = VT.getScalarSizeInBits();
12116 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
12117 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
12118 return Unpack;
12119
12120 // If we're shuffling with a zero vector then we're better off not doing
12121 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
12124 return SDValue();
12125
12126 // If none of the unpack-rooted lowerings worked (or were profitable) try an
12127 // initial unpack.
12128 if (NumLoInputs == 0 || NumHiInputs == 0) {
12129 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
12130 "We have to have *some* inputs!");
12131 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
12132
12133 // FIXME: We could consider the total complexity of the permute of each
12134 // possible unpacking. Or at the least we should consider how many
12135 // half-crossings are created.
12136 // FIXME: We could consider commuting the unpacks.
12137
12138 SmallVector<int, 32> PermMask((unsigned)Size, -1);
12139 for (int i = 0; i < Size; ++i) {
12140 if (Mask[i] < 0)
12141 continue;
12142
12143 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
12144
12145 PermMask[i] =
12146 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
12147 }
12148 return DAG.getVectorShuffle(
12149 VT, DL,
12150 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
12151 V1, V2),
12152 DAG.getUNDEF(VT), PermMask);
12153 }
12154
12155 return SDValue();
12156}
12157
12158/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12159/// permuting the elements of the result in place.
12161 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12162 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12163 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12164 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12165 (VT.is512BitVector() && !Subtarget.hasBWI()))
12166 return SDValue();
12167
12168 // We don't currently support lane crossing permutes.
12169 if (is128BitLaneCrossingShuffleMask(VT, Mask))
12170 return SDValue();
12171
12172 int Scale = VT.getScalarSizeInBits() / 8;
12173 int NumLanes = VT.getSizeInBits() / 128;
12174 int NumElts = VT.getVectorNumElements();
12175 int NumEltsPerLane = NumElts / NumLanes;
12176
12177 // Determine range of mask elts.
12178 bool Blend1 = true;
12179 bool Blend2 = true;
12180 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
12181 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
12182 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12183 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12184 int M = Mask[Lane + Elt];
12185 if (M < 0)
12186 continue;
12187 if (M < NumElts) {
12188 Blend1 &= (M == (Lane + Elt));
12189 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12190 M = M % NumEltsPerLane;
12191 Range1.first = std::min(Range1.first, M);
12192 Range1.second = std::max(Range1.second, M);
12193 } else {
12194 M -= NumElts;
12195 Blend2 &= (M == (Lane + Elt));
12196 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12197 M = M % NumEltsPerLane;
12198 Range2.first = std::min(Range2.first, M);
12199 Range2.second = std::max(Range2.second, M);
12200 }
12201 }
12202 }
12203
12204 // Bail if we don't need both elements.
12205 // TODO - it might be worth doing this for unary shuffles if the permute
12206 // can be widened.
12207 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12208 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12209 return SDValue();
12210
12211 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12212 return SDValue();
12213
12214 // Rotate the 2 ops so we can access both ranges, then permute the result.
12215 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12216 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12217 SDValue Rotate = DAG.getBitcast(
12218 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12219 DAG.getBitcast(ByteVT, Lo),
12220 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12221 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12222 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12223 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12224 int M = Mask[Lane + Elt];
12225 if (M < 0)
12226 continue;
12227 if (M < NumElts)
12228 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12229 else
12230 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12231 }
12232 }
12233 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12234 };
12235
12236 // Check if the ranges are small enough to rotate from either direction.
12237 if (Range2.second < Range1.first)
12238 return RotateAndPermute(V1, V2, Range1.first, 0);
12239 if (Range1.second < Range2.first)
12240 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12241 return SDValue();
12242}
12243
12245 return isUndefOrEqual(Mask, 0);
12246}
12247
12249 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
12250}
12251
12252/// Check if the Mask consists of the same element repeated multiple times.
12254 size_t NumUndefs = 0;
12255 std::optional<int> UniqueElt;
12256 for (int Elt : Mask) {
12257 if (Elt == SM_SentinelUndef) {
12258 NumUndefs++;
12259 continue;
12260 }
12261 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
12262 return false;
12263 UniqueElt = Elt;
12264 }
12265 // Make sure the element is repeated enough times by checking the number of
12266 // undefs is small.
12267 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
12268}
12269
12270/// Generic routine to decompose a shuffle and blend into independent
12271/// blends and permutes.
12272///
12273/// This matches the extremely common pattern for handling combined
12274/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12275/// operations. It will try to pick the best arrangement of shuffles and
12276/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12278 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12279 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12280 int NumElts = Mask.size();
12281 int NumLanes = VT.getSizeInBits() / 128;
12282 int NumEltsPerLane = NumElts / NumLanes;
12283
12284 // Shuffle the input elements into the desired positions in V1 and V2 and
12285 // unpack/blend them together.
12286 bool IsAlternating = true;
12287 bool V1Zero = true, V2Zero = true;
12288 SmallVector<int, 32> V1Mask(NumElts, -1);
12289 SmallVector<int, 32> V2Mask(NumElts, -1);
12290 SmallVector<int, 32> FinalMask(NumElts, -1);
12291 for (int i = 0; i < NumElts; ++i) {
12292 int M = Mask[i];
12293 if (M >= 0 && M < NumElts) {
12294 V1Mask[i] = M;
12295 FinalMask[i] = i;
12296 V1Zero &= Zeroable[i];
12297 IsAlternating &= (i & 1) == 0;
12298 } else if (M >= NumElts) {
12299 V2Mask[i] = M - NumElts;
12300 FinalMask[i] = i + NumElts;
12301 V2Zero &= Zeroable[i];
12302 IsAlternating &= (i & 1) == 1;
12303 }
12304 }
12305
12306 // If we effectively only demand the 0'th element of \p Input, and not only
12307 // as 0'th element, then broadcast said input,
12308 // and change \p InputMask to be a no-op (identity) mask.
12309 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
12310 &DAG](SDValue &Input,
12311 MutableArrayRef<int> InputMask) {
12312 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
12313 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
12314 !X86::mayFoldLoad(Input, Subtarget)))
12315 return;
12316 if (isNoopShuffleMask(InputMask))
12317 return;
12318 assert(isBroadcastShuffleMask(InputMask) &&
12319 "Expected to demand only the 0'th element.");
12320 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
12321 for (auto I : enumerate(InputMask)) {
12322 int &InputMaskElt = I.value();
12323 if (InputMaskElt >= 0)
12324 InputMaskElt = I.index();
12325 }
12326 };
12327
12328 // Currently, we may need to produce one shuffle per input, and blend results.
12329 // It is possible that the shuffle for one of the inputs is already a no-op.
12330 // See if we can simplify non-no-op shuffles into broadcasts,
12331 // which we consider to be strictly better than an arbitrary shuffle.
12332 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
12334 canonicalizeBroadcastableInput(V1, V1Mask);
12335 canonicalizeBroadcastableInput(V2, V2Mask);
12336 }
12337
12338 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12339 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12340 // the shuffle may be able to fold with a load or other benefit. However, when
12341 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12342 // pre-shuffle first is a better strategy.
12343 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12344 // If we don't have blends, see if we can create a cheap unpack.
12345 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
12346 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
12347 is128BitUnpackShuffleMask(V2Mask, DAG)))
12348 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
12349 DL, VT, V1, V2, Mask, Subtarget, DAG))
12350 return PermUnpack;
12351
12352 // Only prefer immediate blends to unpack/rotate.
12353 if (SDValue BlendPerm =
12354 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
12355 return BlendPerm;
12356
12357 // If either input vector provides only a single element which is repeated
12358 // multiple times, unpacking from both input vectors would generate worse
12359 // code. e.g. for
12360 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
12361 // it is better to process t4 first to create a vector of t4[0], then unpack
12362 // that vector with t2.
12363 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
12365 if (SDValue UnpackPerm =
12366 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
12367 return UnpackPerm;
12368
12370 DL, VT, V1, V2, Mask, Subtarget, DAG))
12371 return RotatePerm;
12372
12373 // Unpack/rotate failed - try again with variable blends.
12374 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12375 DAG))
12376 return BlendPerm;
12377
12378 if (VT.getScalarSizeInBits() >= 32)
12379 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
12380 DL, VT, V1, V2, Mask, Subtarget, DAG))
12381 return PermUnpack;
12382 }
12383
12384 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12385 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12386 // TODO: It doesn't have to be alternating - but each lane mustn't have more
12387 // than half the elements coming from each source.
12388 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12389 V1Mask.assign(NumElts, -1);
12390 V2Mask.assign(NumElts, -1);
12391 FinalMask.assign(NumElts, -1);
12392 for (int i = 0; i != NumElts; i += NumEltsPerLane)
12393 for (int j = 0; j != NumEltsPerLane; ++j) {
12394 int M = Mask[i + j];
12395 if (M >= 0 && M < NumElts) {
12396 V1Mask[i + (j / 2)] = M;
12397 FinalMask[i + j] = i + (j / 2);
12398 } else if (M >= NumElts) {
12399 V2Mask[i + (j / 2)] = M - NumElts;
12400 FinalMask[i + j] = i + (j / 2) + NumElts;
12401 }
12402 }
12403 }
12404
12405 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12406 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12407 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12408}
12409
12410static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12411 const X86Subtarget &Subtarget,
12412 ArrayRef<int> Mask) {
12413 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12414 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
12415
12416 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12417 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12418 int MaxSubElts = 64 / EltSizeInBits;
12419 unsigned RotateAmt, NumSubElts;
12420 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
12421 MaxSubElts, NumSubElts, RotateAmt))
12422 return -1;
12423 unsigned NumElts = Mask.size();
12424 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12425 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12426 return RotateAmt;
12427}
12428
12429/// Lower shuffle using X86ISD::VROTLI rotations.
12431 ArrayRef<int> Mask,
12432 const X86Subtarget &Subtarget,
12433 SelectionDAG &DAG) {
12434 // Only XOP + AVX512 targets have bit rotation instructions.
12435 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12436 bool IsLegal =
12437 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12438 if (!IsLegal && Subtarget.hasSSE3())
12439 return SDValue();
12440
12441 MVT RotateVT;
12442 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12443 Subtarget, Mask);
12444 if (RotateAmt < 0)
12445 return SDValue();
12446
12447 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12448 // expanded to OR(SRL,SHL), will be more efficient, but if they can
12449 // widen to vXi16 or more then existing lowering should will be better.
12450 if (!IsLegal) {
12451 if ((RotateAmt % 16) == 0)
12452 return SDValue();
12453 unsigned ShlAmt = RotateAmt;
12454 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12455 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, RotateVT, V1,
12456 ShlAmt, DAG);
12457 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, RotateVT, V1,
12458 SrlAmt, DAG);
12459 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12460 return DAG.getBitcast(VT, Rot);
12461 }
12462
12463 SDValue Rot =
12464 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12465 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12466 return DAG.getBitcast(VT, Rot);
12467}
12468
12469/// Try to match a vector shuffle as an element rotation.
12470///
12471/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12473 ArrayRef<int> Mask) {
12474 int NumElts = Mask.size();
12475
12476 // We need to detect various ways of spelling a rotation:
12477 // [11, 12, 13, 14, 15, 0, 1, 2]
12478 // [-1, 12, 13, 14, -1, -1, 1, -1]
12479 // [-1, -1, -1, -1, -1, -1, 1, 2]
12480 // [ 3, 4, 5, 6, 7, 8, 9, 10]
12481 // [-1, 4, 5, 6, -1, -1, 9, -1]
12482 // [-1, 4, 5, 6, -1, -1, -1, -1]
12483 int Rotation = 0;
12484 SDValue Lo, Hi;
12485 for (int i = 0; i < NumElts; ++i) {
12486 int M = Mask[i];
12487 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
12488 "Unexpected mask index.");
12489 if (M < 0)
12490 continue;
12491
12492 // Determine where a rotated vector would have started.
12493 int StartIdx = i - (M % NumElts);
12494 if (StartIdx == 0)
12495 // The identity rotation isn't interesting, stop.
12496 return -1;
12497
12498 // If we found the tail of a vector the rotation must be the missing
12499 // front. If we found the head of a vector, it must be how much of the
12500 // head.
12501 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12502
12503 if (Rotation == 0)
12504 Rotation = CandidateRotation;
12505 else if (Rotation != CandidateRotation)
12506 // The rotations don't match, so we can't match this mask.
12507 return -1;
12508
12509 // Compute which value this mask is pointing at.
12510 SDValue MaskV = M < NumElts ? V1 : V2;
12511
12512 // Compute which of the two target values this index should be assigned
12513 // to. This reflects whether the high elements are remaining or the low
12514 // elements are remaining.
12515 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12516
12517 // Either set up this value if we've not encountered it before, or check
12518 // that it remains consistent.
12519 if (!TargetV)
12520 TargetV = MaskV;
12521 else if (TargetV != MaskV)
12522 // This may be a rotation, but it pulls from the inputs in some
12523 // unsupported interleaving.
12524 return -1;
12525 }
12526
12527 // Check that we successfully analyzed the mask, and normalize the results.
12528 assert(Rotation != 0 && "Failed to locate a viable rotation!");
12529 assert((Lo || Hi) && "Failed to find a rotated input vector!");
12530 if (!Lo)
12531 Lo = Hi;
12532 else if (!Hi)
12533 Hi = Lo;
12534
12535 V1 = Lo;
12536 V2 = Hi;
12537
12538 return Rotation;
12539}
12540
12541/// Try to lower a vector shuffle as a byte rotation.
12542///
12543/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12544/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12545/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12546/// try to generically lower a vector shuffle through such an pattern. It
12547/// does not check for the profitability of lowering either as PALIGNR or
12548/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12549/// This matches shuffle vectors that look like:
12550///
12551/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12552///
12553/// Essentially it concatenates V1 and V2, shifts right by some number of
12554/// elements, and takes the low elements as the result. Note that while this is
12555/// specified as a *right shift* because x86 is little-endian, it is a *left
12556/// rotate* of the vector lanes.
12558 ArrayRef<int> Mask) {
12559 // Don't accept any shuffles with zero elements.
12560 if (isAnyZero(Mask))
12561 return -1;
12562
12563 // PALIGNR works on 128-bit lanes.
12564 SmallVector<int, 16> RepeatedMask;
12565 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12566 return -1;
12567
12568 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12569 if (Rotation <= 0)
12570 return -1;
12571
12572 // PALIGNR rotates bytes, so we need to scale the
12573 // rotation based on how many bytes are in the vector lane.
12574 int NumElts = RepeatedMask.size();
12575 int Scale = 16 / NumElts;
12576 return Rotation * Scale;
12577}
12578
12580 SDValue V2, ArrayRef<int> Mask,
12581 const X86Subtarget &Subtarget,
12582 SelectionDAG &DAG) {
12583 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12584
12585 SDValue Lo = V1, Hi = V2;
12586 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12587 if (ByteRotation <= 0)
12588 return SDValue();
12589
12590 // Cast the inputs to i8 vector of correct length to match PALIGNR or
12591 // PSLLDQ/PSRLDQ.
12592 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12593 Lo = DAG.getBitcast(ByteVT, Lo);
12594 Hi = DAG.getBitcast(ByteVT, Hi);
12595
12596 // SSSE3 targets can use the palignr instruction.
12597 if (Subtarget.hasSSSE3()) {
12598 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
12599 "512-bit PALIGNR requires BWI instructions");
12600 return DAG.getBitcast(
12601 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12602 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12603 }
12604
12605 assert(VT.is128BitVector() &&
12606 "Rotate-based lowering only supports 128-bit lowering!");
12607 assert(Mask.size() <= 16 &&
12608 "Can shuffle at most 16 bytes in a 128-bit vector!");
12609 assert(ByteVT == MVT::v16i8 &&
12610 "SSE2 rotate lowering only needed for v16i8!");
12611
12612 // Default SSE2 implementation
12613 int LoByteShift = 16 - ByteRotation;
12614 int HiByteShift = ByteRotation;
12615
12616 SDValue LoShift =
12617 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12618 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12619 SDValue HiShift =
12620 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12621 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12622 return DAG.getBitcast(VT,
12623 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12624}
12625
12626/// Try to lower a vector shuffle as a dword/qword rotation.
12627///
12628/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12629/// rotation of the concatenation of two vectors; This routine will
12630/// try to generically lower a vector shuffle through such an pattern.
12631///
12632/// Essentially it concatenates V1 and V2, shifts right by some number of
12633/// elements, and takes the low elements as the result. Note that while this is
12634/// specified as a *right shift* because x86 is little-endian, it is a *left
12635/// rotate* of the vector lanes.
12637 SDValue V2, ArrayRef<int> Mask,
12638 const APInt &Zeroable,
12639 const X86Subtarget &Subtarget,
12640 SelectionDAG &DAG) {
12641 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12642 "Only 32-bit and 64-bit elements are supported!");
12643
12644 // 128/256-bit vectors are only supported with VLX.
12645 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12646 && "VLX required for 128/256-bit vectors");
12647
12648 SDValue Lo = V1, Hi = V2;
12649 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12650 if (0 < Rotation)
12651 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12652 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12653
12654 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12655 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12656 // TODO: We can probably make this more aggressive and use shift-pairs like
12657 // lowerShuffleAsByteShiftMask.
12658 unsigned NumElts = Mask.size();
12659 unsigned ZeroLo = Zeroable.countr_one();
12660 unsigned ZeroHi = Zeroable.countl_one();
12661 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12662 if (!ZeroLo && !ZeroHi)
12663 return SDValue();
12664
12665 if (ZeroLo) {
12666 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12667 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12668 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12669 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12670 getZeroVector(VT, Subtarget, DAG, DL),
12671 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12672 }
12673
12674 if (ZeroHi) {
12675 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12676 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12677 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12678 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12679 getZeroVector(VT, Subtarget, DAG, DL), Src,
12680 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12681 }
12682
12683 return SDValue();
12684}
12685
12686/// Try to lower a vector shuffle as a byte shift sequence.
12688 SDValue V2, ArrayRef<int> Mask,
12689 const APInt &Zeroable,
12690 const X86Subtarget &Subtarget,
12691 SelectionDAG &DAG) {
12692 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12693 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12694
12695 // We need a shuffle that has zeros at one/both ends and a sequential
12696 // shuffle from one source within.
12697 unsigned ZeroLo = Zeroable.countr_one();
12698 unsigned ZeroHi = Zeroable.countl_one();
12699 if (!ZeroLo && !ZeroHi)
12700 return SDValue();
12701
12702 unsigned NumElts = Mask.size();
12703 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12704 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12705 return SDValue();
12706
12707 unsigned Scale = VT.getScalarSizeInBits() / 8;
12708 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12709 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12710 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12711 return SDValue();
12712
12713 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12714 Res = DAG.getBitcast(MVT::v16i8, Res);
12715
12716 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12717 // inner sequential set of elements, possibly offset:
12718 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12719 // 01234567 --> 4567zzzz --> zzzzz456
12720 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12721 if (ZeroLo == 0) {
12722 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12723 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12724 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12725 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12726 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12727 } else if (ZeroHi == 0) {
12728 unsigned Shift = Mask[ZeroLo] % NumElts;
12729 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12730 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12731 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12732 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12733 } else if (!Subtarget.hasSSSE3()) {
12734 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12735 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12736 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12737 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12738 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12739 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12740 Shift += Mask[ZeroLo] % NumElts;
12741 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12742 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12743 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12744 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12745 } else
12746 return SDValue();
12747
12748 return DAG.getBitcast(VT, Res);
12749}
12750
12751/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12752///
12753/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12754/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12755/// matches elements from one of the input vectors shuffled to the left or
12756/// right with zeroable elements 'shifted in'. It handles both the strictly
12757/// bit-wise element shifts and the byte shift across an entire 128-bit double
12758/// quad word lane.
12759///
12760/// PSHL : (little-endian) left bit shift.
12761/// [ zz, 0, zz, 2 ]
12762/// [ -1, 4, zz, -1 ]
12763/// PSRL : (little-endian) right bit shift.
12764/// [ 1, zz, 3, zz]
12765/// [ -1, -1, 7, zz]
12766/// PSLLDQ : (little-endian) left byte shift
12767/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12768/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12769/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12770/// PSRLDQ : (little-endian) right byte shift
12771/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12772/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12773/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12774static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12775 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12776 int MaskOffset, const APInt &Zeroable,
12777 const X86Subtarget &Subtarget) {
12778 int Size = Mask.size();
12779 unsigned SizeInBits = Size * ScalarSizeInBits;
12780
12781 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12782 for (int i = 0; i < Size; i += Scale)
12783 for (int j = 0; j < Shift; ++j)
12784 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12785 return false;
12786
12787 return true;
12788 };
12789
12790 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12791 for (int i = 0; i != Size; i += Scale) {
12792 unsigned Pos = Left ? i + Shift : i;
12793 unsigned Low = Left ? i : i + Shift;
12794 unsigned Len = Scale - Shift;
12795 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12796 return -1;
12797 }
12798
12799 int ShiftEltBits = ScalarSizeInBits * Scale;
12800 bool ByteShift = ShiftEltBits > 64;
12801 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12802 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12803 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12804
12805 // Normalize the scale for byte shifts to still produce an i64 element
12806 // type.
12807 Scale = ByteShift ? Scale / 2 : Scale;
12808
12809 // We need to round trip through the appropriate type for the shift.
12810 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12811 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12812 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12813 return ShiftAmt;
12814 };
12815
12816 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12817 // keep doubling the size of the integer elements up to that. We can
12818 // then shift the elements of the integer vector by whole multiples of
12819 // their width within the elements of the larger integer vector. Test each
12820 // multiple to see if we can find a match with the moved element indices
12821 // and that the shifted in elements are all zeroable.
12822 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12823 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12824 for (int Shift = 1; Shift != Scale; ++Shift)
12825 for (bool Left : {true, false})
12826 if (CheckZeros(Shift, Scale, Left)) {
12827 int ShiftAmt = MatchShift(Shift, Scale, Left);
12828 if (0 < ShiftAmt)
12829 return ShiftAmt;
12830 }
12831
12832 // no match
12833 return -1;
12834}
12835
12837 SDValue V2, ArrayRef<int> Mask,
12838 const APInt &Zeroable,
12839 const X86Subtarget &Subtarget,
12840 SelectionDAG &DAG, bool BitwiseOnly) {
12841 int Size = Mask.size();
12842 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12843
12844 MVT ShiftVT;
12845 SDValue V = V1;
12846 unsigned Opcode;
12847
12848 // Try to match shuffle against V1 shift.
12849 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12850 Mask, 0, Zeroable, Subtarget);
12851
12852 // If V1 failed, try to match shuffle against V2 shift.
12853 if (ShiftAmt < 0) {
12854 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12855 Mask, Size, Zeroable, Subtarget);
12856 V = V2;
12857 }
12858
12859 if (ShiftAmt < 0)
12860 return SDValue();
12861
12862 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12863 return SDValue();
12864
12865 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12866 "Illegal integer vector type");
12867 V = DAG.getBitcast(ShiftVT, V);
12868 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12869 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12870 return DAG.getBitcast(VT, V);
12871}
12872
12873// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12874// Remainder of lower half result is zero and upper half is all undef.
12875static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12876 ArrayRef<int> Mask, uint64_t &BitLen,
12877 uint64_t &BitIdx, const APInt &Zeroable) {
12878 int Size = Mask.size();
12879 int HalfSize = Size / 2;
12880 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12881 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12882
12883 // Upper half must be undefined.
12884 if (!isUndefUpperHalf(Mask))
12885 return false;
12886
12887 // Determine the extraction length from the part of the
12888 // lower half that isn't zeroable.
12889 int Len = HalfSize;
12890 for (; Len > 0; --Len)
12891 if (!Zeroable[Len - 1])
12892 break;
12893 assert(Len > 0 && "Zeroable shuffle mask");
12894
12895 // Attempt to match first Len sequential elements from the lower half.
12896 SDValue Src;
12897 int Idx = -1;
12898 for (int i = 0; i != Len; ++i) {
12899 int M = Mask[i];
12900 if (M == SM_SentinelUndef)
12901 continue;
12902 SDValue &V = (M < Size ? V1 : V2);
12903 M = M % Size;
12904
12905 // The extracted elements must start at a valid index and all mask
12906 // elements must be in the lower half.
12907 if (i > M || M >= HalfSize)
12908 return false;
12909
12910 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12911 Src = V;
12912 Idx = M - i;
12913 continue;
12914 }
12915 return false;
12916 }
12917
12918 if (!Src || Idx < 0)
12919 return false;
12920
12921 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12922 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12923 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12924 V1 = Src;
12925 return true;
12926}
12927
12928// INSERTQ: Extract lowest Len elements from lower half of second source and
12929// insert over first source, starting at Idx.
12930// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12931static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12932 ArrayRef<int> Mask, uint64_t &BitLen,
12933 uint64_t &BitIdx) {
12934 int Size = Mask.size();
12935 int HalfSize = Size / 2;
12936 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12937
12938 // Upper half must be undefined.
12939 if (!isUndefUpperHalf(Mask))
12940 return false;
12941
12942 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12943 SDValue Base;
12944
12945 // Attempt to match first source from mask before insertion point.
12946 if (isUndefInRange(Mask, 0, Idx)) {
12947 /* EMPTY */
12948 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12949 Base = V1;
12950 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12951 Base = V2;
12952 } else {
12953 continue;
12954 }
12955
12956 // Extend the extraction length looking to match both the insertion of
12957 // the second source and the remaining elements of the first.
12958 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12959 SDValue Insert;
12960 int Len = Hi - Idx;
12961
12962 // Match insertion.
12963 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12964 Insert = V1;
12965 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12966 Insert = V2;
12967 } else {
12968 continue;
12969 }
12970
12971 // Match the remaining elements of the lower half.
12972 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12973 /* EMPTY */
12974 } else if ((!Base || (Base == V1)) &&
12975 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12976 Base = V1;
12977 } else if ((!Base || (Base == V2)) &&
12978 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12979 Size + Hi)) {
12980 Base = V2;
12981 } else {
12982 continue;
12983 }
12984
12985 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12986 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12987 V1 = Base;
12988 V2 = Insert;
12989 return true;
12990 }
12991 }
12992
12993 return false;
12994}
12995
12996/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12998 SDValue V2, ArrayRef<int> Mask,
12999 const APInt &Zeroable, SelectionDAG &DAG) {
13000 uint64_t BitLen, BitIdx;
13001 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13002 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13003 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13004 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13005
13006 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13007 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13008 V2 ? V2 : DAG.getUNDEF(VT),
13009 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13010 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13011
13012 return SDValue();
13013}
13014
13015/// Lower a vector shuffle as an any/signed/zero extension.
13016///
13017/// Given a specific number of elements, element bit width, and extension
13018/// stride, produce either an extension based on the available
13019/// features of the subtarget. The extended elements are consecutive and
13020/// begin and can start from an offsetted element index in the input; to
13021/// avoid excess shuffling the offset must either being in the bottom lane
13022/// or at the start of a higher lane. All extended elements must be from
13023/// the same lane.
13025 int Scale, int Offset,
13026 unsigned ExtOpc, SDValue InputV,
13027 ArrayRef<int> Mask,
13028 const X86Subtarget &Subtarget,
13029 SelectionDAG &DAG) {
13030 assert(Scale > 1 && "Need a scale to extend.");
13031 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
13032 int EltBits = VT.getScalarSizeInBits();
13033 int NumElements = VT.getVectorNumElements();
13034 int NumEltsPerLane = 128 / EltBits;
13035 int OffsetLane = Offset / NumEltsPerLane;
13036 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
13037 "Only 8, 16, and 32 bit elements can be extended.");
13038 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
13039 assert(0 <= Offset && "Extension offset must be positive.");
13040 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
13041 "Extension offset must be in the first lane or start an upper lane.");
13042
13043 // Check that an index is in same lane as the base offset.
13044 auto SafeOffset = [&](int Idx) {
13045 return OffsetLane == (Idx / NumEltsPerLane);
13046 };
13047
13048 // Shift along an input so that the offset base moves to the first element.
13049 auto ShuffleOffset = [&](SDValue V) {
13050 if (!Offset)
13051 return V;
13052
13053 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13054 for (int i = 0; i * Scale < NumElements; ++i) {
13055 int SrcIdx = i + Offset;
13056 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13057 }
13058 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13059 };
13060
13061 // Found a valid a/zext mask! Try various lowering strategies based on the
13062 // input type and available ISA extensions.
13063 if (Subtarget.hasSSE41()) {
13064 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13065 // PUNPCK will catch this in a later shuffle match.
13066 if (Offset && Scale == 2 && VT.is128BitVector())
13067 return SDValue();
13068 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13069 NumElements / Scale);
13070 InputV = DAG.getBitcast(VT, InputV);
13071 InputV = ShuffleOffset(InputV);
13072 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
13073 return DAG.getBitcast(VT, InputV);
13074 }
13075
13076 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
13077 InputV = DAG.getBitcast(VT, InputV);
13078 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
13079
13080 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
13081 if (ExtOpc == ISD::SIGN_EXTEND)
13082 return SDValue();
13083
13084 // For any extends we can cheat for larger element sizes and use shuffle
13085 // instructions that can fold with a load and/or copy.
13086 if (AnyExt && EltBits == 32) {
13087 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13088 -1};
13089 return DAG.getBitcast(
13090 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13091 DAG.getBitcast(MVT::v4i32, InputV),
13092 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13093 }
13094 if (AnyExt && EltBits == 16 && Scale > 2) {
13095 int PSHUFDMask[4] = {Offset / 2, -1,
13096 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13097 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13098 DAG.getBitcast(MVT::v4i32, InputV),
13099 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13100 int PSHUFWMask[4] = {1, -1, -1, -1};
13101 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13102 return DAG.getBitcast(
13103 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13104 DAG.getBitcast(MVT::v8i16, InputV),
13105 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13106 }
13107
13108 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13109 // to 64-bits.
13110 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13111 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
13112 assert(VT.is128BitVector() && "Unexpected vector width!");
13113
13114 int LoIdx = Offset * EltBits;
13115 SDValue Lo = DAG.getBitcast(
13116 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13117 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13118 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13119
13120 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13121 return DAG.getBitcast(VT, Lo);
13122
13123 int HiIdx = (Offset + 1) * EltBits;
13124 SDValue Hi = DAG.getBitcast(
13125 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13126 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13127 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13128 return DAG.getBitcast(VT,
13129 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13130 }
13131
13132 // If this would require more than 2 unpack instructions to expand, use
13133 // pshufb when available. We can only use more than 2 unpack instructions
13134 // when zero extending i8 elements which also makes it easier to use pshufb.
13135 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13136 assert(NumElements == 16 && "Unexpected byte vector width!");
13137 SDValue PSHUFBMask[16];
13138 for (int i = 0; i < 16; ++i) {
13139 int Idx = Offset + (i / Scale);
13140 if ((i % Scale == 0 && SafeOffset(Idx))) {
13141 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13142 continue;
13143 }
13144 PSHUFBMask[i] =
13145 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13146 }
13147 InputV = DAG.getBitcast(MVT::v16i8, InputV);
13148 return DAG.getBitcast(
13149 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13150 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13151 }
13152
13153 // If we are extending from an offset, ensure we start on a boundary that
13154 // we can unpack from.
13155 int AlignToUnpack = Offset % (NumElements / Scale);
13156 if (AlignToUnpack) {
13157 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13158 for (int i = AlignToUnpack; i < NumElements; ++i)
13159 ShMask[i - AlignToUnpack] = i;
13160 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13161 Offset -= AlignToUnpack;
13162 }
13163
13164 // Otherwise emit a sequence of unpacks.
13165 do {
13166 unsigned UnpackLoHi = X86ISD::UNPCKL;
13167 if (Offset >= (NumElements / 2)) {
13168 UnpackLoHi = X86ISD::UNPCKH;
13169 Offset -= (NumElements / 2);
13170 }
13171
13172 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13173 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13174 : getZeroVector(InputVT, Subtarget, DAG, DL);
13175 InputV = DAG.getBitcast(InputVT, InputV);
13176 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13177 Scale /= 2;
13178 EltBits *= 2;
13179 NumElements /= 2;
13180 } while (Scale > 1);
13181 return DAG.getBitcast(VT, InputV);
13182}
13183
13184/// Try to lower a vector shuffle as a zero extension on any microarch.
13185///
13186/// This routine will try to do everything in its power to cleverly lower
13187/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13188/// check for the profitability of this lowering, it tries to aggressively
13189/// match this pattern. It will use all of the micro-architectural details it
13190/// can to emit an efficient lowering. It handles both blends with all-zero
13191/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13192/// masking out later).
13193///
13194/// The reason we have dedicated lowering for zext-style shuffles is that they
13195/// are both incredibly common and often quite performance sensitive.
13197 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13198 const APInt &Zeroable, const X86Subtarget &Subtarget,
13199 SelectionDAG &DAG) {
13200 int Bits = VT.getSizeInBits();
13201 int NumLanes = Bits / 128;
13202 int NumElements = VT.getVectorNumElements();
13203 int NumEltsPerLane = NumElements / NumLanes;
13204 assert(VT.getScalarSizeInBits() <= 32 &&
13205 "Exceeds 32-bit integer zero extension limit");
13206 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
13207
13208 // Define a helper function to check a particular ext-scale and lower to it if
13209 // valid.
13210 auto Lower = [&](int Scale) -> SDValue {
13211 SDValue InputV;
13212 bool AnyExt = true;
13213 int Offset = 0;
13214 int Matches = 0;
13215 for (int i = 0; i < NumElements; ++i) {
13216 int M = Mask[i];
13217 if (M < 0)
13218 continue; // Valid anywhere but doesn't tell us anything.
13219 if (i % Scale != 0) {
13220 // Each of the extended elements need to be zeroable.
13221 if (!Zeroable[i])
13222 return SDValue();
13223
13224 // We no longer are in the anyext case.
13225 AnyExt = false;
13226 continue;
13227 }
13228
13229 // Each of the base elements needs to be consecutive indices into the
13230 // same input vector.
13231 SDValue V = M < NumElements ? V1 : V2;
13232 M = M % NumElements;
13233 if (!InputV) {
13234 InputV = V;
13235 Offset = M - (i / Scale);
13236 } else if (InputV != V)
13237 return SDValue(); // Flip-flopping inputs.
13238
13239 // Offset must start in the lowest 128-bit lane or at the start of an
13240 // upper lane.
13241 // FIXME: Is it ever worth allowing a negative base offset?
13242 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13243 (Offset % NumEltsPerLane) == 0))
13244 return SDValue();
13245
13246 // If we are offsetting, all referenced entries must come from the same
13247 // lane.
13248 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13249 return SDValue();
13250
13251 if ((M % NumElements) != (Offset + (i / Scale)))
13252 return SDValue(); // Non-consecutive strided elements.
13253 Matches++;
13254 }
13255
13256 // If we fail to find an input, we have a zero-shuffle which should always
13257 // have already been handled.
13258 // FIXME: Maybe handle this here in case during blending we end up with one?
13259 if (!InputV)
13260 return SDValue();
13261
13262 // If we are offsetting, don't extend if we only match a single input, we
13263 // can always do better by using a basic PSHUF or PUNPCK.
13264 if (Offset != 0 && Matches < 2)
13265 return SDValue();
13266
13267 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
13268 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
13269 InputV, Mask, Subtarget, DAG);
13270 };
13271
13272 // The widest scale possible for extending is to a 64-bit integer.
13273 assert(Bits % 64 == 0 &&
13274 "The number of bits in a vector must be divisible by 64 on x86!");
13275 int NumExtElements = Bits / 64;
13276
13277 // Each iteration, try extending the elements half as much, but into twice as
13278 // many elements.
13279 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13280 assert(NumElements % NumExtElements == 0 &&
13281 "The input vector size must be divisible by the extended size.");
13282 if (SDValue V = Lower(NumElements / NumExtElements))
13283 return V;
13284 }
13285
13286 // General extends failed, but 128-bit vectors may be able to use MOVQ.
13287 if (Bits != 128)
13288 return SDValue();
13289
13290 // Returns one of the source operands if the shuffle can be reduced to a
13291 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13292 auto CanZExtLowHalf = [&]() {
13293 for (int i = NumElements / 2; i != NumElements; ++i)
13294 if (!Zeroable[i])
13295 return SDValue();
13296 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13297 return V1;
13298 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13299 return V2;
13300 return SDValue();
13301 };
13302
13303 if (SDValue V = CanZExtLowHalf()) {
13304 V = DAG.getBitcast(MVT::v2i64, V);
13305 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13306 return DAG.getBitcast(VT, V);
13307 }
13308
13309 // No viable ext lowering found.
13310 return SDValue();
13311}
13312
13313/// Try to get a scalar value for a specific element of a vector.
13314///
13315/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13317 SelectionDAG &DAG) {
13318 MVT VT = V.getSimpleValueType();
13319 MVT EltVT = VT.getVectorElementType();
13320 V = peekThroughBitcasts(V);
13321
13322 // If the bitcasts shift the element size, we can't extract an equivalent
13323 // element from it.
13324 MVT NewVT = V.getSimpleValueType();
13325 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13326 return SDValue();
13327
13328 if (V.getOpcode() == ISD::BUILD_VECTOR ||
13329 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13330 // Ensure the scalar operand is the same size as the destination.
13331 // FIXME: Add support for scalar truncation where possible.
13332 SDValue S = V.getOperand(Idx);
13333 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13334 return DAG.getBitcast(EltVT, S);
13335 }
13336
13337 return SDValue();
13338}
13339
13340/// Helper to test for a load that can be folded with x86 shuffles.
13341///
13342/// This is particularly important because the set of instructions varies
13343/// significantly based on whether the operand is a load or not.
13345 return V.hasOneUse() &&
13347}
13348
13349template<typename T>
13350static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
13351 T EltVT = VT.getScalarType();
13352 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
13353 (EltVT == MVT::f16 && !Subtarget.hasFP16());
13354}
13355
13356template<typename T>
13357static bool isBF16orSoftF16(T VT, const X86Subtarget &Subtarget) {
13358 T EltVT = VT.getScalarType();
13359 return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
13360}
13361
13362/// Try to lower insertion of a single element into a zero vector.
13363///
13364/// This is a common pattern that we have especially efficient patterns to lower
13365/// across all subtarget feature sets.
13367 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13368 const APInt &Zeroable, const X86Subtarget &Subtarget,
13369 SelectionDAG &DAG) {
13370 MVT ExtVT = VT;
13371 MVT EltVT = VT.getVectorElementType();
13372 unsigned NumElts = VT.getVectorNumElements();
13373 unsigned EltBits = VT.getScalarSizeInBits();
13374
13375 if (isSoftF16(EltVT, Subtarget))
13376 return SDValue();
13377
13378 int V2Index =
13379 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13380 Mask.begin();
13381 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
13382 bool IsV1Zeroable = true;
13383 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13384 if (i != V2Index && !Zeroable[i]) {
13385 IsV1Zeroable = false;
13386 break;
13387 }
13388
13389 // Bail if a non-zero V1 isn't used in place.
13390 if (!IsV1Zeroable) {
13391 SmallVector<int, 8> V1Mask(Mask);
13392 V1Mask[V2Index] = -1;
13393 if (!isNoopShuffleMask(V1Mask))
13394 return SDValue();
13395 }
13396
13397 // Check for a single input from a SCALAR_TO_VECTOR node.
13398 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13399 // all the smarts here sunk into that routine. However, the current
13400 // lowering of BUILD_VECTOR makes that nearly impossible until the old
13401 // vector shuffle lowering is dead.
13402 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13403 DAG);
13404 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13405 // We need to zext the scalar if it is smaller than an i32.
13406 V2S = DAG.getBitcast(EltVT, V2S);
13407 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
13408 // Using zext to expand a narrow element won't work for non-zero
13409 // insertions. But we can use a masked constant vector if we're
13410 // inserting V2 into the bottom of V1.
13411 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
13412 return SDValue();
13413
13414 // Zero-extend directly to i32.
13415 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13416 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13417
13418 // If we're inserting into a constant, mask off the inserted index
13419 // and OR with the zero-extended scalar.
13420 if (!IsV1Zeroable) {
13421 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
13422 Bits[V2Index] = APInt::getZero(EltBits);
13423 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
13424 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
13425 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13426 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
13427 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
13428 }
13429 }
13430 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13431 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13432 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
13433 // Either not inserting from the low element of the input or the input
13434 // element size is too small to use VZEXT_MOVL to clear the high bits.
13435 return SDValue();
13436 }
13437
13438 if (!IsV1Zeroable) {
13439 // If V1 can't be treated as a zero vector we have fewer options to lower
13440 // this. We can't support integer vectors or non-zero targets cheaply.
13441 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
13442 if (!VT.isFloatingPoint() || V2Index != 0)
13443 return SDValue();
13444 if (!VT.is128BitVector())
13445 return SDValue();
13446
13447 // Otherwise, use MOVSD, MOVSS or MOVSH.
13448 unsigned MovOpc = 0;
13449 if (EltVT == MVT::f16)
13450 MovOpc = X86ISD::MOVSH;
13451 else if (EltVT == MVT::f32)
13452 MovOpc = X86ISD::MOVSS;
13453 else if (EltVT == MVT::f64)
13454 MovOpc = X86ISD::MOVSD;
13455 else
13456 llvm_unreachable("Unsupported floating point element type to handle!");
13457 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
13458 }
13459
13460 // This lowering only works for the low element with floating point vectors.
13461 if (VT.isFloatingPoint() && V2Index != 0)
13462 return SDValue();
13463
13464 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13465 if (ExtVT != VT)
13466 V2 = DAG.getBitcast(VT, V2);
13467
13468 if (V2Index != 0) {
13469 // If we have 4 or fewer lanes we can cheaply shuffle the element into
13470 // the desired position. Otherwise it is more efficient to do a vector
13471 // shift left. We know that we can do a vector shift left because all
13472 // the inputs are zero.
13473 if (VT.isFloatingPoint() || NumElts <= 4) {
13474 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13475 V2Shuffle[V2Index] = 0;
13476 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13477 } else {
13478 V2 = DAG.getBitcast(MVT::v16i8, V2);
13479 V2 = DAG.getNode(
13480 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13481 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
13482 V2 = DAG.getBitcast(VT, V2);
13483 }
13484 }
13485 return V2;
13486}
13487
13488/// Try to lower broadcast of a single - truncated - integer element,
13489/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13490///
13491/// This assumes we have AVX2.
13493 int BroadcastIdx,
13494 const X86Subtarget &Subtarget,
13495 SelectionDAG &DAG) {
13496 assert(Subtarget.hasAVX2() &&
13497 "We can only lower integer broadcasts with AVX2!");
13498
13499 MVT EltVT = VT.getVectorElementType();
13500 MVT V0VT = V0.getSimpleValueType();
13501
13502 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
13503 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
13504
13505 MVT V0EltVT = V0VT.getVectorElementType();
13506 if (!V0EltVT.isInteger())
13507 return SDValue();
13508
13509 const unsigned EltSize = EltVT.getSizeInBits();
13510 const unsigned V0EltSize = V0EltVT.getSizeInBits();
13511
13512 // This is only a truncation if the original element type is larger.
13513 if (V0EltSize <= EltSize)
13514 return SDValue();
13515
13516 assert(((V0EltSize % EltSize) == 0) &&
13517 "Scalar type sizes must all be powers of 2 on x86!");
13518
13519 const unsigned V0Opc = V0.getOpcode();
13520 const unsigned Scale = V0EltSize / EltSize;
13521 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13522
13523 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13524 V0Opc != ISD::BUILD_VECTOR)
13525 return SDValue();
13526
13527 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13528
13529 // If we're extracting non-least-significant bits, shift so we can truncate.
13530 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13531 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13532 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13533 if (const int OffsetIdx = BroadcastIdx % Scale)
13534 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13535 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13536
13537 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13538 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13539}
13540
13541/// Test whether this can be lowered with a single SHUFPS instruction.
13542///
13543/// This is used to disable more specialized lowerings when the shufps lowering
13544/// will happen to be efficient.
13546 // This routine only handles 128-bit shufps.
13547 assert(Mask.size() == 4 && "Unsupported mask size!");
13548 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
13549 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
13550 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
13551 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
13552
13553 // To lower with a single SHUFPS we need to have the low half and high half
13554 // each requiring a single input.
13555 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13556 return false;
13557 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13558 return false;
13559
13560 return true;
13561}
13562
13563/// Test whether the specified input (0 or 1) is in-place blended by the
13564/// given mask.
13565///
13566/// This returns true if the elements from a particular input are already in the
13567/// slot required by the given mask and require no permutation.
13569 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13570 int Size = Mask.size();
13571 for (int i = 0; i < Size; ++i)
13572 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13573 return false;
13574
13575 return true;
13576}
13577
13578/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
13579/// the given mask.
13580///
13582 int BroadcastableElement = 0) {
13583 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13584 int Size = Mask.size();
13585 for (int i = 0; i < Size; ++i)
13586 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
13587 Mask[i] % Size != BroadcastableElement)
13588 return false;
13589 return true;
13590}
13591
13592/// If we are extracting two 128-bit halves of a vector and shuffling the
13593/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13594/// multi-shuffle lowering.
13596 SDValue N1, ArrayRef<int> Mask,
13597 SelectionDAG &DAG) {
13598 MVT VT = N0.getSimpleValueType();
13599 assert((VT.is128BitVector() &&
13600 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
13601 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13602
13603 // Check that both sources are extracts of the same source vector.
13604 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13606 N0.getOperand(0) != N1.getOperand(0) ||
13607 !N0.hasOneUse() || !N1.hasOneUse())
13608 return SDValue();
13609
13610 SDValue WideVec = N0.getOperand(0);
13611 MVT WideVT = WideVec.getSimpleValueType();
13612 if (!WideVT.is256BitVector())
13613 return SDValue();
13614
13615 // Match extracts of each half of the wide source vector. Commute the shuffle
13616 // if the extract of the low half is N1.
13617 unsigned NumElts = VT.getVectorNumElements();
13618 SmallVector<int, 4> NewMask(Mask);
13619 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13620 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13621 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13623 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13624 return SDValue();
13625
13626 // Final bailout: if the mask is simple, we are better off using an extract
13627 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13628 // because that avoids a constant load from memory.
13629 if (NumElts == 4 &&
13630 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13631 return SDValue();
13632
13633 // Extend the shuffle mask with undef elements.
13634 NewMask.append(NumElts, -1);
13635
13636 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13637 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13638 NewMask);
13639 // This is free: ymm -> xmm.
13640 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13641 DAG.getVectorIdxConstant(0, DL));
13642}
13643
13644/// Try to lower broadcast of a single element.
13645///
13646/// For convenience, this code also bundles all of the subtarget feature set
13647/// filtering. While a little annoying to re-dispatch on type here, there isn't
13648/// a convenient way to factor it out.
13650 SDValue V2, ArrayRef<int> Mask,
13651 const X86Subtarget &Subtarget,
13652 SelectionDAG &DAG) {
13653 MVT EltVT = VT.getVectorElementType();
13654 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13655 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13656 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13657 return SDValue();
13658
13659 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13660 // we can only broadcast from a register with AVX2.
13661 unsigned NumEltBits = VT.getScalarSizeInBits();
13662 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13663 ? X86ISD::MOVDDUP
13664 : X86ISD::VBROADCAST;
13665 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13666
13667 // Check that the mask is a broadcast.
13668 int BroadcastIdx = getSplatIndex(Mask);
13669 if (BroadcastIdx < 0) {
13670 // Check for hidden broadcast.
13671 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13672 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13673 return SDValue();
13674 BroadcastIdx = 0;
13675 }
13676 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13677 "a sorted mask where the broadcast "
13678 "comes from V1.");
13679 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13680
13681 // Go up the chain of (vector) values to find a scalar load that we can
13682 // combine with the broadcast.
13683 // TODO: Combine this logic with findEltLoadSrc() used by
13684 // EltsFromConsecutiveLoads().
13685 int BitOffset = BroadcastIdx * NumEltBits;
13686 SDValue V = V1;
13687 for (;;) {
13688 switch (V.getOpcode()) {
13689 case ISD::BITCAST: {
13690 V = V.getOperand(0);
13691 continue;
13692 }
13693 case ISD::CONCAT_VECTORS: {
13694 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13695 int OpIdx = BitOffset / OpBitWidth;
13696 V = V.getOperand(OpIdx);
13697 BitOffset %= OpBitWidth;
13698 continue;
13699 }
13701 // The extraction index adds to the existing offset.
13702 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13703 unsigned Idx = V.getConstantOperandVal(1);
13704 unsigned BeginOffset = Idx * EltBitWidth;
13705 BitOffset += BeginOffset;
13706 V = V.getOperand(0);
13707 continue;
13708 }
13709 case ISD::INSERT_SUBVECTOR: {
13710 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13711 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13712 int Idx = (int)V.getConstantOperandVal(2);
13713 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13714 int BeginOffset = Idx * EltBitWidth;
13715 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13716 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13717 BitOffset -= BeginOffset;
13718 V = VInner;
13719 } else {
13720 V = VOuter;
13721 }
13722 continue;
13723 }
13724 }
13725 break;
13726 }
13727 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13728 BroadcastIdx = BitOffset / NumEltBits;
13729
13730 // Do we need to bitcast the source to retrieve the original broadcast index?
13731 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13732
13733 // Check if this is a broadcast of a scalar. We special case lowering
13734 // for scalars so that we can more effectively fold with loads.
13735 // If the original value has a larger element type than the shuffle, the
13736 // broadcast element is in essence truncated. Make that explicit to ease
13737 // folding.
13738 if (BitCastSrc && VT.isInteger())
13739 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13740 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13741 return TruncBroadcast;
13742
13743 // Also check the simpler case, where we can directly reuse the scalar.
13744 if (!BitCastSrc &&
13745 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13746 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13747 V = V.getOperand(BroadcastIdx);
13748
13749 // If we can't broadcast from a register, check that the input is a load.
13750 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13751 return SDValue();
13752 } else if (ISD::isNormalLoad(V.getNode()) &&
13753 cast<LoadSDNode>(V)->isSimple()) {
13754 // We do not check for one-use of the vector load because a broadcast load
13755 // is expected to be a win for code size, register pressure, and possibly
13756 // uops even if the original vector load is not eliminated.
13757
13758 // Reduce the vector load and shuffle to a broadcasted scalar load.
13759 auto *Ld = cast<LoadSDNode>(V);
13760 SDValue BaseAddr = Ld->getBasePtr();
13761 MVT SVT = VT.getScalarType();
13762 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13763 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13764 SDValue NewAddr =
13766
13767 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13768 // than MOVDDUP.
13769 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13770 if (Opcode == X86ISD::VBROADCAST) {
13771 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13772 SDValue Ops[] = {Ld->getChain(), NewAddr};
13773 V = DAG.getMemIntrinsicNode(
13774 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13776 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13778 return DAG.getBitcast(VT, V);
13779 }
13780 assert(SVT == MVT::f64 && "Unexpected VT!");
13781 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13783 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13785 } else if (!BroadcastFromReg) {
13786 // We can't broadcast from a vector register.
13787 return SDValue();
13788 } else if (BitOffset != 0) {
13789 // We can only broadcast from the zero-element of a vector register,
13790 // but it can be advantageous to broadcast from the zero-element of a
13791 // subvector.
13792 if (!VT.is256BitVector() && !VT.is512BitVector())
13793 return SDValue();
13794
13795 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13796 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13797 return SDValue();
13798
13799 // If we are broadcasting an element from the lowest 128-bit subvector, try
13800 // to move the element in position.
13801 if (BitOffset < 128 && NumActiveElts > 1 &&
13802 V.getScalarValueSizeInBits() == NumEltBits) {
13803 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13804 "Unexpected bit-offset");
13805 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13806 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13807 V = extractSubVector(V, 0, DAG, DL, 128);
13808 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13809 } else {
13810 // Only broadcast the zero-element of a 128-bit subvector.
13811 if ((BitOffset % 128) != 0)
13812 return SDValue();
13813
13814 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13815 "Unexpected bit-offset");
13816 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13817 "Unexpected vector size");
13818 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13819 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13820 }
13821 }
13822
13823 // On AVX we can use VBROADCAST directly for scalar sources.
13824 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13825 V = DAG.getBitcast(MVT::f64, V);
13826 if (Subtarget.hasAVX()) {
13827 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13828 return DAG.getBitcast(VT, V);
13829 }
13830 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13831 }
13832
13833 // If this is a scalar, do the broadcast on this type and bitcast.
13834 if (!V.getValueType().isVector()) {
13835 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13836 "Unexpected scalar size");
13837 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13839 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13840 }
13841
13842 // We only support broadcasting from 128-bit vectors to minimize the
13843 // number of patterns we need to deal with in isel. So extract down to
13844 // 128-bits, removing as many bitcasts as possible.
13845 if (V.getValueSizeInBits() > 128)
13847
13848 // Otherwise cast V to a vector with the same element type as VT, but
13849 // possibly narrower than VT. Then perform the broadcast.
13850 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13851 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13852 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13853}
13854
13855// Check for whether we can use INSERTPS to perform the shuffle. We only use
13856// INSERTPS when the V1 elements are already in the correct locations
13857// because otherwise we can just always use two SHUFPS instructions which
13858// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13859// perform INSERTPS if a single V1 element is out of place and all V2
13860// elements are zeroable.
13862 unsigned &InsertPSMask,
13863 const APInt &Zeroable,
13864 ArrayRef<int> Mask, SelectionDAG &DAG) {
13865 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13866 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13867 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13868
13869 // Attempt to match INSERTPS with one element from VA or VB being
13870 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13871 // are updated.
13872 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13873 ArrayRef<int> CandidateMask) {
13874 unsigned ZMask = 0;
13875 int VADstIndex = -1;
13876 int VBDstIndex = -1;
13877 bool VAUsedInPlace = false;
13878
13879 for (int i = 0; i < 4; ++i) {
13880 // Synthesize a zero mask from the zeroable elements (includes undefs).
13881 if (Zeroable[i]) {
13882 ZMask |= 1 << i;
13883 continue;
13884 }
13885
13886 // Flag if we use any VA inputs in place.
13887 if (i == CandidateMask[i]) {
13888 VAUsedInPlace = true;
13889 continue;
13890 }
13891
13892 // We can only insert a single non-zeroable element.
13893 if (VADstIndex >= 0 || VBDstIndex >= 0)
13894 return false;
13895
13896 if (CandidateMask[i] < 4) {
13897 // VA input out of place for insertion.
13898 VADstIndex = i;
13899 } else {
13900 // VB input for insertion.
13901 VBDstIndex = i;
13902 }
13903 }
13904
13905 // Don't bother if we have no (non-zeroable) element for insertion.
13906 if (VADstIndex < 0 && VBDstIndex < 0)
13907 return false;
13908
13909 // Determine element insertion src/dst indices. The src index is from the
13910 // start of the inserted vector, not the start of the concatenated vector.
13911 unsigned VBSrcIndex = 0;
13912 if (VADstIndex >= 0) {
13913 // If we have a VA input out of place, we use VA as the V2 element
13914 // insertion and don't use the original V2 at all.
13915 VBSrcIndex = CandidateMask[VADstIndex];
13916 VBDstIndex = VADstIndex;
13917 VB = VA;
13918 } else {
13919 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13920 }
13921
13922 // If no V1 inputs are used in place, then the result is created only from
13923 // the zero mask and the V2 insertion - so remove V1 dependency.
13924 if (!VAUsedInPlace)
13925 VA = DAG.getUNDEF(MVT::v4f32);
13926
13927 // Update V1, V2 and InsertPSMask accordingly.
13928 V1 = VA;
13929 V2 = VB;
13930
13931 // Insert the V2 element into the desired position.
13932 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13933 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13934 return true;
13935 };
13936
13937 if (matchAsInsertPS(V1, V2, Mask))
13938 return true;
13939
13940 // Commute and try again.
13941 SmallVector<int, 4> CommutedMask(Mask);
13943 if (matchAsInsertPS(V2, V1, CommutedMask))
13944 return true;
13945
13946 return false;
13947}
13948
13950 ArrayRef<int> Mask, const APInt &Zeroable,
13951 SelectionDAG &DAG) {
13952 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13953 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13954
13955 // Attempt to match the insertps pattern.
13956 unsigned InsertPSMask = 0;
13957 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13958 return SDValue();
13959
13960 // Insert the V2 element into the desired position.
13961 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13962 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13963}
13964
13965/// Handle lowering of 2-lane 64-bit floating point shuffles.
13966///
13967/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13968/// support for floating point shuffles but not integer shuffles. These
13969/// instructions will incur a domain crossing penalty on some chips though so
13970/// it is better to avoid lowering through this for integer vectors where
13971/// possible.
13973 const APInt &Zeroable, SDValue V1, SDValue V2,
13974 const X86Subtarget &Subtarget,
13975 SelectionDAG &DAG) {
13976 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13977 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13978 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13979
13980 if (V2.isUndef()) {
13981 // Check for being able to broadcast a single element.
13982 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13983 Mask, Subtarget, DAG))
13984 return Broadcast;
13985
13986 // Straight shuffle of a single input vector. Simulate this by using the
13987 // single input as both of the "inputs" to this instruction..
13988 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13989
13990 if (Subtarget.hasAVX()) {
13991 // If we have AVX, we can use VPERMILPS which will allow folding a load
13992 // into the shuffle.
13993 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13994 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13995 }
13996
13997 return DAG.getNode(
13998 X86ISD::SHUFP, DL, MVT::v2f64,
13999 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14000 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14001 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14002 }
14003 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14004 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
14005 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14006 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14007
14008 if (Subtarget.hasAVX2())
14009 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14010 return Extract;
14011
14012 // When loading a scalar and then shuffling it into a vector we can often do
14013 // the insertion cheaply.
14015 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14016 return Insertion;
14017 // Try inverting the insertion since for v2 masks it is easy to do and we
14018 // can't reliably sort the mask one way or the other.
14019 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14020 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14022 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14023 return Insertion;
14024
14025 // Try to use one of the special instruction patterns to handle two common
14026 // blend patterns if a zero-blend above didn't work.
14027 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14028 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14029 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14030 // We can either use a special instruction to load over the low double or
14031 // to move just the low double.
14032 return DAG.getNode(
14033 X86ISD::MOVSD, DL, MVT::v2f64, V2,
14034 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14035
14036 if (Subtarget.hasSSE41())
14037 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14038 Zeroable, Subtarget, DAG))
14039 return Blend;
14040
14041 // Use dedicated unpack instructions for masks that match their pattern.
14042 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
14043 return V;
14044
14045 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14046 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14047 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14048}
14049
14050/// Handle lowering of 2-lane 64-bit integer shuffles.
14051///
14052/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14053/// the integer unit to minimize domain crossing penalties. However, for blends
14054/// it falls back to the floating point shuffle operation with appropriate bit
14055/// casting.
14057 const APInt &Zeroable, SDValue V1, SDValue V2,
14058 const X86Subtarget &Subtarget,
14059 SelectionDAG &DAG) {
14060 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14061 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14062 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14063
14064 if (V2.isUndef()) {
14065 // Check for being able to broadcast a single element.
14066 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14067 Mask, Subtarget, DAG))
14068 return Broadcast;
14069
14070 // Straight shuffle of a single input vector. For everything from SSE2
14071 // onward this has a single fast instruction with no scary immediates.
14072 // We have to map the mask as it is actually a v4i32 shuffle instruction.
14073 V1 = DAG.getBitcast(MVT::v4i32, V1);
14074 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14075 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14076 Mask[1] < 0 ? -1 : (Mask[1] * 2),
14077 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14078 return DAG.getBitcast(
14079 MVT::v2i64,
14080 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14081 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14082 }
14083 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
14084 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
14085 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14086 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14087
14088 if (Subtarget.hasAVX2())
14089 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14090 return Extract;
14091
14092 // Try to use shift instructions.
14093 if (SDValue Shift =
14094 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
14095 DAG, /*BitwiseOnly*/ false))
14096 return Shift;
14097
14098 // When loading a scalar and then shuffling it into a vector we can often do
14099 // the insertion cheaply.
14101 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14102 return Insertion;
14103 // Try inverting the insertion since for v2 masks it is easy to do and we
14104 // can't reliably sort the mask one way or the other.
14105 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14107 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14108 return Insertion;
14109
14110 // We have different paths for blend lowering, but they all must use the
14111 // *exact* same predicate.
14112 bool IsBlendSupported = Subtarget.hasSSE41();
14113 if (IsBlendSupported)
14114 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14115 Zeroable, Subtarget, DAG))
14116 return Blend;
14117
14118 // Use dedicated unpack instructions for masks that match their pattern.
14119 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
14120 return V;
14121
14122 // Try to use byte rotation instructions.
14123 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14124 if (Subtarget.hasSSSE3()) {
14125 if (Subtarget.hasVLX())
14126 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14127 Zeroable, Subtarget, DAG))
14128 return Rotate;
14129
14130 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14131 Subtarget, DAG))
14132 return Rotate;
14133 }
14134
14135 // If we have direct support for blends, we should lower by decomposing into
14136 // a permute. That will be faster than the domain cross.
14137 if (IsBlendSupported)
14138 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14139 Zeroable, Subtarget, DAG);
14140
14141 // We implement this with SHUFPD which is pretty lame because it will likely
14142 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14143 // However, all the alternatives are still more cycles and newer chips don't
14144 // have this problem. It would be really nice if x86 had better shuffles here.
14145 V1 = DAG.getBitcast(MVT::v2f64, V1);
14146 V2 = DAG.getBitcast(MVT::v2f64, V2);
14147 return DAG.getBitcast(MVT::v2i64,
14148 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14149}
14150
14151/// Lower a vector shuffle using the SHUFPS instruction.
14152///
14153/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14154/// It makes no assumptions about whether this is the *best* lowering, it simply
14155/// uses it.
14157 ArrayRef<int> Mask, SDValue V1,
14158 SDValue V2, SelectionDAG &DAG) {
14159 SDValue LowV = V1, HighV = V2;
14160 SmallVector<int, 4> NewMask(Mask);
14161 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14162
14163 if (NumV2Elements == 1) {
14164 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14165
14166 // Compute the index adjacent to V2Index and in the same half by toggling
14167 // the low bit.
14168 int V2AdjIndex = V2Index ^ 1;
14169
14170 if (Mask[V2AdjIndex] < 0) {
14171 // Handles all the cases where we have a single V2 element and an undef.
14172 // This will only ever happen in the high lanes because we commute the
14173 // vector otherwise.
14174 if (V2Index < 2)
14175 std::swap(LowV, HighV);
14176 NewMask[V2Index] -= 4;
14177 } else {
14178 // Handle the case where the V2 element ends up adjacent to a V1 element.
14179 // To make this work, blend them together as the first step.
14180 int V1Index = V2AdjIndex;
14181 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14182 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14183 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14184
14185 // Now proceed to reconstruct the final blend as we have the necessary
14186 // high or low half formed.
14187 if (V2Index < 2) {
14188 LowV = V2;
14189 HighV = V1;
14190 } else {
14191 HighV = V2;
14192 }
14193 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14194 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14195 }
14196 } else if (NumV2Elements == 2) {
14197 if (Mask[0] < 4 && Mask[1] < 4) {
14198 // Handle the easy case where we have V1 in the low lanes and V2 in the
14199 // high lanes.
14200 NewMask[2] -= 4;
14201 NewMask[3] -= 4;
14202 } else if (Mask[2] < 4 && Mask[3] < 4) {
14203 // We also handle the reversed case because this utility may get called
14204 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14205 // arrange things in the right direction.
14206 NewMask[0] -= 4;
14207 NewMask[1] -= 4;
14208 HighV = V1;
14209 LowV = V2;
14210 } else {
14211 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14212 // trying to place elements directly, just blend them and set up the final
14213 // shuffle to place them.
14214
14215 // The first two blend mask elements are for V1, the second two are for
14216 // V2.
14217 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14218 Mask[2] < 4 ? Mask[2] : Mask[3],
14219 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14220 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14221 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14222 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14223
14224 // Now we do a normal shuffle of V1 by giving V1 as both operands to
14225 // a blend.
14226 LowV = HighV = V1;
14227 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14228 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14229 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14230 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14231 }
14232 } else if (NumV2Elements == 3) {
14233 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14234 // we can get here due to other paths (e.g repeated mask matching) that we
14235 // don't want to do another round of lowerVECTOR_SHUFFLE.
14237 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14238 }
14239 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14240 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14241}
14242
14243/// Lower 4-lane 32-bit floating point shuffles.
14244///
14245/// Uses instructions exclusively from the floating point unit to minimize
14246/// domain crossing penalties, as these are sufficient to implement all v4f32
14247/// shuffles.
14249 const APInt &Zeroable, SDValue V1, SDValue V2,
14250 const X86Subtarget &Subtarget,
14251 SelectionDAG &DAG) {
14252 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14253 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14254 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14255
14256 if (Subtarget.hasSSE41())
14257 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14258 Zeroable, Subtarget, DAG))
14259 return Blend;
14260
14261 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14262
14263 if (NumV2Elements == 0) {
14264 // Check for being able to broadcast a single element.
14265 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14266 Mask, Subtarget, DAG))
14267 return Broadcast;
14268
14269 // Use even/odd duplicate instructions for masks that match their pattern.
14270 if (Subtarget.hasSSE3()) {
14271 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14272 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14273 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14274 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14275 }
14276
14277 if (Subtarget.hasAVX()) {
14278 // If we have AVX, we can use VPERMILPS which will allow folding a load
14279 // into the shuffle.
14280 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14281 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14282 }
14283
14284 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14285 // in SSE1 because otherwise they are widened to v2f64 and never get here.
14286 if (!Subtarget.hasSSE2()) {
14287 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14288 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14289 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14290 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14291 }
14292
14293 // Otherwise, use a straight shuffle of a single input vector. We pass the
14294 // input vector to both operands to simulate this with a SHUFPS.
14295 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14296 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14297 }
14298
14299 if (Subtarget.hasSSE2())
14301 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
14302 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
14303 return ZExt;
14304 }
14305
14306 if (Subtarget.hasAVX2())
14307 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14308 return Extract;
14309
14310 // There are special ways we can lower some single-element blends. However, we
14311 // have custom ways we can lower more complex single-element blends below that
14312 // we defer to if both this and BLENDPS fail to match, so restrict this to
14313 // when the V2 input is targeting element 0 of the mask -- that is the fast
14314 // case here.
14315 if (NumV2Elements == 1 && Mask[0] >= 4)
14316 if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask,
14317 Zeroable, Subtarget, DAG))
14318 return V;
14319
14320 if (Subtarget.hasSSE41()) {
14321 bool MatchesShufPS = isSingleSHUFPSMask(Mask);
14322
14323 // Use INSERTPS if we can complete the shuffle efficiently.
14324 if (!MatchesShufPS || Zeroable == 0x3 || Zeroable == 0xC)
14325 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14326 return V;
14327
14328 if (!MatchesShufPS)
14329 if (SDValue BlendPerm =
14330 lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG))
14331 return BlendPerm;
14332 }
14333
14334 // Use low/high mov instructions. These are only valid in SSE1 because
14335 // otherwise they are widened to v2f64 and never get here.
14336 if (!Subtarget.hasSSE2()) {
14337 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14338 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14339 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14340 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14341 }
14342
14343 // Use dedicated unpack instructions for masks that match their pattern.
14344 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
14345 return V;
14346
14347 // Otherwise fall back to a SHUFPS lowering strategy.
14348 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14349}
14350
14351/// Lower 4-lane i32 vector shuffles.
14352///
14353/// We try to handle these with integer-domain shuffles where we can, but for
14354/// blends we use the floating point domain blend instructions.
14356 const APInt &Zeroable, SDValue V1, SDValue V2,
14357 const X86Subtarget &Subtarget,
14358 SelectionDAG &DAG) {
14359 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14360 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14361 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14362
14363 // Whenever we can lower this as a zext, that instruction is strictly faster
14364 // than any alternative. It also allows us to fold memory operands into the
14365 // shuffle in many cases.
14366 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14367 Zeroable, Subtarget, DAG))
14368 return ZExt;
14369
14370 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14371
14372 // Try to use shift instructions if fast.
14373 if (Subtarget.preferLowerShuffleAsShift()) {
14374 if (SDValue Shift =
14375 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
14376 Subtarget, DAG, /*BitwiseOnly*/ true))
14377 return Shift;
14378 if (NumV2Elements == 0)
14379 if (SDValue Rotate =
14380 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
14381 return Rotate;
14382 }
14383
14384 if (NumV2Elements == 0) {
14385 // Try to use broadcast unless the mask only has one non-undef element.
14386 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14387 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14388 Mask, Subtarget, DAG))
14389 return Broadcast;
14390 }
14391
14392 // Straight shuffle of a single input vector. For everything from SSE2
14393 // onward this has a single fast instruction with no scary immediates.
14394 // We coerce the shuffle pattern to be compatible with UNPCK instructions
14395 // but we aren't actually going to use the UNPCK instruction because doing
14396 // so prevents folding a load into this instruction or making a copy.
14397 const int UnpackLoMask[] = {0, 0, 1, 1};
14398 const int UnpackHiMask[] = {2, 2, 3, 3};
14399 if (!isSingleElementRepeatedMask(Mask)) {
14400 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14401 Mask = UnpackLoMask;
14402 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14403 Mask = UnpackHiMask;
14404 }
14405
14406 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14407 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14408 }
14409
14410 if (Subtarget.hasAVX2())
14411 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14412 return Extract;
14413
14414 // Try to use shift instructions.
14415 if (SDValue Shift =
14416 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
14417 DAG, /*BitwiseOnly*/ false))
14418 return Shift;
14419
14420 // There are special ways we can lower some single-element blends.
14421 if (NumV2Elements == 1)
14423 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14424 return V;
14425
14426 // We have different paths for blend lowering, but they all must use the
14427 // *exact* same predicate.
14428 bool IsBlendSupported = Subtarget.hasSSE41();
14429 if (IsBlendSupported)
14430 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14431 Zeroable, Subtarget, DAG))
14432 return Blend;
14433
14434 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14435 Zeroable, Subtarget, DAG))
14436 return Masked;
14437
14438 // Use dedicated unpack instructions for masks that match their pattern.
14439 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
14440 return V;
14441
14442 // Try to use byte rotation instructions.
14443 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14444 if (Subtarget.hasSSSE3()) {
14445 if (Subtarget.hasVLX())
14446 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14447 Zeroable, Subtarget, DAG))
14448 return Rotate;
14449
14450 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14451 Subtarget, DAG))
14452 return Rotate;
14453 }
14454
14455 // Assume that a single SHUFPS is faster than an alternative sequence of
14456 // multiple instructions (even if the CPU has a domain penalty).
14457 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14458 if (!isSingleSHUFPSMask(Mask)) {
14459 // If we have direct support for blends, we should lower by decomposing into
14460 // a permute. That will be faster than the domain cross.
14461 if (IsBlendSupported)
14462 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14463 Zeroable, Subtarget, DAG);
14464
14465 // Try to lower by permuting the inputs into an unpack instruction.
14466 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14467 Mask, Subtarget, DAG))
14468 return Unpack;
14469 }
14470
14471 // We implement this with SHUFPS because it can blend from two vectors.
14472 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14473 // up the inputs, bypassing domain shift penalties that we would incur if we
14474 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14475 // relevant.
14476 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14477 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14478 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14479 return DAG.getBitcast(MVT::v4i32, ShufPS);
14480}
14481
14482/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14483/// shuffle lowering, and the most complex part.
14484///
14485/// The lowering strategy is to try to form pairs of input lanes which are
14486/// targeted at the same half of the final vector, and then use a dword shuffle
14487/// to place them onto the right half, and finally unpack the paired lanes into
14488/// their final position.
14489///
14490/// The exact breakdown of how to form these dword pairs and align them on the
14491/// correct sides is really tricky. See the comments within the function for
14492/// more of the details.
14493///
14494/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14495/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14496/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14497/// vector, form the analogous 128-bit 8-element Mask.
14499 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14500 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14501 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
14502 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14503
14504 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
14505 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14506 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14507
14508 // Attempt to directly match PSHUFLW or PSHUFHW.
14509 if (isUndefOrInRange(LoMask, 0, 4) &&
14510 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14511 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14512 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14513 }
14514 if (isUndefOrInRange(HiMask, 4, 8) &&
14515 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14516 for (int i = 0; i != 4; ++i)
14517 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14518 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14519 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14520 }
14521
14522 SmallVector<int, 4> LoInputs;
14523 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14524 array_pod_sort(LoInputs.begin(), LoInputs.end());
14525 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14526 SmallVector<int, 4> HiInputs;
14527 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14528 array_pod_sort(HiInputs.begin(), HiInputs.end());
14529 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14530 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14531 int NumHToL = LoInputs.size() - NumLToL;
14532 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14533 int NumHToH = HiInputs.size() - NumLToH;
14534 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14535 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14536 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14537 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14538
14539 // If we are shuffling values from one half - check how many different DWORD
14540 // pairs we need to create. If only 1 or 2 then we can perform this as a
14541 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14542 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14543 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14544 V = DAG.getNode(ShufWOp, DL, VT, V,
14545 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14546 V = DAG.getBitcast(PSHUFDVT, V);
14547 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14548 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14549 return DAG.getBitcast(VT, V);
14550 };
14551
14552 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14553 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14554 SmallVector<std::pair<int, int>, 4> DWordPairs;
14555 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14556
14557 // Collect the different DWORD pairs.
14558 for (int DWord = 0; DWord != 4; ++DWord) {
14559 int M0 = Mask[2 * DWord + 0];
14560 int M1 = Mask[2 * DWord + 1];
14561 M0 = (M0 >= 0 ? M0 % 4 : M0);
14562 M1 = (M1 >= 0 ? M1 % 4 : M1);
14563 if (M0 < 0 && M1 < 0)
14564 continue;
14565
14566 bool Match = false;
14567 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14568 auto &DWordPair = DWordPairs[j];
14569 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14570 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14571 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14572 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14573 PSHUFDMask[DWord] = DOffset + j;
14574 Match = true;
14575 break;
14576 }
14577 }
14578 if (!Match) {
14579 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14580 DWordPairs.push_back(std::make_pair(M0, M1));
14581 }
14582 }
14583
14584 if (DWordPairs.size() <= 2) {
14585 DWordPairs.resize(2, std::make_pair(-1, -1));
14586 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14587 DWordPairs[1].first, DWordPairs[1].second};
14588 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
14589 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
14590 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
14591 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
14592 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
14593 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
14594 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
14595 }
14596 if ((NumHToL + NumHToH) == 0)
14597 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14598 if ((NumLToL + NumLToH) == 0)
14599 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14600 }
14601 }
14602
14603 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14604 // such inputs we can swap two of the dwords across the half mark and end up
14605 // with <=2 inputs to each half in each half. Once there, we can fall through
14606 // to the generic code below. For example:
14607 //
14608 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14609 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14610 //
14611 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14612 // and an existing 2-into-2 on the other half. In this case we may have to
14613 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14614 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14615 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14616 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14617 // half than the one we target for fixing) will be fixed when we re-enter this
14618 // path. We will also combine away any sequence of PSHUFD instructions that
14619 // result into a single instruction. Here is an example of the tricky case:
14620 //
14621 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14622 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14623 //
14624 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14625 //
14626 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14627 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14628 //
14629 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14630 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14631 //
14632 // The result is fine to be handled by the generic logic.
14633 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14634 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14635 int AOffset, int BOffset) {
14636 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14637 "Must call this with A having 3 or 1 inputs from the A half.");
14638 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14639 "Must call this with B having 1 or 3 inputs from the B half.");
14640 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14641 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14642
14643 bool ThreeAInputs = AToAInputs.size() == 3;
14644
14645 // Compute the index of dword with only one word among the three inputs in
14646 // a half by taking the sum of the half with three inputs and subtracting
14647 // the sum of the actual three inputs. The difference is the remaining
14648 // slot.
14649 int ADWord = 0, BDWord = 0;
14650 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14651 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14652 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14653 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14654 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14655 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14656 int TripleNonInputIdx =
14657 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14658 TripleDWord = TripleNonInputIdx / 2;
14659
14660 // We use xor with one to compute the adjacent DWord to whichever one the
14661 // OneInput is in.
14662 OneInputDWord = (OneInput / 2) ^ 1;
14663
14664 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14665 // and BToA inputs. If there is also such a problem with the BToB and AToB
14666 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14667 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14668 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14669 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14670 // Compute how many inputs will be flipped by swapping these DWords. We
14671 // need
14672 // to balance this to ensure we don't form a 3-1 shuffle in the other
14673 // half.
14674 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14675 llvm::count(AToBInputs, 2 * ADWord + 1);
14676 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14677 llvm::count(BToBInputs, 2 * BDWord + 1);
14678 if ((NumFlippedAToBInputs == 1 &&
14679 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14680 (NumFlippedBToBInputs == 1 &&
14681 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14682 // We choose whether to fix the A half or B half based on whether that
14683 // half has zero flipped inputs. At zero, we may not be able to fix it
14684 // with that half. We also bias towards fixing the B half because that
14685 // will more commonly be the high half, and we have to bias one way.
14686 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14687 ArrayRef<int> Inputs) {
14688 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14689 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14690 // Determine whether the free index is in the flipped dword or the
14691 // unflipped dword based on where the pinned index is. We use this bit
14692 // in an xor to conditionally select the adjacent dword.
14693 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14694 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14695 if (IsFixIdxInput == IsFixFreeIdxInput)
14696 FixFreeIdx += 1;
14697 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14698 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14699 "We need to be changing the number of flipped inputs!");
14700 int PSHUFHalfMask[] = {0, 1, 2, 3};
14701 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14702 V = DAG.getNode(
14703 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14704 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14705 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14706
14707 for (int &M : Mask)
14708 if (M >= 0 && M == FixIdx)
14709 M = FixFreeIdx;
14710 else if (M >= 0 && M == FixFreeIdx)
14711 M = FixIdx;
14712 };
14713 if (NumFlippedBToBInputs != 0) {
14714 int BPinnedIdx =
14715 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14716 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14717 } else {
14718 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14719 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14720 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14721 }
14722 }
14723 }
14724
14725 int PSHUFDMask[] = {0, 1, 2, 3};
14726 PSHUFDMask[ADWord] = BDWord;
14727 PSHUFDMask[BDWord] = ADWord;
14728 V = DAG.getBitcast(
14729 VT,
14730 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14731 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14732
14733 // Adjust the mask to match the new locations of A and B.
14734 for (int &M : Mask)
14735 if (M >= 0 && M/2 == ADWord)
14736 M = 2 * BDWord + M % 2;
14737 else if (M >= 0 && M/2 == BDWord)
14738 M = 2 * ADWord + M % 2;
14739
14740 // Recurse back into this routine to re-compute state now that this isn't
14741 // a 3 and 1 problem.
14742 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14743 };
14744 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14745 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14746 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14747 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14748
14749 // At this point there are at most two inputs to the low and high halves from
14750 // each half. That means the inputs can always be grouped into dwords and
14751 // those dwords can then be moved to the correct half with a dword shuffle.
14752 // We use at most one low and one high word shuffle to collect these paired
14753 // inputs into dwords, and finally a dword shuffle to place them.
14754 int PSHUFLMask[4] = {-1, -1, -1, -1};
14755 int PSHUFHMask[4] = {-1, -1, -1, -1};
14756 int PSHUFDMask[4] = {-1, -1, -1, -1};
14757
14758 // First fix the masks for all the inputs that are staying in their
14759 // original halves. This will then dictate the targets of the cross-half
14760 // shuffles.
14761 auto fixInPlaceInputs =
14762 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14763 MutableArrayRef<int> SourceHalfMask,
14764 MutableArrayRef<int> HalfMask, int HalfOffset) {
14765 if (InPlaceInputs.empty())
14766 return;
14767 if (InPlaceInputs.size() == 1) {
14768 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14769 InPlaceInputs[0] - HalfOffset;
14770 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14771 return;
14772 }
14773 if (IncomingInputs.empty()) {
14774 // Just fix all of the in place inputs.
14775 for (int Input : InPlaceInputs) {
14776 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14777 PSHUFDMask[Input / 2] = Input / 2;
14778 }
14779 return;
14780 }
14781
14782 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14783 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14784 InPlaceInputs[0] - HalfOffset;
14785 // Put the second input next to the first so that they are packed into
14786 // a dword. We find the adjacent index by toggling the low bit.
14787 int AdjIndex = InPlaceInputs[0] ^ 1;
14788 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14789 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14790 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14791 };
14792 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14793 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14794
14795 // Now gather the cross-half inputs and place them into a free dword of
14796 // their target half.
14797 // FIXME: This operation could almost certainly be simplified dramatically to
14798 // look more like the 3-1 fixing operation.
14799 auto moveInputsToRightHalf = [&PSHUFDMask](
14800 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14801 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14802 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14803 int DestOffset) {
14804 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14805 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14806 };
14807 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14808 int Word) {
14809 int LowWord = Word & ~1;
14810 int HighWord = Word | 1;
14811 return isWordClobbered(SourceHalfMask, LowWord) ||
14812 isWordClobbered(SourceHalfMask, HighWord);
14813 };
14814
14815 if (IncomingInputs.empty())
14816 return;
14817
14818 if (ExistingInputs.empty()) {
14819 // Map any dwords with inputs from them into the right half.
14820 for (int Input : IncomingInputs) {
14821 // If the source half mask maps over the inputs, turn those into
14822 // swaps and use the swapped lane.
14823 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14824 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14825 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14826 Input - SourceOffset;
14827 // We have to swap the uses in our half mask in one sweep.
14828 for (int &M : HalfMask)
14829 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14830 M = Input;
14831 else if (M == Input)
14832 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14833 } else {
14834 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14835 Input - SourceOffset &&
14836 "Previous placement doesn't match!");
14837 }
14838 // Note that this correctly re-maps both when we do a swap and when
14839 // we observe the other side of the swap above. We rely on that to
14840 // avoid swapping the members of the input list directly.
14841 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14842 }
14843
14844 // Map the input's dword into the correct half.
14845 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14846 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14847 else
14848 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14849 Input / 2 &&
14850 "Previous placement doesn't match!");
14851 }
14852
14853 // And just directly shift any other-half mask elements to be same-half
14854 // as we will have mirrored the dword containing the element into the
14855 // same position within that half.
14856 for (int &M : HalfMask)
14857 if (M >= SourceOffset && M < SourceOffset + 4) {
14858 M = M - SourceOffset + DestOffset;
14859 assert(M >= 0 && "This should never wrap below zero!");
14860 }
14861 return;
14862 }
14863
14864 // Ensure we have the input in a viable dword of its current half. This
14865 // is particularly tricky because the original position may be clobbered
14866 // by inputs being moved and *staying* in that half.
14867 if (IncomingInputs.size() == 1) {
14868 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14869 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14870 SourceOffset;
14871 SourceHalfMask[InputFixed - SourceOffset] =
14872 IncomingInputs[0] - SourceOffset;
14873 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14874 IncomingInputs[0] = InputFixed;
14875 }
14876 } else if (IncomingInputs.size() == 2) {
14877 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14878 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14879 // We have two non-adjacent or clobbered inputs we need to extract from
14880 // the source half. To do this, we need to map them into some adjacent
14881 // dword slot in the source mask.
14882 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14883 IncomingInputs[1] - SourceOffset};
14884
14885 // If there is a free slot in the source half mask adjacent to one of
14886 // the inputs, place the other input in it. We use (Index XOR 1) to
14887 // compute an adjacent index.
14888 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14889 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14890 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14891 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14892 InputsFixed[1] = InputsFixed[0] ^ 1;
14893 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14894 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14895 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14896 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14897 InputsFixed[0] = InputsFixed[1] ^ 1;
14898 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14899 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14900 // The two inputs are in the same DWord but it is clobbered and the
14901 // adjacent DWord isn't used at all. Move both inputs to the free
14902 // slot.
14903 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14904 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14905 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14906 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14907 } else {
14908 // The only way we hit this point is if there is no clobbering
14909 // (because there are no off-half inputs to this half) and there is no
14910 // free slot adjacent to one of the inputs. In this case, we have to
14911 // swap an input with a non-input.
14912 for (int i = 0; i < 4; ++i)
14913 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14914 "We can't handle any clobbers here!");
14915 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14916 "Cannot have adjacent inputs here!");
14917
14918 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14919 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14920
14921 // We also have to update the final source mask in this case because
14922 // it may need to undo the above swap.
14923 for (int &M : FinalSourceHalfMask)
14924 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14925 M = InputsFixed[1] + SourceOffset;
14926 else if (M == InputsFixed[1] + SourceOffset)
14927 M = (InputsFixed[0] ^ 1) + SourceOffset;
14928
14929 InputsFixed[1] = InputsFixed[0] ^ 1;
14930 }
14931
14932 // Point everything at the fixed inputs.
14933 for (int &M : HalfMask)
14934 if (M == IncomingInputs[0])
14935 M = InputsFixed[0] + SourceOffset;
14936 else if (M == IncomingInputs[1])
14937 M = InputsFixed[1] + SourceOffset;
14938
14939 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14940 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14941 }
14942 } else {
14943 llvm_unreachable("Unhandled input size!");
14944 }
14945
14946 // Now hoist the DWord down to the right half.
14947 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14948 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14949 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14950 for (int &M : HalfMask)
14951 for (int Input : IncomingInputs)
14952 if (M == Input)
14953 M = FreeDWord * 2 + Input % 2;
14954 };
14955 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14956 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14957 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14958 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14959
14960 // Now enact all the shuffles we've computed to move the inputs into their
14961 // target half.
14962 if (!isNoopShuffleMask(PSHUFLMask))
14963 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14964 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14965 if (!isNoopShuffleMask(PSHUFHMask))
14966 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14967 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14968 if (!isNoopShuffleMask(PSHUFDMask))
14969 V = DAG.getBitcast(
14970 VT,
14971 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14972 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14973
14974 // At this point, each half should contain all its inputs, and we can then
14975 // just shuffle them into their final position.
14976 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14977 "Failed to lift all the high half inputs to the low mask!");
14978 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14979 "Failed to lift all the low half inputs to the high mask!");
14980
14981 // Do a half shuffle for the low mask.
14982 if (!isNoopShuffleMask(LoMask))
14983 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14984 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14985
14986 // Do a half shuffle with the high mask after shifting its values down.
14987 for (int &M : HiMask)
14988 if (M >= 0)
14989 M -= 4;
14990 if (!isNoopShuffleMask(HiMask))
14991 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14992 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14993
14994 return V;
14995}
14996
14997/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14998/// blend if only one input is used.
15000 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15001 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15003 "Lane crossing shuffle masks not supported");
15004
15005 int NumBytes = VT.getSizeInBits() / 8;
15006 int Size = Mask.size();
15007 int Scale = NumBytes / Size;
15008
15009 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15010 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15011 V1InUse = false;
15012 V2InUse = false;
15013
15014 for (int i = 0; i < NumBytes; ++i) {
15015 int M = Mask[i / Scale];
15016 if (M < 0)
15017 continue;
15018
15019 const int ZeroMask = 0x80;
15020 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15021 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15022 if (Zeroable[i / Scale])
15023 V1Idx = V2Idx = ZeroMask;
15024
15025 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15026 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15027 V1InUse |= (ZeroMask != V1Idx);
15028 V2InUse |= (ZeroMask != V2Idx);
15029 }
15030
15031 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15032 if (V1InUse)
15033 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15034 DAG.getBuildVector(ShufVT, DL, V1Mask));
15035 if (V2InUse)
15036 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15037 DAG.getBuildVector(ShufVT, DL, V2Mask));
15038
15039 // If we need shuffled inputs from both, blend the two.
15040 SDValue V;
15041 if (V1InUse && V2InUse)
15042 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15043 else
15044 V = V1InUse ? V1 : V2;
15045
15046 // Cast the result back to the correct type.
15047 return DAG.getBitcast(VT, V);
15048}
15049
15050/// Generic lowering of 8-lane i16 shuffles.
15051///
15052/// This handles both single-input shuffles and combined shuffle/blends with
15053/// two inputs. The single input shuffles are immediately delegated to
15054/// a dedicated lowering routine.
15055///
15056/// The blends are lowered in one of three fundamental ways. If there are few
15057/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15058/// of the input is significantly cheaper when lowered as an interleaving of
15059/// the two inputs, try to interleave them. Otherwise, blend the low and high
15060/// halves of the inputs separately (making them have relatively few inputs)
15061/// and then concatenate them.
15063 const APInt &Zeroable, SDValue V1, SDValue V2,
15064 const X86Subtarget &Subtarget,
15065 SelectionDAG &DAG) {
15066 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15067 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15068 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15069
15070 // Whenever we can lower this as a zext, that instruction is strictly faster
15071 // than any alternative.
15072 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15073 Zeroable, Subtarget, DAG))
15074 return ZExt;
15075
15076 // Try to use lower using a truncation.
15077 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15078 Subtarget, DAG))
15079 return V;
15080
15081 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15082
15083 if (NumV2Inputs == 0) {
15084 // Try to use shift instructions.
15085 if (SDValue Shift =
15086 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
15087 Subtarget, DAG, /*BitwiseOnly*/ false))
15088 return Shift;
15089
15090 // Check for being able to broadcast a single element.
15091 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15092 Mask, Subtarget, DAG))
15093 return Broadcast;
15094
15095 // Try to use bit rotation instructions.
15096 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15097 Subtarget, DAG))
15098 return Rotate;
15099
15100 // Use dedicated unpack instructions for masks that match their pattern.
15101 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
15102 return V;
15103
15104 // Use dedicated pack instructions for masks that match their pattern.
15105 if (SDValue V =
15106 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
15107 return V;
15108
15109 // Try to use byte rotation instructions.
15110 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15111 Subtarget, DAG))
15112 return Rotate;
15113
15114 // Make a copy of the mask so it can be modified.
15115 SmallVector<int, 8> MutableMask(Mask);
15116 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15117 Subtarget, DAG);
15118 }
15119
15120 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
15121 "All single-input shuffles should be canonicalized to be V1-input "
15122 "shuffles.");
15123
15124 // Try to use shift instructions.
15125 if (SDValue Shift =
15126 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
15127 DAG, /*BitwiseOnly*/ false))
15128 return Shift;
15129
15130 // See if we can use SSE4A Extraction / Insertion.
15131 if (Subtarget.hasSSE4A())
15132 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15133 Zeroable, DAG))
15134 return V;
15135
15136 // There are special ways we can lower some single-element blends.
15137 if (NumV2Inputs == 1)
15139 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15140 return V;
15141
15142 // We have different paths for blend lowering, but they all must use the
15143 // *exact* same predicate.
15144 bool IsBlendSupported = Subtarget.hasSSE41();
15145 if (IsBlendSupported)
15146 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15147 Zeroable, Subtarget, DAG))
15148 return Blend;
15149
15150 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15151 Zeroable, Subtarget, DAG))
15152 return Masked;
15153
15154 // Use dedicated unpack instructions for masks that match their pattern.
15155 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
15156 return V;
15157
15158 // Use dedicated pack instructions for masks that match their pattern.
15159 if (SDValue V =
15160 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
15161 return V;
15162
15163 // Try to use lower using a truncation.
15164 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15165 Subtarget, DAG))
15166 return V;
15167
15168 // Try to use byte rotation instructions.
15169 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15170 Subtarget, DAG))
15171 return Rotate;
15172
15173 if (SDValue BitBlend =
15174 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15175 return BitBlend;
15176
15177 // Try to use byte shift instructions to mask.
15178 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15179 Zeroable, Subtarget, DAG))
15180 return V;
15181
15182 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15183 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
15184 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
15185 !Subtarget.hasVLX()) {
15186 // Check if this is part of a 256-bit vector truncation.
15187 unsigned PackOpc = 0;
15188 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
15189 peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
15190 peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
15191 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
15192 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
15193 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
15194 DAG.getTargetConstant(0xEE, DL, MVT::i8));
15195 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
15196 V1 = extract128BitVector(V1V2, 0, DAG, DL);
15197 V2 = extract128BitVector(V1V2, 4, DAG, DL);
15198 PackOpc = X86ISD::PACKUS;
15199 } else if (Subtarget.hasSSE41()) {
15200 SmallVector<SDValue, 4> DWordClearOps(4,
15201 DAG.getConstant(0, DL, MVT::i32));
15202 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15203 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15204 SDValue DWordClearMask =
15205 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15206 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15207 DWordClearMask);
15208 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15209 DWordClearMask);
15210 PackOpc = X86ISD::PACKUS;
15211 } else if (!Subtarget.hasSSSE3()) {
15212 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
15213 V1 = DAG.getBitcast(MVT::v4i32, V1);
15214 V2 = DAG.getBitcast(MVT::v4i32, V2);
15215 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
15216 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
15217 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
15218 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
15219 PackOpc = X86ISD::PACKSS;
15220 }
15221 if (PackOpc) {
15222 // Now pack things back together.
15223 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
15224 if (NumEvenDrops == 2) {
15225 Result = DAG.getBitcast(MVT::v4i32, Result);
15226 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
15227 }
15228 return Result;
15229 }
15230 }
15231
15232 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
15233 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
15234 if (NumOddDrops == 1) {
15235 bool HasSSE41 = Subtarget.hasSSE41();
15236 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15237 DAG.getBitcast(MVT::v4i32, V1),
15238 DAG.getTargetConstant(16, DL, MVT::i8));
15239 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15240 DAG.getBitcast(MVT::v4i32, V2),
15241 DAG.getTargetConstant(16, DL, MVT::i8));
15242 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
15243 MVT::v8i16, V1, V2);
15244 }
15245
15246 // Try to lower by permuting the inputs into an unpack instruction.
15247 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15248 Mask, Subtarget, DAG))
15249 return Unpack;
15250
15251 // If we can't directly blend but can use PSHUFB, that will be better as it
15252 // can both shuffle and set up the inefficient blend.
15253 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15254 bool V1InUse, V2InUse;
15255 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15256 Zeroable, DAG, V1InUse, V2InUse);
15257 }
15258
15259 // We can always bit-blend if we have to so the fallback strategy is to
15260 // decompose into single-input permutes and blends/unpacks.
15261 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
15262 Zeroable, Subtarget, DAG);
15263}
15264
15265/// Lower 8-lane 16-bit floating point shuffles.
15267 const APInt &Zeroable, SDValue V1, SDValue V2,
15268 const X86Subtarget &Subtarget,
15269 SelectionDAG &DAG) {
15270 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
15271 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
15272 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15273 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
15274
15275 if (Subtarget.hasFP16()) {
15276 if (NumV2Elements == 0) {
15277 // Check for being able to broadcast a single element.
15278 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
15279 Mask, Subtarget, DAG))
15280 return Broadcast;
15281 }
15282 if (NumV2Elements == 1 && Mask[0] >= 8)
15284 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15285 return V;
15286 }
15287
15288 V1 = DAG.getBitcast(MVT::v8i16, V1);
15289 V2 = DAG.getBitcast(MVT::v8i16, V2);
15290 return DAG.getBitcast(MVT::v8f16,
15291 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15292}
15293
15294// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15295// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15296// the active subvector is extracted.
15298 ArrayRef<int> OriginalMask, SDValue V1,
15299 SDValue V2, const X86Subtarget &Subtarget,
15300 SelectionDAG &DAG) {
15301 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
15302 SmallVector<int, 32> Mask(OriginalMask);
15303 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
15304 !isShuffleFoldableLoad(V2)) {
15306 std::swap(V1, V2);
15307 }
15308
15309 MVT MaskVT = VT.changeTypeToInteger();
15310 SDValue MaskNode;
15311 MVT ShuffleVT = VT;
15312 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15313 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15314 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15315 ShuffleVT = V1.getSimpleValueType();
15316
15317 // Adjust mask to correct indices for the second input.
15318 int NumElts = VT.getVectorNumElements();
15319 unsigned Scale = 512 / VT.getSizeInBits();
15320 SmallVector<int, 32> AdjustedMask(Mask);
15321 for (int &M : AdjustedMask)
15322 if (NumElts <= M)
15323 M += (Scale - 1) * NumElts;
15324 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15325 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15326 } else {
15327 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15328 }
15329
15330 SDValue Result;
15331 if (V2.isUndef())
15332 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15333 else
15334 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15335
15336 if (VT != ShuffleVT)
15337 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15338
15339 return Result;
15340}
15341
15342/// Generic lowering of v16i8 shuffles.
15343///
15344/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15345/// detect any complexity reducing interleaving. If that doesn't help, it uses
15346/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15347/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15348/// back together.
15350 const APInt &Zeroable, SDValue V1, SDValue V2,
15351 const X86Subtarget &Subtarget,
15352 SelectionDAG &DAG) {
15353 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15354 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15355 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
15356
15357 // Try to use shift instructions.
15358 if (SDValue Shift =
15359 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
15360 DAG, /*BitwiseOnly*/ false))
15361 return Shift;
15362
15363 // Try to use byte rotation instructions.
15364 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15365 Subtarget, DAG))
15366 return Rotate;
15367
15368 // Use dedicated pack instructions for masks that match their pattern.
15369 if (SDValue V =
15370 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15371 return V;
15372
15373 // Try to use a zext lowering.
15374 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15375 Zeroable, Subtarget, DAG))
15376 return ZExt;
15377
15378 // Try to use lower using a truncation.
15379 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15380 Subtarget, DAG))
15381 return V;
15382
15383 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15384 Subtarget, DAG))
15385 return V;
15386
15387 // See if we can use SSE4A Extraction / Insertion.
15388 if (Subtarget.hasSSE4A())
15389 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15390 Zeroable, DAG))
15391 return V;
15392
15393 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15394
15395 // For single-input shuffles, there are some nicer lowering tricks we can use.
15396 if (NumV2Elements == 0) {
15397 // Check for being able to broadcast a single element.
15398 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15399 Mask, Subtarget, DAG))
15400 return Broadcast;
15401
15402 // Try to use bit rotation instructions.
15403 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15404 Subtarget, DAG))
15405 return Rotate;
15406
15407 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
15408 return V;
15409
15410 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15411 // Notably, this handles splat and partial-splat shuffles more efficiently.
15412 // However, it only makes sense if the pre-duplication shuffle simplifies
15413 // things significantly. Currently, this means we need to be able to
15414 // express the pre-duplication shuffle as an i16 shuffle.
15415 //
15416 // FIXME: We should check for other patterns which can be widened into an
15417 // i16 shuffle as well.
15418 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15419 for (int i = 0; i < 16; i += 2)
15420 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15421 return false;
15422
15423 return true;
15424 };
15425 auto tryToWidenViaDuplication = [&]() -> SDValue {
15426 if (!canWidenViaDuplication(Mask))
15427 return SDValue();
15428 SmallVector<int, 4> LoInputs;
15429 copy_if(Mask, std::back_inserter(LoInputs),
15430 [](int M) { return M >= 0 && M < 8; });
15431 array_pod_sort(LoInputs.begin(), LoInputs.end());
15432 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
15433 SmallVector<int, 4> HiInputs;
15434 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15435 array_pod_sort(HiInputs.begin(), HiInputs.end());
15436 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
15437
15438 bool TargetLo = LoInputs.size() >= HiInputs.size();
15439 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15440 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15441
15442 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15444 for (int I : InPlaceInputs) {
15445 PreDupI16Shuffle[I/2] = I/2;
15446 LaneMap[I] = I;
15447 }
15448 int j = TargetLo ? 0 : 4, je = j + 4;
15449 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15450 // Check if j is already a shuffle of this input. This happens when
15451 // there are two adjacent bytes after we move the low one.
15452 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15453 // If we haven't yet mapped the input, search for a slot into which
15454 // we can map it.
15455 while (j < je && PreDupI16Shuffle[j] >= 0)
15456 ++j;
15457
15458 if (j == je)
15459 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15460 return SDValue();
15461
15462 // Map this input with the i16 shuffle.
15463 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15464 }
15465
15466 // Update the lane map based on the mapping we ended up with.
15467 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15468 }
15469 V1 = DAG.getBitcast(
15470 MVT::v16i8,
15471 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15472 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15473
15474 // Unpack the bytes to form the i16s that will be shuffled into place.
15475 bool EvenInUse = false, OddInUse = false;
15476 for (int i = 0; i < 16; i += 2) {
15477 EvenInUse |= (Mask[i + 0] >= 0);
15478 OddInUse |= (Mask[i + 1] >= 0);
15479 if (EvenInUse && OddInUse)
15480 break;
15481 }
15482 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15483 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15484 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15485
15486 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15487 for (int i = 0; i < 16; ++i)
15488 if (Mask[i] >= 0) {
15489 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15490 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
15491 if (PostDupI16Shuffle[i / 2] < 0)
15492 PostDupI16Shuffle[i / 2] = MappedMask;
15493 else
15494 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15495 "Conflicting entries in the original shuffle!");
15496 }
15497 return DAG.getBitcast(
15498 MVT::v16i8,
15499 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15500 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15501 };
15502 if (SDValue V = tryToWidenViaDuplication())
15503 return V;
15504 }
15505
15506 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15507 Zeroable, Subtarget, DAG))
15508 return Masked;
15509
15510 // Use dedicated unpack instructions for masks that match their pattern.
15511 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
15512 return V;
15513
15514 // Try to use byte shift instructions to mask.
15515 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15516 Zeroable, Subtarget, DAG))
15517 return V;
15518
15519 // Check for compaction patterns.
15520 bool IsSingleInput = V2.isUndef();
15521 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
15522
15523 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15524 // with PSHUFB. It is important to do this before we attempt to generate any
15525 // blends but after all of the single-input lowerings. If the single input
15526 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15527 // want to preserve that and we can DAG combine any longer sequences into
15528 // a PSHUFB in the end. But once we start blending from multiple inputs,
15529 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15530 // and there are *very* few patterns that would actually be faster than the
15531 // PSHUFB approach because of its ability to zero lanes.
15532 //
15533 // If the mask is a binary compaction, we can more efficiently perform this
15534 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15535 //
15536 // FIXME: The only exceptions to the above are blends which are exact
15537 // interleavings with direct instructions supporting them. We currently don't
15538 // handle those well here.
15539 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15540 bool V1InUse = false;
15541 bool V2InUse = false;
15542
15544 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15545
15546 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15547 // do so. This avoids using them to handle blends-with-zero which is
15548 // important as a single pshufb is significantly faster for that.
15549 if (V1InUse && V2InUse) {
15550 if (Subtarget.hasSSE41())
15551 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15552 Zeroable, Subtarget, DAG))
15553 return Blend;
15554
15555 // We can use an unpack to do the blending rather than an or in some
15556 // cases. Even though the or may be (very minorly) more efficient, we
15557 // preference this lowering because there are common cases where part of
15558 // the complexity of the shuffles goes away when we do the final blend as
15559 // an unpack.
15560 // FIXME: It might be worth trying to detect if the unpack-feeding
15561 // shuffles will both be pshufb, in which case we shouldn't bother with
15562 // this.
15564 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15565 return Unpack;
15566
15567 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15568 if (Subtarget.hasVBMI())
15569 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15570 DAG);
15571
15572 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15573 if (Subtarget.hasXOP()) {
15574 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15575 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15576 }
15577
15578 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15579 // PALIGNR will be cheaper than the second PSHUFB+OR.
15581 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15582 return V;
15583 }
15584
15585 return PSHUFB;
15586 }
15587
15588 // There are special ways we can lower some single-element blends.
15589 if (NumV2Elements == 1)
15591 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15592 return V;
15593
15594 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15595 return Blend;
15596
15597 // Check whether a compaction lowering can be done. This handles shuffles
15598 // which take every Nth element for some even N. See the helper function for
15599 // details.
15600 //
15601 // We special case these as they can be particularly efficiently handled with
15602 // the PACKUSB instruction on x86 and they show up in common patterns of
15603 // rearranging bytes to truncate wide elements.
15604 if (NumEvenDrops) {
15605 // NumEvenDrops is the power of two stride of the elements. Another way of
15606 // thinking about it is that we need to drop the even elements this many
15607 // times to get the original input.
15608
15609 // First we need to zero all the dropped bytes.
15610 assert(NumEvenDrops <= 3 &&
15611 "No support for dropping even elements more than 3 times.");
15612 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15613 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15614 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15615 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15616 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15617 WordClearMask);
15618 if (!IsSingleInput)
15619 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15620 WordClearMask);
15621
15622 // Now pack things back together.
15623 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15624 IsSingleInput ? V1 : V2);
15625 for (int i = 1; i < NumEvenDrops; ++i) {
15626 Result = DAG.getBitcast(MVT::v8i16, Result);
15627 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15628 }
15629 return Result;
15630 }
15631
15632 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15633 if (NumOddDrops == 1) {
15634 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15635 DAG.getBitcast(MVT::v8i16, V1),
15636 DAG.getTargetConstant(8, DL, MVT::i8));
15637 if (!IsSingleInput)
15638 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15639 DAG.getBitcast(MVT::v8i16, V2),
15640 DAG.getTargetConstant(8, DL, MVT::i8));
15641 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15642 IsSingleInput ? V1 : V2);
15643 }
15644
15645 // Handle multi-input cases by blending/unpacking single-input shuffles.
15646 if (NumV2Elements > 0)
15647 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15648 Zeroable, Subtarget, DAG);
15649
15650 // The fallback path for single-input shuffles widens this into two v8i16
15651 // vectors with unpacks, shuffles those, and then pulls them back together
15652 // with a pack.
15653 SDValue V = V1;
15654
15655 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15656 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15657 for (int i = 0; i < 16; ++i)
15658 if (Mask[i] >= 0)
15659 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15660
15661 SDValue VLoHalf, VHiHalf;
15662 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15663 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15664 // i16s.
15665 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15666 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15667 // Use a mask to drop the high bytes.
15668 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15669 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15670 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15671
15672 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15673 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15674
15675 // Squash the masks to point directly into VLoHalf.
15676 for (int &M : LoBlendMask)
15677 if (M >= 0)
15678 M /= 2;
15679 for (int &M : HiBlendMask)
15680 if (M >= 0)
15681 M /= 2;
15682 } else {
15683 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15684 // VHiHalf so that we can blend them as i16s.
15685 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15686
15687 VLoHalf = DAG.getBitcast(
15688 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15689 VHiHalf = DAG.getBitcast(
15690 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15691 }
15692
15693 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15694 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15695
15696 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15697}
15698
15699/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15700///
15701/// This routine breaks down the specific type of 128-bit shuffle and
15702/// dispatches to the lowering routines accordingly.
15704 MVT VT, SDValue V1, SDValue V2,
15705 const APInt &Zeroable,
15706 const X86Subtarget &Subtarget,
15707 SelectionDAG &DAG) {
15708 if (VT == MVT::v8bf16) {
15709 V1 = DAG.getBitcast(MVT::v8i16, V1);
15710 V2 = DAG.getBitcast(MVT::v8i16, V2);
15711 return DAG.getBitcast(VT,
15712 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15713 }
15714
15715 switch (VT.SimpleTy) {
15716 case MVT::v2i64:
15717 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15718 case MVT::v2f64:
15719 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15720 case MVT::v4i32:
15721 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15722 case MVT::v4f32:
15723 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15724 case MVT::v8i16:
15725 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15726 case MVT::v8f16:
15727 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15728 case MVT::v16i8:
15729 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15730
15731 default:
15732 llvm_unreachable("Unimplemented!");
15733 }
15734}
15735
15736/// Generic routine to split vector shuffle into half-sized shuffles.
15737///
15738/// This routine just extracts two subvectors, shuffles them independently, and
15739/// then concatenates them back together. This should work effectively with all
15740/// AVX vector shuffle types.
15742 SDValue V2, ArrayRef<int> Mask,
15743 SelectionDAG &DAG, bool SimpleOnly) {
15744 assert(VT.getSizeInBits() >= 256 &&
15745 "Only for 256-bit or wider vector shuffles!");
15746 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15747 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15748
15749 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15750 if (VT == MVT::v8f32) {
15751 SDValue BC1 = peekThroughBitcasts(V1);
15752 SDValue BC2 = peekThroughBitcasts(V2);
15753 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15754 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15755 DAG, SimpleOnly))
15756 return DAG.getBitcast(VT, Split);
15757 }
15758 }
15759
15760 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15761 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15762
15763 int NumElements = VT.getVectorNumElements();
15764 int SplitNumElements = NumElements / 2;
15765 MVT ScalarVT = VT.getVectorElementType();
15766 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15767
15768 // Use splitVector/extractSubVector so that split build-vectors just build two
15769 // narrower build vectors. This helps shuffling with splats and zeros.
15770 auto SplitVector = [&](SDValue V) {
15771 SDValue LoV, HiV;
15772 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15773 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15774 DAG.getBitcast(SplitVT, HiV));
15775 };
15776
15777 SDValue LoV1, HiV1, LoV2, HiV2;
15778 std::tie(LoV1, HiV1) = SplitVector(V1);
15779 std::tie(LoV2, HiV2) = SplitVector(V2);
15780
15781 // Now create two 4-way blends of these half-width vectors.
15782 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15783 bool &UseHiV1, bool &UseLoV2,
15784 bool &UseHiV2) {
15785 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15786 for (int i = 0; i < SplitNumElements; ++i) {
15787 int M = HalfMask[i];
15788 if (M >= NumElements) {
15789 if (M >= NumElements + SplitNumElements)
15790 UseHiV2 = true;
15791 else
15792 UseLoV2 = true;
15793 } else if (M >= 0) {
15794 if (M >= SplitNumElements)
15795 UseHiV1 = true;
15796 else
15797 UseLoV1 = true;
15798 }
15799 }
15800 };
15801
15802 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15803 if (!SimpleOnly)
15804 return true;
15805
15806 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15807 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15808
15809 return !(UseHiV1 || UseHiV2);
15810 };
15811
15812 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15813 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15814 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15815 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15816 for (int i = 0; i < SplitNumElements; ++i) {
15817 int M = HalfMask[i];
15818 if (M >= NumElements) {
15819 V2BlendMask[i] = M - NumElements;
15820 BlendMask[i] = SplitNumElements + i;
15821 } else if (M >= 0) {
15822 V1BlendMask[i] = M;
15823 BlendMask[i] = i;
15824 }
15825 }
15826
15827 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15828 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15829
15830 // Because the lowering happens after all combining takes place, we need to
15831 // manually combine these blend masks as much as possible so that we create
15832 // a minimal number of high-level vector shuffle nodes.
15833 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15834
15835 // First try just blending the halves of V1 or V2.
15836 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15837 return DAG.getUNDEF(SplitVT);
15838 if (!UseLoV2 && !UseHiV2)
15839 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15840 if (!UseLoV1 && !UseHiV1)
15841 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15842
15843 SDValue V1Blend, V2Blend;
15844 if (UseLoV1 && UseHiV1) {
15845 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15846 } else {
15847 // We only use half of V1 so map the usage down into the final blend mask.
15848 V1Blend = UseLoV1 ? LoV1 : HiV1;
15849 for (int i = 0; i < SplitNumElements; ++i)
15850 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15851 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15852 }
15853 if (UseLoV2 && UseHiV2) {
15854 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15855 } else {
15856 // We only use half of V2 so map the usage down into the final blend mask.
15857 V2Blend = UseLoV2 ? LoV2 : HiV2;
15858 for (int i = 0; i < SplitNumElements; ++i)
15859 if (BlendMask[i] >= SplitNumElements)
15860 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15861 }
15862 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15863 };
15864
15865 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15866 return SDValue();
15867
15868 SDValue Lo = HalfBlend(LoMask);
15869 SDValue Hi = HalfBlend(HiMask);
15870 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15871}
15872
15873/// Either split a vector in halves or decompose the shuffles and the
15874/// blend/unpack.
15875///
15876/// This is provided as a good fallback for many lowerings of non-single-input
15877/// shuffles with more than one 128-bit lane. In those cases, we want to select
15878/// between splitting the shuffle into 128-bit components and stitching those
15879/// back together vs. extracting the single-input shuffles and blending those
15880/// results.
15882 SDValue V2, ArrayRef<int> Mask,
15883 const APInt &Zeroable,
15884 const X86Subtarget &Subtarget,
15885 SelectionDAG &DAG) {
15886 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15887 "shuffles as it could then recurse on itself.");
15888 int Size = Mask.size();
15889
15890 // If this can be modeled as a broadcast of two elements followed by a blend,
15891 // prefer that lowering. This is especially important because broadcasts can
15892 // often fold with memory operands.
15893 auto DoBothBroadcast = [&] {
15894 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15895 for (int M : Mask)
15896 if (M >= Size) {
15897 if (V2BroadcastIdx < 0)
15898 V2BroadcastIdx = M - Size;
15899 else if ((M - Size) != V2BroadcastIdx &&
15900 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15901 return false;
15902 } else if (M >= 0) {
15903 if (V1BroadcastIdx < 0)
15904 V1BroadcastIdx = M;
15905 else if (M != V1BroadcastIdx &&
15906 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15907 return false;
15908 }
15909 return true;
15910 };
15911 if (DoBothBroadcast())
15912 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15913 Subtarget, DAG);
15914
15915 // If the inputs all stem from a single 128-bit lane of each input, then we
15916 // split them rather than blending because the split will decompose to
15917 // unusually few instructions.
15918 int LaneCount = VT.getSizeInBits() / 128;
15919 int LaneSize = Size / LaneCount;
15920 SmallBitVector LaneInputs[2];
15921 LaneInputs[0].resize(LaneCount, false);
15922 LaneInputs[1].resize(LaneCount, false);
15923 for (int i = 0; i < Size; ++i)
15924 if (Mask[i] >= 0)
15925 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15926 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15927 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15928 /*SimpleOnly*/ false);
15929
15930 // Without AVX2, if we can freely split the subvectors then we're better off
15931 // performing half width shuffles.
15932 if (!Subtarget.hasAVX2()) {
15933 SDValue BC1 = peekThroughBitcasts(V1);
15934 SDValue BC2 = peekThroughBitcasts(V2);
15935 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15936 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15937 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15938 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15939 if (SplatOrSplitV1 && SplatOrSplitV2)
15940 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15941 /*SimpleOnly*/ false);
15942 }
15943
15944 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15945 // requires that the decomposed single-input shuffles don't end up here.
15946 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15947 Subtarget, DAG);
15948}
15949
15950// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15951// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15953 SDValue V1, SDValue V2,
15954 ArrayRef<int> Mask,
15955 SelectionDAG &DAG) {
15956 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15957
15958 int LHSMask[4] = {-1, -1, -1, -1};
15959 int RHSMask[4] = {-1, -1, -1, -1};
15960 int SHUFPDMask[4] = {-1, -1, -1, -1};
15961
15962 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15963 // perform the shuffle once the lanes have been shuffled in place.
15964 for (int i = 0; i != 4; ++i) {
15965 int M = Mask[i];
15966 if (M < 0)
15967 continue;
15968 int LaneBase = i & ~1;
15969 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15970 LaneMask[LaneBase + (M & 1)] = M;
15971 SHUFPDMask[i] = M & 1;
15972 }
15973
15974 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15975 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15976 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15977 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15978}
15979
15980/// Lower a vector shuffle crossing multiple 128-bit lanes as
15981/// a lane permutation followed by a per-lane permutation.
15982///
15983/// This is mainly for cases where we can have non-repeating permutes
15984/// in each lane.
15985///
15986/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15987/// we should investigate merging them.
15989 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15990 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15991 int NumElts = VT.getVectorNumElements();
15992 int NumLanes = VT.getSizeInBits() / 128;
15993 int NumEltsPerLane = NumElts / NumLanes;
15994 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15995
15996 /// Attempts to find a sublane permute with the given size
15997 /// that gets all elements into their target lanes.
15998 ///
15999 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
16000 /// If unsuccessful, returns false and may overwrite InLaneMask.
16001 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
16002 int NumSublanesPerLane = NumSublanes / NumLanes;
16003 int NumEltsPerSublane = NumElts / NumSublanes;
16004
16005 SmallVector<int, 16> CrossLaneMask;
16006 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
16007 // CrossLaneMask but one entry == one sublane.
16008 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
16009 APInt DemandedCrossLane = APInt::getZero(NumElts);
16010
16011 for (int i = 0; i != NumElts; ++i) {
16012 int M = Mask[i];
16013 if (M < 0)
16014 continue;
16015
16016 int SrcSublane = M / NumEltsPerSublane;
16017 int DstLane = i / NumEltsPerLane;
16018
16019 // We only need to get the elements into the right lane, not sublane.
16020 // So search all sublanes that make up the destination lane.
16021 bool Found = false;
16022 int DstSubStart = DstLane * NumSublanesPerLane;
16023 int DstSubEnd = DstSubStart + NumSublanesPerLane;
16024 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
16025 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
16026 continue;
16027
16028 Found = true;
16029 CrossLaneMaskLarge[DstSublane] = SrcSublane;
16030 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
16031 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
16032 DemandedCrossLane.setBit(InLaneMask[i]);
16033 break;
16034 }
16035 if (!Found)
16036 return SDValue();
16037 }
16038
16039 // Fill CrossLaneMask using CrossLaneMaskLarge.
16040 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
16041
16042 if (!CanUseSublanes) {
16043 // If we're only shuffling a single lowest lane and the rest are identity
16044 // then don't bother.
16045 // TODO - isShuffleMaskInputInPlace could be extended to something like
16046 // this.
16047 int NumIdentityLanes = 0;
16048 bool OnlyShuffleLowestLane = true;
16049 for (int i = 0; i != NumLanes; ++i) {
16050 int LaneOffset = i * NumEltsPerLane;
16051 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
16052 i * NumEltsPerLane))
16053 NumIdentityLanes++;
16054 else if (CrossLaneMask[LaneOffset] != 0)
16055 OnlyShuffleLowestLane = false;
16056 }
16057 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
16058 return SDValue();
16059 }
16060
16061 // Simplify CrossLaneMask based on the actual demanded elements.
16062 if (V1.hasOneUse())
16063 for (int i = 0; i != NumElts; ++i)
16064 if (!DemandedCrossLane[i])
16065 CrossLaneMask[i] = SM_SentinelUndef;
16066
16067 // Avoid returning the same shuffle operation. For example,
16068 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
16069 // undef:v16i16
16070 if (CrossLaneMask == Mask || InLaneMask == Mask)
16071 return SDValue();
16072
16073 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
16074 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
16075 InLaneMask);
16076 };
16077
16078 // First attempt a solution with full lanes.
16079 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
16080 return V;
16081
16082 // The rest of the solutions use sublanes.
16083 if (!CanUseSublanes)
16084 return SDValue();
16085
16086 // Then attempt a solution with 64-bit sublanes (vpermq).
16087 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16088 return V;
16089
16090 // If that doesn't work and we have fast variable cross-lane shuffle,
16091 // attempt 32-bit sublanes (vpermd).
16092 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16093 return SDValue();
16094
16095 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16096}
16097
16098/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
16099static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
16100 SmallVector<int> &InLaneMask) {
16101 int Size = Mask.size();
16102 InLaneMask.assign(Mask.begin(), Mask.end());
16103 for (int i = 0; i < Size; ++i) {
16104 int &M = InLaneMask[i];
16105 if (M < 0)
16106 continue;
16107 if (((M % Size) / LaneSize) != (i / LaneSize))
16108 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16109 }
16110}
16111
16112/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16113/// source with a lane permutation.
16114///
16115/// This lowering strategy results in four instructions in the worst case for a
16116/// single-input cross lane shuffle which is lower than any other fully general
16117/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16118/// shuffle pattern should be handled prior to trying this lowering.
16120 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16121 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16122 // FIXME: This should probably be generalized for 512-bit vectors as well.
16123 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
16124 int Size = Mask.size();
16125 int LaneSize = Size / 2;
16126
16127 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16128 // Only do this if the elements aren't all from the lower lane,
16129 // otherwise we're (probably) better off doing a split.
16130 if (VT == MVT::v4f64 &&
16131 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16132 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
16133
16134 // If there are only inputs from one 128-bit lane, splitting will in fact be
16135 // less expensive. The flags track whether the given lane contains an element
16136 // that crosses to another lane.
16137 bool AllLanes;
16138 if (!Subtarget.hasAVX2()) {
16139 bool LaneCrossing[2] = {false, false};
16140 for (int i = 0; i < Size; ++i)
16141 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16142 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16143 AllLanes = LaneCrossing[0] && LaneCrossing[1];
16144 } else {
16145 bool LaneUsed[2] = {false, false};
16146 for (int i = 0; i < Size; ++i)
16147 if (Mask[i] >= 0)
16148 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16149 AllLanes = LaneUsed[0] && LaneUsed[1];
16150 }
16151
16152 // TODO - we could support shuffling V2 in the Flipped input.
16153 assert(V2.isUndef() &&
16154 "This last part of this routine only works on single input shuffles");
16155
16156 SmallVector<int> InLaneMask;
16157 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16158
16159 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
16160 "In-lane shuffle mask expected");
16161
16162 // If we're not using both lanes in each lane and the inlane mask is not
16163 // repeating, then we're better off splitting.
16164 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
16165 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
16166 /*SimpleOnly*/ false);
16167
16168 // Flip the lanes, and shuffle the results which should now be in-lane.
16169 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16170 SDValue Flipped = DAG.getBitcast(PVT, V1);
16171 Flipped =
16172 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16173 Flipped = DAG.getBitcast(VT, Flipped);
16174 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16175}
16176
16177/// Handle lowering 2-lane 128-bit shuffles.
16179 SDValue V2, ArrayRef<int> Mask,
16180 const APInt &Zeroable,
16181 const X86Subtarget &Subtarget,
16182 SelectionDAG &DAG) {
16183 if (V2.isUndef()) {
16184 // Attempt to match VBROADCAST*128 subvector broadcast load.
16185 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16186 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16187 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16189 MVT MemVT = VT.getHalfNumVectorElementsVT();
16190 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16192 if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
16193 VT, MemVT, Ld, Ofs, DAG))
16194 return BcstLd;
16195 }
16196
16197 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16198 if (Subtarget.hasAVX2())
16199 return SDValue();
16200 }
16201
16202 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16203
16204 SmallVector<int, 4> WidenedMask;
16205 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16206 return SDValue();
16207
16208 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16209 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16210
16211 // Try to use an insert into a zero vector.
16212 if (WidenedMask[0] == 0 && IsHighZero) {
16213 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16214 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16215 DAG.getVectorIdxConstant(0, DL));
16216 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16217 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16218 DAG.getVectorIdxConstant(0, DL));
16219 }
16220
16221 // TODO: If minimizing size and one of the inputs is a zero vector and the
16222 // the zero vector has only one use, we could use a VPERM2X128 to save the
16223 // instruction bytes needed to explicitly generate the zero vector.
16224
16225 // Blends are faster and handle all the non-lane-crossing cases.
16226 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16227 Subtarget, DAG))
16228 return Blend;
16229
16230 // If either input operand is a zero vector, use VPERM2X128 because its mask
16231 // allows us to replace the zero input with an implicit zero.
16232 if (!IsLowZero && !IsHighZero) {
16233 // Check for patterns which can be matched with a single insert of a 128-bit
16234 // subvector.
16235 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16236 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16237
16238 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16239 // this will likely become vinsertf128 which can't fold a 256-bit memop.
16241 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16242 SDValue SubVec =
16243 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16244 DAG.getVectorIdxConstant(0, DL));
16245 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16246 DAG.getVectorIdxConstant(2, DL));
16247 }
16248 }
16249
16250 // Try to use SHUF128 if possible.
16251 if (Subtarget.hasVLX()) {
16252 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16253 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16254 ((WidenedMask[1] % 2) << 1);
16255 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16256 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16257 }
16258 }
16259 }
16260
16261 // Otherwise form a 128-bit permutation. After accounting for undefs,
16262 // convert the 64-bit shuffle mask selection values into 128-bit
16263 // selection bits by dividing the indexes by 2 and shifting into positions
16264 // defined by a vperm2*128 instruction's immediate control byte.
16265
16266 // The immediate permute control byte looks like this:
16267 // [1:0] - select 128 bits from sources for low half of destination
16268 // [2] - ignore
16269 // [3] - zero low half of destination
16270 // [5:4] - select 128 bits from sources for high half of destination
16271 // [6] - ignore
16272 // [7] - zero high half of destination
16273
16274 assert((WidenedMask[0] >= 0 || IsLowZero) &&
16275 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
16276
16277 unsigned PermMask = 0;
16278 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16279 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16280
16281 // Check the immediate mask and replace unused sources with undef.
16282 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16283 V1 = DAG.getUNDEF(VT);
16284 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16285 V2 = DAG.getUNDEF(VT);
16286
16287 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16288 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16289}
16290
16291/// Lower a vector shuffle by first fixing the 128-bit lanes and then
16292/// shuffling each lane.
16293///
16294/// This attempts to create a repeated lane shuffle where each lane uses one
16295/// or two of the lanes of the inputs. The lanes of the input vectors are
16296/// shuffled in one or two independent shuffles to get the lanes into the
16297/// position needed by the final shuffle.
16299 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16300 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16301 // This is only useful for binary shuffle with a non-repeating mask.
16302 if (V2.isUndef() || is128BitLaneRepeatedShuffleMask(VT, Mask))
16303 return SDValue();
16304
16305 int NumElts = Mask.size();
16306 int NumLanes = VT.getSizeInBits() / 128;
16307 int NumLaneElts = 128 / VT.getScalarSizeInBits();
16308 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16309 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16310
16311 // First pass will try to fill in the RepeatMask from lanes that need two
16312 // sources.
16313 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16314 int Srcs[2] = {-1, -1};
16315 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16316 for (int i = 0; i != NumLaneElts; ++i) {
16317 int M = Mask[(Lane * NumLaneElts) + i];
16318 if (M < 0)
16319 continue;
16320 // Determine which of the possible input lanes (NumLanes from each source)
16321 // this element comes from. Assign that as one of the sources for this
16322 // lane. We can assign up to 2 sources for this lane. If we run out
16323 // sources we can't do anything.
16324 int LaneSrc = M / NumLaneElts;
16325 int Src;
16326 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16327 Src = 0;
16328 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16329 Src = 1;
16330 else
16331 return SDValue();
16332
16333 Srcs[Src] = LaneSrc;
16334 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16335 }
16336
16337 // If this lane has two sources, see if it fits with the repeat mask so far.
16338 if (Srcs[1] < 0)
16339 continue;
16340
16341 LaneSrcs[Lane][0] = Srcs[0];
16342 LaneSrcs[Lane][1] = Srcs[1];
16343
16344 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16345 assert(M1.size() == M2.size() && "Unexpected mask size");
16346 for (int i = 0, e = M1.size(); i != e; ++i)
16347 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16348 return false;
16349 return true;
16350 };
16351
16352 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16353 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
16354 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16355 int M = Mask[i];
16356 if (M < 0)
16357 continue;
16358 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
16359 "Unexpected mask element");
16360 MergedMask[i] = M;
16361 }
16362 };
16363
16364 if (MatchMasks(InLaneMask, RepeatMask)) {
16365 // Merge this lane mask into the final repeat mask.
16366 MergeMasks(InLaneMask, RepeatMask);
16367 continue;
16368 }
16369
16370 // Didn't find a match. Swap the operands and try again.
16371 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16373
16374 if (MatchMasks(InLaneMask, RepeatMask)) {
16375 // Merge this lane mask into the final repeat mask.
16376 MergeMasks(InLaneMask, RepeatMask);
16377 continue;
16378 }
16379
16380 // Couldn't find a match with the operands in either order.
16381 return SDValue();
16382 }
16383
16384 // Now handle any lanes with only one source.
16385 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16386 // If this lane has already been processed, skip it.
16387 if (LaneSrcs[Lane][0] >= 0)
16388 continue;
16389
16390 for (int i = 0; i != NumLaneElts; ++i) {
16391 int M = Mask[(Lane * NumLaneElts) + i];
16392 if (M < 0)
16393 continue;
16394
16395 // If RepeatMask isn't defined yet we can define it ourself.
16396 if (RepeatMask[i] < 0)
16397 RepeatMask[i] = M % NumLaneElts;
16398
16399 if (RepeatMask[i] < NumElts) {
16400 if (RepeatMask[i] != M % NumLaneElts)
16401 return SDValue();
16402 LaneSrcs[Lane][0] = M / NumLaneElts;
16403 } else {
16404 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16405 return SDValue();
16406 LaneSrcs[Lane][1] = M / NumLaneElts;
16407 }
16408 }
16409
16410 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16411 return SDValue();
16412 }
16413
16414 SmallVector<int, 16> NewMask(NumElts, -1);
16415 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16416 int Src = LaneSrcs[Lane][0];
16417 for (int i = 0; i != NumLaneElts; ++i) {
16418 int M = -1;
16419 if (Src >= 0)
16420 M = Src * NumLaneElts + i;
16421 NewMask[Lane * NumLaneElts + i] = M;
16422 }
16423 }
16424 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16425 // Ensure we didn't get back the shuffle we started with.
16426 // FIXME: This is a hack to make up for some splat handling code in
16427 // getVectorShuffle.
16428 if (isa<ShuffleVectorSDNode>(NewV1) &&
16429 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16430 return SDValue();
16431
16432 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16433 int Src = LaneSrcs[Lane][1];
16434 for (int i = 0; i != NumLaneElts; ++i) {
16435 int M = -1;
16436 if (Src >= 0)
16437 M = Src * NumLaneElts + i;
16438 NewMask[Lane * NumLaneElts + i] = M;
16439 }
16440 }
16441 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16442 // Ensure we didn't get back the shuffle we started with.
16443 // FIXME: This is a hack to make up for some splat handling code in
16444 // getVectorShuffle.
16445 if (isa<ShuffleVectorSDNode>(NewV2) &&
16446 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16447 return SDValue();
16448
16449 for (int i = 0; i != NumElts; ++i) {
16450 if (Mask[i] < 0) {
16451 NewMask[i] = -1;
16452 continue;
16453 }
16454 NewMask[i] = RepeatMask[i % NumLaneElts];
16455 if (NewMask[i] < 0)
16456 continue;
16457
16458 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16459 }
16460 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16461}
16462
16463/// If the input shuffle mask results in a vector that is undefined in all upper
16464/// or lower half elements and that mask accesses only 2 halves of the
16465/// shuffle's operands, return true. A mask of half the width with mask indexes
16466/// adjusted to access the extracted halves of the original shuffle operands is
16467/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16468/// lower half of each input operand is accessed.
16469static bool
16471 int &HalfIdx1, int &HalfIdx2) {
16472 assert((Mask.size() == HalfMask.size() * 2) &&
16473 "Expected input mask to be twice as long as output");
16474
16475 // Exactly one half of the result must be undef to allow narrowing.
16476 bool UndefLower = isUndefLowerHalf(Mask);
16477 bool UndefUpper = isUndefUpperHalf(Mask);
16478 if (UndefLower == UndefUpper)
16479 return false;
16480
16481 unsigned HalfNumElts = HalfMask.size();
16482 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16483 HalfIdx1 = -1;
16484 HalfIdx2 = -1;
16485 for (unsigned i = 0; i != HalfNumElts; ++i) {
16486 int M = Mask[i + MaskIndexOffset];
16487 if (M < 0) {
16488 HalfMask[i] = M;
16489 continue;
16490 }
16491
16492 // Determine which of the 4 half vectors this element is from.
16493 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16494 int HalfIdx = M / HalfNumElts;
16495
16496 // Determine the element index into its half vector source.
16497 int HalfElt = M % HalfNumElts;
16498
16499 // We can shuffle with up to 2 half vectors, set the new 'half'
16500 // shuffle mask accordingly.
16501 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16502 HalfMask[i] = HalfElt;
16503 HalfIdx1 = HalfIdx;
16504 continue;
16505 }
16506 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16507 HalfMask[i] = HalfElt + HalfNumElts;
16508 HalfIdx2 = HalfIdx;
16509 continue;
16510 }
16511
16512 // Too many half vectors referenced.
16513 return false;
16514 }
16515
16516 return true;
16517}
16518
16519/// Given the output values from getHalfShuffleMask(), create a half width
16520/// shuffle of extracted vectors followed by an insert back to full width.
16522 ArrayRef<int> HalfMask, int HalfIdx1,
16523 int HalfIdx2, bool UndefLower,
16524 SelectionDAG &DAG, bool UseConcat = false) {
16525 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
16526 assert(V1.getValueType().isSimple() && "Expecting only simple types");
16527
16528 MVT VT = V1.getSimpleValueType();
16529 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16530 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16531
16532 auto getHalfVector = [&](int HalfIdx) {
16533 if (HalfIdx < 0)
16534 return DAG.getUNDEF(HalfVT);
16535 SDValue V = (HalfIdx < 2 ? V1 : V2);
16536 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16537 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16538 DAG.getVectorIdxConstant(HalfIdx, DL));
16539 };
16540
16541 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16542 SDValue Half1 = getHalfVector(HalfIdx1);
16543 SDValue Half2 = getHalfVector(HalfIdx2);
16544 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16545 if (UseConcat) {
16546 SDValue Op0 = V;
16547 SDValue Op1 = DAG.getUNDEF(HalfVT);
16548 if (UndefLower)
16549 std::swap(Op0, Op1);
16550 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16551 }
16552
16553 unsigned Offset = UndefLower ? HalfNumElts : 0;
16554 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16556}
16557
16558/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16559/// This allows for fast cases such as subvector extraction/insertion
16560/// or shuffling smaller vector types which can lower more efficiently.
16562 SDValue V2, ArrayRef<int> Mask,
16563 const X86Subtarget &Subtarget,
16564 SelectionDAG &DAG) {
16565 assert((VT.is256BitVector() || VT.is512BitVector()) &&
16566 "Expected 256-bit or 512-bit vector");
16567
16568 bool UndefLower = isUndefLowerHalf(Mask);
16569 if (!UndefLower && !isUndefUpperHalf(Mask))
16570 return SDValue();
16571
16572 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
16573 "Completely undef shuffle mask should have been simplified already");
16574
16575 // Upper half is undef and lower half is whole upper subvector.
16576 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16577 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16578 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16579 if (!UndefLower &&
16580 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16581 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16582 DAG.getVectorIdxConstant(HalfNumElts, DL));
16583 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16584 DAG.getVectorIdxConstant(0, DL));
16585 }
16586
16587 // Lower half is undef and upper half is whole lower subvector.
16588 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16589 if (UndefLower &&
16590 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16591 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16592 DAG.getVectorIdxConstant(0, DL));
16593 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16594 DAG.getVectorIdxConstant(HalfNumElts, DL));
16595 }
16596
16597 int HalfIdx1, HalfIdx2;
16598 SmallVector<int, 8> HalfMask(HalfNumElts);
16599 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16600 return SDValue();
16601
16602 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
16603
16604 // Only shuffle the halves of the inputs when useful.
16605 unsigned NumLowerHalves =
16606 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16607 unsigned NumUpperHalves =
16608 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16609 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
16610
16611 // Determine the larger pattern of undef/halves, then decide if it's worth
16612 // splitting the shuffle based on subtarget capabilities and types.
16613 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16614 if (!UndefLower) {
16615 // XXXXuuuu: no insert is needed.
16616 // Always extract lowers when setting lower - these are all free subreg ops.
16617 if (NumUpperHalves == 0)
16618 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16619 UndefLower, DAG);
16620
16621 if (NumUpperHalves == 1) {
16622 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16623 if (Subtarget.hasAVX2()) {
16624 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16625 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16626 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16627 (!isSingleSHUFPSMask(HalfMask) ||
16628 Subtarget.hasFastVariableCrossLaneShuffle()))
16629 return SDValue();
16630 // If this is an unary shuffle (assume that the 2nd operand is
16631 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16632 // are better off extracting the upper half of 1 operand and using a
16633 // narrow shuffle.
16634 if (EltWidth == 64 && V2.isUndef())
16635 return SDValue();
16636 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16637 // full width pshufb, and then merge.
16638 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16639 return SDValue();
16640 }
16641 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16642 if (Subtarget.hasAVX512() && VT.is512BitVector())
16643 return SDValue();
16644 // Extract + narrow shuffle is better than the wide alternative.
16645 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16646 UndefLower, DAG);
16647 }
16648
16649 // Don't extract both uppers, instead shuffle and then extract.
16650 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16651 return SDValue();
16652 }
16653
16654 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16655 if (NumUpperHalves == 0) {
16656 // AVX2 has efficient 64-bit element cross-lane shuffles.
16657 // TODO: Refine to account for unary shuffle, splat, and other masks?
16658 if (Subtarget.hasAVX2() && EltWidth == 64)
16659 return SDValue();
16660 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16661 if (Subtarget.hasAVX512() && VT.is512BitVector())
16662 return SDValue();
16663 // Narrow shuffle + insert is better than the wide alternative.
16664 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16665 UndefLower, DAG);
16666 }
16667
16668 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16669 return SDValue();
16670}
16671
16672/// Handle case where shuffle sources are coming from the same 128-bit lane and
16673/// every lane can be represented as the same repeating mask - allowing us to
16674/// shuffle the sources with the repeating shuffle and then permute the result
16675/// to the destination lanes.
16677 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16678 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16679 int NumElts = VT.getVectorNumElements();
16680 int NumLanes = VT.getSizeInBits() / 128;
16681 int NumLaneElts = NumElts / NumLanes;
16682
16683 // On AVX2 we may be able to just shuffle the lowest elements and then
16684 // broadcast the result.
16685 if (Subtarget.hasAVX2()) {
16686 for (unsigned BroadcastSize : {16, 32, 64}) {
16687 if (BroadcastSize <= VT.getScalarSizeInBits())
16688 continue;
16689 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16690
16691 // Attempt to match a repeating pattern every NumBroadcastElts,
16692 // accounting for UNDEFs but only references the lowest 128-bit
16693 // lane of the inputs.
16694 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16695 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16696 for (int j = 0; j != NumBroadcastElts; ++j) {
16697 int M = Mask[i + j];
16698 if (M < 0)
16699 continue;
16700 int &R = RepeatMask[j];
16701 if (0 != ((M % NumElts) / NumLaneElts))
16702 return false;
16703 if (0 <= R && R != M)
16704 return false;
16705 R = M;
16706 }
16707 return true;
16708 };
16709
16710 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16711 if (!FindRepeatingBroadcastMask(RepeatMask))
16712 continue;
16713
16714 // Shuffle the (lowest) repeated elements in place for broadcast.
16715 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16716
16717 // Shuffle the actual broadcast.
16718 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16719 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16720 for (int j = 0; j != NumBroadcastElts; ++j)
16721 BroadcastMask[i + j] = j;
16722
16723 // Avoid returning the same shuffle operation. For example,
16724 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16725 if (BroadcastMask == Mask)
16726 return SDValue();
16727
16728 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16729 BroadcastMask);
16730 }
16731 }
16732
16733 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16734 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16735 return SDValue();
16736
16737 // Bail if we already have a repeated lane shuffle mask.
16738 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16739 return SDValue();
16740
16741 // Helper to look for repeated mask in each split sublane, and that those
16742 // sublanes can then be permuted into place.
16743 auto ShuffleSubLanes = [&](int SubLaneScale) {
16744 int NumSubLanes = NumLanes * SubLaneScale;
16745 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16746
16747 // Check that all the sources are coming from the same lane and see if we
16748 // can form a repeating shuffle mask (local to each sub-lane). At the same
16749 // time, determine the source sub-lane for each destination sub-lane.
16750 int TopSrcSubLane = -1;
16751 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16752 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16753 SubLaneScale,
16754 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16755
16756 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16757 // Extract the sub-lane mask, check that it all comes from the same lane
16758 // and normalize the mask entries to come from the first lane.
16759 int SrcLane = -1;
16760 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16761 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16762 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16763 if (M < 0)
16764 continue;
16765 int Lane = (M % NumElts) / NumLaneElts;
16766 if ((0 <= SrcLane) && (SrcLane != Lane))
16767 return SDValue();
16768 SrcLane = Lane;
16769 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16770 SubLaneMask[Elt] = LocalM;
16771 }
16772
16773 // Whole sub-lane is UNDEF.
16774 if (SrcLane < 0)
16775 continue;
16776
16777 // Attempt to match against the candidate repeated sub-lane masks.
16778 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16779 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16780 for (int i = 0; i != NumSubLaneElts; ++i) {
16781 if (M1[i] < 0 || M2[i] < 0)
16782 continue;
16783 if (M1[i] != M2[i])
16784 return false;
16785 }
16786 return true;
16787 };
16788
16789 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16790 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16791 continue;
16792
16793 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16794 for (int i = 0; i != NumSubLaneElts; ++i) {
16795 int M = SubLaneMask[i];
16796 if (M < 0)
16797 continue;
16798 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16799 "Unexpected mask element");
16800 RepeatedSubLaneMask[i] = M;
16801 }
16802
16803 // Track the top most source sub-lane - by setting the remaining to
16804 // UNDEF we can greatly simplify shuffle matching.
16805 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16806 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16807 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16808 break;
16809 }
16810
16811 // Bail if we failed to find a matching repeated sub-lane mask.
16812 if (Dst2SrcSubLanes[DstSubLane] < 0)
16813 return SDValue();
16814 }
16815 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16816 "Unexpected source lane");
16817
16818 // Create a repeating shuffle mask for the entire vector.
16819 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16820 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16821 int Lane = SubLane / SubLaneScale;
16822 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16823 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16824 int M = RepeatedSubLaneMask[Elt];
16825 if (M < 0)
16826 continue;
16827 int Idx = (SubLane * NumSubLaneElts) + Elt;
16828 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16829 }
16830 }
16831
16832 // Shuffle each source sub-lane to its destination.
16833 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16834 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16835 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16836 if (SrcSubLane < 0)
16837 continue;
16838 for (int j = 0; j != NumSubLaneElts; ++j)
16839 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16840 }
16841
16842 // Avoid returning the same shuffle operation.
16843 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16844 if (RepeatedMask == Mask || SubLaneMask == Mask)
16845 return SDValue();
16846
16847 SDValue RepeatedShuffle =
16848 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16849
16850 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16851 SubLaneMask);
16852 };
16853
16854 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16855 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16856 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16857 // Otherwise we can only permute whole 128-bit lanes.
16858 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16859 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16860 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16861 MinSubLaneScale = 2;
16862 MaxSubLaneScale =
16863 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16864 }
16865 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16866 MinSubLaneScale = MaxSubLaneScale = 4;
16867
16868 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16869 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16870 return Shuffle;
16871
16872 return SDValue();
16873}
16874
16876 bool &ForceV1Zero, bool &ForceV2Zero,
16877 unsigned &ShuffleImm, ArrayRef<int> Mask,
16878 const APInt &Zeroable) {
16879 int NumElts = VT.getVectorNumElements();
16880 assert(VT.getScalarSizeInBits() == 64 &&
16881 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16882 "Unexpected data type for VSHUFPD");
16883 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16884 "Illegal shuffle mask");
16885
16886 bool ZeroLane[2] = { true, true };
16887 for (int i = 0; i < NumElts; ++i)
16888 ZeroLane[i & 1] &= Zeroable[i];
16889
16890 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16891 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16892 bool IsSHUFPD = true;
16893 bool IsCommutable = true;
16894 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16895 for (int i = 0; i < NumElts; ++i) {
16896 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16897 continue;
16898 if (Mask[i] < 0)
16899 return false;
16900 int Val = (i & 6) + NumElts * (i & 1);
16901 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16902 if (Mask[i] < Val || Mask[i] > Val + 1)
16903 IsSHUFPD = false;
16904 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16905 IsCommutable = false;
16906 SHUFPDMask[i] = Mask[i] % 2;
16907 }
16908
16909 if (!IsSHUFPD && !IsCommutable)
16910 return false;
16911
16912 if (!IsSHUFPD && IsCommutable)
16913 std::swap(V1, V2);
16914
16915 ForceV1Zero = ZeroLane[0];
16916 ForceV2Zero = ZeroLane[1];
16917 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16918 return true;
16919}
16920
16922 SDValue V2, ArrayRef<int> Mask,
16923 const APInt &Zeroable,
16924 const X86Subtarget &Subtarget,
16925 SelectionDAG &DAG) {
16926 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16927 "Unexpected data type for VSHUFPD");
16928
16929 unsigned Immediate = 0;
16930 bool ForceV1Zero = false, ForceV2Zero = false;
16931 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16932 Mask, Zeroable))
16933 return SDValue();
16934
16935 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16936 if (ForceV1Zero)
16937 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16938 if (ForceV2Zero)
16939 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16940
16941 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16942 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16943}
16944
16945// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16946// by zeroable elements in the remaining 24 elements. Turn this into two
16947// vmovqb instructions shuffled together.
16949 SDValue V1, SDValue V2,
16950 ArrayRef<int> Mask,
16951 const APInt &Zeroable,
16952 SelectionDAG &DAG) {
16953 assert(VT == MVT::v32i8 && "Unexpected type!");
16954
16955 // The first 8 indices should be every 8th element.
16956 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16957 return SDValue();
16958
16959 // Remaining elements need to be zeroable.
16960 if (Zeroable.countl_one() < (Mask.size() - 8))
16961 return SDValue();
16962
16963 V1 = DAG.getBitcast(MVT::v4i64, V1);
16964 V2 = DAG.getBitcast(MVT::v4i64, V2);
16965
16966 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16967 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16968
16969 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16970 // the upper bits of the result using an unpckldq.
16971 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16972 { 0, 1, 2, 3, 16, 17, 18, 19,
16973 4, 5, 6, 7, 20, 21, 22, 23 });
16974 // Insert the unpckldq into a zero vector to widen to v32i8.
16975 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16976 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16977 DAG.getVectorIdxConstant(0, DL));
16978}
16979
16980// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16981// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16982// =>
16983// ul = unpckl v1, v2
16984// uh = unpckh v1, v2
16985// a = vperm ul, uh
16986// b = vperm ul, uh
16987//
16988// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16989// and permute. We cannot directly match v3 because it is split into two
16990// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16991// pair of 256-bit shuffles and makes sure the masks are consecutive.
16992//
16993// Once unpck and permute nodes are created, the permute corresponding to this
16994// shuffle is returned, while the other permute replaces the other half of the
16995// shuffle in the selection dag.
16997 SDValue V1, SDValue V2,
16998 ArrayRef<int> Mask,
16999 SelectionDAG &DAG) {
17000 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
17001 VT != MVT::v32i8)
17002 return SDValue();
17003 // <B0, B1, B0+1, B1+1, ..., >
17004 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
17005 unsigned Begin1) {
17006 size_t Size = Mask.size();
17007 assert(Size % 2 == 0 && "Expected even mask size");
17008 for (unsigned I = 0; I < Size; I += 2) {
17009 if (Mask[I] != (int)(Begin0 + I / 2) ||
17010 Mask[I + 1] != (int)(Begin1 + I / 2))
17011 return false;
17012 }
17013 return true;
17014 };
17015 // Check which half is this shuffle node
17016 int NumElts = VT.getVectorNumElements();
17017 size_t FirstQtr = NumElts / 2;
17018 size_t ThirdQtr = NumElts + NumElts / 2;
17019 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
17020 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
17021 if (!IsFirstHalf && !IsSecondHalf)
17022 return SDValue();
17023
17024 // Find the intersection between shuffle users of V1 and V2.
17025 SmallVector<SDNode *, 2> Shuffles;
17026 for (SDNode *User : V1->users())
17027 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
17028 User->getOperand(1) == V2)
17029 Shuffles.push_back(User);
17030 // Limit user size to two for now.
17031 if (Shuffles.size() != 2)
17032 return SDValue();
17033 // Find out which half of the 512-bit shuffles is each smaller shuffle
17034 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
17035 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
17036 SDNode *FirstHalf;
17037 SDNode *SecondHalf;
17038 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
17039 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
17040 FirstHalf = Shuffles[0];
17041 SecondHalf = Shuffles[1];
17042 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
17043 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
17044 FirstHalf = Shuffles[1];
17045 SecondHalf = Shuffles[0];
17046 } else {
17047 return SDValue();
17048 }
17049 // Lower into unpck and perm. Return the perm of this shuffle and replace
17050 // the other.
17051 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
17052 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
17053 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
17054 DAG.getTargetConstant(0x20, DL, MVT::i8));
17055 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
17056 DAG.getTargetConstant(0x31, DL, MVT::i8));
17057 if (IsFirstHalf) {
17058 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
17059 return Perm1;
17060 }
17061 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
17062 return Perm2;
17063}
17064
17065/// Handle lowering of 4-lane 64-bit floating point shuffles.
17066///
17067/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
17068/// isn't available.
17070 const APInt &Zeroable, SDValue V1, SDValue V2,
17071 const X86Subtarget &Subtarget,
17072 SelectionDAG &DAG) {
17073 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
17074 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
17075 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
17076
17077 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
17078 Subtarget, DAG))
17079 return V;
17080
17081 if (V2.isUndef()) {
17082 // Check for being able to broadcast a single element.
17083 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
17084 Mask, Subtarget, DAG))
17085 return Broadcast;
17086
17087 // Use low duplicate instructions for masks that match their pattern.
17088 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
17089 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
17090
17091 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
17092 // Non-half-crossing single input shuffles can be lowered with an
17093 // interleaved permutation.
17094 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17095 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
17096 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
17097 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17098 }
17099
17100 // With AVX2 we have direct support for this permutation.
17101 if (Subtarget.hasAVX2())
17102 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
17103 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17104
17105 // Try to create an in-lane repeating shuffle mask and then shuffle the
17106 // results into the target lanes.
17108 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17109 return V;
17110
17111 // Try to permute the lanes and then use a per-lane permute.
17112 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
17113 Mask, DAG, Subtarget))
17114 return V;
17115
17116 // Otherwise, fall back.
17117 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
17118 DAG, Subtarget);
17119 }
17120
17121 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
17122 Zeroable, Subtarget, DAG))
17123 return Blend;
17124
17125 // Use dedicated unpack instructions for masks that match their pattern.
17126 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
17127 return V;
17128
17129 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
17130 Zeroable, Subtarget, DAG))
17131 return Op;
17132
17133 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
17134 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
17135 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
17136 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
17137
17138 // If we have lane crossing shuffles AND they don't all come from the lower
17139 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17140 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
17141 // canonicalize to a blend of splat which isn't necessary for this combine.
17142 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
17143 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
17144 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
17145 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
17146 (!Subtarget.hasAVX2() ||
17147 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
17148 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
17149
17150 // If we have one input in place, then we can permute the other input and
17151 // blend the result.
17152 if (V1IsInPlace || V2IsInPlace)
17153 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17154 Zeroable, Subtarget, DAG);
17155
17156 // Try to create an in-lane repeating shuffle mask and then shuffle the
17157 // results into the target lanes.
17159 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17160 return V;
17161
17162 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17163 // shuffle. However, if we have AVX2 and either inputs are already in place,
17164 // we will be able to shuffle even across lanes the other input in a single
17165 // instruction so skip this pattern.
17166 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
17168 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17169 return V;
17170
17171 // If we have VLX support, we can use VEXPAND.
17172 if (Subtarget.hasVLX())
17173 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
17174 Zeroable, Subtarget, DAG))
17175 return V;
17176
17177 // If we have AVX2 then we always want to lower with a blend because an v4 we
17178 // can fully permute the elements.
17179 if (Subtarget.hasAVX2())
17180 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17181 Zeroable, Subtarget, DAG);
17182
17183 // Otherwise fall back on generic lowering.
17184 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
17185 Subtarget, DAG);
17186}
17187
17188/// Handle lowering of 4-lane 64-bit integer shuffles.
17189///
17190/// This routine is only called when we have AVX2 and thus a reasonable
17191/// instruction set for v4i64 shuffling..
17193 const APInt &Zeroable, SDValue V1, SDValue V2,
17194 const X86Subtarget &Subtarget,
17195 SelectionDAG &DAG) {
17196 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
17197 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
17198 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
17199 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
17200
17201 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17202 Subtarget, DAG))
17203 return V;
17204
17205 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17206 Zeroable, Subtarget, DAG))
17207 return Blend;
17208
17209 // Check for being able to broadcast a single element.
17210 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17211 Subtarget, DAG))
17212 return Broadcast;
17213
17214 // Try to use shift instructions if fast.
17215 if (Subtarget.preferLowerShuffleAsShift())
17216 if (SDValue Shift =
17217 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17218 Subtarget, DAG, /*BitwiseOnly*/ true))
17219 return Shift;
17220
17221 if (V2.isUndef()) {
17222 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17223 // can use lower latency instructions that will operate on both lanes.
17224 SmallVector<int, 2> RepeatedMask;
17225 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17226 SmallVector<int, 4> PSHUFDMask;
17227 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17228 return DAG.getBitcast(
17229 MVT::v4i64,
17230 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17231 DAG.getBitcast(MVT::v8i32, V1),
17232 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17233 }
17234
17235 // AVX2 provides a direct instruction for permuting a single input across
17236 // lanes.
17237 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17238 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17239 }
17240
17241 // Try to use shift instructions.
17242 if (SDValue Shift =
17243 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
17244 DAG, /*BitwiseOnly*/ false))
17245 return Shift;
17246
17247 // If we have VLX support, we can use VALIGN or VEXPAND.
17248 if (Subtarget.hasVLX()) {
17249 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17250 Zeroable, Subtarget, DAG))
17251 return Rotate;
17252
17253 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
17254 Zeroable, Subtarget, DAG))
17255 return V;
17256 }
17257
17258 // Try to use PALIGNR.
17259 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17260 Subtarget, DAG))
17261 return Rotate;
17262
17263 // Use dedicated unpack instructions for masks that match their pattern.
17264 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
17265 return V;
17266
17267 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
17268 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
17269
17270 // If we have one input in place, then we can permute the other input and
17271 // blend the result.
17272 if (V1IsInPlace || V2IsInPlace)
17273 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17274 Zeroable, Subtarget, DAG);
17275
17276 // Try to create an in-lane repeating shuffle mask and then shuffle the
17277 // results into the target lanes.
17279 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17280 return V;
17281
17282 // Try to lower to PERMQ(BLENDD(V1,V2)).
17283 if (SDValue V =
17284 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
17285 return V;
17286
17287 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17288 // shuffle. However, if we have AVX2 and either inputs are already in place,
17289 // we will be able to shuffle even across lanes the other input in a single
17290 // instruction so skip this pattern.
17291 if (!V1IsInPlace && !V2IsInPlace)
17293 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17294 return Result;
17295
17296 // Otherwise fall back on generic blend lowering.
17297 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17298 Zeroable, Subtarget, DAG);
17299}
17300
17301/// Handle lowering of 8-lane 32-bit floating point shuffles.
17302///
17303/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17304/// isn't available.
17306 const APInt &Zeroable, SDValue V1, SDValue V2,
17307 const X86Subtarget &Subtarget,
17308 SelectionDAG &DAG) {
17309 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17310 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17311 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17312
17313 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17314 Zeroable, Subtarget, DAG))
17315 return Blend;
17316
17317 // Check for being able to broadcast a single element.
17318 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17319 Subtarget, DAG))
17320 return Broadcast;
17321
17322 if (!Subtarget.hasAVX2()) {
17323 SmallVector<int> InLaneMask;
17324 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
17325
17326 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
17327 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
17328 /*SimpleOnly*/ true))
17329 return R;
17330 }
17331 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17332 Zeroable, Subtarget, DAG))
17333 return DAG.getBitcast(MVT::v8f32, ZExt);
17334
17335 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17336 // options to efficiently lower the shuffle.
17337 SmallVector<int, 4> RepeatedMask;
17338 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17339 assert(RepeatedMask.size() == 4 &&
17340 "Repeated masks must be half the mask width!");
17341
17342 // Use even/odd duplicate instructions for masks that match their pattern.
17343 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17344 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17345 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17346 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17347
17348 if (V2.isUndef())
17349 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17350 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17351
17352 // Use dedicated unpack instructions for masks that match their pattern.
17353 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
17354 return V;
17355
17356 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17357 // have already handled any direct blends.
17358 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17359 }
17360
17361 // Try to create an in-lane repeating shuffle mask and then shuffle the
17362 // results into the target lanes.
17364 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17365 return V;
17366
17367 // If we have a single input shuffle with different shuffle patterns in the
17368 // two 128-bit lanes use the variable mask to VPERMILPS.
17369 if (V2.isUndef()) {
17370 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17371 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17372 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17373 }
17374 if (Subtarget.hasAVX2()) {
17375 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17376 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17377 }
17378 // Otherwise, fall back.
17379 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17380 DAG, Subtarget);
17381 }
17382
17383 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17384 // shuffle.
17386 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17387 return Result;
17388
17389 // If we have VLX support, we can use VEXPAND.
17390 if (Subtarget.hasVLX())
17391 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
17392 Zeroable, Subtarget, DAG))
17393 return V;
17394
17395 // Try to match an interleave of two v8f32s and lower them as unpck and
17396 // permutes using ymms. This needs to go before we try to split the vectors.
17397 // Don't attempt on AVX1 if we're likely to split vectors anyway.
17398 if ((Subtarget.hasAVX2() ||
17401 !Subtarget.hasAVX512())
17402 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
17403 Mask, DAG))
17404 return V;
17405
17406 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17407 // since after split we get a more efficient code using vpunpcklwd and
17408 // vpunpckhwd instrs than vblend.
17409 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
17410 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
17411 Subtarget, DAG);
17412
17413 // If we have AVX2 then we always want to lower with a blend because at v8 we
17414 // can fully permute the elements.
17415 if (Subtarget.hasAVX2())
17416 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17417 Zeroable, Subtarget, DAG);
17418
17419 // Otherwise fall back on generic lowering.
17420 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
17421 Subtarget, DAG);
17422}
17423
17424/// Handle lowering of 8-lane 32-bit integer shuffles.
17425///
17426/// This routine is only called when we have AVX2 and thus a reasonable
17427/// instruction set for v8i32 shuffling..
17429 const APInt &Zeroable, SDValue V1, SDValue V2,
17430 const X86Subtarget &Subtarget,
17431 SelectionDAG &DAG) {
17432 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17433 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17434 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17435 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
17436
17437 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
17438
17439 // Whenever we can lower this as a zext, that instruction is strictly faster
17440 // than any alternative. It also allows us to fold memory operands into the
17441 // shuffle in many cases.
17442 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17443 Zeroable, Subtarget, DAG))
17444 return ZExt;
17445
17446 // Try to match an interleave of two v8i32s and lower them as unpck and
17447 // permutes using ymms. This needs to go before we try to split the vectors.
17448 if (!Subtarget.hasAVX512())
17449 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
17450 Mask, DAG))
17451 return V;
17452
17453 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17454 // since after split we get a more efficient code than vblend by using
17455 // vpunpcklwd and vpunpckhwd instrs.
17456 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
17457 !Subtarget.hasAVX512())
17458 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
17459 Subtarget, DAG);
17460
17461 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17462 Zeroable, Subtarget, DAG))
17463 return Blend;
17464
17465 // Check for being able to broadcast a single element.
17466 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17467 Subtarget, DAG))
17468 return Broadcast;
17469
17470 // Try to use shift instructions if fast.
17471 if (Subtarget.preferLowerShuffleAsShift()) {
17472 if (SDValue Shift =
17473 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
17474 Subtarget, DAG, /*BitwiseOnly*/ true))
17475 return Shift;
17476 if (NumV2Elements == 0)
17477 if (SDValue Rotate =
17478 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
17479 return Rotate;
17480 }
17481
17482 // If the shuffle mask is repeated in each 128-bit lane we can use more
17483 // efficient instructions that mirror the shuffles across the two 128-bit
17484 // lanes.
17485 SmallVector<int, 4> RepeatedMask;
17486 bool Is128BitLaneRepeatedShuffle =
17487 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17488 if (Is128BitLaneRepeatedShuffle) {
17489 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17490 if (V2.isUndef())
17491 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17492 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17493
17494 // Use dedicated unpack instructions for masks that match their pattern.
17495 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
17496 return V;
17497 }
17498
17499 // Try to use shift instructions.
17500 if (SDValue Shift =
17501 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
17502 DAG, /*BitwiseOnly*/ false))
17503 return Shift;
17504
17505 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
17506 if (SDValue Rotate =
17507 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
17508 return Rotate;
17509
17510 // If we have VLX support, we can use VALIGN or EXPAND.
17511 if (Subtarget.hasVLX()) {
17512 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17513 Zeroable, Subtarget, DAG))
17514 return Rotate;
17515
17516 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
17517 Zeroable, Subtarget, DAG))
17518 return V;
17519 }
17520
17521 // Try to use byte rotation instructions.
17522 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17523 Subtarget, DAG))
17524 return Rotate;
17525
17526 // Try to create an in-lane repeating shuffle mask and then shuffle the
17527 // results into the target lanes.
17529 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17530 return V;
17531
17532 if (V2.isUndef()) {
17533 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17534 // because that should be faster than the variable permute alternatives.
17535 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
17536 return V;
17537
17538 // If the shuffle patterns aren't repeated but it's a single input, directly
17539 // generate a cross-lane VPERMD instruction.
17540 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17541 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17542 }
17543
17544 // Assume that a single SHUFPS is faster than an alternative sequence of
17545 // multiple instructions (even if the CPU has a domain penalty).
17546 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17547 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17548 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17549 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17550 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17551 CastV1, CastV2, DAG);
17552 return DAG.getBitcast(MVT::v8i32, ShufPS);
17553 }
17554
17555 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17556 // shuffle.
17558 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17559 return Result;
17560
17561 // Otherwise fall back on generic blend lowering.
17562 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17563 Zeroable, Subtarget, DAG);
17564}
17565
17566/// Handle lowering of 16-lane 16-bit integer shuffles.
17567///
17568/// This routine is only called when we have AVX2 and thus a reasonable
17569/// instruction set for v16i16 shuffling..
17571 const APInt &Zeroable, SDValue V1, SDValue V2,
17572 const X86Subtarget &Subtarget,
17573 SelectionDAG &DAG) {
17574 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17575 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17576 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17577 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
17578
17579 // Whenever we can lower this as a zext, that instruction is strictly faster
17580 // than any alternative. It also allows us to fold memory operands into the
17581 // shuffle in many cases.
17583 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17584 return ZExt;
17585
17586 // Check for being able to broadcast a single element.
17587 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17588 Subtarget, DAG))
17589 return Broadcast;
17590
17591 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17592 Zeroable, Subtarget, DAG))
17593 return Blend;
17594
17595 // Use dedicated unpack instructions for masks that match their pattern.
17596 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
17597 return V;
17598
17599 // Use dedicated pack instructions for masks that match their pattern.
17600 if (SDValue V =
17601 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17602 return V;
17603
17604 // Try to use lower using a truncation.
17605 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17606 Subtarget, DAG))
17607 return V;
17608
17609 // Try to use shift instructions.
17610 if (SDValue Shift =
17611 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17612 Subtarget, DAG, /*BitwiseOnly*/ false))
17613 return Shift;
17614
17615 // Try to use byte rotation instructions.
17616 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17617 Subtarget, DAG))
17618 return Rotate;
17619
17620 // Try to create an in-lane repeating shuffle mask and then shuffle the
17621 // results into the target lanes.
17623 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17624 return V;
17625
17626 if (V2.isUndef()) {
17627 // Try to use bit rotation instructions.
17628 if (SDValue Rotate =
17629 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17630 return Rotate;
17631
17632 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17633 // because that should be faster than the variable permute alternatives.
17634 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17635 return V;
17636
17637 // There are no generalized cross-lane shuffle operations available on i16
17638 // element types.
17639 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17641 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17642 return V;
17643
17644 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17645 DAG, Subtarget);
17646 }
17647
17648 SmallVector<int, 8> RepeatedMask;
17649 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17650 // As this is a single-input shuffle, the repeated mask should be
17651 // a strictly valid v8i16 mask that we can pass through to the v8i16
17652 // lowering to handle even the v16 case.
17654 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17655 }
17656 }
17657
17658 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17659 Zeroable, Subtarget, DAG))
17660 return PSHUFB;
17661
17662 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17663 if (Subtarget.hasBWI())
17664 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17665
17666 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17667 // shuffle.
17669 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17670 return Result;
17671
17672 // Try to permute the lanes and then use a per-lane permute.
17674 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17675 return V;
17676
17677 // Try to match an interleave of two v16i16s and lower them as unpck and
17678 // permutes using ymms.
17679 if (!Subtarget.hasAVX512())
17680 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17681 Mask, DAG))
17682 return V;
17683
17684 // Otherwise fall back on generic lowering.
17685 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17686 Subtarget, DAG);
17687}
17688
17689/// Handle lowering of 32-lane 8-bit integer shuffles.
17690///
17691/// This routine is only called when we have AVX2 and thus a reasonable
17692/// instruction set for v32i8 shuffling..
17694 const APInt &Zeroable, SDValue V1, SDValue V2,
17695 const X86Subtarget &Subtarget,
17696 SelectionDAG &DAG) {
17697 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17698 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17699 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17700 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17701
17702 // Whenever we can lower this as a zext, that instruction is strictly faster
17703 // than any alternative. It also allows us to fold memory operands into the
17704 // shuffle in many cases.
17705 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17706 Zeroable, Subtarget, DAG))
17707 return ZExt;
17708
17709 // Check for being able to broadcast a single element.
17710 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17711 Subtarget, DAG))
17712 return Broadcast;
17713
17714 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17715 Zeroable, Subtarget, DAG))
17716 return Blend;
17717
17718 // Use dedicated unpack instructions for masks that match their pattern.
17719 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17720 return V;
17721
17722 // Use dedicated pack instructions for masks that match their pattern.
17723 if (SDValue V =
17724 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17725 return V;
17726
17727 // Try to use lower using a truncation.
17728 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17729 Subtarget, DAG))
17730 return V;
17731
17732 // Try to use shift instructions.
17733 if (SDValue Shift =
17734 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17735 DAG, /*BitwiseOnly*/ false))
17736 return Shift;
17737
17738 // Try to use byte rotation instructions.
17739 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17740 Subtarget, DAG))
17741 return Rotate;
17742
17743 // Try to use bit rotation instructions.
17744 if (V2.isUndef())
17745 if (SDValue Rotate =
17746 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17747 return Rotate;
17748
17749 // Try to create an in-lane repeating shuffle mask and then shuffle the
17750 // results into the target lanes.
17752 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17753 return V;
17754
17755 // There are no generalized cross-lane shuffle operations available on i8
17756 // element types.
17757 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17758 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17759 // because that should be faster than the variable permute alternatives.
17760 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17761 return V;
17762
17764 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17765 return V;
17766
17767 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17768 DAG, Subtarget);
17769 }
17770
17771 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17772 Zeroable, Subtarget, DAG))
17773 return PSHUFB;
17774
17775 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17776 if (Subtarget.hasVBMI())
17777 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17778
17779 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17780 // shuffle.
17782 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17783 return Result;
17784
17785 // Try to permute the lanes and then use a per-lane permute.
17787 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17788 return V;
17789
17790 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17791 // by zeroable elements in the remaining 24 elements. Turn this into two
17792 // vmovqb instructions shuffled together.
17793 if (Subtarget.hasVLX())
17794 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17795 Mask, Zeroable, DAG))
17796 return V;
17797
17798 // Try to match an interleave of two v32i8s and lower them as unpck and
17799 // permutes using ymms.
17800 if (!Subtarget.hasAVX512())
17801 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17802 Mask, DAG))
17803 return V;
17804
17805 // Otherwise fall back on generic lowering.
17806 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17807 Subtarget, DAG);
17808}
17809
17810/// High-level routine to lower various 256-bit x86 vector shuffles.
17811///
17812/// This routine either breaks down the specific type of a 256-bit x86 vector
17813/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17814/// together based on the available instructions.
17816 SDValue V1, SDValue V2, const APInt &Zeroable,
17817 const X86Subtarget &Subtarget,
17818 SelectionDAG &DAG) {
17819 // If we have a single input to the zero element, insert that into V1 if we
17820 // can do so cheaply.
17821 int NumElts = VT.getVectorNumElements();
17822 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17823
17824 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17826 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17827 return Insertion;
17828
17829 // Handle special cases where the lower or upper half is UNDEF.
17830 if (SDValue V =
17831 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17832 return V;
17833
17834 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17835 // can check for those subtargets here and avoid much of the subtarget
17836 // querying in the per-vector-type lowering routines. With AVX1 we have
17837 // essentially *zero* ability to manipulate a 256-bit vector with integer
17838 // types. Since we'll use floating point types there eventually, just
17839 // immediately cast everything to a float and operate entirely in that domain.
17840 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17841 int ElementBits = VT.getScalarSizeInBits();
17842 if (ElementBits < 32) {
17843 // No floating point type available, if we can't use the bit operations
17844 // for masking/blending then decompose into 128-bit vectors.
17845 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17846 Subtarget, DAG))
17847 return V;
17848 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17849 return V;
17850 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17851 }
17852
17853 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17855 V1 = DAG.getBitcast(FpVT, V1);
17856 V2 = DAG.getBitcast(FpVT, V2);
17857 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17858 }
17859
17860 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17861 V1 = DAG.getBitcast(MVT::v16i16, V1);
17862 V2 = DAG.getBitcast(MVT::v16i16, V2);
17863 return DAG.getBitcast(VT,
17864 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17865 }
17866
17867 switch (VT.SimpleTy) {
17868 case MVT::v4f64:
17869 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17870 case MVT::v4i64:
17871 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17872 case MVT::v8f32:
17873 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17874 case MVT::v8i32:
17875 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17876 case MVT::v16i16:
17877 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17878 case MVT::v32i8:
17879 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17880
17881 default:
17882 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17883 }
17884}
17885
17886/// Try to lower a vector shuffle as a 128-bit shuffles.
17888 const APInt &Zeroable, SDValue V1, SDValue V2,
17889 const X86Subtarget &Subtarget,
17890 SelectionDAG &DAG) {
17891 assert(VT.getScalarSizeInBits() == 64 &&
17892 "Unexpected element type size for 128bit shuffle.");
17893
17894 // To handle 256 bit vector requires VLX and most probably
17895 // function lowerV2X128VectorShuffle() is better solution.
17896 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17897
17898 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17899 SmallVector<int, 4> Widened128Mask;
17900 if (!canWidenShuffleElements(Mask, Widened128Mask))
17901 return SDValue();
17902 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17903
17904 // Try to use an insert into a zero vector.
17905 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17906 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17907 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17908 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17909 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17910 DAG.getVectorIdxConstant(0, DL));
17911 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17912 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17913 DAG.getVectorIdxConstant(0, DL));
17914 }
17915
17916 // Check for patterns which can be matched with a single insert of a 256-bit
17917 // subvector.
17918 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17919 if (OnlyUsesV1 ||
17920 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17921 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17922 SDValue SubVec =
17923 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17924 DAG.getVectorIdxConstant(0, DL));
17925 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17926 DAG.getVectorIdxConstant(4, DL));
17927 }
17928
17929 // See if this is an insertion of the lower 128-bits of V2 into V1.
17930 bool IsInsert = true;
17931 int V2Index = -1;
17932 for (int i = 0; i < 4; ++i) {
17933 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17934 if (Widened128Mask[i] < 0)
17935 continue;
17936
17937 // Make sure all V1 subvectors are in place.
17938 if (Widened128Mask[i] < 4) {
17939 if (Widened128Mask[i] != i) {
17940 IsInsert = false;
17941 break;
17942 }
17943 } else {
17944 // Make sure we only have a single V2 index and its the lowest 128-bits.
17945 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17946 IsInsert = false;
17947 break;
17948 }
17949 V2Index = i;
17950 }
17951 }
17952 if (IsInsert && V2Index >= 0) {
17953 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17954 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17955 DAG.getVectorIdxConstant(0, DL));
17956 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17957 }
17958
17959 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17960 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17961 // possible we at least ensure the lanes stay sequential to help later
17962 // combines.
17963 SmallVector<int, 2> Widened256Mask;
17964 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17965 Widened128Mask.clear();
17966 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17967 }
17968
17969 // Try to lower to vshuf64x2/vshuf32x4.
17970 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17971 int PermMask[4] = {-1, -1, -1, -1};
17972 // Ensure elements came from the same Op.
17973 for (int i = 0; i < 4; ++i) {
17974 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17975 if (Widened128Mask[i] < 0)
17976 continue;
17977
17978 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17979 unsigned OpIndex = i / 2;
17980 if (Ops[OpIndex].isUndef())
17981 Ops[OpIndex] = Op;
17982 else if (Ops[OpIndex] != Op)
17983 return SDValue();
17984
17985 PermMask[i] = Widened128Mask[i] % 4;
17986 }
17987
17988 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17989 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17990}
17991
17992/// Handle lowering of 8-lane 64-bit floating point shuffles.
17994 const APInt &Zeroable, SDValue V1, SDValue V2,
17995 const X86Subtarget &Subtarget,
17996 SelectionDAG &DAG) {
17997 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17998 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17999 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
18000
18001 if (V2.isUndef()) {
18002 // Use low duplicate instructions for masks that match their pattern.
18003 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
18004 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
18005
18006 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
18007 // Non-half-crossing single input shuffles can be lowered with an
18008 // interleaved permutation.
18009 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18010 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
18011 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
18012 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
18013 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
18014 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18015 }
18016
18017 SmallVector<int, 4> RepeatedMask;
18018 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
18019 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
18020 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18021 }
18022
18023 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
18024 V2, Subtarget, DAG))
18025 return Shuf128;
18026
18027 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
18028 return Unpck;
18029
18030 // Check if the blend happens to exactly fit that of SHUFPD.
18031 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
18032 Zeroable, Subtarget, DAG))
18033 return Op;
18034
18035 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
18036 Subtarget, DAG))
18037 return V;
18038
18039 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
18040 Zeroable, Subtarget, DAG))
18041 return Blend;
18042
18043 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
18044}
18045
18046/// Handle lowering of 16-lane 32-bit floating point shuffles.
18048 const APInt &Zeroable, SDValue V1, SDValue V2,
18049 const X86Subtarget &Subtarget,
18050 SelectionDAG &DAG) {
18051 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
18052 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
18053 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
18054
18055 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18056 // options to efficiently lower the shuffle.
18057 SmallVector<int, 4> RepeatedMask;
18058 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
18059 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
18060
18061 // Use even/odd duplicate instructions for masks that match their pattern.
18062 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18063 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
18064 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18065 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
18066
18067 if (V2.isUndef())
18068 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
18069 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18070
18071 // Use dedicated unpack instructions for masks that match their pattern.
18072 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
18073 return V;
18074
18075 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
18076 Zeroable, Subtarget, DAG))
18077 return Blend;
18078
18079 // Otherwise, fall back to a SHUFPS sequence.
18080 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
18081 }
18082
18083 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
18084 Zeroable, Subtarget, DAG))
18085 return Blend;
18086
18088 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18089 return DAG.getBitcast(MVT::v16f32, ZExt);
18090
18091 // Try to create an in-lane repeating shuffle mask and then shuffle the
18092 // results into the target lanes.
18094 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
18095 return V;
18096
18097 // If we have a single input shuffle with different shuffle patterns in the
18098 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
18099 if (V2.isUndef() &&
18100 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
18101 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
18102 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
18103 }
18104
18105 // If we have AVX512F support, we can use VEXPAND.
18106 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
18107 Zeroable, Subtarget, DAG))
18108 return V;
18109
18110 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
18111}
18112
18113/// Handle lowering of 8-lane 64-bit integer shuffles.
18115 const APInt &Zeroable, SDValue V1, SDValue V2,
18116 const X86Subtarget &Subtarget,
18117 SelectionDAG &DAG) {
18118 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
18119 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
18120 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
18121
18122 // Try to use shift instructions if fast.
18123 if (Subtarget.preferLowerShuffleAsShift())
18124 if (SDValue Shift =
18125 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
18126 Subtarget, DAG, /*BitwiseOnly*/ true))
18127 return Shift;
18128
18129 if (V2.isUndef()) {
18130 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18131 // can use lower latency instructions that will operate on all four
18132 // 128-bit lanes.
18133 SmallVector<int, 2> Repeated128Mask;
18134 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
18135 SmallVector<int, 4> PSHUFDMask;
18136 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
18137 return DAG.getBitcast(
18138 MVT::v8i64,
18139 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
18140 DAG.getBitcast(MVT::v16i32, V1),
18141 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18142 }
18143
18144 SmallVector<int, 4> Repeated256Mask;
18145 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
18146 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
18147 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
18148 }
18149
18150 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
18151 V2, Subtarget, DAG))
18152 return Shuf128;
18153
18154 // Try to use shift instructions.
18155 if (SDValue Shift =
18156 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
18157 DAG, /*BitwiseOnly*/ false))
18158 return Shift;
18159
18160 // Try to use VALIGN.
18161 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
18162 Zeroable, Subtarget, DAG))
18163 return Rotate;
18164
18165 // Try to use PALIGNR.
18166 if (Subtarget.hasBWI())
18167 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
18168 Subtarget, DAG))
18169 return Rotate;
18170
18171 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
18172 return Unpck;
18173
18174 // If we have AVX512F support, we can use VEXPAND.
18175 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
18176 Subtarget, DAG))
18177 return V;
18178
18179 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
18180 Zeroable, Subtarget, DAG))
18181 return Blend;
18182
18183 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
18184}
18185
18186/// Handle lowering of 16-lane 32-bit integer shuffles.
18188 const APInt &Zeroable, SDValue V1, SDValue V2,
18189 const X86Subtarget &Subtarget,
18190 SelectionDAG &DAG) {
18191 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
18192 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
18193 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
18194
18195 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
18196
18197 // Whenever we can lower this as a zext, that instruction is strictly faster
18198 // than any alternative. It also allows us to fold memory operands into the
18199 // shuffle in many cases.
18201 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18202 return ZExt;
18203
18204 // Try to use shift instructions if fast.
18205 if (Subtarget.preferLowerShuffleAsShift()) {
18206 if (SDValue Shift =
18207 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
18208 Subtarget, DAG, /*BitwiseOnly*/ true))
18209 return Shift;
18210 if (NumV2Elements == 0)
18211 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
18212 Subtarget, DAG))
18213 return Rotate;
18214 }
18215
18216 // If the shuffle mask is repeated in each 128-bit lane we can use more
18217 // efficient instructions that mirror the shuffles across the four 128-bit
18218 // lanes.
18219 SmallVector<int, 4> RepeatedMask;
18220 bool Is128BitLaneRepeatedShuffle =
18221 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
18222 if (Is128BitLaneRepeatedShuffle) {
18223 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
18224 if (V2.isUndef())
18225 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
18226 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18227
18228 // Use dedicated unpack instructions for masks that match their pattern.
18229 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
18230 return V;
18231 }
18232
18233 // Try to use shift instructions.
18234 if (SDValue Shift =
18235 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
18236 Subtarget, DAG, /*BitwiseOnly*/ false))
18237 return Shift;
18238
18239 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
18240 if (SDValue Rotate =
18241 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
18242 return Rotate;
18243
18244 // Try to use VALIGN.
18245 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
18246 Zeroable, Subtarget, DAG))
18247 return Rotate;
18248
18249 // Try to use byte rotation instructions.
18250 if (Subtarget.hasBWI())
18251 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
18252 Subtarget, DAG))
18253 return Rotate;
18254
18255 // Assume that a single SHUFPS is faster than using a permv shuffle.
18256 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18257 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18258 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
18259 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
18260 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
18261 CastV1, CastV2, DAG);
18262 return DAG.getBitcast(MVT::v16i32, ShufPS);
18263 }
18264
18265 // Try to create an in-lane repeating shuffle mask and then shuffle the
18266 // results into the target lanes.
18268 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
18269 return V;
18270
18271 // If we have AVX512F support, we can use VEXPAND.
18272 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
18273 Zeroable, Subtarget, DAG))
18274 return V;
18275
18276 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
18277 Zeroable, Subtarget, DAG))
18278 return Blend;
18279
18280 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
18281}
18282
18283/// Handle lowering of 32-lane 16-bit integer shuffles.
18285 const APInt &Zeroable, SDValue V1, SDValue V2,
18286 const X86Subtarget &Subtarget,
18287 SelectionDAG &DAG) {
18288 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
18289 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
18290 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
18291 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
18292
18293 // Whenever we can lower this as a zext, that instruction is strictly faster
18294 // than any alternative. It also allows us to fold memory operands into the
18295 // shuffle in many cases.
18297 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18298 return ZExt;
18299
18300 // Use dedicated unpack instructions for masks that match their pattern.
18301 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
18302 return V;
18303
18304 // Use dedicated pack instructions for masks that match their pattern.
18305 if (SDValue V =
18306 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
18307 return V;
18308
18309 // Try to use shift instructions.
18310 if (SDValue Shift =
18311 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
18312 Subtarget, DAG, /*BitwiseOnly*/ false))
18313 return Shift;
18314
18315 // Try to use byte rotation instructions.
18316 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
18317 Subtarget, DAG))
18318 return Rotate;
18319
18320 if (V2.isUndef()) {
18321 // Try to use bit rotation instructions.
18322 if (SDValue Rotate =
18323 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
18324 return Rotate;
18325
18326 SmallVector<int, 8> RepeatedMask;
18327 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18328 // As this is a single-input shuffle, the repeated mask should be
18329 // a strictly valid v8i16 mask that we can pass through to the v8i16
18330 // lowering to handle even the v32 case.
18331 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18332 RepeatedMask, Subtarget, DAG);
18333 }
18334 }
18335
18336 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18337 Zeroable, Subtarget, DAG))
18338 return Blend;
18339
18340 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18341 Zeroable, Subtarget, DAG))
18342 return PSHUFB;
18343
18344 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18345 // shuffle.
18347 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
18348 return Result;
18349
18350 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18351}
18352
18353/// Handle lowering of 64-lane 8-bit integer shuffles.
18355 const APInt &Zeroable, SDValue V1, SDValue V2,
18356 const X86Subtarget &Subtarget,
18357 SelectionDAG &DAG) {
18358 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18359 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18360 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
18361 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
18362
18363 // Whenever we can lower this as a zext, that instruction is strictly faster
18364 // than any alternative. It also allows us to fold memory operands into the
18365 // shuffle in many cases.
18367 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18368 return ZExt;
18369
18370 // Use dedicated unpack instructions for masks that match their pattern.
18371 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
18372 return V;
18373
18374 // Use dedicated pack instructions for masks that match their pattern.
18375 if (SDValue V =
18376 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18377 return V;
18378
18379 // Try to use shift instructions.
18380 if (SDValue Shift =
18381 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
18382 DAG, /*BitwiseOnly*/ false))
18383 return Shift;
18384
18385 // Try to use byte rotation instructions.
18386 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18387 Subtarget, DAG))
18388 return Rotate;
18389
18390 // Try to use bit rotation instructions.
18391 if (V2.isUndef())
18392 if (SDValue Rotate =
18393 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18394 return Rotate;
18395
18396 // Lower as AND if possible.
18397 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18398 Zeroable, Subtarget, DAG))
18399 return Masked;
18400
18401 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18402 Zeroable, Subtarget, DAG))
18403 return PSHUFB;
18404
18405 // Try to create an in-lane repeating shuffle mask and then shuffle the
18406 // results into the target lanes.
18407 // FIXME: Avoid on VBMI targets as the post lane permute often interferes
18408 // with shuffle combining (should be fixed by topological DAG sorting).
18409 if (!Subtarget.hasVBMI())
18411 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18412 return V;
18413
18415 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
18416 return Result;
18417
18418 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18419 Zeroable, Subtarget, DAG))
18420 return Blend;
18421
18422 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
18423 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
18424 // PALIGNR will be cheaper than the second PSHUFB+OR.
18425 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
18426 Mask, Subtarget, DAG))
18427 return V;
18428
18429 // VBMI can use VPERMV/VPERMV3 byte shuffles more efficiently than
18430 // OR(PSHUFB,PSHUFB).
18431 if (Subtarget.hasVBMI())
18432 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget,
18433 DAG);
18434
18435 // If we can't directly blend but can use PSHUFB, that will be better as it
18436 // can both shuffle and set up the inefficient blend.
18437 bool V1InUse, V2InUse;
18438 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
18439 DAG, V1InUse, V2InUse);
18440 }
18441
18442 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18443 // shuffle.
18445 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18446 return Result;
18447
18448 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18449 if (Subtarget.hasVBMI())
18450 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18451
18452 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG,
18453 /*SimpleOnly*/ false);
18454}
18455
18456/// High-level routine to lower various 512-bit x86 vector shuffles.
18457///
18458/// This routine either breaks down the specific type of a 512-bit x86 vector
18459/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18460/// together based on the available instructions.
18462 MVT VT, SDValue V1, SDValue V2,
18463 const APInt &Zeroable,
18464 const X86Subtarget &Subtarget,
18465 SelectionDAG &DAG) {
18466 assert(Subtarget.hasAVX512() &&
18467 "Cannot lower 512-bit vectors w/ basic ISA!");
18468
18469 // If we have a single input to the zero element, insert that into V1 if we
18470 // can do so cheaply.
18471 int NumElts = Mask.size();
18472 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18473
18474 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18476 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18477 return Insertion;
18478
18479 // Handle special cases where the lower or upper half is UNDEF.
18480 if (SDValue V =
18481 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18482 return V;
18483
18484 // Check for being able to broadcast a single element.
18485 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18486 Subtarget, DAG))
18487 return Broadcast;
18488
18489 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18490 // Try using bit ops for masking and blending before falling back to
18491 // splitting.
18492 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18493 Subtarget, DAG))
18494 return V;
18495 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18496 return V;
18497
18498 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
18499 }
18500
18501 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
18502 if (!Subtarget.hasBWI())
18503 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
18504 /*SimpleOnly*/ false);
18505
18506 V1 = DAG.getBitcast(MVT::v32i16, V1);
18507 V2 = DAG.getBitcast(MVT::v32i16, V2);
18508 return DAG.getBitcast(VT,
18509 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
18510 }
18511
18512 // Dispatch to each element type for lowering. If we don't have support for
18513 // specific element type shuffles at 512 bits, immediately split them and
18514 // lower them. Each lowering routine of a given type is allowed to assume that
18515 // the requisite ISA extensions for that element type are available.
18516 switch (VT.SimpleTy) {
18517 case MVT::v8f64:
18518 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18519 case MVT::v16f32:
18520 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18521 case MVT::v8i64:
18522 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18523 case MVT::v16i32:
18524 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18525 case MVT::v32i16:
18526 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18527 case MVT::v64i8:
18528 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18529
18530 default:
18531 llvm_unreachable("Not a valid 512-bit x86 vector type!");
18532 }
18533}
18534
18536 MVT VT, SDValue V1, SDValue V2,
18537 const X86Subtarget &Subtarget,
18538 SelectionDAG &DAG) {
18539 // Shuffle should be unary.
18540 if (!V2.isUndef())
18541 return SDValue();
18542
18543 int ShiftAmt = -1;
18544 int NumElts = Mask.size();
18545 for (int i = 0; i != NumElts; ++i) {
18546 int M = Mask[i];
18547 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
18548 "Unexpected mask index.");
18549 if (M < 0)
18550 continue;
18551
18552 // The first non-undef element determines our shift amount.
18553 if (ShiftAmt < 0) {
18554 ShiftAmt = M - i;
18555 // Need to be shifting right.
18556 if (ShiftAmt <= 0)
18557 return SDValue();
18558 }
18559 // All non-undef elements must shift by the same amount.
18560 if (ShiftAmt != M - i)
18561 return SDValue();
18562 }
18563 assert(ShiftAmt >= 0 && "All undef?");
18564
18565 // Great we found a shift right.
18566 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
18567 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
18568 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18569 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18570 DAG.getVectorIdxConstant(0, DL));
18571}
18572
18573// Determine if this shuffle can be implemented with a KSHIFT instruction.
18574// Returns the shift amount if possible or -1 if not. This is a simplified
18575// version of matchShuffleAsShift.
18576static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18577 int MaskOffset, const APInt &Zeroable) {
18578 int Size = Mask.size();
18579
18580 auto CheckZeros = [&](int Shift, bool Left) {
18581 for (int j = 0; j < Shift; ++j)
18582 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18583 return false;
18584
18585 return true;
18586 };
18587
18588 auto MatchShift = [&](int Shift, bool Left) {
18589 unsigned Pos = Left ? Shift : 0;
18590 unsigned Low = Left ? 0 : Shift;
18591 unsigned Len = Size - Shift;
18592 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18593 };
18594
18595 for (int Shift = 1; Shift != Size; ++Shift)
18596 for (bool Left : {true, false})
18597 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18598 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18599 return Shift;
18600 }
18601
18602 return -1;
18603}
18604
18605
18606// Lower vXi1 vector shuffles.
18607// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18608// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18609// vector, shuffle and then truncate it back.
18611 MVT VT, SDValue V1, SDValue V2,
18612 const APInt &Zeroable,
18613 const X86Subtarget &Subtarget,
18614 SelectionDAG &DAG) {
18615 assert(Subtarget.hasAVX512() &&
18616 "Cannot lower 512-bit vectors w/o basic ISA!");
18617
18618 int NumElts = Mask.size();
18619 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18620
18621 // Try to recognize shuffles that are just padding a subvector with zeros.
18622 int SubvecElts = 0;
18623 int Src = -1;
18624 for (int i = 0; i != NumElts; ++i) {
18625 if (Mask[i] >= 0) {
18626 // Grab the source from the first valid mask. All subsequent elements need
18627 // to use this same source.
18628 if (Src < 0)
18629 Src = Mask[i] / NumElts;
18630 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18631 break;
18632 }
18633
18634 ++SubvecElts;
18635 }
18636 assert(SubvecElts != NumElts && "Identity shuffle?");
18637
18638 // Clip to a power 2.
18639 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18640
18641 // Make sure the number of zeroable bits in the top at least covers the bits
18642 // not covered by the subvector.
18643 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18644 assert(Src >= 0 && "Expected a source!");
18645 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18646 SDValue Extract =
18647 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18648 DAG.getVectorIdxConstant(0, DL));
18649 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18650 DAG.getConstant(0, DL, VT), Extract,
18651 DAG.getVectorIdxConstant(0, DL));
18652 }
18653
18654 // Try a simple shift right with undef elements. Later we'll try with zeros.
18655 if (SDValue Shift =
18656 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18657 return Shift;
18658
18659 // Try to match KSHIFTs.
18660 unsigned Offset = 0;
18661 for (SDValue V : {V1, V2}) {
18662 unsigned Opcode;
18663 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18664 if (ShiftAmt >= 0) {
18665 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18666 MVT WideVT = Res.getSimpleValueType();
18667 // Widened right shifts need two shifts to ensure we shift in zeroes.
18668 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18669 int WideElts = WideVT.getVectorNumElements();
18670 // Shift left to put the original vector in the MSBs of the new size.
18671 Res =
18672 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18673 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18674 // Increase the shift amount to account for the left shift.
18675 ShiftAmt += WideElts - NumElts;
18676 }
18677
18678 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18679 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18680 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18681 DAG.getVectorIdxConstant(0, DL));
18682 }
18683 Offset += NumElts; // Increment for next iteration.
18684 }
18685
18686 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18687 // ops instead.
18688 // TODO: What other unary shuffles would benefit from this?
18689 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18690 SDValue Op0 = V1.getOperand(0);
18691 SDValue Op1 = V1.getOperand(1);
18693 EVT OpVT = Op0.getValueType();
18694 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18695 return DAG.getSetCC(
18696 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18697 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18698 }
18699
18700 // If this is a sequential shuffle with zero'd elements - then lower to AND.
18701 bool IsBlendWithZero = all_of(enumerate(Mask), [&Zeroable](auto M) {
18702 return Zeroable[M.index()] || (M.value() == (int)M.index());
18703 });
18704 if (IsBlendWithZero) {
18705 const unsigned Width = std::max<unsigned>(NumElts, 8u);
18706 MVT IntVT = MVT::getIntegerVT(Width);
18707
18708 APInt MaskValue = (~Zeroable).zextOrTrunc(Width);
18709 SDValue MaskNode = DAG.getConstant(MaskValue, DL, IntVT);
18710
18711 MVT MaskVecVT = MVT::getVectorVT(MVT::i1, Width);
18712 SDValue MaskVecNode = DAG.getBitcast(MaskVecVT, MaskNode);
18713
18714 SDValue MaskVec = DAG.getExtractSubvector(DL, VT, MaskVecNode, 0);
18715 return DAG.getNode(ISD::AND, DL, VT, V1, MaskVec);
18716 }
18717
18718 MVT ExtVT;
18719 switch (VT.SimpleTy) {
18720 default:
18721 llvm_unreachable("Expected a vector of i1 elements");
18722 case MVT::v2i1:
18723 ExtVT = MVT::v2i64;
18724 break;
18725 case MVT::v4i1:
18726 ExtVT = MVT::v4i32;
18727 break;
18728 case MVT::v8i1:
18729 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18730 // shuffle.
18731 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18732 break;
18733 case MVT::v16i1:
18734 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18735 // 256-bit operation available.
18736 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18737 break;
18738 case MVT::v32i1:
18739 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18740 // 256-bit operation available.
18741 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18742 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18743 break;
18744 case MVT::v64i1:
18745 // Fall back to scalarization. FIXME: We can do better if the shuffle
18746 // can be partitioned cleanly.
18747 if (!Subtarget.useBWIRegs())
18748 return SDValue();
18749 ExtVT = MVT::v64i8;
18750 break;
18751 }
18752
18753 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18754 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18755
18756 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18757 // i1 was sign extended we can use X86ISD::CVT2MASK.
18758 int NumElems = VT.getVectorNumElements();
18759 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18760 (Subtarget.hasDQI() && (NumElems < 32)))
18761 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18762 Shuffle, ISD::SETGT);
18763
18764 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18765}
18766
18767/// Helper function that returns true if the shuffle mask should be
18768/// commuted to improve canonicalization.
18770 int NumElements = Mask.size();
18771
18772 int NumV1Elements = 0, NumV2Elements = 0;
18773 for (int M : Mask)
18774 if (M < 0)
18775 continue;
18776 else if (M < NumElements)
18777 ++NumV1Elements;
18778 else
18779 ++NumV2Elements;
18780
18781 // Commute the shuffle as needed such that more elements come from V1 than
18782 // V2. This allows us to match the shuffle pattern strictly on how many
18783 // elements come from V1 without handling the symmetric cases.
18784 if (NumV2Elements > NumV1Elements)
18785 return true;
18786
18787 assert(NumV1Elements > 0 && "No V1 indices");
18788
18789 if (NumV2Elements == 0)
18790 return false;
18791
18792 // When the number of V1 and V2 elements are the same, try to minimize the
18793 // number of uses of V2 in the low half of the vector. When that is tied,
18794 // ensure that the sum of indices for V1 is equal to or lower than the sum
18795 // indices for V2. When those are equal, try to ensure that the number of odd
18796 // indices for V1 is lower than the number of odd indices for V2.
18797 if (NumV1Elements == NumV2Elements) {
18798 int LowV1Elements = 0, LowV2Elements = 0;
18799 for (int M : Mask.slice(0, NumElements / 2))
18800 if (M >= NumElements)
18801 ++LowV2Elements;
18802 else if (M >= 0)
18803 ++LowV1Elements;
18804 if (LowV2Elements > LowV1Elements)
18805 return true;
18806 if (LowV2Elements == LowV1Elements) {
18807 int SumV1Indices = 0, SumV2Indices = 0;
18808 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18809 if (Mask[i] >= NumElements)
18810 SumV2Indices += i;
18811 else if (Mask[i] >= 0)
18812 SumV1Indices += i;
18813 if (SumV2Indices < SumV1Indices)
18814 return true;
18815 if (SumV2Indices == SumV1Indices) {
18816 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18817 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18818 if (Mask[i] >= NumElements)
18819 NumV2OddIndices += i % 2;
18820 else if (Mask[i] >= 0)
18821 NumV1OddIndices += i % 2;
18822 if (NumV2OddIndices < NumV1OddIndices)
18823 return true;
18824 }
18825 }
18826 }
18827
18828 return false;
18829}
18830
18832 const X86Subtarget &Subtarget) {
18833 if (!Subtarget.hasAVX512())
18834 return false;
18835
18836 if (!V.getValueType().isSimple())
18837 return false;
18838
18839 MVT VT = V.getSimpleValueType().getScalarType();
18840 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18841 return false;
18842
18843 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18844 // are preferable to blendw/blendvb/masked-mov.
18845 if ((VT == MVT::i16 || VT == MVT::i8) &&
18846 V.getSimpleValueType().getSizeInBits() < 512)
18847 return false;
18848
18849 auto HasMaskOperation = [&](SDValue V) {
18850 // TODO: Currently we only check limited opcode. We probably extend
18851 // it to all binary operation by checking TLI.isBinOp().
18852 switch (V->getOpcode()) {
18853 default:
18854 return false;
18855 case ISD::ADD:
18856 case ISD::SUB:
18857 case ISD::AND:
18858 case ISD::XOR:
18859 case ISD::OR:
18860 case ISD::SMAX:
18861 case ISD::SMIN:
18862 case ISD::UMAX:
18863 case ISD::UMIN:
18864 case ISD::ABS:
18865 case ISD::SHL:
18866 case ISD::SRL:
18867 case ISD::SRA:
18868 case ISD::MUL:
18869 break;
18870 }
18871 if (!V->hasOneUse())
18872 return false;
18873
18874 return true;
18875 };
18876
18877 if (HasMaskOperation(V))
18878 return true;
18879
18880 return false;
18881}
18882
18883// Forward declaration.
18886 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18887 const X86Subtarget &Subtarget);
18888
18889 /// Top-level lowering for x86 vector shuffles.
18890///
18891/// This handles decomposition, canonicalization, and lowering of all x86
18892/// vector shuffles. Most of the specific lowering strategies are encapsulated
18893/// above in helper routines. The canonicalization attempts to widen shuffles
18894/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18895/// s.t. only one of the two inputs needs to be tested, etc.
18897 SelectionDAG &DAG) {
18899 ArrayRef<int> OrigMask = SVOp->getMask();
18900 SDValue V1 = Op.getOperand(0);
18901 SDValue V2 = Op.getOperand(1);
18902 MVT VT = Op.getSimpleValueType();
18903 int NumElements = VT.getVectorNumElements();
18904 SDLoc DL(Op);
18905 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18906
18907 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18908 "Can't lower MMX shuffles");
18909
18910 bool V1IsUndef = V1.isUndef();
18911 bool V2IsUndef = V2.isUndef();
18912 if (V1IsUndef && V2IsUndef)
18913 return DAG.getUNDEF(VT);
18914
18915 // When we create a shuffle node we put the UNDEF node to second operand,
18916 // but in some cases the first operand may be transformed to UNDEF.
18917 // In this case we should just commute the node.
18918 if (V1IsUndef)
18919 return DAG.getCommutedVectorShuffle(*SVOp);
18920
18921 // Check for non-undef masks pointing at an undef vector and make the masks
18922 // undef as well. This makes it easier to match the shuffle based solely on
18923 // the mask.
18924 if (V2IsUndef &&
18925 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18926 SmallVector<int, 8> NewMask(OrigMask);
18927 for (int &M : NewMask)
18928 if (M >= NumElements)
18929 M = -1;
18930 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18931 }
18932
18933 // Check for illegal shuffle mask element index values.
18934 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18935 (void)MaskUpperLimit;
18936 assert(llvm::all_of(OrigMask,
18937 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18938 "Out of bounds shuffle index");
18939
18940 // We actually see shuffles that are entirely re-arrangements of a set of
18941 // zero inputs. This mostly happens while decomposing complex shuffles into
18942 // simple ones. Directly lower these as a buildvector of zeros.
18943 APInt KnownUndef, KnownZero;
18944 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18945
18946 APInt Zeroable = KnownUndef | KnownZero;
18947 if (Zeroable.isAllOnes())
18948 return getZeroVector(VT, Subtarget, DAG, DL);
18949
18950 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18951
18952 // Try to collapse shuffles into using a vector type with fewer elements but
18953 // wider element types. We cap this to not form integers or floating point
18954 // elements wider than 64 bits. It does not seem beneficial to form i128
18955 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18956 SmallVector<int, 16> WidenedMask;
18957 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18958 !canCombineAsMaskOperation(V1, Subtarget) &&
18959 !canCombineAsMaskOperation(V2, Subtarget) &&
18960 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18961 // Shuffle mask widening should not interfere with a broadcast opportunity
18962 // by obfuscating the operands with bitcasts.
18963 // TODO: Avoid lowering directly from this top-level function: make this
18964 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18965 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18966 Subtarget, DAG))
18967 return Broadcast;
18968
18969 MVT NewEltVT = VT.isFloatingPoint()
18972 int NewNumElts = NumElements / 2;
18973 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18974 // Make sure that the new vector type is legal. For example, v2f64 isn't
18975 // legal on SSE1.
18976 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18977 if (V2IsZero) {
18978 // Modify the new Mask to take all zeros from the all-zero vector.
18979 // Choose indices that are blend-friendly.
18980 bool UsedZeroVector = false;
18981 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18982 "V2's non-undef elements are used?!");
18983 for (int i = 0; i != NewNumElts; ++i)
18984 if (WidenedMask[i] == SM_SentinelZero) {
18985 WidenedMask[i] = i + NewNumElts;
18986 UsedZeroVector = true;
18987 }
18988 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18989 // some elements to be undef.
18990 if (UsedZeroVector)
18991 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18992 }
18993 V1 = DAG.getBitcast(NewVT, V1);
18994 V2 = DAG.getBitcast(NewVT, V2);
18995 return DAG.getBitcast(
18996 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18997 }
18998 }
18999
19000 SmallVector<SDValue> Ops = {V1, V2};
19001 SmallVector<int> Mask(OrigMask);
19002
19003 // Canonicalize the shuffle with any horizontal ops inputs.
19004 // Don't attempt this if the shuffle can still be widened as we may lose
19005 // whole lane shuffle patterns.
19006 // NOTE: This may update Ops and Mask.
19007 if (!canWidenShuffleElements(Mask)) {
19009 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
19010 return DAG.getBitcast(VT, HOp);
19011
19012 V1 = DAG.getBitcast(VT, Ops[0]);
19013 V2 = DAG.getBitcast(VT, Ops[1]);
19014 assert(NumElements == (int)Mask.size() &&
19015 "canonicalizeShuffleMaskWithHorizOp "
19016 "shouldn't alter the shuffle mask size");
19017 }
19018
19019 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
19020 // These will be materialized uniformly anyway, so make splat matching easier.
19021 // TODO: Allow all int constants?
19022 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
19023 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
19024 BitVector Undefs;
19025 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
19026 if (Undefs.any() &&
19029 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
19030 }
19031 }
19032 }
19033 return V;
19034 };
19035 V1 = CanonicalizeConstant(V1);
19036 V2 = CanonicalizeConstant(V2);
19037
19038 // Commute the shuffle if it will improve canonicalization.
19041 std::swap(V1, V2);
19042 }
19043
19044 // For each vector width, delegate to a specialized lowering routine.
19045 if (VT.is128BitVector())
19046 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19047
19048 if (VT.is256BitVector())
19049 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19050
19051 if (VT.is512BitVector())
19052 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19053
19054 if (Is1BitVector)
19055 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19056
19057 llvm_unreachable("Unimplemented!");
19058}
19059
19060// As legal vpcompress instructions depend on various AVX512 extensions, try to
19061// convert illegal vector sizes to legal ones to avoid expansion.
19063 SelectionDAG &DAG) {
19064 assert(Subtarget.hasAVX512() &&
19065 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
19066
19067 SDLoc DL(Op);
19068 SDValue Vec = Op.getOperand(0);
19069 SDValue Mask = Op.getOperand(1);
19070 SDValue Passthru = Op.getOperand(2);
19071
19072 EVT VecVT = Vec.getValueType();
19073 EVT ElementVT = VecVT.getVectorElementType();
19074 unsigned NumElements = VecVT.getVectorNumElements();
19075 unsigned NumVecBits = VecVT.getFixedSizeInBits();
19076 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
19077
19078 // 128- and 256-bit vectors with <= 16 elements can be converted to and
19079 // compressed as 512-bit vectors in AVX512F.
19080 if (NumVecBits != 128 && NumVecBits != 256)
19081 return SDValue();
19082
19083 if (NumElementBits == 32 || NumElementBits == 64) {
19084 unsigned NumLargeElements = 512 / NumElementBits;
19085 MVT LargeVecVT =
19086 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
19087 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
19088
19089 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
19090 DAG, DL);
19091 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
19092 Subtarget, DAG, DL);
19093 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
19094 : widenSubVector(LargeVecVT, Passthru,
19095 /*ZeroNewElements=*/false,
19096 Subtarget, DAG, DL);
19097
19098 SDValue Compressed =
19099 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
19100 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
19101 DAG.getConstant(0, DL, MVT::i64));
19102 }
19103
19104 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
19105 VecVT == MVT::v16i16) {
19106 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
19107 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
19108
19109 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
19110 Passthru = Passthru.isUndef()
19111 ? DAG.getUNDEF(LargeVecVT)
19112 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
19113
19114 SDValue Compressed =
19115 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
19116 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
19117 }
19118
19119 return SDValue();
19120}
19121
19122/// Try to lower a VSELECT instruction to a vector shuffle.
19124 const X86Subtarget &Subtarget,
19125 SelectionDAG &DAG) {
19126 SDValue Cond = Op.getOperand(0);
19127 SDValue LHS = Op.getOperand(1);
19128 SDValue RHS = Op.getOperand(2);
19129 MVT VT = Op.getSimpleValueType();
19130
19131 // Only non-legal VSELECTs reach this lowering, convert those into generic
19132 // shuffles and re-use the shuffle lowering path for blends.
19136 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
19137 }
19138
19139 return SDValue();
19140}
19141
19142SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
19143 SDValue Cond = Op.getOperand(0);
19144 SDValue LHS = Op.getOperand(1);
19145 SDValue RHS = Op.getOperand(2);
19146
19147 SDLoc dl(Op);
19148 MVT VT = Op.getSimpleValueType();
19149 if (isSoftF16(VT, Subtarget)) {
19150 MVT NVT = VT.changeVectorElementTypeToInteger();
19151 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
19152 DAG.getBitcast(NVT, LHS),
19153 DAG.getBitcast(NVT, RHS)));
19154 }
19155
19156 // A vselect where all conditions and data are constants can be optimized into
19157 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
19161 return SDValue();
19162
19163 // Try to lower this to a blend-style vector shuffle. This can handle all
19164 // constant condition cases.
19165 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
19166 return BlendOp;
19167
19168 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
19169 // with patterns on the mask registers on AVX-512.
19170 MVT CondVT = Cond.getSimpleValueType();
19171 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
19172 if (CondEltSize == 1)
19173 return Op;
19174
19175 // Variable blends are only legal from SSE4.1 onward.
19176 if (!Subtarget.hasSSE41())
19177 return SDValue();
19178
19179 unsigned EltSize = VT.getScalarSizeInBits();
19180 unsigned NumElts = VT.getVectorNumElements();
19181
19182 // Expand v32i16/v64i8 without BWI.
19183 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
19184 return SDValue();
19185
19186 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
19187 // into an i1 condition so that we can use the mask-based 512-bit blend
19188 // instructions.
19189 if (VT.getSizeInBits() == 512) {
19190 // Build a mask by testing the condition against zero.
19191 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
19192 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
19193 DAG.getConstant(0, dl, CondVT),
19194 ISD::SETNE);
19195 // Now return a new VSELECT using the mask.
19196 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
19197 }
19198
19199 // SEXT/TRUNC cases where the mask doesn't match the destination size.
19200 if (CondEltSize != EltSize) {
19201 // If we don't have a sign splat, rely on the expansion.
19202 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
19203 return SDValue();
19204
19205 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
19206 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
19207 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
19208 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
19209 }
19210
19211 // v16i16/v32i8 selects without AVX2, if the condition and another operand
19212 // are free to split, then better to split before expanding the
19213 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
19214 // TODO: This is very similar to narrowVectorSelect.
19215 // TODO: Add Load splitting to isFreeToSplitVector ?
19216 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
19217 !Subtarget.hasXOP()) {
19218 bool FreeCond = isFreeToSplitVector(Cond, DAG);
19219 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
19220 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
19221 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
19222 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
19223 if (FreeCond && (FreeLHS || FreeRHS))
19224 return splitVectorOp(Op, DAG, dl);
19225 }
19226
19227 // Only some types will be legal on some subtargets. If we can emit a legal
19228 // VSELECT-matching blend, return Op, and but if we need to expand, return
19229 // a null value.
19230 switch (VT.SimpleTy) {
19231 default:
19232 // Most of the vector types have blends past SSE4.1.
19233 return Op;
19234
19235 case MVT::v32i8:
19236 // The byte blends for AVX vectors were introduced only in AVX2.
19237 if (Subtarget.hasAVX2())
19238 return Op;
19239
19240 return SDValue();
19241
19242 case MVT::v8i16:
19243 case MVT::v16i16:
19244 case MVT::v8f16:
19245 case MVT::v16f16: {
19246 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
19247 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
19248 Cond = DAG.getBitcast(CastVT, Cond);
19249 LHS = DAG.getBitcast(CastVT, LHS);
19250 RHS = DAG.getBitcast(CastVT, RHS);
19251 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
19252 return DAG.getBitcast(VT, Select);
19253 }
19254 }
19255}
19256
19258 MVT VT = Op.getSimpleValueType();
19259 SDValue Vec = Op.getOperand(0);
19260 SDValue Idx = Op.getOperand(1);
19261 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
19262 SDLoc dl(Op);
19263
19265 return SDValue();
19266
19267 if (VT.getSizeInBits() == 8) {
19268 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
19269 // we're going to zero extend the register or fold the store.
19272 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
19273 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19274 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19275
19276 unsigned IdxVal = Idx->getAsZExtVal();
19277 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
19278 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19279 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19280 }
19281
19282 if (VT == MVT::f32) {
19283 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
19284 // the result back to FR32 register. It's only worth matching if the
19285 // result has a single use which is a store or a bitcast to i32. And in
19286 // the case of a store, it's not worth it if the index is a constant 0,
19287 // because a MOVSSmr can be used instead, which is smaller and faster.
19288 if (!Op.hasOneUse())
19289 return SDValue();
19290 SDNode *User = *Op.getNode()->user_begin();
19291 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
19292 (User->getOpcode() != ISD::BITCAST ||
19293 User->getValueType(0) != MVT::i32))
19294 return SDValue();
19295 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19296 DAG.getBitcast(MVT::v4i32, Vec), Idx);
19297 return DAG.getBitcast(MVT::f32, Extract);
19298 }
19299
19300 if (VT == MVT::i32 || VT == MVT::i64)
19301 return Op;
19302
19303 return SDValue();
19304}
19305
19306/// Extract one bit from mask vector, like v16i1 or v8i1.
19307/// AVX-512 feature.
19309 const X86Subtarget &Subtarget) {
19310 SDValue Vec = Op.getOperand(0);
19311 SDLoc dl(Vec);
19312 MVT VecVT = Vec.getSimpleValueType();
19313 SDValue Idx = Op.getOperand(1);
19314 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19315 MVT EltVT = Op.getSimpleValueType();
19316
19317 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
19318 "Unexpected vector type in ExtractBitFromMaskVector");
19319
19320 // variable index can't be handled in mask registers,
19321 // extend vector to VR512/128
19322 if (!IdxC) {
19323 unsigned NumElts = VecVT.getVectorNumElements();
19324 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
19325 // than extending to 128/256bit.
19326 if (NumElts == 1) {
19327 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19329 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
19330 }
19331 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19332 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19333 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
19334 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
19335 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
19336 }
19337
19338 unsigned IdxVal = IdxC->getZExtValue();
19339 if (IdxVal == 0) // the operation is legal
19340 return Op;
19341
19342 // Extend to natively supported kshift.
19343 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19344
19345 // Use kshiftr instruction to move to the lower element.
19346 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19347 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19348
19349 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19350 DAG.getVectorIdxConstant(0, dl));
19351}
19352
19353// Helper to find all the extracted elements from a vector.
19355 MVT VT = N->getSimpleValueType(0);
19356 unsigned NumElts = VT.getVectorNumElements();
19357 APInt DemandedElts = APInt::getZero(NumElts);
19358 for (SDNode *User : N->users()) {
19359 switch (User->getOpcode()) {
19360 case X86ISD::PEXTRB:
19361 case X86ISD::PEXTRW:
19364 DemandedElts.setAllBits();
19365 return DemandedElts;
19366 }
19367 DemandedElts.setBit(User->getConstantOperandVal(1));
19368 break;
19369 case ISD::BITCAST: {
19370 if (!User->getValueType(0).isSimple() ||
19371 !User->getValueType(0).isVector()) {
19372 DemandedElts.setAllBits();
19373 return DemandedElts;
19374 }
19375 APInt DemandedSrcElts = getExtractedDemandedElts(User);
19376 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
19377 break;
19378 }
19379 default:
19380 DemandedElts.setAllBits();
19381 return DemandedElts;
19382 }
19383 }
19384 return DemandedElts;
19385}
19386
19387SDValue
19388X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
19389 SelectionDAG &DAG) const {
19390 SDLoc dl(Op);
19391 SDValue Vec = Op.getOperand(0);
19392 MVT VecVT = Vec.getSimpleValueType();
19393 SDValue Idx = Op.getOperand(1);
19394 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19395
19396 if (VecVT.getVectorElementType() == MVT::i1)
19397 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
19398
19399 if (!IdxC) {
19400 // Its more profitable to go through memory (1 cycles throughput)
19401 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
19402 // IACA tool was used to get performance estimation
19403 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
19404 //
19405 // example : extractelement <16 x i8> %a, i32 %i
19406 //
19407 // Block Throughput: 3.00 Cycles
19408 // Throughput Bottleneck: Port5
19409 //
19410 // | Num Of | Ports pressure in cycles | |
19411 // | Uops | 0 - DV | 5 | 6 | 7 | |
19412 // ---------------------------------------------
19413 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
19414 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
19415 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
19416 // Total Num Of Uops: 4
19417 //
19418 //
19419 // Block Throughput: 1.00 Cycles
19420 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
19421 //
19422 // | | Ports pressure in cycles | |
19423 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
19424 // ---------------------------------------------------------
19425 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
19426 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
19427 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
19428 // Total Num Of Uops: 4
19429
19430 return SDValue();
19431 }
19432
19433 unsigned IdxVal = IdxC->getZExtValue();
19434
19435 // If this is a 256-bit vector result, first extract the 128-bit vector and
19436 // then extract the element from the 128-bit vector.
19437 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
19438 // Get the 128-bit vector.
19439 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
19440 MVT EltVT = VecVT.getVectorElementType();
19441
19442 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
19443 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
19444
19445 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
19446 // this can be done with a mask.
19447 IdxVal &= ElemsPerChunk - 1;
19448 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19449 DAG.getVectorIdxConstant(IdxVal, dl));
19450 }
19451
19452 assert(VecVT.is128BitVector() && "Unexpected vector length");
19453
19454 MVT VT = Op.getSimpleValueType();
19455
19456 if (VT == MVT::i16) {
19457 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
19458 // we're going to zero extend the register or fold the store (SSE41 only).
19459 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
19460 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
19461 if (Subtarget.hasFP16())
19462 return Op;
19463
19464 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
19465 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19466 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19467 }
19468
19469 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19470 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19471 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19472 }
19473
19474 if (Subtarget.hasSSE41())
19475 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
19476 return Res;
19477
19478 // Only extract a single element from a v16i8 source - determine the common
19479 // DWORD/WORD that all extractions share, and extract the sub-byte.
19480 // TODO: Add QWORD MOVQ extraction?
19481 if (VT == MVT::i8) {
19482 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
19483 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
19484
19485 // Extract either the lowest i32 or any i16, and extract the sub-byte.
19486 int DWordIdx = IdxVal / 4;
19487 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
19488 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19489 DAG.getBitcast(MVT::v4i32, Vec),
19490 DAG.getVectorIdxConstant(DWordIdx, dl));
19491 int ShiftVal = (IdxVal % 4) * 8;
19492 if (ShiftVal != 0)
19493 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
19494 DAG.getConstant(ShiftVal, dl, MVT::i8));
19495 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19496 }
19497
19498 int WordIdx = IdxVal / 2;
19499 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
19500 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
19501 DAG.getBitcast(MVT::v8i16, Vec),
19502 DAG.getVectorIdxConstant(WordIdx, dl));
19503 int ShiftVal = (IdxVal % 2) * 8;
19504 if (ShiftVal != 0)
19505 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
19506 DAG.getConstant(ShiftVal, dl, MVT::i8));
19507 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19508 }
19509 }
19510
19511 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
19512 if (IdxVal == 0)
19513 return Op;
19514
19515 // Shuffle the element to the lowest element, then movss or movsh.
19516 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
19517 Mask[0] = static_cast<int>(IdxVal);
19518 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19519 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19520 DAG.getVectorIdxConstant(0, dl));
19521 }
19522
19523 if (VT.getSizeInBits() == 64) {
19524 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
19525 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
19526 // to match extract_elt for f64.
19527 if (IdxVal == 0)
19528 return Op;
19529
19530 // UNPCKHPD the element to the lowest double word, then movsd.
19531 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
19532 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
19533 int Mask[2] = { 1, -1 };
19534 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19535 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19536 DAG.getVectorIdxConstant(0, dl));
19537 }
19538
19539 return SDValue();
19540}
19541
19542/// Insert one bit to mask vector, like v16i1 or v8i1.
19543/// AVX-512 feature.
19545 const X86Subtarget &Subtarget) {
19546 SDLoc dl(Op);
19547 SDValue Vec = Op.getOperand(0);
19548 SDValue Elt = Op.getOperand(1);
19549 SDValue Idx = Op.getOperand(2);
19550 MVT VecVT = Vec.getSimpleValueType();
19551
19552 if (!isa<ConstantSDNode>(Idx)) {
19553 // Non constant index. Extend source and destination,
19554 // insert element and then truncate the result.
19555 unsigned NumElts = VecVT.getVectorNumElements();
19556 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19557 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19558 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
19559 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
19560 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
19561 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
19562 }
19563
19564 // Copy into a k-register, extract to v1i1 and insert_subvector.
19565 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
19566 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
19567}
19568
19569SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
19570 SelectionDAG &DAG) const {
19571 MVT VT = Op.getSimpleValueType();
19572 MVT EltVT = VT.getVectorElementType();
19573 unsigned NumElts = VT.getVectorNumElements();
19574 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
19575
19576 if (EltVT == MVT::i1)
19577 return InsertBitToMaskVector(Op, DAG, Subtarget);
19578
19579 SDLoc dl(Op);
19580 SDValue N0 = Op.getOperand(0);
19581 SDValue N1 = Op.getOperand(1);
19582 SDValue N2 = Op.getOperand(2);
19583 auto *N2C = dyn_cast<ConstantSDNode>(N2);
19584
19585 if (EltVT == MVT::bf16) {
19586 MVT IVT = VT.changeVectorElementTypeToInteger();
19587 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
19588 DAG.getBitcast(IVT, N0),
19589 DAG.getBitcast(MVT::i16, N1), N2);
19590 return DAG.getBitcast(VT, Res);
19591 }
19592
19593 if (!N2C) {
19594 // Variable insertion indices, usually we're better off spilling to stack,
19595 // but AVX512 can use a variable compare+select by comparing against all
19596 // possible vector indices, and FP insertion has less gpr->simd traffic.
19597 if (!(Subtarget.hasBWI() ||
19598 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19599 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
19600 return SDValue();
19601
19602 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
19603 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
19604 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
19605 return SDValue();
19606
19607 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
19608 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
19609 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
19610
19611 SmallVector<SDValue, 16> RawIndices;
19612 for (unsigned I = 0; I != NumElts; ++I)
19613 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
19614 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
19615
19616 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
19617 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19619 }
19620
19621 if (N2C->getAPIntValue().uge(NumElts))
19622 return SDValue();
19623 uint64_t IdxVal = N2C->getZExtValue();
19624
19625 bool IsZeroElt = X86::isZeroNode(N1);
19626 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19627
19628 if (IsZeroElt || IsAllOnesElt) {
19629 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
19630 // We don't deal with i8 0 since it appears to be handled elsewhere.
19631 if (IsAllOnesElt &&
19632 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
19633 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
19634 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
19635 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
19636 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
19637 CstVectorElts[IdxVal] = OnesCst;
19638 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
19639 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
19640 }
19641 // See if we can do this more efficiently with a blend shuffle with a
19642 // rematerializable vector.
19643 if (Subtarget.hasSSE41() &&
19644 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19645 SmallVector<int, 8> BlendMask;
19646 for (unsigned i = 0; i != NumElts; ++i)
19647 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19648 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19649 : getOnesVector(VT, DAG, dl);
19650 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19651 }
19652 }
19653
19654 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19655 // into that, and then insert the subvector back into the result.
19656 if (VT.is256BitVector() || VT.is512BitVector()) {
19657 // With a 256-bit vector, we can insert into the zero element efficiently
19658 // using a blend if we have AVX or AVX2 and the right data type.
19659 if (VT.is256BitVector() && IdxVal == 0) {
19660 // TODO: It is worthwhile to cast integer to floating point and back
19661 // and incur a domain crossing penalty if that's what we'll end up
19662 // doing anyway after extracting to a 128-bit vector.
19663 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19664 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19665 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19666 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19667 DAG.getTargetConstant(1, dl, MVT::i8));
19668 }
19669 }
19670
19671 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19672 assert(isPowerOf2_32(NumEltsIn128) &&
19673 "Vectors will always have power-of-two number of elements.");
19674
19675 // If we are not inserting into the low 128-bit vector chunk,
19676 // then prefer the broadcast+blend sequence.
19677 // FIXME: relax the profitability check iff all N1 uses are insertions.
19678 if (IdxVal >= NumEltsIn128 &&
19679 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19680 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19681 X86::mayFoldLoad(N1, Subtarget)))) {
19682 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19683 SmallVector<int, 8> BlendMask;
19684 for (unsigned i = 0; i != NumElts; ++i)
19685 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19686 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19687 }
19688
19689 // Get the desired 128-bit vector chunk.
19690 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19691
19692 // Insert the element into the desired chunk.
19693 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19694 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19695
19696 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19697 DAG.getVectorIdxConstant(IdxIn128, dl));
19698
19699 // Insert the changed part back into the bigger vector
19700 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19701 }
19702 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19703
19704 // This will be just movw/movd/movq/movsh/movss/movsd.
19705 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19706 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19707 EltVT == MVT::f16 || EltVT == MVT::i64) {
19708 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19709 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19710 }
19711
19712 // We can't directly insert an i8 or i16 into a vector, so zero extend
19713 // it to i32 first.
19714 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19715 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19716 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19717 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19718 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19719 return DAG.getBitcast(VT, N1);
19720 }
19721 }
19722
19723 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19724 // argument. SSE41 required for pinsrb.
19725 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19726 unsigned Opc;
19727 if (VT == MVT::v8i16) {
19728 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19729 Opc = X86ISD::PINSRW;
19730 } else {
19731 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19732 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19733 Opc = X86ISD::PINSRB;
19734 }
19735
19736 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19737 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19738 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19739 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19740 }
19741
19742 if (Subtarget.hasSSE41()) {
19743 if (EltVT == MVT::f32) {
19744 // Bits [7:6] of the constant are the source select. This will always be
19745 // zero here. The DAG Combiner may combine an extract_elt index into
19746 // these bits. For example (insert (extract, 3), 2) could be matched by
19747 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19748 // Bits [5:4] of the constant are the destination select. This is the
19749 // value of the incoming immediate.
19750 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19751 // combine either bitwise AND or insert of float 0.0 to set these bits.
19752
19753 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19754 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19755 // If this is an insertion of 32-bits into the low 32-bits of
19756 // a vector, we prefer to generate a blend with immediate rather
19757 // than an insertps. Blends are simpler operations in hardware and so
19758 // will always have equal or better performance than insertps.
19759 // But if optimizing for size and there's a load folding opportunity,
19760 // generate insertps because blendps does not have a 32-bit memory
19761 // operand form.
19762 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19763 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19764 DAG.getTargetConstant(1, dl, MVT::i8));
19765 }
19766 // Create this as a scalar to vector..
19767 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19768 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19769 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19770 }
19771
19772 // PINSR* works with constant index.
19773 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19774 return Op;
19775 }
19776
19777 return SDValue();
19778}
19779
19780static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
19781 SelectionDAG &DAG) {
19782 SDLoc DL(Op);
19783 SDValue X = Op.getOperand(0);
19784 MVT XTy = X.getSimpleValueType();
19785 SDValue Exp = Op.getOperand(1);
19786
19787 switch (XTy.SimpleTy) {
19788 default:
19789 return SDValue();
19790 case MVT::f16:
19791 if (!Subtarget.hasFP16())
19792 X = DAG.getFPExtendOrRound(X, DL, MVT::f32);
19793 [[fallthrough]];
19794 case MVT::f32:
19795 case MVT::f64: {
19796 MVT VT = MVT::getVectorVT(X.getSimpleValueType(),
19797 128 / X.getSimpleValueType().getSizeInBits());
19798 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
19799 SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X);
19800 SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp);
19801 SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp);
19802 SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0);
19803 return DAG.getFPExtendOrRound(Final, DL, XTy);
19804 }
19805 case MVT::v4f32:
19806 case MVT::v2f64:
19807 case MVT::v8f32:
19808 case MVT::v4f64:
19809 case MVT::v16f32:
19810 case MVT::v8f64:
19811 if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) {
19812 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19813 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19814 }
19815 break;
19816 case MVT::v8f16:
19817 case MVT::v16f16:
19818 if (Subtarget.hasFP16()) {
19819 if (Subtarget.hasVLX()) {
19820 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19821 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19822 }
19823 break;
19824 }
19825 X = DAG.getFPExtendOrRound(X, DL, XTy.changeVectorElementType(MVT::f32));
19826 Exp = DAG.getSExtOrTrunc(Exp, DL,
19827 X.getSimpleValueType().changeTypeToInteger());
19828 break;
19829 case MVT::v32f16:
19830 if (Subtarget.hasFP16()) {
19831 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19832 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19833 }
19834 return splitVectorOp(Op, DAG, DL);
19835 }
19836 SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512);
19837 SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512);
19838 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, WideExp.getSimpleValueType(), Exp);
19839 SDValue Scalef =
19840 DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp);
19841 SDValue Final =
19842 DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0);
19843 return DAG.getFPExtendOrRound(Final, DL, XTy);
19844}
19845
19847 SelectionDAG &DAG) {
19848 SDLoc dl(Op);
19849 MVT OpVT = Op.getSimpleValueType();
19850
19851 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19852 // combines.
19853 if (X86::isZeroNode(Op.getOperand(0)))
19854 return getZeroVector(OpVT, Subtarget, DAG, dl);
19855
19856 // If this is a 256-bit vector result, first insert into a 128-bit
19857 // vector and then insert into the 256-bit vector.
19858 if (!OpVT.is128BitVector()) {
19859 // Insert into a 128-bit vector.
19860 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19862 OpVT.getVectorNumElements() / SizeFactor);
19863
19864 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19865
19866 // Insert the 128-bit vector.
19867 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19868 }
19869 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19870 "Expected an SSE type!");
19871
19872 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19873 // tblgen.
19874 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19875 return Op;
19876
19877 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19878 return DAG.getBitcast(
19879 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19880}
19881
19882// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19883// simple superregister reference or explicit instructions to insert
19884// the upper bits of a vector.
19886 SelectionDAG &DAG) {
19887 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19888
19889 return insert1BitVector(Op, DAG, Subtarget);
19890}
19891
19893 SelectionDAG &DAG) {
19894 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19895 "Only vXi1 extract_subvectors need custom lowering");
19896
19897 SDLoc dl(Op);
19898 SDValue Vec = Op.getOperand(0);
19899 uint64_t IdxVal = Op.getConstantOperandVal(1);
19900
19901 if (IdxVal == 0) // the operation is legal
19902 return Op;
19903
19904 // Extend to natively supported kshift.
19905 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19906
19907 // Shift to the LSB.
19908 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19909 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19910
19911 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19912 DAG.getVectorIdxConstant(0, dl));
19913}
19914
19915// Returns the appropriate wrapper opcode for a global reference.
19916unsigned X86TargetLowering::getGlobalWrapperKind(
19917 const GlobalValue *GV, const unsigned char OpFlags) const {
19918 // References to absolute symbols are never PC-relative.
19919 if (GV && GV->isAbsoluteSymbolRef())
19920 return X86ISD::Wrapper;
19921
19922 // The following OpFlags under RIP-rel PIC use RIP.
19923 if (Subtarget.isPICStyleRIPRel() &&
19924 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19925 OpFlags == X86II::MO_DLLIMPORT))
19926 return X86ISD::WrapperRIP;
19927
19928 // GOTPCREL references must always use RIP.
19929 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19930 return X86ISD::WrapperRIP;
19931
19932 return X86ISD::Wrapper;
19933}
19934
19935// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19936// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19937// one of the above mentioned nodes. It has to be wrapped because otherwise
19938// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19939// be used to form addressing mode. These wrapped nodes will be selected
19940// into MOV32ri.
19941SDValue
19942X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19943 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19944
19945 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19946 // global base reg.
19947 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19948
19949 auto PtrVT = getPointerTy(DAG.getDataLayout());
19951 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19952 SDLoc DL(CP);
19953 Result =
19954 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19955 // With PIC, the address is actually $g + Offset.
19956 if (OpFlag) {
19957 Result =
19958 DAG.getNode(ISD::ADD, DL, PtrVT,
19959 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19960 }
19961
19962 return Result;
19963}
19964
19965SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19966 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19967
19968 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19969 // global base reg.
19970 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19971
19972 EVT PtrVT = Op.getValueType();
19973 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19974 SDLoc DL(JT);
19975 Result =
19976 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19977
19978 // With PIC, the address is actually $g + Offset.
19979 if (OpFlag)
19980 Result =
19981 DAG.getNode(ISD::ADD, DL, PtrVT,
19982 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19983
19984 return Result;
19985}
19986
19987SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19988 SelectionDAG &DAG) const {
19989 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19990}
19991
19992SDValue
19993X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19994 // Create the TargetBlockAddressAddress node.
19995 unsigned char OpFlags =
19996 Subtarget.classifyBlockAddressReference();
19997 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19998 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19999 SDLoc dl(Op);
20000 EVT PtrVT = Op.getValueType();
20001 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
20002 Result =
20003 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
20004
20005 // With PIC, the address is actually $g + Offset.
20006 if (isGlobalRelativeToPICBase(OpFlags)) {
20007 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20008 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20009 }
20010
20011 return Result;
20012}
20013
20014/// Creates target global address or external symbol nodes for calls or
20015/// other uses.
20016SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
20017 bool ForCall,
20018 bool *IsImpCall) const {
20019 // Unpack the global address or external symbol.
20020 SDLoc dl(Op);
20021 const GlobalValue *GV = nullptr;
20022 int64_t Offset = 0;
20023 const char *ExternalSym = nullptr;
20024 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
20025 GV = G->getGlobal();
20026 Offset = G->getOffset();
20027 } else {
20028 const auto *ES = cast<ExternalSymbolSDNode>(Op);
20029 ExternalSym = ES->getSymbol();
20030 }
20031
20032 // Calculate some flags for address lowering.
20034 unsigned char OpFlags;
20035 if (ForCall)
20036 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
20037 else
20038 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
20039 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
20040 bool NeedsLoad = isGlobalStubReference(OpFlags);
20041
20043 EVT PtrVT = Op.getValueType();
20045
20046 if (GV) {
20047 // Create a target global address if this is a global. If possible, fold the
20048 // offset into the global address reference. Otherwise, ADD it on later.
20049 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
20050 // allowed because if the address of foo is 0, the ELF R_X86_64_32
20051 // relocation will compute to a negative value, which is invalid.
20052 int64_t GlobalOffset = 0;
20053 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
20055 std::swap(GlobalOffset, Offset);
20056 }
20057 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
20058 } else {
20059 // If this is not a global address, this must be an external symbol.
20060 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
20061 }
20062
20063 // If this is a direct call, avoid the wrapper if we don't need to do any
20064 // loads or adds. This allows SDAG ISel to match direct calls.
20065 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
20066 return Result;
20067
20068 // If Import Call Optimization is enabled and this is an imported function
20069 // then make a note of it and return the global address without wrapping.
20070 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
20071 Mod.getModuleFlag("import-call-optimization")) {
20072 assert(ForCall && "Should only enable import call optimization if we are "
20073 "lowering a call");
20074 *IsImpCall = true;
20075 return Result;
20076 }
20077
20078 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
20079
20080 // With PIC, the address is actually $g + Offset.
20081 if (HasPICReg) {
20082 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20083 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20084 }
20085
20086 // For globals that require a load from a stub to get the address, emit the
20087 // load.
20088 if (NeedsLoad)
20089 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
20091
20092 // If there was a non-zero offset that we didn't fold, create an explicit
20093 // addition for it.
20094 if (Offset != 0)
20095 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
20096 DAG.getSignedConstant(Offset, dl, PtrVT));
20097
20098 return Result;
20099}
20100
20101SDValue
20102X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
20103 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
20104}
20105
20107 const EVT PtrVT, unsigned ReturnReg,
20108 unsigned char OperandFlags,
20109 bool LoadGlobalBaseReg = false,
20110 bool LocalDynamic = false) {
20112 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20113 SDLoc dl(GA);
20114 SDValue TGA;
20115 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
20116 SDValue Chain = DAG.getEntryNode();
20117 SDValue Ret;
20118 if (LocalDynamic && UseTLSDESC) {
20119 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
20120 // Reuse existing GetTLSADDR node if we can find it.
20121 if (TGA->hasOneUse()) {
20122 // TLSDESC uses TGA.
20123 SDNode *TLSDescOp = *TGA->user_begin();
20124 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
20125 "Unexpected TLSDESC DAG");
20126 // CALLSEQ_END uses TGA via a chain and glue.
20127 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
20128 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
20129 "Unexpected TLSDESC DAG");
20130 // CopyFromReg uses CALLSEQ_END via a chain and glue.
20131 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
20132 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
20133 "Unexpected TLSDESC DAG");
20134 Ret = SDValue(CopyFromRegOp, 0);
20135 }
20136 } else {
20137 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
20138 GA->getOffset(), OperandFlags);
20139 }
20140
20141 if (!Ret) {
20142 unsigned CallType = UseTLSDESC ? X86ISD::TLSDESC
20143 : LocalDynamic ? X86ISD::TLSBASEADDR
20144 : X86ISD::TLSADDR;
20145
20146 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
20147 if (LoadGlobalBaseReg) {
20148 SDValue InGlue;
20149 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
20150 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
20151 InGlue);
20152 InGlue = Chain.getValue(1);
20153 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
20154 } else {
20155 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
20156 }
20157 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
20158
20159 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
20160 MFI.setHasCalls(true);
20161
20162 SDValue Glue = Chain.getValue(1);
20163 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
20164 }
20165
20166 if (!UseTLSDESC)
20167 return Ret;
20168
20169 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
20170 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
20171
20173 SDValue Offset =
20174 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
20175 MachinePointerInfo(Ptr));
20176 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
20177}
20178
20179// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
20180static SDValue
20182 const EVT PtrVT) {
20183 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
20184 /*LoadGlobalBaseReg=*/true);
20185}
20186
20187// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
20188static SDValue
20190 const EVT PtrVT) {
20191 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
20192}
20193
20194// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
20195static SDValue
20197 const EVT PtrVT) {
20198 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
20199}
20200
20202 SelectionDAG &DAG, const EVT PtrVT,
20203 bool Is64Bit, bool Is64BitLP64) {
20204 SDLoc dl(GA);
20205
20206 // Get the start address of the TLS block for this module.
20210
20211 SDValue Base;
20212 if (Is64Bit) {
20213 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
20214 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
20215 /*LoadGlobalBaseReg=*/false,
20216 /*LocalDynamic=*/true);
20217 } else {
20218 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
20219 /*LoadGlobalBaseReg=*/true,
20220 /*LocalDynamic=*/true);
20221 }
20222
20223 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
20224 // of Base.
20225
20226 // Build x@dtpoff.
20227 unsigned char OperandFlags = X86II::MO_DTPOFF;
20228 unsigned WrapperKind = X86ISD::Wrapper;
20229 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20230 GA->getValueType(0),
20231 GA->getOffset(), OperandFlags);
20232 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20233
20234 // Add x@dtpoff with the base.
20235 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
20236}
20237
20238// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
20240 const EVT PtrVT, TLSModel::Model model,
20241 bool is64Bit, bool isPIC) {
20242 SDLoc dl(GA);
20243
20244 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
20247
20248 SDValue ThreadPointer =
20249 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
20250 MachinePointerInfo(Ptr));
20251
20252 unsigned char OperandFlags = 0;
20253 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
20254 // initialexec.
20255 unsigned WrapperKind = X86ISD::Wrapper;
20256 if (model == TLSModel::LocalExec) {
20257 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
20258 } else if (model == TLSModel::InitialExec) {
20259 if (is64Bit) {
20260 OperandFlags = X86II::MO_GOTTPOFF;
20261 WrapperKind = X86ISD::WrapperRIP;
20262 } else {
20263 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
20264 }
20265 } else {
20266 llvm_unreachable("Unexpected model");
20267 }
20268
20269 // emit "addl x@ntpoff,%eax" (local exec)
20270 // or "addl x@indntpoff,%eax" (initial exec)
20271 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
20272 SDValue TGA =
20273 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
20274 GA->getOffset(), OperandFlags);
20275 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20276
20277 if (model == TLSModel::InitialExec) {
20278 if (isPIC && !is64Bit) {
20279 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
20280 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20281 Offset);
20282 }
20283
20284 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
20286 }
20287
20288 // The address of the thread local variable is the add of the thread
20289 // pointer with the offset of the variable.
20290 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
20291}
20292
20293SDValue
20294X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
20295
20296 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
20297
20298 if (DAG.getTarget().useEmulatedTLS())
20299 return LowerToTLSEmulatedModel(GA, DAG);
20300
20301 const GlobalValue *GV = GA->getGlobal();
20302 EVT PtrVT = Op.getValueType();
20303 bool PositionIndependent = isPositionIndependent();
20304
20305 if (Subtarget.isTargetELF()) {
20306 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
20307 switch (model) {
20309 if (Subtarget.is64Bit()) {
20310 if (Subtarget.isTarget64BitLP64())
20311 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
20312 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
20313 }
20314 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
20316 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
20317 Subtarget.isTarget64BitLP64());
20320 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
20321 PositionIndependent);
20322 }
20323 llvm_unreachable("Unknown TLS model.");
20324 }
20325
20326 if (Subtarget.isTargetDarwin()) {
20327 // Darwin only has one model of TLS. Lower to that.
20328 unsigned char OpFlag = 0;
20329 unsigned WrapperKind = 0;
20330
20331 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20332 // global base reg.
20333 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
20334 if (PIC32) {
20335 OpFlag = X86II::MO_TLVP_PIC_BASE;
20336 WrapperKind = X86ISD::Wrapper;
20337 } else {
20338 OpFlag = X86II::MO_TLVP;
20339 WrapperKind = X86ISD::WrapperRIP;
20340 }
20341 SDLoc DL(Op);
20343 GA->getValueType(0),
20344 GA->getOffset(), OpFlag);
20345 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
20346
20347 // With PIC32, the address is actually $g + Offset.
20348 if (PIC32)
20349 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
20350 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20351 Offset);
20352
20353 // Lowering the machine isd will make sure everything is in the right
20354 // location.
20355 SDValue Chain = DAG.getEntryNode();
20356 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20357 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
20358 SDValue Args[] = { Chain, Offset };
20359 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
20360 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
20361
20362 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
20363 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20364 MFI.setAdjustsStack(true);
20365
20366 // And our return value (tls address) is in the standard call return value
20367 // location.
20368 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
20369 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
20370 }
20371
20372 if (Subtarget.isOSWindows()) {
20373 // Just use the implicit TLS architecture
20374 // Need to generate something similar to:
20375 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
20376 // ; from TEB
20377 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
20378 // mov rcx, qword [rdx+rcx*8]
20379 // mov eax, .tls$:tlsvar
20380 // [rax+rcx] contains the address
20381 // Windows 64bit: gs:0x58
20382 // Windows 32bit: fs:__tls_array
20383
20384 SDLoc dl(GA);
20385 SDValue Chain = DAG.getEntryNode();
20386
20387 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
20388 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
20389 // use its literal value of 0x2C.
20391 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
20393
20394 SDValue TlsArray = Subtarget.is64Bit()
20395 ? DAG.getIntPtrConstant(0x58, dl)
20396 : (Subtarget.isTargetWindowsGNU()
20397 ? DAG.getIntPtrConstant(0x2C, dl)
20398 : DAG.getExternalSymbol("_tls_array", PtrVT));
20399
20400 SDValue ThreadPointer =
20401 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
20402
20403 SDValue res;
20405 res = ThreadPointer;
20406 } else {
20407 // Load the _tls_index variable
20408 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
20409 if (Subtarget.is64Bit())
20410 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
20411 MachinePointerInfo(), MVT::i32);
20412 else
20413 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
20414
20415 const DataLayout &DL = DAG.getDataLayout();
20416 SDValue Scale =
20417 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
20418 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
20419
20420 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
20421 }
20422
20423 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
20424
20425 // Get the offset of start of .tls section
20426 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20427 GA->getValueType(0),
20429 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
20430
20431 // The address of the thread local variable is the add of the thread
20432 // pointer with the offset of the variable.
20433 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
20434 }
20435
20436 llvm_unreachable("TLS not implemented for this target.");
20437}
20438
20440 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
20441 const TargetMachine &TM = getTargetMachine();
20442 TLSModel::Model Model = TM.getTLSModel(&GV);
20443 switch (Model) {
20446 // We can include the %fs segment register in addressing modes.
20447 return true;
20450 // These models do not result in %fs relative addresses unless
20451 // TLS descriptior are used.
20452 //
20453 // Even in the case of TLS descriptors we currently have no way to model
20454 // the difference between %fs access and the computations needed for the
20455 // offset and returning `true` for TLS-desc currently duplicates both
20456 // which is detrimental :-/
20457 return false;
20458 }
20459 }
20460 return false;
20461}
20462
20463/// Lower SRA_PARTS and friends, which return two i32 values
20464/// and take a 2 x i32 value to shift plus a shift amount.
20465/// TODO: Can this be moved to general expansion code?
20467 SDValue Lo, Hi;
20468 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
20469 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
20470}
20471
20472// Try to use a packed vector operation to handle i64 on 32-bit targets when
20473// AVX512DQ is enabled.
20475 SelectionDAG &DAG,
20476 const X86Subtarget &Subtarget) {
20477 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20478 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20479 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20480 Op.getOpcode() == ISD::UINT_TO_FP) &&
20481 "Unexpected opcode!");
20482 bool IsStrict = Op->isStrictFPOpcode();
20483 unsigned OpNo = IsStrict ? 1 : 0;
20484 SDValue Src = Op.getOperand(OpNo);
20485 MVT SrcVT = Src.getSimpleValueType();
20486 MVT VT = Op.getSimpleValueType();
20487
20488 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20489 (VT != MVT::f32 && VT != MVT::f64))
20490 return SDValue();
20491
20492 // Pack the i64 into a vector, do the operation and extract.
20493
20494 // Using 256-bit to ensure result is 128-bits for f32 case.
20495 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20496 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
20497 MVT VecVT = MVT::getVectorVT(VT, NumElts);
20498
20499 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
20500 if (IsStrict) {
20501 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
20502 {Op.getOperand(0), InVec});
20503 SDValue Chain = CvtVec.getValue(1);
20504 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20505 DAG.getVectorIdxConstant(0, dl));
20506 return DAG.getMergeValues({Value, Chain}, dl);
20507 }
20508
20509 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
20510
20511 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20512 DAG.getVectorIdxConstant(0, dl));
20513}
20514
20515// Try to use a packed vector operation to handle i64 on 32-bit targets.
20517 const X86Subtarget &Subtarget) {
20518 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20519 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20520 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20521 Op.getOpcode() == ISD::UINT_TO_FP) &&
20522 "Unexpected opcode!");
20523 bool IsStrict = Op->isStrictFPOpcode();
20524 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20525 MVT SrcVT = Src.getSimpleValueType();
20526 MVT VT = Op.getSimpleValueType();
20527
20528 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20529 return SDValue();
20530
20531 // Pack the i64 into a vector, do the operation and extract.
20532
20533 assert(Subtarget.hasFP16() && "Expected FP16");
20534
20535 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
20536 if (IsStrict) {
20537 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20538 {Op.getOperand(0), InVec});
20539 SDValue Chain = CvtVec.getValue(1);
20540 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20541 DAG.getVectorIdxConstant(0, dl));
20542 return DAG.getMergeValues({Value, Chain}, dl);
20543 }
20544
20545 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
20546
20547 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20548 DAG.getVectorIdxConstant(0, dl));
20549}
20550
20551static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
20552 const X86Subtarget &Subtarget) {
20553 switch (Opcode) {
20554 case ISD::SINT_TO_FP:
20555 // TODO: Handle wider types with AVX/AVX512.
20556 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
20557 return false;
20558 // CVTDQ2PS or (V)CVTDQ2PD
20559 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
20560
20561 case ISD::UINT_TO_FP:
20562 // TODO: Handle wider types and i64 elements.
20563 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
20564 return false;
20565 // VCVTUDQ2PS or VCVTUDQ2PD
20566 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20567
20568 default:
20569 return false;
20570 }
20571}
20572
20573/// Given a scalar cast operation that is extracted from a vector, try to
20574/// vectorize the cast op followed by extraction. This will avoid an expensive
20575/// round-trip between XMM and GPR.
20577 SelectionDAG &DAG,
20578 const X86Subtarget &Subtarget) {
20579 // TODO: This could be enhanced to handle smaller integer types by peeking
20580 // through an extend.
20581 SDValue Extract = Cast.getOperand(0);
20582 MVT DestVT = Cast.getSimpleValueType();
20583 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20584 !isa<ConstantSDNode>(Extract.getOperand(1)))
20585 return SDValue();
20586
20587 // See if we have a 128-bit vector cast op for this type of cast.
20588 SDValue VecOp = Extract.getOperand(0);
20589 EVT FromVT = VecOp.getValueType();
20590 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
20591 MVT Vec128VT =
20592 MVT::getVectorVT(FromVT.getScalarType().getSimpleVT(), NumEltsInXMM);
20593 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
20594 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
20595 return SDValue();
20596
20597 // If we are extracting from a non-zero element, first shuffle the source
20598 // vector to allow extracting from element zero.
20599 if (!isNullConstant(Extract.getOperand(1))) {
20600 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
20601 Mask[0] = Extract.getConstantOperandVal(1);
20602 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
20603 }
20604 // If the source vector is wider than 128-bits, extract the low part. Do not
20605 // create an unnecessarily wide vector cast op.
20606 if (FromVT != Vec128VT)
20607 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
20608
20609 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
20610 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
20611 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
20612 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
20613 DAG.getVectorIdxConstant(0, DL));
20614}
20615
20616/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
20617/// try to vectorize the cast ops. This will avoid an expensive round-trip
20618/// between XMM and GPR.
20619static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
20620 SelectionDAG &DAG,
20621 const X86Subtarget &Subtarget) {
20622 SDValue CastToInt = CastToFP.getOperand(0);
20623 MVT VT = CastToFP.getSimpleValueType();
20624 if ((CastToInt.getOpcode() != ISD::FP_TO_SINT &&
20625 CastToInt.getOpcode() != ISD::FP_TO_UINT) ||
20626 VT.isVector())
20627 return SDValue();
20628
20629 MVT IntVT = CastToInt.getSimpleValueType();
20630 SDValue X = CastToInt.getOperand(0);
20631 MVT SrcVT = X.getSimpleValueType();
20632 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20633 return SDValue();
20634
20635 // See if we have 128-bit vector cast instructions for this type of cast.
20636 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
20637 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20638 (IntVT != MVT::i32 && IntVT != MVT::i64))
20639 return SDValue();
20640
20641 unsigned SrcSize = SrcVT.getSizeInBits();
20642 unsigned IntSize = IntVT.getSizeInBits();
20643 unsigned VTSize = VT.getSizeInBits();
20644 bool IsUnsigned = CastToInt.getOpcode() == ISD::FP_TO_UINT;
20645 unsigned ToIntOpcode =
20646 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
20647 unsigned ToFPOpcode =
20648 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
20649 unsigned Width = 128;
20650
20651 if (Subtarget.hasVLX() && Subtarget.hasDQI()) {
20652 // AVX512DQ+VLX
20653 if (IsUnsigned) {
20654 ToIntOpcode =
20655 SrcSize != IntSize ? X86ISD::CVTTP2UI : (unsigned)ISD::FP_TO_UINT;
20656 ToFPOpcode =
20657 IntSize != VTSize ? X86ISD::CVTUI2P : (unsigned)ISD::UINT_TO_FP;
20658 }
20659 } else {
20660 if (IsUnsigned || IntVT == MVT::i64) {
20661 // SSE2 can only perform f64/f32 <-> i32 signed.
20662 if (!Subtarget.useAVX512Regs() || !Subtarget.hasDQI())
20663 return SDValue();
20664
20665 // Need to extend width for AVX512DQ without AVX512VL.
20666 Width = 512;
20667 ToIntOpcode = CastToInt.getOpcode();
20668 ToFPOpcode = IsUnsigned ? ISD::UINT_TO_FP : ISD::SINT_TO_FP;
20669 }
20670 }
20671
20672 MVT VecSrcVT, VecIntVT, VecVT;
20673 unsigned NumElts;
20674 unsigned SrcElts, VTElts;
20675 // Some conversions are only legal with uniform vector sizes on AVX512DQ.
20676 if (Width == 512) {
20677 NumElts = std::min(Width / IntSize, Width / SrcSize);
20678 SrcElts = NumElts;
20679 VTElts = NumElts;
20680 } else {
20681 NumElts = Width / IntSize;
20682 SrcElts = Width / SrcSize;
20683 VTElts = Width / VTSize;
20684 }
20685 VecIntVT = MVT::getVectorVT(IntVT, NumElts);
20686 VecSrcVT = MVT::getVectorVT(SrcVT, SrcElts);
20687 VecVT = MVT::getVectorVT(VT, VTElts);
20688 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
20689 //
20690 // We are not defining the high elements (for example, zero them) because
20691 // that could nullify any performance advantage that we hoped to gain from
20692 // this vector op hack. We do not expect any adverse effects (like denorm
20693 // penalties) with cast ops.
20694 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20695 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
20696 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
20697 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
20698 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
20699}
20700
20702 SelectionDAG &DAG,
20703 const X86Subtarget &Subtarget) {
20704 bool IsStrict = Op->isStrictFPOpcode();
20705 MVT VT = Op->getSimpleValueType(0);
20706 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
20707
20708 if (Subtarget.hasDQI()) {
20709 assert(!Subtarget.hasVLX() && "Unexpected features");
20710
20711 assert((Src.getSimpleValueType() == MVT::v2i64 ||
20712 Src.getSimpleValueType() == MVT::v4i64) &&
20713 "Unsupported custom type");
20714
20715 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
20716 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
20717 "Unexpected VT!");
20718 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20719
20720 // Need to concat with zero vector for strict fp to avoid spurious
20721 // exceptions.
20722 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
20723 : DAG.getUNDEF(MVT::v8i64);
20724 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
20725 DAG.getVectorIdxConstant(0, DL));
20726 SDValue Res, Chain;
20727 if (IsStrict) {
20728 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
20729 {Op->getOperand(0), Src});
20730 Chain = Res.getValue(1);
20731 } else {
20732 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
20733 }
20734
20735 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20736 DAG.getVectorIdxConstant(0, DL));
20737
20738 if (IsStrict)
20739 return DAG.getMergeValues({Res, Chain}, DL);
20740 return Res;
20741 }
20742
20743 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20744 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20745 if (VT != MVT::v4f32 || IsSigned)
20746 return SDValue();
20747
20748 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20749 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20750 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20751 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20752 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20753 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20754 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20755 SmallVector<SDValue, 4> SignCvts(4);
20756 SmallVector<SDValue, 4> Chains(4);
20757 for (int i = 0; i != 4; ++i) {
20758 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20759 DAG.getVectorIdxConstant(i, DL));
20760 if (IsStrict) {
20761 SignCvts[i] =
20762 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20763 {Op.getOperand(0), Elt});
20764 Chains[i] = SignCvts[i].getValue(1);
20765 } else {
20766 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20767 }
20768 }
20769 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20770
20771 SDValue Slow, Chain;
20772 if (IsStrict) {
20773 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20774 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20775 {Chain, SignCvt, SignCvt});
20776 Chain = Slow.getValue(1);
20777 } else {
20778 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20779 }
20780
20781 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20782 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20783
20784 if (IsStrict)
20785 return DAG.getMergeValues({Cvt, Chain}, DL);
20786
20787 return Cvt;
20788}
20789
20791 SelectionDAG &DAG) {
20792 bool IsStrict = Op->isStrictFPOpcode();
20793 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20794 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20795 MVT VT = Op.getSimpleValueType();
20796 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20797
20798 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20799 if (IsStrict)
20800 return DAG.getNode(
20801 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20802 {Chain,
20803 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20804 Rnd});
20805 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20806 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20807}
20808
20809static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20810 const X86Subtarget &Subtarget) {
20811 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20812 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20813 return true;
20814 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20815 return true;
20816 }
20817 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20818 return true;
20819 if (Subtarget.useAVX512Regs()) {
20820 if (VT == MVT::v16i32)
20821 return true;
20822 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20823 return true;
20824 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20825 return true;
20826 }
20827 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20828 (VT == MVT::v2i64 || VT == MVT::v4i64))
20829 return true;
20830 return false;
20831}
20832
20833SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20834 SelectionDAG &DAG) const {
20835 bool IsStrict = Op->isStrictFPOpcode();
20836 unsigned OpNo = IsStrict ? 1 : 0;
20837 SDValue Src = Op.getOperand(OpNo);
20838 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20839 MVT SrcVT = Src.getSimpleValueType();
20840 MVT VT = Op.getSimpleValueType();
20841 SDLoc dl(Op);
20842
20843 if (isBF16orSoftF16(VT, Subtarget))
20844 return promoteXINT_TO_FP(Op, dl, DAG);
20845 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20846 return Op;
20847
20848 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20849 return LowerWin64_INT128_TO_FP(Op, DAG);
20850
20851 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20852 return Extract;
20853
20854 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20855 return R;
20856
20857 if (SrcVT.isVector()) {
20858 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20859 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20860 // source for strict FP.
20861 if (IsStrict)
20862 return DAG.getNode(
20863 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20864 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20865 DAG.getUNDEF(SrcVT))});
20866 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20867 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20868 DAG.getUNDEF(SrcVT)));
20869 }
20870 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20871 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20872
20873 return SDValue();
20874 }
20875
20876 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20877 "Unknown SINT_TO_FP to lower!");
20878
20879 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20880
20881 // These are really Legal; return the operand so the caller accepts it as
20882 // Legal.
20883 if (SrcVT == MVT::i32 && UseSSEReg)
20884 return Op;
20885 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20886 return Op;
20887
20888 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20889 return V;
20890 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20891 return V;
20892
20893 // SSE doesn't have an i16 conversion so we need to promote.
20894 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20895 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20896 if (IsStrict)
20897 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20898 {Chain, Ext});
20899
20900 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20901 }
20902
20903 if (VT == MVT::f128 || !Subtarget.hasX87())
20904 return SDValue();
20905
20906 SDValue ValueToStore = Src;
20907 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20908 // Bitcasting to f64 here allows us to do a single 64-bit store from
20909 // an SSE register, avoiding the store forwarding penalty that would come
20910 // with two 32-bit stores.
20911 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20912
20913 unsigned Size = SrcVT.getStoreSize();
20914 Align Alignment(Size);
20915 MachineFunction &MF = DAG.getMachineFunction();
20916 auto PtrVT = getPointerTy(MF.getDataLayout());
20917 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20918 MachinePointerInfo MPI =
20920 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20921 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20922 std::pair<SDValue, SDValue> Tmp =
20923 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20924
20925 if (IsStrict)
20926 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20927
20928 return Tmp.first;
20929}
20930
20931std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20932 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20933 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20934 // Build the FILD
20935 SDVTList Tys;
20936 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20937 if (useSSE)
20938 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20939 else
20940 Tys = DAG.getVTList(DstVT, MVT::Other);
20941
20942 SDValue FILDOps[] = {Chain, Pointer};
20943 SDValue Result =
20944 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20945 Alignment, MachineMemOperand::MOLoad);
20946 Chain = Result.getValue(1);
20947
20948 if (useSSE) {
20950 unsigned SSFISize = DstVT.getStoreSize();
20951 int SSFI =
20952 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20953 auto PtrVT = getPointerTy(MF.getDataLayout());
20954 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20955 Tys = DAG.getVTList(MVT::Other);
20956 SDValue FSTOps[] = {Chain, Result, StackSlot};
20959 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20960
20961 Chain =
20962 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20963 Result = DAG.getLoad(
20964 DstVT, DL, Chain, StackSlot,
20966 Chain = Result.getValue(1);
20967 }
20968
20969 return { Result, Chain };
20970}
20971
20972/// Horizontal vector math instructions may be slower than normal math with
20973/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20974/// implementation, and likely shuffle complexity of the alternate sequence.
20975static bool shouldUseHorizontalOp(bool IsSingleSource, const SelectionDAG &DAG,
20976 const X86Subtarget &Subtarget) {
20977 bool IsOptimizingSize = DAG.shouldOptForSize();
20978 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20979 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20980}
20981
20982/// 64-bit unsigned integer to double expansion.
20984 SelectionDAG &DAG,
20985 const X86Subtarget &Subtarget) {
20986 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20987 // when converting 0 when rounding toward negative infinity. Caller will
20988 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20989 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20990 // This algorithm is not obvious. Here it is what we're trying to output:
20991 /*
20992 movq %rax, %xmm0
20993 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20994 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20995 #ifdef __SSE3__
20996 haddpd %xmm0, %xmm0
20997 #else
20998 pshufd $0x4e, %xmm0, %xmm1
20999 addpd %xmm1, %xmm0
21000 #endif
21001 */
21002
21003 LLVMContext *Context = DAG.getContext();
21004
21005 // Build some magic constants.
21006 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
21007 Constant *C0 = ConstantDataVector::get(*Context, CV0);
21008 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21009 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
21010
21012 CV1.push_back(
21013 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21014 APInt(64, 0x4330000000000000ULL))));
21015 CV1.push_back(
21016 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21017 APInt(64, 0x4530000000000000ULL))));
21018 Constant *C1 = ConstantVector::get(CV1);
21019 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
21020
21021 // Load the 64-bit value into an XMM register.
21022 SDValue XR1 =
21023 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
21024 SDValue CLod0 = DAG.getLoad(
21025 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
21027 SDValue Unpck1 =
21028 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
21029
21030 SDValue CLod1 = DAG.getLoad(
21031 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
21033 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
21034 // TODO: Are there any fast-math-flags to propagate here?
21035 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
21036 SDValue Result;
21037
21038 if (Subtarget.hasSSE3() &&
21039 shouldUseHorizontalOp(true, DAG, Subtarget)) {
21040 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
21041 } else {
21042 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
21043 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
21044 }
21045 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
21046 DAG.getVectorIdxConstant(0, dl));
21047 return Result;
21048}
21049
21050/// 32-bit unsigned integer to float expansion.
21052 SelectionDAG &DAG,
21053 const X86Subtarget &Subtarget) {
21054 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21055 // FP constant to bias correct the final result.
21056 SDValue Bias = DAG.getConstantFP(
21057 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
21058
21059 // Load the 32-bit value into an XMM register.
21060 SDValue Load =
21061 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
21062
21063 // Zero out the upper parts of the register.
21064 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
21065
21066 // Or the load with the bias.
21067 SDValue Or = DAG.getNode(
21068 ISD::OR, dl, MVT::v2i64,
21069 DAG.getBitcast(MVT::v2i64, Load),
21070 DAG.getBitcast(MVT::v2i64,
21071 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
21072 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
21073 DAG.getBitcast(MVT::v2f64, Or),
21074 DAG.getVectorIdxConstant(0, dl));
21075
21076 if (Op.getNode()->isStrictFPOpcode()) {
21077 // Subtract the bias.
21078 // TODO: Are there any fast-math-flags to propagate here?
21079 SDValue Chain = Op.getOperand(0);
21080 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
21081 {Chain, Or, Bias});
21082
21083 if (Op.getValueType() == Sub.getValueType())
21084 return Sub;
21085
21086 // Handle final rounding.
21087 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
21088 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
21089
21090 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
21091 }
21092
21093 // Subtract the bias.
21094 // TODO: Are there any fast-math-flags to propagate here?
21095 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
21096
21097 // Handle final rounding.
21098 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
21099}
21100
21102 SelectionDAG &DAG,
21103 const X86Subtarget &Subtarget) {
21104 if (Op.getSimpleValueType() != MVT::v2f64)
21105 return SDValue();
21106
21107 bool IsStrict = Op->isStrictFPOpcode();
21108
21109 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
21110 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
21111
21112 if (Subtarget.hasAVX512()) {
21113 if (!Subtarget.hasVLX()) {
21114 // Let generic type legalization widen this.
21115 if (!IsStrict)
21116 return SDValue();
21117 // Otherwise pad the integer input with 0s and widen the operation.
21118 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21119 DAG.getConstant(0, DL, MVT::v2i32));
21120 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
21121 {Op.getOperand(0), N0});
21122 SDValue Chain = Res.getValue(1);
21123 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
21124 DAG.getVectorIdxConstant(0, DL));
21125 return DAG.getMergeValues({Res, Chain}, DL);
21126 }
21127
21128 // Legalize to v4i32 type.
21129 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21130 DAG.getUNDEF(MVT::v2i32));
21131 if (IsStrict)
21132 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
21133 {Op.getOperand(0), N0});
21134 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
21135 }
21136
21137 // Zero extend to 2i64, OR with the floating point representation of 2^52.
21138 // This gives us the floating point equivalent of 2^52 + the i32 integer
21139 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
21140 // point leaving just our i32 integers in double format.
21141 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
21142 SDValue VBias = DAG.getConstantFP(
21143 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
21144 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
21145 DAG.getBitcast(MVT::v2i64, VBias));
21146 Or = DAG.getBitcast(MVT::v2f64, Or);
21147
21148 if (IsStrict)
21149 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
21150 {Op.getOperand(0), Or, VBias});
21151 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
21152}
21153
21155 SelectionDAG &DAG,
21156 const X86Subtarget &Subtarget) {
21157 bool IsStrict = Op->isStrictFPOpcode();
21158 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
21159 MVT VecIntVT = V.getSimpleValueType();
21160 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
21161 "Unsupported custom type");
21162
21163 if (Subtarget.hasAVX512()) {
21164 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
21165 assert(!Subtarget.hasVLX() && "Unexpected features");
21166 MVT VT = Op->getSimpleValueType(0);
21167
21168 // v8i32->v8f64 is legal with AVX512 so just return it.
21169 if (VT == MVT::v8f64)
21170 return Op;
21171
21172 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
21173 VT == MVT::v8f16) &&
21174 "Unexpected VT!");
21175 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
21176 MVT WideIntVT = MVT::v16i32;
21177 if (VT == MVT::v4f64) {
21178 WideVT = MVT::v8f64;
21179 WideIntVT = MVT::v8i32;
21180 }
21181
21182 // Need to concat with zero vector for strict fp to avoid spurious
21183 // exceptions.
21184 SDValue Tmp =
21185 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
21186 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
21187 DAG.getVectorIdxConstant(0, DL));
21188 SDValue Res, Chain;
21189 if (IsStrict) {
21190 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
21191 {Op->getOperand(0), V});
21192 Chain = Res.getValue(1);
21193 } else {
21194 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
21195 }
21196
21197 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21198 DAG.getVectorIdxConstant(0, DL));
21199
21200 if (IsStrict)
21201 return DAG.getMergeValues({Res, Chain}, DL);
21202 return Res;
21203 }
21204
21205 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
21206 Op->getSimpleValueType(0) == MVT::v4f64) {
21207 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
21208 Constant *Bias = ConstantFP::get(
21209 *DAG.getContext(),
21210 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
21211 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21212 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
21213 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
21214 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
21215 SDValue VBias = DAG.getMemIntrinsicNode(
21216 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
21219
21220 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
21221 DAG.getBitcast(MVT::v4i64, VBias));
21222 Or = DAG.getBitcast(MVT::v4f64, Or);
21223
21224 if (IsStrict)
21225 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
21226 {Op.getOperand(0), Or, VBias});
21227 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
21228 }
21229
21230 // The algorithm is the following:
21231 // #ifdef __SSE4_1__
21232 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21233 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21234 // (uint4) 0x53000000, 0xaa);
21235 // #else
21236 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21237 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21238 // #endif
21239 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21240 // return (float4) lo + fhi;
21241
21242 bool Is128 = VecIntVT == MVT::v4i32;
21243 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
21244 // If we convert to something else than the supported type, e.g., to v4f64,
21245 // abort early.
21246 if (VecFloatVT != Op->getSimpleValueType(0))
21247 return SDValue();
21248
21249 // In the #idef/#else code, we have in common:
21250 // - The vector of constants:
21251 // -- 0x4b000000
21252 // -- 0x53000000
21253 // - A shift:
21254 // -- v >> 16
21255
21256 // Create the splat vector for 0x4b000000.
21257 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
21258 // Create the splat vector for 0x53000000.
21259 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
21260
21261 // Create the right shift.
21262 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
21263 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
21264
21265 SDValue Low, High;
21266 if (Subtarget.hasSSE41()) {
21267 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
21268 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21269 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
21270 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
21271 // Low will be bitcasted right away, so do not bother bitcasting back to its
21272 // original type.
21273 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
21274 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21275 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21276 // (uint4) 0x53000000, 0xaa);
21277 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
21278 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
21279 // High will be bitcasted right away, so do not bother bitcasting back to
21280 // its original type.
21281 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
21282 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21283 } else {
21284 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
21285 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21286 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
21287 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
21288
21289 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21290 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
21291 }
21292
21293 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
21294 SDValue VecCstFSub = DAG.getConstantFP(
21295 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
21296
21297 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21298 // NOTE: By using fsub of a positive constant instead of fadd of a negative
21299 // constant, we avoid reassociation in MachineCombiner when reassoc is
21300 // enabled. See PR24512.
21301 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
21302 // TODO: Are there any fast-math-flags to propagate here?
21303 // (float4) lo;
21304 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
21305 // return (float4) lo + fhi;
21306 if (IsStrict) {
21307 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
21308 {Op.getOperand(0), HighBitcast, VecCstFSub});
21309 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
21310 {FHigh.getValue(1), LowBitcast, FHigh});
21311 }
21312
21313 SDValue FHigh =
21314 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
21315 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
21316}
21317
21319 const X86Subtarget &Subtarget) {
21320 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21321 SDValue N0 = Op.getOperand(OpNo);
21322 MVT SrcVT = N0.getSimpleValueType();
21323
21324 switch (SrcVT.SimpleTy) {
21325 default:
21326 llvm_unreachable("Custom UINT_TO_FP is not supported!");
21327 case MVT::v2i32:
21328 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
21329 case MVT::v4i32:
21330 case MVT::v8i32:
21331 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
21332 case MVT::v2i64:
21333 case MVT::v4i64:
21334 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
21335 }
21336}
21337
21338SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
21339 SelectionDAG &DAG) const {
21340 bool IsStrict = Op->isStrictFPOpcode();
21341 unsigned OpNo = IsStrict ? 1 : 0;
21342 SDValue Src = Op.getOperand(OpNo);
21343 SDLoc dl(Op);
21344 auto PtrVT = getPointerTy(DAG.getDataLayout());
21345 MVT SrcVT = Src.getSimpleValueType();
21346 MVT DstVT = Op->getSimpleValueType(0);
21347 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21348
21349 // Bail out when we don't have native conversion instructions.
21350 if (DstVT == MVT::f128)
21351 return SDValue();
21352
21353 if (isBF16orSoftF16(DstVT, Subtarget))
21354 return promoteXINT_TO_FP(Op, dl, DAG);
21355 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
21356 return Op;
21357
21358 if (SDValue V = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
21359 return V;
21360
21361 if (DstVT.isVector())
21362 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
21363
21364 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21365 return LowerWin64_INT128_TO_FP(Op, DAG);
21366
21367 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
21368 return Extract;
21369
21370 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
21371 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
21372 // Conversions from unsigned i32 to f32/f64 are legal,
21373 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
21374 return Op;
21375 }
21376
21377 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
21378 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
21379 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
21380 if (IsStrict)
21381 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
21382 {Chain, Src});
21383 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
21384 }
21385
21386 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
21387 return V;
21388 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
21389 return V;
21390
21391 // The transform for i64->f64 isn't correct for 0 when rounding to negative
21392 // infinity. It produces -0.0, so disable under strictfp.
21393 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
21394 !IsStrict)
21395 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
21396 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
21397 // negative infinity. So disable under strictfp. Using FILD instead.
21398 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
21399 !IsStrict)
21400 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
21401 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
21402 (DstVT == MVT::f32 || DstVT == MVT::f64))
21403 return SDValue();
21404
21405 // Make a 64-bit buffer, and use it to build an FILD.
21406 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
21407 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
21408 Align SlotAlign(8);
21409 MachinePointerInfo MPI =
21411 if (SrcVT == MVT::i32) {
21412 SDValue OffsetSlot =
21413 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
21414 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
21415 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
21416 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
21417 std::pair<SDValue, SDValue> Tmp =
21418 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21419 if (IsStrict)
21420 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21421
21422 return Tmp.first;
21423 }
21424
21425 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
21426 SDValue ValueToStore = Src;
21427 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
21428 // Bitcasting to f64 here allows us to do a single 64-bit store from
21429 // an SSE register, avoiding the store forwarding penalty that would come
21430 // with two 32-bit stores.
21431 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21432 }
21433 SDValue Store =
21434 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21435 // For i64 source, we need to add the appropriate power of 2 if the input
21436 // was negative. We must be careful to do the computation in x87 extended
21437 // precision, not in SSE.
21438 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21439 SDValue Ops[] = {Store, StackSlot};
21440 SDValue Fild =
21441 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
21442 SlotAlign, MachineMemOperand::MOLoad);
21443 Chain = Fild.getValue(1);
21444
21445 // Check whether the sign bit is set.
21446 SDValue SignSet = DAG.getSetCC(
21447 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
21448 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
21449
21450 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
21451 APInt FF(64, 0x5F80000000000000ULL);
21452 SDValue FudgePtr =
21453 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
21454 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
21455
21456 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
21457 SDValue Zero = DAG.getIntPtrConstant(0, dl);
21458 SDValue Four = DAG.getIntPtrConstant(4, dl);
21459 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
21460 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
21461
21462 // Load the value out, extending it from f32 to f80.
21463 SDValue Fudge = DAG.getExtLoad(
21464 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
21466 CPAlignment);
21467 Chain = Fudge.getValue(1);
21468 // Extend everything to 80 bits to force it to be done on x87.
21469 // TODO: Are there any fast-math-flags to propagate here?
21470 if (IsStrict) {
21471 unsigned Opc = ISD::STRICT_FADD;
21472 // Windows needs the precision control changed to 80bits around this add.
21473 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21474 Opc = X86ISD::STRICT_FP80_ADD;
21475
21476 SDValue Add =
21477 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
21478 // STRICT_FP_ROUND can't handle equal types.
21479 if (DstVT == MVT::f80)
21480 return Add;
21481 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
21482 {Add.getValue(1), Add,
21483 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
21484 }
21485 unsigned Opc = ISD::FADD;
21486 // Windows needs the precision control changed to 80bits around this add.
21487 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21488 Opc = X86ISD::FP80_ADD;
21489
21490 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
21491 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
21492 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21493}
21494
21495// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
21496// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
21497// just return an SDValue().
21498// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
21499// to i16, i32 or i64, and we lower it to a legal sequence and return the
21500// result.
21501SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
21502 bool IsSigned,
21503 SDValue &Chain) const {
21504 bool IsStrict = Op->isStrictFPOpcode();
21505 SDLoc DL(Op);
21506
21507 EVT DstTy = Op.getValueType();
21508 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
21509 EVT TheVT = Value.getValueType();
21510 auto PtrVT = getPointerTy(DAG.getDataLayout());
21511
21512 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21513 // f16 must be promoted before using the lowering in this routine.
21514 // fp128 does not use this lowering.
21515 return SDValue();
21516 }
21517
21518 // If using FIST to compute an unsigned i64, we'll need some fixup
21519 // to handle values above the maximum signed i64. A FIST is always
21520 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
21521 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21522
21523 // FIXME: This does not generate an invalid exception if the input does not
21524 // fit in i32. PR44019
21525 if (!IsSigned && DstTy != MVT::i64) {
21526 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
21527 // The low 32 bits of the fist result will have the correct uint32 result.
21528 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
21529 DstTy = MVT::i64;
21530 }
21531
21532 assert(DstTy.getSimpleVT() <= MVT::i64 &&
21533 DstTy.getSimpleVT() >= MVT::i16 &&
21534 "Unknown FP_TO_INT to lower!");
21535
21536 // We lower FP->int64 into FISTP64 followed by a load from a temporary
21537 // stack slot.
21538 MachineFunction &MF = DAG.getMachineFunction();
21539 unsigned MemSize = DstTy.getStoreSize();
21540 int SSFI =
21541 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
21542 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21543
21544 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21545
21546 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
21547
21548 if (UnsignedFixup) {
21549 //
21550 // Conversion to unsigned i64 is implemented with a select,
21551 // depending on whether the source value fits in the range
21552 // of a signed i64. Let Thresh be the FP equivalent of
21553 // 0x8000000000000000ULL.
21554 //
21555 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
21556 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
21557 // FistSrc = (Value - FltOfs);
21558 // Fist-to-mem64 FistSrc
21559 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
21560 // to XOR'ing the high 32 bits with Adjust.
21561 //
21562 // Being a power of 2, Thresh is exactly representable in all FP formats.
21563 // For X87 we'd like to use the smallest FP type for this constant, but
21564 // for DAG type consistency we have to match the FP operand type.
21565
21566 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
21567 [[maybe_unused]] APFloat::opStatus Status = APFloat::opOK;
21568 bool LosesInfo = false;
21569 if (TheVT == MVT::f64)
21570 // The rounding mode is irrelevant as the conversion should be exact.
21571 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
21572 &LosesInfo);
21573 else if (TheVT == MVT::f80)
21574 Status = Thresh.convert(APFloat::x87DoubleExtended(),
21575 APFloat::rmNearestTiesToEven, &LosesInfo);
21576
21577 assert(Status == APFloat::opOK && !LosesInfo &&
21578 "FP conversion should have been exact");
21579
21580 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
21581
21582 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
21583 *DAG.getContext(), TheVT);
21584 SDValue Cmp;
21585 if (IsStrict) {
21586 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
21587 /*IsSignaling*/ true);
21588 Chain = Cmp.getValue(1);
21589 } else {
21590 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
21591 }
21592
21593 // Our preferred lowering of
21594 //
21595 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
21596 //
21597 // is
21598 //
21599 // (Value >= Thresh) << 63
21600 //
21601 // but since we can get here after LegalOperations, DAGCombine might do the
21602 // wrong thing if we create a select. So, directly create the preferred
21603 // version.
21604 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND,