LLVM 23.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Intrinsics.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/Debug.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE "x86-isel"
71
73 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
79
81 "x86-br-merging-base-cost", cl::init(2),
83 "Sets the cost threshold for when multiple conditionals will be merged "
84 "into one branch versus be split in multiple branches. Merging "
85 "conditionals saves branches at the cost of additional instructions. "
86 "This value sets the instruction cost limit, below which conditionals "
87 "will be merged, and above which conditionals will be split. Set to -1 "
88 "to never merge branches."),
90
92 "x86-br-merging-ccmp-bias", cl::init(6),
93 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
94 "supports conditional compare instructions."),
96
97static cl::opt<bool>
98 WidenShift("x86-widen-shift", cl::init(true),
99 cl::desc("Replace narrow shifts with wider shifts."),
100 cl::Hidden);
101
103 "x86-br-merging-likely-bias", cl::init(0),
104 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
105 "that all conditionals will be executed. For example for merging "
106 "the conditionals (a == b && c > d), if its known that a == b is "
107 "likely, then it is likely that if the conditionals are split "
108 "both sides will be executed, so it may be desirable to increase "
109 "the instruction cost threshold. Set to -1 to never merge likely "
110 "branches."),
111 cl::Hidden);
112
114 "x86-br-merging-unlikely-bias", cl::init(-1),
115 cl::desc(
116 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
117 "that all conditionals will be executed. For example for merging "
118 "the conditionals (a == b && c > d), if its known that a == b is "
119 "unlikely, then it is unlikely that if the conditionals are split "
120 "both sides will be executed, so it may be desirable to decrease "
121 "the instruction cost threshold. Set to -1 to never merge unlikely "
122 "branches."),
123 cl::Hidden);
124
126 "mul-constant-optimization", cl::init(true),
127 cl::desc("Replace 'mul x, Const' with more effective instructions like "
128 "SHIFT, LEA, etc."),
129 cl::Hidden);
130
132 const X86Subtarget &STI)
133 : TargetLowering(TM, STI), Subtarget(STI) {
134 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
135 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
136
137 // Set up the TargetLowering object.
138
139 // X86 is weird. It always uses i8 for shift amounts and setcc results.
141 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
143
144 // X86 instruction cache is coherent with its data cache so we can use the
145 // default expansion to a no-op.
147
148 // For 64-bit, since we have so many registers, use the ILP scheduler.
149 // For 32-bit, use the register pressure specific scheduling.
150 // For Atom, always use ILP scheduling.
151 if (Subtarget.isAtom())
153 else if (Subtarget.is64Bit())
155 else
157 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
158 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
159
160 // Bypass expensive divides and use cheaper ones.
161 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
162 if (Subtarget.hasSlowDivide32())
163 addBypassSlowDiv(32, 8);
164 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
165 addBypassSlowDiv(64, 32);
166 }
167
168 if (Subtarget.canUseCMPXCHG16B())
170 else if (Subtarget.canUseCMPXCHG8B())
172 else
174
175 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
176
178
179 // Set up the register classes.
180 addRegisterClass(MVT::i8, &X86::GR8RegClass);
181 addRegisterClass(MVT::i16, &X86::GR16RegClass);
182 addRegisterClass(MVT::i32, &X86::GR32RegClass);
183 if (Subtarget.is64Bit())
184 addRegisterClass(MVT::i64, &X86::GR64RegClass);
185
186 for (MVT VT : MVT::integer_valuetypes())
188
189 // We don't accept any truncstore of integer registers.
190 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
191 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
193 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
194 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
195 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
196
197 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
198
199 // SETOEQ and SETUNE require checking two conditions.
200 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
203 }
204
205 // Integer absolute.
206 if (Subtarget.canUseCMOV()) {
207 setOperationAction(ISD::ABS , MVT::i16 , Custom);
208 setOperationAction(ISD::ABS , MVT::i32 , Custom);
209 if (Subtarget.is64Bit())
210 setOperationAction(ISD::ABS , MVT::i64 , Custom);
211 }
212
213 // Absolute difference.
214 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
215 setOperationAction(Op , MVT::i8 , Custom);
216 setOperationAction(Op , MVT::i16 , Custom);
217 setOperationAction(Op , MVT::i32 , Custom);
218 if (Subtarget.is64Bit())
219 setOperationAction(Op , MVT::i64 , Custom);
220 }
221
222 // Signed saturation subtraction.
226 if (Subtarget.is64Bit())
228
229 // Funnel shifts.
230 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
231 // For slow shld targets we only lower for code size.
232 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
233
234 setOperationAction(ShiftOp , MVT::i8 , Custom);
235 setOperationAction(ShiftOp , MVT::i16 , Custom);
236 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
237 if (Subtarget.is64Bit())
238 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
239 }
240
241 if (!Subtarget.useSoftFloat()) {
242 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
243 // operation.
248 // We have an algorithm for SSE2, and we turn this into a 64-bit
249 // FILD or VCVTUSI2SS/SD for other targets.
252 // We have an algorithm for SSE2->double, and we turn this into a
253 // 64-bit FILD followed by conditional FADD for other targets.
256
257 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
258 // this operation.
261 // SSE has no i16 to fp conversion, only i32. We promote in the handler
262 // to allow f80 to use i16 and f64 to use i16 with sse1 only
265 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
268 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
269 // are Legal, f80 is custom lowered.
272
273 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
274 // this operation.
276 // FIXME: This doesn't generate invalid exception when it should. PR44019.
282 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
283 // are Legal, f80 is custom lowered.
286
287 // Handle FP_TO_UINT by promoting the destination to a larger signed
288 // conversion.
290 // FIXME: This doesn't generate invalid exception when it should. PR44019.
293 // FIXME: This doesn't generate invalid exception when it should. PR44019.
299
304
305 if (!Subtarget.is64Bit() && Subtarget.hasX87()) {
308 }
309 }
310
311 if (Subtarget.hasSSE2()) {
312 // Custom lowering for saturating float to int conversions.
313 // We handle promotion to larger result types manually.
314 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
317 }
318 if (Subtarget.is64Bit()) {
321 }
322 }
323 if (Subtarget.hasAVX10_2()) {
328 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
329 MVT::v4i64}) {
332 }
333 if (Subtarget.is64Bit()) {
336 }
337 }
338
339 // Handle address space casts between mixed sized pointers.
342
343 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
344 if (!Subtarget.hasSSE2()) {
347 if (Subtarget.is64Bit()) {
349 // Without SSE, i64->f64 goes through memory.
351 }
352 } else if (!Subtarget.is64Bit())
354
355 // Scalar integer divide and remainder are lowered to use operations that
356 // produce two results, to match the available instructions. This exposes
357 // the two-result form to trivial CSE, which is able to combine x/y and x%y
358 // into a single instruction.
359 //
360 // Scalar integer multiply-high is also lowered to use two-result
361 // operations, to match the available instructions. However, plain multiply
362 // (low) operations are left as Legal, as there are single-result
363 // instructions for this in x86. Using the two-result multiply instructions
364 // when both high and low results are needed must be arranged by dagcombine.
365 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
372 }
373
374 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
376 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
377 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
380 }
381 if (Subtarget.is64Bit())
386
391
392 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
398 }
399
400 // Promote the i8 variants and force them on up to i32 which has a shorter
401 // encoding.
402 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
404 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
405 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
406 // promote that too.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
409
410 if (!Subtarget.hasBMI()) {
411 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
413 if (Subtarget.is64Bit()) {
414 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
416 }
417 }
418
419 if (Subtarget.hasLZCNT()) {
420 // When promoting the i8 variants, force them to i32 for a shorter
421 // encoding.
422 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
424 } else {
425 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
426 if (VT == MVT::i64 && !Subtarget.is64Bit())
427 continue;
430 }
431 }
432
435 // Special handling for half-precision floating point conversions.
436 // If we don't have F16C support, then lower half float conversions
437 // into library calls.
439 Op, MVT::f32,
440 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
441 // There's never any support for operations beyond MVT::f32.
442 setOperationAction(Op, MVT::f64, Expand);
443 setOperationAction(Op, MVT::f80, Expand);
444 setOperationAction(Op, MVT::f128, Expand);
445 }
446
447 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
450 }
451
452 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
453 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
454 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
455 setTruncStoreAction(VT, MVT::f16, Expand);
456 setTruncStoreAction(VT, MVT::bf16, Expand);
457
460 }
461
465 if (Subtarget.is64Bit())
467 if (Subtarget.hasPOPCNT()) {
468 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
469 // popcntw is longer to encode than popcntl and also has a false dependency
470 // on the dest that popcntl hasn't had since Cannon Lake.
471 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
472 } else {
477 }
478
480
481 if (!Subtarget.hasMOVBE())
483
484 // X86 wants to expand cmov itself.
485 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
490 }
491 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
492 if (VT == MVT::i64 && !Subtarget.is64Bit())
493 continue;
496 }
497
499
500 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
503
505 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
506 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
510
511 // Darwin ABI issue.
512 for (auto VT : { MVT::i32, MVT::i64 }) {
513 if (VT == MVT::i64 && !Subtarget.is64Bit())
514 continue;
521 }
522
523 // 64-bit shl, sra, srl (iff 32-bit x86)
524 for (auto VT : { MVT::i32, MVT::i64 }) {
525 if (VT == MVT::i64 && !Subtarget.is64Bit())
526 continue;
530 }
531
532 if (Subtarget.hasSSEPrefetch())
534
536
537 // Expand certain atomics
538 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
546 }
547
548 if (!Subtarget.is64Bit())
550
551 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
552 // All CPUs supporting AVX will atomically load/store aligned 128-bit
553 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
556 }
557
558 if (Subtarget.canUseCMPXCHG16B())
560
561 // FIXME - use subtarget debug flags
562 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
563 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
564 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
566 }
567
570
573
574 setOperationAction(ISD::TRAP, MVT::Other, Legal);
576 if (Subtarget.isTargetPS())
578 else
580
581 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
583 setOperationAction(ISD::VAEND , MVT::Other, Expand);
584 bool Is64Bit = Subtarget.is64Bit();
585 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
586 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
587
590
592
593 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
596
598
599 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
600 setOperationAction(ISD::FABS, VT, Action);
601 setOperationAction(ISD::FNEG, VT, Action);
603 setOperationAction(ISD::FREM, VT, Action);
604 setOperationAction(ISD::FMA, VT, Action);
605 setOperationAction(ISD::FMINNUM, VT, Action);
606 setOperationAction(ISD::FMAXNUM, VT, Action);
611 setOperationAction(ISD::FSIN, VT, Action);
612 setOperationAction(ISD::FCOS, VT, Action);
613 setOperationAction(ISD::FSINCOS, VT, Action);
614 setOperationAction(ISD::FTAN, VT, Action);
615 setOperationAction(ISD::FSQRT, VT, Action);
616 setOperationAction(ISD::FPOW, VT, Action);
617 setOperationAction(ISD::FPOWI, VT, Action);
618 setOperationAction(ISD::FLOG, VT, Action);
619 setOperationAction(ISD::FLOG2, VT, Action);
620 setOperationAction(ISD::FLOG10, VT, Action);
621 setOperationAction(ISD::FEXP, VT, Action);
622 setOperationAction(ISD::FEXP2, VT, Action);
623 setOperationAction(ISD::FEXP10, VT, Action);
624 setOperationAction(ISD::FCEIL, VT, Action);
625 setOperationAction(ISD::FFLOOR, VT, Action);
627 setOperationAction(ISD::FRINT, VT, Action);
628 setOperationAction(ISD::BR_CC, VT, Action);
629 setOperationAction(ISD::SETCC, VT, Action);
632 setOperationAction(ISD::FROUND, VT, Action);
634 setOperationAction(ISD::FTRUNC, VT, Action);
635 setOperationAction(ISD::FLDEXP, VT, Action);
637 };
638
639 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
640 // f16, f32 and f64 use SSE.
641 // Set up the FP register classes.
642 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
643 : &X86::FR16RegClass);
644 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
645 : &X86::FR32RegClass);
646 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
647 : &X86::FR64RegClass);
648
649 // Disable f32->f64 extload as we can only generate this in one instruction
650 // under optsize. So its easier to pattern match (fpext (load)) for that
651 // case instead of needing to emit 2 instructions for extload in the
652 // non-optsize case.
653 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
654
655 for (auto VT : { MVT::f32, MVT::f64 }) {
656 // Use ANDPD to simulate FABS.
658
659 // Use XORP to simulate FNEG.
661
662 // Use ANDPD and ORPD to simulate FCOPYSIGN.
664
665 // These might be better off as horizontal vector ops.
668
669 // We don't support sin/cos/fmod
673 }
674
675 // Half type will be promoted by default.
676 setF16Action(MVT::f16, Promote);
687
717
722
727
728 // Lower this to MOVMSK plus an AND.
731
732 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
733 (UseX87 || Is64Bit)) {
734 // Use SSE for f32, x87 for f64.
735 // Set up the FP register classes.
736 addRegisterClass(MVT::f32, &X86::FR32RegClass);
737 if (UseX87)
738 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
739
740 // Use ANDPS to simulate FABS.
742
743 // Use XORP to simulate FNEG.
745
746 if (UseX87)
748
749 // Use ANDPS and ORPS to simulate FCOPYSIGN.
750 if (UseX87)
753
754 // We don't support sin/cos/fmod
758
759 if (UseX87) {
760 // Always expand sin/cos functions even though x87 has an instruction.
764 }
765 } else if (UseX87) {
766 // f32 and f64 in x87.
767 // Set up the FP register classes.
768 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
769 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
770
771 for (auto VT : { MVT::f32, MVT::f64 }) {
774
775 // Always expand sin/cos functions even though x87 has an instruction.
779 }
780 }
781
782 // Expand FP32 immediates into loads from the stack, save special cases.
783 if (isTypeLegal(MVT::f32)) {
784 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
785 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
786 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
787 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
788 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
789 } else // SSE immediates.
790 addLegalFPImmediate(APFloat(+0.0f)); // xorps
791 }
792 // Expand FP64 immediates into loads from the stack, save special cases.
793 if (isTypeLegal(MVT::f64)) {
794 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
795 addLegalFPImmediate(APFloat(+0.0)); // FLD0
796 addLegalFPImmediate(APFloat(+1.0)); // FLD1
797 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
798 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
799 } else // SSE immediates.
800 addLegalFPImmediate(APFloat(+0.0)); // xorpd
801 }
802 // Support fp16 0 immediate.
803 if (isTypeLegal(MVT::f16))
804 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
805
806 // Handle constrained floating-point operations of scalar.
819
820 // We don't support FMA.
823
824 // f80 always uses X87.
825 if (UseX87) {
826 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
829 {
831 addLegalFPImmediate(TmpFlt); // FLD0
832 TmpFlt.changeSign();
833 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
834
835 bool ignored;
836 APFloat TmpFlt2(+1.0);
838 &ignored);
839 addLegalFPImmediate(TmpFlt2); // FLD1
840 TmpFlt2.changeSign();
841 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
842 }
843
844 // Always expand sin/cos functions even though x87 has an instruction.
845 // clang-format off
857 // clang-format on
858
870
871 // Handle constrained floating-point operations of scalar.
878 if (isTypeLegal(MVT::f16)) {
881 } else {
883 }
884 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
885 // as Custom.
887 }
888
889 // f128 uses xmm registers, but most operations require libcalls.
890 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
891 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
892 : &X86::VR128RegClass);
893
894 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
895
906
910
911 // clang-format off
919 // clang-format on
920 // No STRICT_FSINCOS
923
926 // We need to custom handle any FP_ROUND with an f128 input, but
927 // LegalizeDAG uses the result type to know when to run a custom handler.
928 // So we have to list all legal floating point result types here.
929 if (isTypeLegal(MVT::f32)) {
932 }
933 if (isTypeLegal(MVT::f64)) {
936 }
937 if (isTypeLegal(MVT::f80)) {
941 }
942
944
945 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
946 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
947 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
948 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
949 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
950 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
951 }
952
953 // Always use a library call for pow.
954 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
955 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
956 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
957 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
958
967
968 // Some FP actions are always expanded for vector types.
969 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
970 MVT::v4f32, MVT::v8f32, MVT::v16f32,
971 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
972 // clang-format off
986 // clang-format on
987 }
988
989 // First set operation action for all vector types to either promote
990 // (for widening) or expand (for scalarization). Then we will selectively
991 // turn on ones that can be effectively codegen'd.
1031 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1032 setTruncStoreAction(InnerVT, VT, Expand);
1033
1034 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1035 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1036
1037 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1038 // types, we have to deal with them whether we ask for Expansion or not.
1039 // Setting Expand causes its own optimisation problems though, so leave
1040 // them legal.
1041 if (VT.getVectorElementType() == MVT::i1)
1042 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1043
1044 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1045 // split/scalarized right now.
1046 if (VT.getVectorElementType() == MVT::f16 ||
1047 VT.getVectorElementType() == MVT::bf16)
1048 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1049 }
1050 }
1051
1052 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1053 // with -msoft-float, disable use of MMX as well.
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1055 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1056 // No operations on x86mmx supported, everything uses intrinsics.
1057 }
1058
1059 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1060 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1061 : &X86::VR128RegClass);
1062
1067
1068 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1069 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1077
1078 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1079 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1081
1087 }
1088
1089 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1090 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1091 : &X86::VR128RegClass);
1092
1093 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1094 // registers cannot be used even for integer operations.
1095 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1096 : &X86::VR128RegClass);
1097 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1098 : &X86::VR128RegClass);
1099 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1100 : &X86::VR128RegClass);
1101 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1102 : &X86::VR128RegClass);
1103 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1104 : &X86::VR128RegClass);
1105
1106 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1111 }
1112
1113 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1114 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1115 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1116
1117 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1119 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1120 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1121 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1122 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1123 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1124 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1125 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1126 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1129
1130 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1131 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1132 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1133
1134 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1136 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1138
1139 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1140 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1141
1142 setOperationAction(ISD::AND, MVT::i128, Custom);
1143 setOperationAction(ISD::OR, MVT::i128, Custom);
1144 setOperationAction(ISD::XOR, MVT::i128, Custom);
1146
1147 if (Subtarget.hasPCLMUL()) {
1148 for (auto VT : {MVT::i64, MVT::v4i32, MVT::v2i64}) {
1151 }
1155 }
1156
1157 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1158 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1159 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1160 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1161 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1162 }
1163
1174
1179
1180 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1186
1187 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1188 // setcc all the way to isel and prefer SETGT in some isel patterns.
1191 }
1192
1193 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1194 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1199
1200 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1206 }
1207
1208 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1212
1213 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1214 continue;
1215
1218 }
1219 setF16Action(MVT::v8f16, Expand);
1220 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1221 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1222 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1223 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1224 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1225 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1227
1228 // Custom lower v2i64 and v2f64 selects.
1235
1242
1243 // Custom legalize these to avoid over promotion or custom promotion.
1244 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1249 }
1250
1255
1258
1261
1262 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1267
1272
1273 // We want to legalize this to an f64 load rather than an i64 load on
1274 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1275 // store.
1276 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1277 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1278 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1279 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1280 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1282
1283 // Add 32-bit vector stores to help vectorization opportunities.
1284 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1286
1290 if (!Subtarget.hasAVX512())
1292
1296
1298
1315
1316 // In the customized shift lowering, the legal v4i32/v2i64 cases
1317 // in AVX2 will be recognized.
1318 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1322 if (VT == MVT::v2i64) continue;
1327 }
1328
1334 }
1335
1336 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1341
1342 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1344 }
1345
1346 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1347 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
1348 }
1349
1350 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1351 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1352 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1353 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1354
1355 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1358 }
1360
1361 // These might be better off as horizontal vector ops.
1366 }
1367
1368 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1369 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1372 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1376 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1382
1384 }
1385
1386 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1387 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1388 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1389 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1390 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1391 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1392 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1393 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1394
1398
1399 // FIXME: Do we need to handle scalar-to-vector here?
1400 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1401 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1402
1403 // We directly match byte blends in the backend as they match the VSELECT
1404 // condition form.
1406
1407 // SSE41 brings specific instructions for doing vector sign extend even in
1408 // cases where we don't have SRA.
1409 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1412 }
1413
1414 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1415 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1416 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1417 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1418 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1419 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1420 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1421 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1422 }
1423
1424 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1425 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1426 // do the pre and post work in the vector domain.
1429 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1430 // so that DAG combine doesn't try to turn it into uint_to_fp.
1433 }
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1438 }
1439
1440 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1441 for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1442 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1445 }
1446
1447 // XOP can efficiently perform BITREVERSE with VPPERM.
1448 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1450 }
1451
1452 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1453 bool HasInt256 = Subtarget.hasInt256();
1454
1455 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1456 : &X86::VR256RegClass);
1457 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1458 : &X86::VR256RegClass);
1459 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1460 : &X86::VR256RegClass);
1461 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1462 : &X86::VR256RegClass);
1463 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1464 : &X86::VR256RegClass);
1465 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1466 : &X86::VR256RegClass);
1467 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1468 : &X86::VR256RegClass);
1469
1470 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1483
1485
1489
1495 }
1496
1497 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1498 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1499
1500 setOperationAction(ISD::AND, MVT::i256, Custom);
1501 setOperationAction(ISD::OR, MVT::i256, Custom);
1502 setOperationAction(ISD::XOR, MVT::i256, Custom);
1505
1506 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1507 // even though v8i16 is a legal type.
1508 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1509 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1510 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1511 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1515
1522
1534
1535 if (!Subtarget.hasAVX512())
1537
1538 // In the customized shift lowering, the legal v8i32/v4i64 cases
1539 // in AVX2 will be recognized.
1540 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1546 if (VT == MVT::v4i64) continue;
1551 }
1552
1553 // These types need custom splitting if their input is a 128-bit vector.
1558
1562 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1563 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1566
1567 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1571 }
1572
1577
1578 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1583
1584 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1585 // setcc all the way to isel and prefer SETGT in some isel patterns.
1588 }
1589
1590 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1591 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1596
1597 if (Subtarget.hasAnyFMA()) {
1598 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1599 MVT::v2f64, MVT::v4f64 }) {
1602 }
1603 }
1604
1605 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1606 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1607 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1608 }
1609
1610 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1611 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1614
1615 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1616 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1617 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1619 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1620 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1621 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1622 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1623
1624 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1625 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1626
1627 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1628 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1629 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1630 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1631 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1632
1633 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1634 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1635 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1636 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1637 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1638 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1639 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1640 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1645
1646 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1647 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1648 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1649 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1650 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1651 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1652 }
1653
1654 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1657 }
1658
1659 if (HasInt256) {
1660 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1661 // when we have a 256bit-wide blend with immediate.
1664
1665 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1666 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1667 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1668 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1669 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1670 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1671 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1672 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1673 }
1674 }
1675
1676 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1677 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1678 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1680 }
1681
1682 // Extract subvector is special because the value type
1683 // (result) is 128-bit but the source is 256-bit wide.
1684 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1685 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1687 }
1688
1689 // Custom lower several nodes for 256-bit types.
1690 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1691 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1701 }
1702 setF16Action(MVT::v16f16, Expand);
1703 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1704 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1706 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1707 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1708 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1709 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1710
1711 // Only PCLMUL required as we always unroll clmul vectors.
1712 if (Subtarget.hasPCLMUL()) {
1713 for (auto VT : {MVT::v8i32, MVT::v4i64}) {
1716 }
1717 }
1718
1719 if (HasInt256) {
1721
1722 // Custom legalize 2x32 to get a little better code.
1725
1726 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1727 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1729 }
1730
1731 if (Subtarget.hasGFNI()) {
1732 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1733 setOperationAction(ISD::CTTZ, MVT::v32i8, Custom);
1734 }
1735 }
1736
1737 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1738 Subtarget.hasF16C()) {
1739 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1742 }
1743 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1746 }
1747 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1748 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1749 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1750 }
1751 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1752 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1753 }
1754
1755 // This block controls legalization of the mask vector sizes that are
1756 // available with AVX512. 512-bit vectors are in a separate block controlled
1757 // by useAVX512Regs.
1758 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1759 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1760 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1761 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1762 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1763 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1764
1768
1769 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1770 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1771 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1772 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1773 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1774 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1775 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1776 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1784
1785 // There is no byte sized k-register load or store without AVX512DQ.
1786 if (!Subtarget.hasDQI()) {
1787 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1788 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1789 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1790 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1791
1796 }
1797
1798 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1799 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1803 }
1804
1805 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1807
1808 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1812
1819 }
1820
1821 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1823 }
1824 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1825 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1828 }
1829 }
1830
1831 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1832 // elements. 512-bits can be disabled based on prefer-vector-width and
1833 // required-vector-width function attributes.
1834 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1835 bool HasBWI = Subtarget.hasBWI();
1836
1837 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1838 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1839 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1840 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1841 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1842 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1843 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1844
1845 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1846 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1847 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1848 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1849 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1850 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1851 if (HasBWI)
1852 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1853 }
1854
1855 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1866 }
1867 setOperationAction(ISD::LRINT, MVT::v16f32,
1868 Subtarget.hasDQI() ? Legal : Custom);
1869 setOperationAction(ISD::LRINT, MVT::v8f64,
1870 Subtarget.hasDQI() ? Legal : Custom);
1871 if (Subtarget.hasDQI())
1872 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1873
1874 setOperationAction(ISD::AND, MVT::i512, Custom);
1875 setOperationAction(ISD::OR, MVT::i512, Custom);
1876 setOperationAction(ISD::XOR, MVT::i512, Custom);
1877 setOperationAction(ISD::ADD, MVT::i512, Custom);
1878 setOperationAction(ISD::SUB, MVT::i512, Custom);
1879 setOperationAction(ISD::SRL, MVT::i512, Custom);
1880 setOperationAction(ISD::SHL, MVT::i512, Custom);
1881 setOperationAction(ISD::SRA, MVT::i512, Custom);
1882 setOperationAction(ISD::FSHR, MVT::i512, Custom);
1883 setOperationAction(ISD::FSHL, MVT::i512, Custom);
1884 setOperationAction(ISD::FSHR, MVT::i256, Custom);
1885 setOperationAction(ISD::FSHL, MVT::i256, Custom);
1888
1889 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1894 }
1895
1896 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1901 }
1902
1909
1921
1922 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1923 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1924 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1925 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1926 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1927 if (HasBWI)
1928 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1929
1930 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1931 // to 512-bit rather than use the AVX2 instructions so that we can use
1932 // k-masks.
1933 if (!Subtarget.hasVLX()) {
1934 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1935 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1938 }
1939 }
1940
1942 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1943 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1953
1954 if (HasBWI) {
1955 // Extends from v64i1 masks to 512-bit vectors.
1959 }
1960
1961 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1974
1976 }
1977
1978 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1981 }
1982
1983 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1984 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1985 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1986 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1987
1988 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1989 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1990 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1991 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1992
1993 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1994 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1995 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1996 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1997 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1998 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1999 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
2000 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
2001
2002 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
2003 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
2004
2005 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
2015
2016 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
2017 // setcc all the way to isel and prefer SETGT in some isel patterns.
2020 }
2021
2022 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
2023 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
2028
2029 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
2038 }
2039
2040 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2041 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
2042 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
2044 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
2045 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
2046 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
2047 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2052 }
2053
2054 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2055 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2056 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2057 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2058 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2059 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2060
2061 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2065 setOperationAction(Opc, MVT::v8i64, Custom);
2066
2067 if (Subtarget.hasDQI())
2068 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2069
2070 if (Subtarget.hasCDI()) {
2071 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2072 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2074 }
2075 } // Subtarget.hasCDI()
2076
2077 if (Subtarget.hasVPOPCNTDQ()) {
2078 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2081 }
2082
2083 // Extract subvector is special because the value type
2084 // (result) is 256-bit but the source is 512-bit wide.
2085 // 128-bit was made Legal under AVX1.
2086 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2087 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2089
2090 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2091 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2101 }
2102 setF16Action(MVT::v32f16, Expand);
2107 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2108 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2109 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2110
2111 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2116 }
2117 if (HasBWI) {
2118 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2121 }
2122 } else {
2123 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2124 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2125 }
2126
2127 if (Subtarget.hasVBMI2()) {
2128 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2131 }
2132
2133 setOperationAction(ISD::ROTL, MVT::v32i16, Legal);
2134 setOperationAction(ISD::ROTR, MVT::v32i16, Legal);
2135 }
2136
2137 // Only PCLMUL required as we always unroll clmul vectors.
2138 if (Subtarget.hasPCLMUL()) {
2139 for (auto VT : {MVT::v16i32, MVT::v8i64}) {
2142 }
2143 }
2144
2145 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2146 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2148
2149 if (Subtarget.hasGFNI()) {
2150 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
2151 setOperationAction(ISD::CTTZ, MVT::v64i8, Custom);
2152 }
2153 }// useAVX512Regs
2154
2155 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2156 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2157 MVT::v4i64}) {
2160 }
2161
2162 setOperationAction(ISD::ROTL, MVT::v16i16, Legal);
2163 setOperationAction(ISD::ROTR, MVT::v16i16, Legal);
2164 setOperationAction(ISD::ROTL, MVT::v8i16, Legal);
2165 setOperationAction(ISD::ROTR, MVT::v8i16, Legal);
2166 }
2167
2168 // This block controls legalization for operations that don't have
2169 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2170 // narrower widths.
2171 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2172 for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
2173 MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
2174 MVT::v16f32, MVT::v8f64})
2176
2177 // These operations are handled on non-VLX by artificially widening in
2178 // isel patterns.
2182
2183 if (Subtarget.hasDQI()) {
2184 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2185 // v2f32 UINT_TO_FP is already custom under SSE2.
2188 "Unexpected operation action!");
2189 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2194 }
2195
2196 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2202 }
2203
2204 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2207 }
2208
2209 // Custom legalize 2x32 to get a little better code.
2212
2213 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2214 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2216
2217 if (Subtarget.hasDQI()) {
2221 setOperationAction(Opc, MVT::v2i64, Custom);
2222 setOperationAction(Opc, MVT::v4i64, Custom);
2223 }
2224 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2225 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2226 }
2227
2228 if (Subtarget.hasCDI()) {
2229 for (auto VT : {MVT::i256, MVT::i512}) {
2230 if (VT == MVT::i512 && !Subtarget.useAVX512Regs())
2231 continue;
2236 }
2237 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2239 }
2240 } // Subtarget.hasCDI()
2241
2242 if (Subtarget.hasVPOPCNTDQ()) {
2243 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64})
2246 }
2247
2248 // We can try to convert vectors to different sizes to leverage legal
2249 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2250 // then specialize to Legal below.
2251 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2252 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2253 MVT::v16i16, MVT::v8i8})
2255
2256 // Legal vpcompress depends on various AVX512 extensions.
2257 // Legal in AVX512F
2258 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2260
2261 // Legal in AVX512F + AVX512VL
2262 if (Subtarget.hasVLX())
2263 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2264 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2266
2267 // Legal in AVX512F + AVX512VBMI2
2268 if (Subtarget.hasVBMI2())
2269 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2271
2272 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2273 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2274 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2276 }
2277
2278 // This block control legalization of v32i1/v64i1 which are available with
2279 // AVX512BW..
2280 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2281 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2282 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2283
2284 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2295 }
2296
2297 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2299
2300 // Extends from v32i1 masks to 256-bit vectors.
2304
2305 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2306 MVT::v16f16, MVT::v8f16}) {
2307 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2308 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2309 }
2310
2311 // These operations are handled on non-VLX by artificially widening in
2312 // isel patterns.
2313 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2314
2315 if (Subtarget.hasBITALG()) {
2316 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2318 }
2319 }
2320
2321 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2322 auto setGroup = [&] (MVT VT) {
2333
2346
2348
2351
2357
2363
2367 };
2368
2369 // AVX512_FP16 scalar operations
2370 setGroup(MVT::f16);
2388
2391
2392 if (Subtarget.useAVX512Regs()) {
2393 setGroup(MVT::v32f16);
2399 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2406
2411 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2413 MVT::v32i16);
2414 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2416 MVT::v32i16);
2417 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2419 MVT::v32i16);
2420 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2422 MVT::v32i16);
2423
2427
2428 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2429 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2430
2435 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2436 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2437 }
2438
2443
2444 if (Subtarget.hasVLX()) {
2445 setGroup(MVT::v8f16);
2446 setGroup(MVT::v16f16);
2447
2458
2465
2466 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2469
2473
2474 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2475 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2476 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2477 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2478
2479 // Need to custom widen these to prevent scalarization.
2480 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2481 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2482
2487
2492 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2493 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2494 }
2495 }
2496
2497 if (!Subtarget.useSoftFloat() &&
2498 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2499 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2500 : &X86::VR128RegClass);
2501 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2502 : &X86::VR256RegClass);
2503 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2504 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2505 // Set the operation action Custom to do the customization later.
2508 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2509 setF16Action(VT, Expand);
2510 if (!Subtarget.hasBF16())
2516 }
2517 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2518 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2519 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2520 }
2521 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2522 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2524 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2525 }
2526
2527 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2528 Subtarget.useAVX512Regs()) {
2529 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2530 setF16Action(MVT::v32bf16, Expand);
2531 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2532 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2533 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2535 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2539 }
2540
2541 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2542 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2543 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2544 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2545 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2546 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2547 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2548 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2549 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2550 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2553 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2565 }
2566 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2569 }
2570 }
2571
2572 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2573 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2574 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2575 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2576 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2577 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2578
2579 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2580 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2581 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2582 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2583 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2584
2585 if (Subtarget.hasBWI()) {
2586 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2587 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2588 }
2589
2590 if (Subtarget.hasFP16()) {
2591 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2600 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2609 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2614 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2619 }
2620 }
2621
2622 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2623 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2624 }
2625
2626 // We want to custom lower some of our intrinsics.
2630 if (!Subtarget.is64Bit()) {
2632 }
2633
2634 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2635 // handle type legalization for these operations here.
2636 //
2637 // FIXME: We really should do custom legalization for addition and
2638 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2639 // than generic legalization for 64-bit multiplication-with-overflow, though.
2640 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2641 if (VT == MVT::i64 && !Subtarget.is64Bit())
2642 continue;
2643 // Add/Sub/Mul with overflow operations are custom lowered.
2650
2651 // Support carry in as value rather than glue.
2657 }
2658
2659 // Combine sin / cos into _sincos_stret if it is available.
2662
2663 if (Subtarget.isTargetWin64()) {
2664 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2665 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2666 setOperationAction(ISD::SREM, MVT::i128, Custom);
2667 setOperationAction(ISD::UREM, MVT::i128, Custom);
2676 }
2677
2678 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2679 // is. We should promote the value to 64-bits to solve this.
2680 // This is what the CRT headers do - `fmodf` is an inline header
2681 // function casting to f64 and calling `fmod`.
2682 if (Subtarget.is32Bit() &&
2683 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2684 // clang-format off
2685 for (ISD::NodeType Op :
2703 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2704 ISD::FMODF})
2705 if (isOperationExpandOrLibCall(Op, MVT::f32))
2706 setOperationAction(Op, MVT::f32, Promote);
2707 // clang-format on
2708
2709 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2710 // it, but it's just a wrapper around ldexp.
2711 if (Subtarget.isOSWindows()) {
2713 if (isOperationExpand(Op, MVT::f32))
2714 setOperationAction(Op, MVT::f32, Promote);
2715 }
2716
2717 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
2718 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
2719 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
2720
2721 // We have target-specific dag combine patterns for the following nodes:
2732 ISD::SHL,
2733 ISD::SRA,
2734 ISD::SRL,
2735 ISD::OR,
2736 ISD::AND,
2742 ISD::ADD,
2745 ISD::FADD,
2746 ISD::FSUB,
2747 ISD::FNEG,
2748 ISD::FMA,
2752 ISD::SUB,
2753 ISD::LOAD,
2754 ISD::LRINT,
2756 ISD::MLOAD,
2757 ISD::STORE,
2774 ISD::SETCC,
2775 ISD::MUL,
2776 ISD::XOR,
2784 ISD::ROTL,
2785 ISD::ROTR,
2786 ISD::FSHL,
2787 ISD::FSHR,
2791
2792 computeRegisterProperties(Subtarget.getRegisterInfo());
2793
2794 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2796 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2798 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2800
2801 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2802 // that needs to benchmarked and balanced with the potential use of vector
2803 // load/store types (PR33329, PR33914).
2806
2807 // Default loop alignment, which can be overridden by -align-loops.
2809
2810 // An out-of-order CPU can speculatively execute past a predictable branch,
2811 // but a conditional move could be stalled by an expensive earlier operation.
2812 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2813 EnableExtLdPromotion = true;
2815
2817
2818 // Default to having -disable-strictnode-mutation on
2819 IsStrictFPEnabled = true;
2820}
2821
2822// This has so far only been implemented for 64-bit MachO.
2824 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2825}
2826
2828 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2829 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2830}
2831
2833 const SDLoc &DL) const {
2834 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2835 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2836 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2837 return SDValue(Node, 0);
2838}
2839
2842 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2843 !Subtarget.hasBWI())
2844 return TypeSplitVector;
2845
2846 // Since v8f16 is legal, widen anything over v4f16.
2847 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2848 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2849 VT.getVectorElementType() == MVT::f16)
2850 return TypeSplitVector;
2851
2852 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2853 VT.getVectorElementType() != MVT::i1)
2854 return TypeWidenVector;
2855
2857}
2858
2860 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
2861 const LibcallLoweringInfo *libcallLowering) const {
2862 return X86::createFastISel(funcInfo, libInfo, libcallLowering);
2863}
2864
2865//===----------------------------------------------------------------------===//
2866// Other Lowering Hooks
2867//===----------------------------------------------------------------------===//
2868
2870 bool AssumeSingleUse, bool IgnoreAlignment) {
2871 if (!AssumeSingleUse && !Op.hasOneUse())
2872 return false;
2873 if (!ISD::isNormalLoad(Op.getNode()))
2874 return false;
2875
2876 // If this is an unaligned vector, make sure the target supports folding it.
2877 auto *Ld = cast<LoadSDNode>(Op.getNode());
2878 if (!IgnoreAlignment && !Subtarget.hasAVX() &&
2879 !Subtarget.hasSSEUnalignedMem() && Ld->getValueSizeInBits(0) == 128 &&
2880 Ld->getAlign() < Align(16))
2881 return false;
2882
2883 // TODO: If this is a non-temporal load and the target has an instruction
2884 // for it, it should not be folded. See "useNonTemporalLoad()".
2885
2886 return true;
2887}
2888
2890 const X86Subtarget &Subtarget,
2891 bool AssumeSingleUse) {
2892 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2893 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2894 return false;
2895
2896 // We can not replace a wide volatile load with a broadcast-from-memory,
2897 // because that would narrow the load, which isn't legal for volatiles.
2898 auto *Ld = cast<LoadSDNode>(Op.getNode());
2899 return !Ld->isVolatile() ||
2900 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2901}
2902
2904 if (!Op.hasOneUse())
2905 return false;
2906 // Peek through (oneuse) bitcast users
2907 SDNode *User = *Op->user_begin();
2908 while (User->getOpcode() == ISD::BITCAST) {
2909 if (!User->hasOneUse())
2910 return false;
2911 User = *User->user_begin();
2912 }
2913 return ISD::isNormalStore(User);
2914}
2915
2917 if (Op.hasOneUse()) {
2918 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2919 return (ISD::ZERO_EXTEND == Opcode);
2920 }
2921 return false;
2922}
2923
2924// Return true if its cheap to bitcast this to a vector type.
2926 const X86Subtarget &Subtarget) {
2927 if (peekThroughBitcasts(Op).getValueType().isVector())
2928 return true;
2930 return true;
2931
2932 EVT VT = Op.getValueType();
2933 unsigned Opcode = Op.getOpcode();
2934 if ((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
2935 DAG.getTargetLoweringInfo().getOperationAction(Opcode, VT) ==
2937 // Check for larger than legal scalar integer ops that might have been
2938 // custom lowered to vector instruction.
2939 switch (Opcode) {
2940 case ISD::BITREVERSE:
2941 return true;
2942 case ISD::SHL:
2943 case ISD::SRL:
2944 case ISD::SRA:
2945 return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget);
2946 case ISD::AND:
2947 case ISD::OR:
2948 case ISD::XOR:
2949 case ISD::ADD:
2950 case ISD::SUB:
2951 case ISD::FSHL:
2952 case ISD::FSHR:
2953 return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget) &&
2954 mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget);
2955 case ISD::SELECT:
2956 return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget) &&
2957 mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget);
2958 }
2959 }
2960 return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/true,
2961 /*IgnoreAlignment=*/true);
2962}
2963
2964static bool isLogicOp(unsigned Opcode) {
2965 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2966 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2967}
2968
2969static bool isTargetShuffle(unsigned Opcode) {
2970 switch(Opcode) {
2971 default: return false;
2972 case X86ISD::BLENDI:
2973 case X86ISD::PSHUFB:
2974 case X86ISD::PSHUFD:
2975 case X86ISD::PSHUFHW:
2976 case X86ISD::PSHUFLW:
2977 case X86ISD::SHUFP:
2978 case X86ISD::INSERTPS:
2979 case X86ISD::EXTRQI:
2980 case X86ISD::INSERTQI:
2981 case X86ISD::VALIGN:
2982 case X86ISD::PALIGNR:
2983 case X86ISD::VSHLDQ:
2984 case X86ISD::VSRLDQ:
2985 case X86ISD::MOVLHPS:
2986 case X86ISD::MOVHLPS:
2987 case X86ISD::MOVSHDUP:
2988 case X86ISD::MOVSLDUP:
2989 case X86ISD::MOVDDUP:
2990 case X86ISD::MOVSS:
2991 case X86ISD::MOVSD:
2992 case X86ISD::MOVSH:
2993 case X86ISD::UNPCKL:
2994 case X86ISD::UNPCKH:
2995 case X86ISD::VBROADCAST:
2996 case X86ISD::VPERMILPI:
2997 case X86ISD::VPERMILPV:
2998 case X86ISD::VPERM2X128:
2999 case X86ISD::SHUF128:
3000 case X86ISD::VPERMIL2:
3001 case X86ISD::VPERMI:
3002 case X86ISD::VPPERM:
3003 case X86ISD::VPERMV:
3004 case X86ISD::VPERMV3:
3005 case X86ISD::VZEXT_MOVL:
3006 case X86ISD::COMPRESS:
3007 case X86ISD::EXPAND:
3008 return true;
3009 }
3010}
3011
3012static bool isTargetShuffleVariableMask(unsigned Opcode) {
3013 switch (Opcode) {
3014 default: return false;
3015 // Target Shuffles.
3016 case X86ISD::PSHUFB:
3017 case X86ISD::VPERMILPV:
3018 case X86ISD::VPERMIL2:
3019 case X86ISD::VPPERM:
3020 case X86ISD::VPERMV:
3021 case X86ISD::VPERMV3:
3022 return true;
3023 // 'Faux' Target Shuffles.
3024 case ISD::OR:
3025 case ISD::AND:
3026 case X86ISD::ANDNP:
3027 return true;
3028 }
3029}
3030
3033 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3035 int ReturnAddrIndex = FuncInfo->getRAIndex();
3036
3037 if (ReturnAddrIndex == 0) {
3038 // Set up a frame object for the return address.
3039 unsigned SlotSize = RegInfo->getSlotSize();
3040 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
3041 -(int64_t)SlotSize,
3042 false);
3043 FuncInfo->setRAIndex(ReturnAddrIndex);
3044 }
3045
3046 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
3047}
3048
3050 bool HasSymbolicDisplacement) {
3051 // Offset should fit into 32 bit immediate field.
3052 if (!isInt<32>(Offset))
3053 return false;
3054
3055 // If we don't have a symbolic displacement - we don't have any extra
3056 // restrictions.
3057 if (!HasSymbolicDisplacement)
3058 return true;
3059
3060 // We can fold large offsets in the large code model because we always use
3061 // 64-bit offsets.
3062 if (CM == CodeModel::Large)
3063 return true;
3064
3065 // For kernel code model we know that all object resist in the negative half
3066 // of 32bits address space. We may not accept negative offsets, since they may
3067 // be just off and we may accept pretty large positive ones.
3068 if (CM == CodeModel::Kernel)
3069 return Offset >= 0;
3070
3071 // For other non-large code models we assume that latest small object is 16MB
3072 // before end of 31 bits boundary. We may also accept pretty large negative
3073 // constants knowing that all objects are in the positive half of address
3074 // space.
3075 return Offset < 16 * 1024 * 1024;
3076}
3077
3078/// Return true if the condition is an signed comparison operation.
3079static bool isX86CCSigned(X86::CondCode X86CC) {
3080 switch (X86CC) {
3081 default:
3082 llvm_unreachable("Invalid integer condition!");
3083 case X86::COND_E:
3084 case X86::COND_NE:
3085 case X86::COND_B:
3086 case X86::COND_A:
3087 case X86::COND_BE:
3088 case X86::COND_AE:
3089 return false;
3090 case X86::COND_G:
3091 case X86::COND_GE:
3092 case X86::COND_L:
3093 case X86::COND_LE:
3094 return true;
3095 }
3096}
3097
3099 switch (SetCCOpcode) {
3100 // clang-format off
3101 default: llvm_unreachable("Invalid integer condition!");
3102 case ISD::SETEQ: return X86::COND_E;
3103 case ISD::SETGT: return X86::COND_G;
3104 case ISD::SETGE: return X86::COND_GE;
3105 case ISD::SETLT: return X86::COND_L;
3106 case ISD::SETLE: return X86::COND_LE;
3107 case ISD::SETNE: return X86::COND_NE;
3108 case ISD::SETULT: return X86::COND_B;
3109 case ISD::SETUGT: return X86::COND_A;
3110 case ISD::SETULE: return X86::COND_BE;
3111 case ISD::SETUGE: return X86::COND_AE;
3112 // clang-format on
3113 }
3114}
3115
3116/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
3117/// condition code, returning the condition code and the LHS/RHS of the
3118/// comparison to make.
3120 bool isFP, SDValue &LHS, SDValue &RHS,
3121 SelectionDAG &DAG) {
3122 if (!isFP) {
3124 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
3125 // X > -1 -> X == 0, jump !sign.
3126 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3127 return X86::COND_NS;
3128 }
3129 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
3130 // X < 0 -> X == 0, jump on sign.
3131 return X86::COND_S;
3132 }
3133 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3134 // X >= 0 -> X == 0, jump on !sign.
3135 return X86::COND_NS;
3136 }
3137 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3138 // X < 1 -> X <= 0
3139 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3140 return X86::COND_LE;
3141 }
3142 }
3143
3144 return TranslateIntegerX86CC(SetCCOpcode);
3145 }
3146
3147 // First determine if it is required or is profitable to flip the operands.
3148
3149 // If LHS is a foldable load, but RHS is not, flip the condition.
3150 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3151 !ISD::isNON_EXTLoad(RHS.getNode())) {
3152 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3153 std::swap(LHS, RHS);
3154 }
3155
3156 switch (SetCCOpcode) {
3157 default: break;
3158 case ISD::SETOLT:
3159 case ISD::SETOLE:
3160 case ISD::SETUGT:
3161 case ISD::SETUGE:
3162 std::swap(LHS, RHS);
3163 break;
3164 }
3165
3166 // On a floating point condition, the flags are set as follows:
3167 // ZF PF CF op
3168 // 0 | 0 | 0 | X > Y
3169 // 0 | 0 | 1 | X < Y
3170 // 1 | 0 | 0 | X == Y
3171 // 1 | 1 | 1 | unordered
3172 switch (SetCCOpcode) {
3173 // clang-format off
3174 default: llvm_unreachable("Condcode should be pre-legalized away");
3175 case ISD::SETUEQ:
3176 case ISD::SETEQ: return X86::COND_E;
3177 case ISD::SETOLT: // flipped
3178 case ISD::SETOGT:
3179 case ISD::SETGT: return X86::COND_A;
3180 case ISD::SETOLE: // flipped
3181 case ISD::SETOGE:
3182 case ISD::SETGE: return X86::COND_AE;
3183 case ISD::SETUGT: // flipped
3184 case ISD::SETULT:
3185 case ISD::SETLT: return X86::COND_B;
3186 case ISD::SETUGE: // flipped
3187 case ISD::SETULE:
3188 case ISD::SETLE: return X86::COND_BE;
3189 case ISD::SETONE:
3190 case ISD::SETNE: return X86::COND_NE;
3191 case ISD::SETUO: return X86::COND_P;
3192 case ISD::SETO: return X86::COND_NP;
3193 case ISD::SETOEQ:
3194 case ISD::SETUNE: return X86::COND_INVALID;
3195 // clang-format on
3196 }
3197}
3198
3199/// Is there a floating point cmov for the specific X86 condition code?
3200/// Current x86 isa includes the following FP cmov instructions:
3201/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3202static bool hasFPCMov(unsigned X86CC) {
3203 switch (X86CC) {
3204 default:
3205 return false;
3206 case X86::COND_B:
3207 case X86::COND_BE:
3208 case X86::COND_E:
3209 case X86::COND_P:
3210 case X86::COND_A:
3211 case X86::COND_AE:
3212 case X86::COND_NE:
3213 case X86::COND_NP:
3214 return true;
3215 }
3216}
3217
3218static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3219 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3220 VT.is512BitVector();
3221}
3222
3225 MachineFunction &MF, unsigned Intrinsic) const {
3226 IntrinsicInfo Info;
3228 Info.offset = 0;
3229
3231 if (!IntrData) {
3232 switch (Intrinsic) {
3233 case Intrinsic::x86_aesenc128kl:
3234 case Intrinsic::x86_aesdec128kl:
3235 Info.opc = ISD::INTRINSIC_W_CHAIN;
3236 Info.ptrVal = I.getArgOperand(1);
3237 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3238 Info.align = Align(1);
3239 Info.flags |= MachineMemOperand::MOLoad;
3240 Infos.push_back(Info);
3241 return;
3242 case Intrinsic::x86_aesenc256kl:
3243 case Intrinsic::x86_aesdec256kl:
3244 Info.opc = ISD::INTRINSIC_W_CHAIN;
3245 Info.ptrVal = I.getArgOperand(1);
3246 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3247 Info.align = Align(1);
3248 Info.flags |= MachineMemOperand::MOLoad;
3249 Infos.push_back(Info);
3250 return;
3251 case Intrinsic::x86_aesencwide128kl:
3252 case Intrinsic::x86_aesdecwide128kl:
3253 Info.opc = ISD::INTRINSIC_W_CHAIN;
3254 Info.ptrVal = I.getArgOperand(0);
3255 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3256 Info.align = Align(1);
3257 Info.flags |= MachineMemOperand::MOLoad;
3258 Infos.push_back(Info);
3259 return;
3260 case Intrinsic::x86_aesencwide256kl:
3261 case Intrinsic::x86_aesdecwide256kl:
3262 Info.opc = ISD::INTRINSIC_W_CHAIN;
3263 Info.ptrVal = I.getArgOperand(0);
3264 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3265 Info.align = Align(1);
3266 Info.flags |= MachineMemOperand::MOLoad;
3267 Infos.push_back(Info);
3268 return;
3269 case Intrinsic::x86_cmpccxadd32:
3270 case Intrinsic::x86_cmpccxadd64:
3271 case Intrinsic::x86_atomic_bts:
3272 case Intrinsic::x86_atomic_btc:
3273 case Intrinsic::x86_atomic_btr: {
3274 Info.opc = ISD::INTRINSIC_W_CHAIN;
3275 Info.ptrVal = I.getArgOperand(0);
3276 unsigned Size = I.getType()->getScalarSizeInBits();
3277 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3278 Info.align = Align(Size);
3281 Infos.push_back(Info);
3282 return;
3283 }
3284 case Intrinsic::x86_atomic_bts_rm:
3285 case Intrinsic::x86_atomic_btc_rm:
3286 case Intrinsic::x86_atomic_btr_rm: {
3287 Info.opc = ISD::INTRINSIC_W_CHAIN;
3288 Info.ptrVal = I.getArgOperand(0);
3289 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3290 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3291 Info.align = Align(Size);
3294 Infos.push_back(Info);
3295 return;
3296 }
3297 case Intrinsic::x86_aadd32:
3298 case Intrinsic::x86_aadd64:
3299 case Intrinsic::x86_aand32:
3300 case Intrinsic::x86_aand64:
3301 case Intrinsic::x86_aor32:
3302 case Intrinsic::x86_aor64:
3303 case Intrinsic::x86_axor32:
3304 case Intrinsic::x86_axor64:
3305 case Intrinsic::x86_atomic_add_cc:
3306 case Intrinsic::x86_atomic_sub_cc:
3307 case Intrinsic::x86_atomic_or_cc:
3308 case Intrinsic::x86_atomic_and_cc:
3309 case Intrinsic::x86_atomic_xor_cc: {
3310 Info.opc = ISD::INTRINSIC_W_CHAIN;
3311 Info.ptrVal = I.getArgOperand(0);
3312 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3313 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3314 Info.align = Align(Size);
3317 Infos.push_back(Info);
3318 return;
3319 }
3320 }
3321 return;
3322 }
3323
3324 switch (IntrData->Type) {
3327 case TRUNCATE_TO_MEM_VI32: {
3328 Info.opc = ISD::INTRINSIC_VOID;
3329 Info.ptrVal = I.getArgOperand(0);
3330 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3332 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3333 ScalarVT = MVT::i8;
3334 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3335 ScalarVT = MVT::i16;
3336 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3337 ScalarVT = MVT::i32;
3338
3339 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3340 Info.align = Align(1);
3341 Info.flags |= MachineMemOperand::MOStore;
3342 Infos.push_back(Info);
3343 return;
3344 }
3345 case GATHER:
3346 case GATHER_AVX2: {
3347 Info.opc = ISD::INTRINSIC_W_CHAIN;
3348 Info.ptrVal = nullptr;
3349 MVT DataVT = MVT::getVT(I.getType());
3350 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3351 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3352 IndexVT.getVectorNumElements());
3353 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3354 Info.align = Align(1);
3355 Info.flags |= MachineMemOperand::MOLoad;
3356 Infos.push_back(Info);
3357 return;
3358 }
3359 case SCATTER: {
3360 Info.opc = ISD::INTRINSIC_VOID;
3361 Info.ptrVal = nullptr;
3362 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3363 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3364 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3365 IndexVT.getVectorNumElements());
3366 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3367 Info.align = Align(1);
3368 Info.flags |= MachineMemOperand::MOStore;
3369 Infos.push_back(Info);
3370 return;
3371 }
3372 default:
3373 return;
3374 }
3375}
3376
3377/// Returns true if the target can instruction select the
3378/// specified FP immediate natively. If false, the legalizer will
3379/// materialize the FP immediate as a load from a constant pool.
3381 bool ForCodeSize) const {
3382 for (const APFloat &FPImm : LegalFPImmediates)
3383 if (Imm.bitwiseIsEqual(FPImm))
3384 return true;
3385 return false;
3386}
3387
3389 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3390 std::optional<unsigned> ByteOffset) const {
3391 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3392
3393 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3394 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3395 N = *N->user_begin();
3396 return N;
3397 };
3398
3399 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3400 // relocation target a movq or addq instruction: don't let the load shrink.
3401 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3402 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3403 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3404 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3405
3406 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3407 // those uses are extracted directly into a store, then the extract + store
3408 // can be store-folded, or (4) any use will be used by legal full width
3409 // instruction. Then, it's probably not worth splitting the load.
3410 EVT VT = Load->getValueType(0);
3411 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3412 !SDValue(Load, 0).hasOneUse()) {
3413 bool FullWidthUse = false;
3414 bool AllExtractStores = true;
3415 for (SDUse &Use : Load->uses()) {
3416 // Skip uses of the chain value. Result 0 of the node is the load value.
3417 if (Use.getResNo() != 0)
3418 continue;
3419
3420 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3421
3422 // If this use is an extract + store, it's probably not worth splitting.
3423 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3424 all_of(User->uses(), [&](const SDUse &U) {
3425 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3426 return Inner->getOpcode() == ISD::STORE;
3427 }))
3428 continue;
3429
3430 AllExtractStores = false;
3431
3432 // If any use is a full width legal/target bin op, then assume its legal
3433 // and won't split.
3434 if (isBinOp(User->getOpcode()) &&
3435 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3436 User->getOpcode() > ISD::BUILTIN_OP_END))
3437 FullWidthUse = true;
3438 }
3439
3440 if (AllExtractStores)
3441 return false;
3442
3443 // If we have an user that uses the full vector width, then this use is
3444 // only worth splitting if the offset isn't 0 (to avoid an
3445 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3446 if (FullWidthUse)
3447 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3448 }
3449
3450 return true;
3451}
3452
3453/// Returns true if it is beneficial to convert a load of a constant
3454/// to just the constant itself.
3456 Type *Ty) const {
3457 assert(Ty->isIntegerTy());
3458
3459 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3460 if (BitSize == 0 || BitSize > 64)
3461 return false;
3462 return true;
3463}
3464
3466 // If we are using XMM registers in the ABI and the condition of the select is
3467 // a floating-point compare and we have blendv or conditional move, then it is
3468 // cheaper to select instead of doing a cross-register move and creating a
3469 // load that depends on the compare result.
3470 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3471 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3472}
3473
3475 // TODO: It might be a win to ease or lift this restriction, but the generic
3476 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3477 if (VT.isVector() && Subtarget.hasAVX512())
3478 return false;
3479
3480 return true;
3481}
3482
3484 SDValue C) const {
3485 // TODO: We handle scalars using custom code, but generic combining could make
3486 // that unnecessary.
3487 APInt MulC;
3488 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3489 return false;
3490
3491 if (VT.isVector() && VT.getScalarSizeInBits() == 8) {
3492 // Check whether a vXi8 multiply can be decomposed into two shifts
3493 // (decomposing 2^m ± 2^n as 2^(a+b) ± 2^b). Similar to
3494 // DAGCombiner::visitMUL, consider the constant `2` decomposable as
3495 // (2^0 + 1).
3496 APInt ShiftedMulC = MulC.abs();
3497 unsigned TZeros = ShiftedMulC == 2 ? 0 : ShiftedMulC.countr_zero();
3498 ShiftedMulC.lshrInPlace(TZeros);
3499 if ((ShiftedMulC - 1).isPowerOf2() || (ShiftedMulC + 1).isPowerOf2())
3500 return true;
3501 }
3502
3503 // Find the type this will be legalized too. Otherwise we might prematurely
3504 // convert this to shl+add/sub and then still have to type legalize those ops.
3505 // Another choice would be to defer the decision for illegal types until
3506 // after type legalization. But constant splat vectors of i64 can't make it
3507 // through type legalization on 32-bit targets so we would need to special
3508 // case vXi64.
3509 while (getTypeAction(Context, VT) != TypeLegal)
3510 VT = getTypeToTransformTo(Context, VT);
3511
3512 // If vector multiply is legal, assume that's faster than shl + add/sub.
3513 // Multiply is a complex op with higher latency and lower throughput in
3514 // most implementations, sub-vXi32 vector multiplies are always fast,
3515 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3516 // is always going to be slow.
3517 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3518 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3519 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3520 return false;
3521
3522 // shl+add, shl+sub, shl+add+neg
3523 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3524 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3525}
3526
3528 unsigned Index) const {
3530 return false;
3531
3532 // Mask vectors support all subregister combinations and operations that
3533 // extract half of vector.
3534 if (ResVT.getVectorElementType() == MVT::i1)
3535 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3536 (Index == ResVT.getVectorNumElements()));
3537
3538 return (Index % ResVT.getVectorNumElements()) == 0;
3539}
3540
3542 unsigned Opc = VecOp.getOpcode();
3543
3544 // Assume target opcodes can't be scalarized.
3545 // TODO - do we have any exceptions?
3546 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3547 return false;
3548
3549 // If the vector op is not supported, try to convert to scalar.
3550 EVT VecVT = VecOp.getValueType();
3552 return true;
3553
3554 // If the vector op is supported, but the scalar op is not, the transform may
3555 // not be worthwhile.
3556 EVT ScalarVT = VecVT.getScalarType();
3557 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3558}
3559
3561 bool) const {
3562 // TODO: Allow vectors?
3563 if (VT.isVector())
3564 return false;
3565 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3566}
3567
3569 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3570 // i32/i64 or can rely on BSF passthrough value.
3571 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3572 Subtarget.hasBitScanPassThrough() ||
3573 (!Ty->isVectorTy() &&
3574 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3575}
3576
3578 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3579 // passthrough value.
3580 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3581 Subtarget.hasBitScanPassThrough();
3582}
3583
3585 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3586 // expensive than a straight movsd. On the other hand, it's important to
3587 // shrink long double fp constant since fldt is very slow.
3588 return !Subtarget.hasSSE2() || VT == MVT::f80;
3589}
3590
3592 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3593 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3594}
3595
3597 const SelectionDAG &DAG,
3598 const MachineMemOperand &MMO) const {
3599 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3600 BitcastVT.getVectorElementType() == MVT::i1)
3601 return false;
3602
3603 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3604 return false;
3605
3606 if (LoadVT.isVector() && BitcastVT.isVector()) {
3607 // If both types are legal vectors, it's always ok to convert them.
3608 // Don't convert to an illegal type.
3609 if (isTypeLegal(LoadVT))
3610 return isTypeLegal(BitcastVT);
3611 }
3612
3613 // If we have a large vector type (even if illegal), don't bitcast to large
3614 // (illegal) scalar types. Better to load fewer vectors and extract.
3615 if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() &&
3616 BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0)
3617 return false;
3618
3619 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3620}
3621
3623 const MachineFunction &MF) const {
3624 // Do not merge to float value size (128 bytes) if no implicit
3625 // float attribute is set.
3626 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3627
3628 if (NoFloat) {
3629 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3630 return (MemVT.getSizeInBits() <= MaxIntSize);
3631 }
3632 // Make sure we don't merge greater than our preferred vector
3633 // width.
3634 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3635 return false;
3636
3637 return true;
3638}
3639
3641 return Subtarget.hasFastLZCNT();
3642}
3643
3645 const Instruction &AndI) const {
3646 return true;
3647}
3648
3650 // Scalar integer and-not compares are efficiently handled by NOT+TEST (or
3651 // BMI ANDN).
3652 return Y.getValueType().isScalarInteger();
3653}
3654
3656 EVT VT = Y.getValueType();
3657
3658 if (!VT.isVector()) {
3659 if (!Subtarget.hasBMI())
3660 return false;
3661
3662 // There are only 32-bit and 64-bit forms for 'andn'.
3663 if (VT != MVT::i32 && VT != MVT::i64)
3664 return false;
3665 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3666 }
3667
3668 // Vector.
3669 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3670 return false;
3671
3672 if (VT == MVT::v4i32)
3673 return true;
3674
3675 return Subtarget.hasSSE2();
3676}
3677
3679 return X.getValueType().isScalarInteger(); // 'bt'
3680}
3681
3685 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3686 SelectionDAG &DAG) const {
3687 // Does baseline recommend not to perform the fold by default?
3689 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3690 return false;
3691 // For scalars this transform is always beneficial.
3692 if (X.getValueType().isScalarInteger())
3693 return true;
3694 // If all the shift amounts are identical, then transform is beneficial even
3695 // with rudimentary SSE2 shifts.
3696 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3697 return true;
3698 // If we have AVX2 with it's powerful shift operations, then it's also good.
3699 if (Subtarget.hasAVX2())
3700 return true;
3701 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3702 return NewShiftOpcode == ISD::SHL;
3703}
3704
3706 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3707 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3708 if (!VT.isInteger())
3709 return ShiftOpc;
3710
3711 bool PreferRotate = false;
3712 if (VT.isVector()) {
3713 // For vectors, if we have rotate instruction support, then its definetly
3714 // best. Otherwise its not clear what the best so just don't make changed.
3715 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3716 VT.getScalarType() == MVT::i64);
3717 } else {
3718 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3719 // rotate unless we have a zext mask+shr.
3720 PreferRotate = Subtarget.hasBMI2();
3721 if (!PreferRotate) {
3722 unsigned MaskBits =
3723 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3724 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3725 }
3726 }
3727
3728 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3729 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3730
3731 if (PreferRotate && MayTransformRotate)
3732 return ISD::ROTL;
3733
3734 // If vector we don't really get much benefit swapping around constants.
3735 // Maybe we could check if the DAG has the flipped node already in the
3736 // future.
3737 if (VT.isVector())
3738 return ShiftOpc;
3739
3740 // See if the beneficial to swap shift type.
3741 if (ShiftOpc == ISD::SHL) {
3742 // If the current setup has imm64 mask, then inverse will have
3743 // at least imm32 mask (or be zext i32 -> i64).
3744 if (VT == MVT::i64)
3745 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3746 : ShiftOpc;
3747
3748 // We can only benefit if req at least 7-bit for the mask. We
3749 // don't want to replace shl of 1,2,3 as they can be implemented
3750 // with lea/add.
3751 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3752 }
3753
3754 if (VT == MVT::i64)
3755 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3756 // extremely efficient.
3757 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3758
3759 // Keep small shifts as shl so we can generate add/lea.
3760 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3761 }
3762
3763 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3764 // (PreferRotate will be set in the latter case).
3765 if (PreferRotate || !MayTransformRotate || VT.isVector())
3766 return ShiftOpc;
3767
3768 // Non-vector type and we have a zext mask with SRL.
3769 return ISD::SRL;
3770}
3771
3774 const Value *Lhs,
3775 const Value *Rhs) const {
3776 using namespace llvm::PatternMatch;
3777 int BaseCost = BrMergingBaseCostThresh.getValue();
3778 // With CCMP, branches can be merged in a more efficient way.
3779 if (BaseCost >= 0 && Subtarget.hasCCMP())
3780 BaseCost += BrMergingCcmpBias;
3781 // a == b && a == c is a fast pattern on x86.
3782 if (BaseCost >= 0 && Opc == Instruction::And &&
3785 BaseCost += 1;
3786
3787 // For OR conditions with EQ comparisons, prefer splitting into branches
3788 // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops,
3789 // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed
3790 // comparisons (SLT, SGT) that can be optimized.
3791 if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or &&
3794 return {-1, -1, -1};
3795
3796 return {BaseCost, BrMergingLikelyBias.getValue(),
3797 BrMergingUnlikelyBias.getValue()};
3798}
3799
3801 return N->getOpcode() != ISD::FP_EXTEND;
3802}
3803
3805 const SDNode *N) const {
3806 assert(((N->getOpcode() == ISD::SHL &&
3807 N->getOperand(0).getOpcode() == ISD::SRL) ||
3808 (N->getOpcode() == ISD::SRL &&
3809 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3810 "Expected shift-shift mask");
3811 // TODO: Should we always create i64 masks? Or only folded immediates?
3812 EVT VT = N->getValueType(0);
3813 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3814 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3815 // Only fold if the shift values are equal - so it folds to AND.
3816 // TODO - we should fold if either is a non-uniform vector but we don't do
3817 // the fold for non-splats yet.
3818 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3819 }
3821}
3822
3824 EVT VT = Y.getValueType();
3825
3826 // For vectors, we don't have a preference, but we probably want a mask.
3827 if (VT.isVector())
3828 return false;
3829
3830 unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
3831 return VT.getScalarSizeInBits() <= MaxWidth;
3832}
3833
3836 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3838 !Subtarget.isOSWindows())
3841 ExpansionFactor);
3842}
3843
3845 // Any legal vector type can be splatted more efficiently than
3846 // loading/spilling from memory.
3847 return isTypeLegal(VT);
3848}
3849
3851 MVT VT = MVT::getIntegerVT(NumBits);
3852 if (isTypeLegal(VT))
3853 return VT;
3854
3855 // PMOVMSKB can handle this.
3856 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3857 return MVT::v16i8;
3858
3859 // VPMOVMSKB can handle this.
3860 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3861 return MVT::v32i8;
3862
3863 // TODO: Allow 64-bit type for 32-bit target.
3864 // TODO: 512-bit types should be allowed, but make sure that those
3865 // cases are handled in combineVectorSizedSetCCEquality().
3866
3868}
3869
3870/// Val is the undef sentinel value or equal to the specified value.
3871static bool isUndefOrEqual(int Val, int CmpVal) {
3872 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3873}
3874
3875/// Return true if every element in Mask is the undef sentinel value or equal to
3876/// the specified value.
3877static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3878 return llvm::all_of(Mask, [CmpVal](int M) {
3879 return (M == SM_SentinelUndef) || (M == CmpVal);
3880 });
3881}
3882
3883/// Return true if every element in Mask, beginning from position Pos and ending
3884/// in Pos+Size is the undef sentinel value or equal to the specified value.
3885static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3886 unsigned Size) {
3887 return llvm::all_of(Mask.slice(Pos, Size),
3888 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3889}
3890
3891/// Val is either the undef or zero sentinel value.
3892static bool isUndefOrZero(int Val) {
3893 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3894}
3895
3896/// Return true if every element in Mask, beginning from position Pos and ending
3897/// in Pos+Size is the undef sentinel value.
3898static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3899 return llvm::all_of(Mask.slice(Pos, Size), equal_to(SM_SentinelUndef));
3900}
3901
3902/// Return true if the mask creates a vector whose lower half is undefined.
3904 unsigned NumElts = Mask.size();
3905 return isUndefInRange(Mask, 0, NumElts / 2);
3906}
3907
3908/// Return true if the mask creates a vector whose upper half is undefined.
3910 unsigned NumElts = Mask.size();
3911 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3912}
3913
3914/// Return true if Val falls within the specified range (L, H].
3915static bool isInRange(int Val, int Low, int Hi) {
3916 return (Val >= Low && Val < Hi);
3917}
3918
3919/// Return true if the value of any element in Mask falls within the specified
3920/// range (L, H].
3921static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3922 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3923}
3924
3925/// Return true if the value of any element in Mask is the zero sentinel value.
3926static bool isAnyZero(ArrayRef<int> Mask) {
3927 return llvm::any_of(Mask, equal_to(SM_SentinelZero));
3928}
3929
3930/// Return true if Val is undef or if its value falls within the
3931/// specified range (L, H].
3932static bool isUndefOrInRange(int Val, int Low, int Hi) {
3933 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3934}
3935
3936/// Return true if every element in Mask is undef or if its value
3937/// falls within the specified range (L, H].
3938static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3939 return llvm::all_of(
3940 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3941}
3942
3943/// Return true if Val is undef, zero or if its value falls within the
3944/// specified range (L, H].
3945static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3946 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3947}
3948
3949/// Return true if every element in Mask is undef, zero or if its value
3950/// falls within the specified range (L, H].
3951static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3952 return llvm::all_of(
3953 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3954}
3955
3956/// Return true if every element in Mask, is an in-place blend/select mask or is
3957/// undef.
3958[[maybe_unused]] static bool isBlendOrUndef(ArrayRef<int> Mask) {
3959 unsigned NumElts = Mask.size();
3960 for (auto [I, M] : enumerate(Mask))
3961 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3962 return false;
3963 return true;
3964}
3965
3966/// Return true if every element in Mask, beginning
3967/// from position Pos and ending in Pos + Size, falls within the specified
3968/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3969static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3970 unsigned Size, int Low, int Step = 1) {
3971 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3972 if (!isUndefOrEqual(Mask[i], Low))
3973 return false;
3974 return true;
3975}
3976
3977/// Return true if every element in Mask, beginning
3978/// from position Pos and ending in Pos+Size, falls within the specified
3979/// sequential range (Low, Low+Size], or is undef or is zero.
3981 unsigned Size, int Low,
3982 int Step = 1) {
3983 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3984 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3985 return false;
3986 return true;
3987}
3988
3989/// Return true if every element in Mask, beginning
3990/// from position Pos and ending in Pos+Size is undef or is zero.
3991static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3992 unsigned Size) {
3993 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3994}
3995
3996/// Return true if every element of a single input is referenced by the shuffle
3997/// mask. i.e. it just permutes them all.
3999 unsigned NumElts = Mask.size();
4000 APInt DemandedElts = APInt::getZero(NumElts);
4001 for (int M : Mask)
4002 if (isInRange(M, 0, NumElts))
4003 DemandedElts.setBit(M);
4004 return DemandedElts.isAllOnes();
4005}
4006
4007/// Helper function to test whether a shuffle mask could be
4008/// simplified by widening the elements being shuffled.
4009///
4010/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4011/// leaves it in an unspecified state.
4012///
4013/// NOTE: This must handle normal vector shuffle masks and *target* vector
4014/// shuffle masks. The latter have the special property of a '-2' representing
4015/// a zero-ed lane of a vector.
4017 SmallVectorImpl<int> &WidenedMask) {
4018 WidenedMask.assign(Mask.size() / 2, 0);
4019 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4020 int M0 = Mask[i];
4021 int M1 = Mask[i + 1];
4022
4023 // If both elements are undef, its trivial.
4024 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4025 WidenedMask[i / 2] = SM_SentinelUndef;
4026 continue;
4027 }
4028
4029 // Check for an undef mask and a mask value properly aligned to fit with
4030 // a pair of values. If we find such a case, use the non-undef mask's value.
4031 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4032 WidenedMask[i / 2] = M1 / 2;
4033 continue;
4034 }
4035 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4036 WidenedMask[i / 2] = M0 / 2;
4037 continue;
4038 }
4039
4040 // When zeroing, we need to spread the zeroing across both lanes to widen.
4041 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4042 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4044 WidenedMask[i / 2] = SM_SentinelZero;
4045 continue;
4046 }
4047 return false;
4048 }
4049
4050 // Finally check if the two mask values are adjacent and aligned with
4051 // a pair.
4052 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4053 WidenedMask[i / 2] = M0 / 2;
4054 continue;
4055 }
4056
4057 // Otherwise we can't safely widen the elements used in this shuffle.
4058 return false;
4059 }
4060 assert(WidenedMask.size() == Mask.size() / 2 &&
4061 "Incorrect size of mask after widening the elements!");
4062
4063 return true;
4064}
4065
4067 const APInt &Zeroable,
4068 bool V2IsZero,
4069 SmallVectorImpl<int> &WidenedMask) {
4070 // Create an alternative mask with info about zeroable elements.
4071 // Here we do not set undef elements as zeroable.
4072 SmallVector<int, 64> ZeroableMask(Mask);
4073 if (V2IsZero) {
4074 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
4075 for (int i = 0, Size = Mask.size(); i != Size; ++i)
4076 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
4077 ZeroableMask[i] = SM_SentinelZero;
4078 }
4079 return canWidenShuffleElements(ZeroableMask, WidenedMask);
4080}
4081
4083 SmallVector<int, 32> WidenedMask;
4084 return canWidenShuffleElements(Mask, WidenedMask);
4085}
4086
4087// Attempt to narrow/widen shuffle mask until it matches the target number of
4088// elements.
4089static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
4090 SmallVectorImpl<int> &ScaledMask) {
4091 unsigned NumSrcElts = Mask.size();
4092 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
4093 "Illegal shuffle scale factor");
4094
4095 // Narrowing is guaranteed to work.
4096 if (NumDstElts >= NumSrcElts) {
4097 int Scale = NumDstElts / NumSrcElts;
4098 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
4099 return true;
4100 }
4101
4102 // We have to repeat the widening until we reach the target size, but we can
4103 // split out the first widening as it sets up ScaledMask for us.
4104 if (canWidenShuffleElements(Mask, ScaledMask)) {
4105 while (ScaledMask.size() > NumDstElts) {
4106 SmallVector<int, 16> WidenedMask;
4107 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
4108 return false;
4109 ScaledMask = std::move(WidenedMask);
4110 }
4111 return true;
4112 }
4113
4114 return false;
4115}
4116
4117static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
4118 SmallVector<int, 32> ScaledMask;
4119 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
4120}
4121
4122// Helper to grow the shuffle mask for a larger value type.
4123// NOTE: This is different to scaleShuffleElements which is a same size type.
4124static void growShuffleMask(ArrayRef<int> SrcMask,
4125 SmallVectorImpl<int> &DstMask,
4126 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
4127 assert(DstMask.empty() && "Expected an empty shuffle mas");
4128 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
4129 unsigned Scale = DstSizeInBits / SrcSizeInBits;
4130 unsigned NumSrcElts = SrcMask.size();
4131 DstMask.assign(SrcMask.begin(), SrcMask.end());
4132 for (int &M : DstMask) {
4133 if (M < 0)
4134 continue;
4135 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
4136 }
4137 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
4138}
4139
4140/// Returns true if Elt is a constant zero or a floating point constant +0.0.
4142 return isNullConstant(Elt) || isNullFPConstant(Elt);
4143}
4144
4145// Build a vector of constants.
4146// Use an UNDEF node if MaskElt == -1.
4147// Split 64-bit constants in the 32-bit mode.
4149 const SDLoc &dl, bool IsMask = false) {
4150
4152 bool Split = false;
4153
4154 MVT ConstVecVT = VT;
4155 unsigned NumElts = VT.getVectorNumElements();
4156 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4157 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4158 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4159 Split = true;
4160 }
4161
4162 MVT EltVT = ConstVecVT.getVectorElementType();
4163 for (unsigned i = 0; i < NumElts; ++i) {
4164 bool IsUndef = Values[i] < 0 && IsMask;
4165 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4166 DAG.getConstant(Values[i], dl, EltVT);
4167 Ops.push_back(OpNode);
4168 if (Split)
4169 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4170 DAG.getConstant(0, dl, EltVT));
4171 }
4172 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4173 if (Split)
4174 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4175 return ConstsNode;
4176}
4177
4178static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4179 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4180 assert(Bits.size() == Undefs.getBitWidth() &&
4181 "Unequal constant and undef arrays");
4183 bool Split = false;
4184
4185 MVT ConstVecVT = VT;
4186 unsigned NumElts = VT.getVectorNumElements();
4187 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4188 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4189 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4190 Split = true;
4191 }
4192
4193 MVT EltVT = ConstVecVT.getVectorElementType();
4194 MVT EltIntVT = EltVT.changeTypeToInteger();
4195 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4196 if (Undefs[i]) {
4197 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4198 continue;
4199 }
4200 const APInt &V = Bits[i];
4201 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4202 if (Split) {
4203 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4204 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4205 } else {
4206 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4207 }
4208 }
4209
4210 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4211 return DAG.getBitcast(VT, ConstsNode);
4212}
4213
4215 SelectionDAG &DAG, const SDLoc &dl) {
4216 APInt Undefs = APInt::getZero(Bits.size());
4217 return getConstVector(Bits, Undefs, VT, DAG, dl);
4218}
4219
4220/// Returns a vector of specified type with all zero elements.
4221static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4222 SelectionDAG &DAG, const SDLoc &dl) {
4223 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4224 VT.getVectorElementType() == MVT::i1) &&
4225 "Unexpected vector type");
4226
4227 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4228 // type. This ensures they get CSE'd. But if the integer type is not
4229 // available, use a floating-point +0.0 instead.
4230 SDValue Vec;
4231 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4232 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4233 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4234 } else if (VT.isFloatingPoint() &&
4236 Vec = DAG.getConstantFP(+0.0, dl, VT);
4237 } else if (VT.getVectorElementType() == MVT::i1) {
4238 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4239 "Unexpected vector type");
4240 Vec = DAG.getConstant(0, dl, VT);
4241 } else {
4242 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4243 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4244 }
4245 return DAG.getBitcast(VT, Vec);
4246}
4247
4248// Helper to determine if the ops are all the extracted subvectors come from a
4249// single source. If we allow commute they don't have to be in order (Lo/Hi).
4250static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4251 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4252 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4253 LHS.getValueType() != RHS.getValueType() ||
4254 LHS.getOperand(0) != RHS.getOperand(0))
4255 return SDValue();
4256
4257 SDValue Src = LHS.getOperand(0);
4258 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4259 return SDValue();
4260
4261 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4262 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4263 RHS.getConstantOperandAPInt(1) == NumElts) ||
4264 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4265 LHS.getConstantOperandAPInt(1) == NumElts))
4266 return Src;
4267
4268 return SDValue();
4269}
4270
4271static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4272 const SDLoc &dl, unsigned vectorWidth) {
4273 EVT VT = Vec.getValueType();
4274 EVT ElVT = VT.getVectorElementType();
4275 unsigned ResultNumElts =
4276 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4277 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4278
4279 assert(ResultVT.getSizeInBits() == vectorWidth &&
4280 "Illegal subvector extraction");
4281
4282 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4283 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4284 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4285
4286 // This is the index of the first element of the vectorWidth-bit chunk
4287 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4288 IdxVal &= ~(ElemsPerChunk - 1);
4289
4290 // If the input is a buildvector just emit a smaller one.
4291 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4292 return DAG.getBuildVector(ResultVT, dl,
4293 Vec->ops().slice(IdxVal, ElemsPerChunk));
4294
4295 // Check if we're extracting the upper undef of a widening pattern.
4296 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4297 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4298 isNullConstant(Vec.getOperand(2)))
4299 return DAG.getUNDEF(ResultVT);
4300
4301 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4302}
4303
4304/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4305/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4306/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4307/// instructions or a simple subregister reference. Idx is an index in the
4308/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4309/// lowering EXTRACT_VECTOR_ELT operations easier.
4310static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4311 SelectionDAG &DAG, const SDLoc &dl) {
4313 Vec.getValueType().is512BitVector()) &&
4314 "Unexpected vector size!");
4315 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4316}
4317
4318/// Generate a DAG to grab 256-bits from a 512-bit vector.
4319static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4320 SelectionDAG &DAG, const SDLoc &dl) {
4321 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4322 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4323}
4324
4325static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4326 SelectionDAG &DAG, const SDLoc &dl,
4327 unsigned vectorWidth) {
4328 assert((vectorWidth == 128 || vectorWidth == 256) &&
4329 "Unsupported vector width");
4330 // Inserting UNDEF is Result
4331 if (Vec.isUndef())
4332 return Result;
4333
4334 // Insert the relevant vectorWidth bits.
4335 EVT VT = Vec.getValueType();
4336 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4337 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4338
4339 // This is the index of the first element of the vectorWidth-bit chunk
4340 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4341 IdxVal &= ~(ElemsPerChunk - 1);
4342 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4343}
4344
4345/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4346/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4347/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4348/// simple superregister reference. Idx is an index in the 128 bits
4349/// we want. It need not be aligned to a 128-bit boundary. That makes
4350/// lowering INSERT_VECTOR_ELT operations easier.
4351static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4352 SelectionDAG &DAG, const SDLoc &dl) {
4353 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4354 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4355}
4356
4357/// Widen a vector to a larger size with the same scalar type, with the new
4358/// elements either zero or undef.
4359static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4360 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4361 const SDLoc &dl) {
4362 EVT VecVT = Vec.getValueType();
4364 VecVT.getScalarType() == VT.getScalarType() &&
4365 "Unsupported vector widening type");
4366 // If the upper 128-bits of a build vector are already undef/zero, then try to
4367 // widen from the lower 128-bits.
4368 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4369 unsigned NumSrcElts = VecVT.getVectorNumElements();
4370 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4371 if (all_of(Hi, [&](SDValue V) {
4372 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4373 }))
4374 Vec = extract128BitVector(Vec, 0, DAG, dl);
4375 }
4376 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4377 : DAG.getUNDEF(VT);
4378 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4379}
4380
4381/// Widen a vector to a larger size with the same scalar type, with the new
4382/// elements either zero or undef.
4383static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4384 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4385 const SDLoc &dl, unsigned WideSizeInBits) {
4386 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4387 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4388 "Unsupported vector widening type");
4389 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4390 MVT SVT = Vec.getSimpleValueType().getScalarType();
4391 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4392 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4393}
4394
4395/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4396/// and bitcast with integer types.
4397static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4398 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4399 unsigned NumElts = VT.getVectorNumElements();
4400 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4401 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4402 return VT;
4403}
4404
4405/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4406/// bitcast with integer types.
4407static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4408 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4409 const SDLoc &dl) {
4410 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4411 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4412}
4413
4414// Helper function to collect subvector ops that are concatenated together,
4415// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4416// The subvectors in Ops are guaranteed to be the same type.
4418 SelectionDAG &DAG) {
4419 assert(Ops.empty() && "Expected an empty ops vector");
4420
4421 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4422 Ops.append(N->op_begin(), N->op_end());
4423 return true;
4424 }
4425
4426 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4427 SDValue Src = N->getOperand(0);
4428 SDValue Sub = N->getOperand(1);
4429 const APInt &Idx = N->getConstantOperandAPInt(2);
4430 EVT VT = Src.getValueType();
4431 EVT SubVT = Sub.getValueType();
4432
4433 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4434 // insert_subvector(undef, x, lo)
4435 if (Idx == 0 && Src.isUndef()) {
4436 Ops.push_back(Sub);
4437 Ops.push_back(DAG.getUNDEF(SubVT));
4438 return true;
4439 }
4440 if (Idx == (VT.getVectorNumElements() / 2)) {
4441 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4442 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4443 Src.getOperand(1).getValueType() == SubVT &&
4444 isNullConstant(Src.getOperand(2))) {
4445 // Attempt to recurse into inner (matching) concats.
4446 SDValue Lo = Src.getOperand(1);
4447 SDValue Hi = Sub;
4448 SmallVector<SDValue, 2> LoOps, HiOps;
4449 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4450 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4451 LoOps.size() == HiOps.size()) {
4452 Ops.append(LoOps);
4453 Ops.append(HiOps);
4454 return true;
4455 }
4456 Ops.push_back(Lo);
4457 Ops.push_back(Hi);
4458 return true;
4459 }
4460 // insert_subvector(x, extract_subvector(x, lo), hi)
4461 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4462 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4463 Ops.append(2, Sub);
4464 return true;
4465 }
4466 // insert_subvector(undef, x, hi)
4467 if (Src.isUndef()) {
4468 Ops.push_back(DAG.getUNDEF(SubVT));
4469 Ops.push_back(Sub);
4470 return true;
4471 }
4472 }
4473 }
4474 }
4475
4476 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4477 EVT VT = N->getValueType(0);
4478 SDValue Src = N->getOperand(0);
4479 uint64_t Idx = N->getConstantOperandVal(1);
4480
4481 // Collect all the subvectors from the source vector and slice off the
4482 // extraction.
4484 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4485 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4486 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4487 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4488 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4489 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4490 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4491 return true;
4492 }
4493 }
4494
4495 assert(Ops.empty() && "Expected an empty ops vector");
4496 return false;
4497}
4498
4499// Helper to check if \p V can be split into subvectors and the upper subvectors
4500// are all undef. In which case return the lower subvector.
4502 SelectionDAG &DAG) {
4503 SmallVector<SDValue> SubOps;
4504 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4505 return SDValue();
4506
4507 unsigned NumSubOps = SubOps.size();
4508 unsigned HalfNumSubOps = NumSubOps / 2;
4509 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4510
4511 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4512 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4513 return SDValue();
4514
4515 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4516 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4517 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4518}
4519
4520// Helper to check if we can access all the constituent subvectors without any
4521// extract ops.
4524 return collectConcatOps(V.getNode(), Ops, DAG);
4525}
4526
4527static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4528 const SDLoc &dl) {
4529 EVT VT = Op.getValueType();
4530 unsigned NumElems = VT.getVectorNumElements();
4531 unsigned SizeInBits = VT.getSizeInBits();
4532 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4533 "Can't split odd sized vector");
4534
4536 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4537 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4538 unsigned HalfOps = SubOps.size() / 2;
4539 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4540 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4541 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4542 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4543 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4544 return std::make_pair(Lo, Hi);
4545 }
4546
4547 // If this is a splat value (with no-undefs) then use the lower subvector,
4548 // which should be a free extraction.
4549 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4550 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4551 return std::make_pair(Lo, Lo);
4552
4553 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4554 return std::make_pair(Lo, Hi);
4555}
4556
4557/// Break an operation into 2 half sized ops and then concatenate the results.
4559 unsigned NumOps = Op.getNumOperands();
4560 EVT VT = Op.getValueType();
4561
4562 // Extract the LHS Lo/Hi vectors
4565 for (unsigned I = 0; I != NumOps; ++I) {
4566 SDValue SrcOp = Op.getOperand(I);
4567 if (!SrcOp.getValueType().isVector()) {
4568 LoOps[I] = HiOps[I] = SrcOp;
4569 continue;
4570 }
4571 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4572 }
4573
4574 EVT LoVT, HiVT;
4575 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4576 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4577 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4578 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4579}
4580
4581/// Break an unary integer operation into 2 half sized ops and then
4582/// concatenate the result back.
4584 const SDLoc &dl) {
4585 // Make sure we only try to split 256/512-bit types to avoid creating
4586 // narrow vectors.
4587 [[maybe_unused]] EVT VT = Op.getValueType();
4588 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4589 Op.getOperand(0).getValueType().is512BitVector()) &&
4590 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4591 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4592 VT.getVectorNumElements() &&
4593 "Unexpected VTs!");
4594 return splitVectorOp(Op, DAG, dl);
4595}
4596
4597/// Break a binary integer operation into 2 half sized ops and then
4598/// concatenate the result back.
4600 const SDLoc &dl) {
4601 // Assert that all the types match.
4602 [[maybe_unused]] EVT VT = Op.getValueType();
4603 assert(Op.getOperand(0).getValueType() == VT &&
4604 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4605 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4606 return splitVectorOp(Op, DAG, dl);
4607}
4608
4609// Helper for splitting operands of an operation to legal target size and
4610// apply a function on each part.
4611// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4612// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4613// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4614// The argument Builder is a function that will be applied on each split part:
4615// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4616template <typename F>
4618 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4619 F Builder, bool CheckBWI = true,
4620 bool AllowAVX512 = true) {
4621 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4622 unsigned NumSubs = 1;
4623 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4624 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4625 if (VT.getSizeInBits() > 512) {
4626 NumSubs = VT.getSizeInBits() / 512;
4627 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4628 }
4629 } else if (Subtarget.hasAVX2()) {
4630 if (VT.getSizeInBits() > 256) {
4631 NumSubs = VT.getSizeInBits() / 256;
4632 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4633 }
4634 } else {
4635 if (VT.getSizeInBits() > 128) {
4636 NumSubs = VT.getSizeInBits() / 128;
4637 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4638 }
4639 }
4640
4641 if (NumSubs == 1)
4642 return Builder(DAG, DL, Ops);
4643
4645 for (unsigned i = 0; i != NumSubs; ++i) {
4647 for (SDValue Op : Ops) {
4648 EVT OpVT = Op.getValueType();
4649 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4650 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4651 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4652 }
4653 Subs.push_back(Builder(DAG, DL, SubOps));
4654 }
4655 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4656}
4657
4658// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4659// targets.
4660static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4662 const X86Subtarget &Subtarget) {
4663 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4664 MVT SVT = VT.getScalarType();
4665
4666 // If we have a 32/64 splatted constant, splat it to DstTy to
4667 // encourage a foldable broadcast'd operand.
4668 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4669 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4670 // AVX512 broadcasts 32/64-bit operands.
4671 // TODO: Support float once getAVX512Node is used by fp-ops.
4672 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4674 return SDValue();
4675 // If we're not widening, don't bother if we're not bitcasting.
4676 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4677 return SDValue();
4679 APInt SplatValue, SplatUndef;
4680 unsigned SplatBitSize;
4681 bool HasAnyUndefs;
4682 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4683 HasAnyUndefs, OpEltSizeInBits) &&
4684 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4685 return DAG.getConstant(SplatValue, DL, DstVT);
4686 }
4687 return SDValue();
4688 };
4689
4690 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4691
4692 MVT DstVT = VT;
4693 if (Widen)
4694 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4695
4696 // Canonicalize src operands.
4697 SmallVector<SDValue> SrcOps(Ops);
4698 for (SDValue &Op : SrcOps) {
4699 MVT OpVT = Op.getSimpleValueType();
4700 // Just pass through scalar operands.
4701 if (!OpVT.isVector())
4702 continue;
4703 assert(OpVT == VT && "Vector type mismatch");
4704
4705 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4706 Op = BroadcastOp;
4707 continue;
4708 }
4709
4710 // Just widen the subvector by inserting into an undef wide vector.
4711 if (Widen)
4712 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4713 }
4714
4715 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4716
4717 // Perform the 512-bit op then extract the bottom subvector.
4718 if (Widen)
4719 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4720 return Res;
4721}
4722
4723/// Insert i1-subvector to i1-vector.
4725 const X86Subtarget &Subtarget) {
4726
4727 SDLoc dl(Op);
4728 SDValue Vec = Op.getOperand(0);
4729 SDValue SubVec = Op.getOperand(1);
4730 SDValue Idx = Op.getOperand(2);
4731 unsigned IdxVal = Op.getConstantOperandVal(2);
4732
4733 // Inserting undef is a nop. We can just return the original vector.
4734 if (SubVec.isUndef())
4735 return Vec;
4736
4737 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4738 return Op;
4739
4740 MVT OpVT = Op.getSimpleValueType();
4741 unsigned NumElems = OpVT.getVectorNumElements();
4742 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4743
4744 // Extend to natively supported kshift.
4745 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4746
4747 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4748 // if necessary.
4749 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4750 // May need to promote to a legal type.
4751 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4752 DAG.getConstant(0, dl, WideOpVT),
4753 SubVec, Idx);
4754 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4755 }
4756
4757 MVT SubVecVT = SubVec.getSimpleValueType();
4758 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4759 assert(IdxVal + SubVecNumElems <= NumElems &&
4760 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4761 "Unexpected index value in INSERT_SUBVECTOR");
4762
4763 SDValue Undef = DAG.getUNDEF(WideOpVT);
4764
4765 if (IdxVal == 0) {
4766 // Zero lower bits of the Vec
4767 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4768 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4769 ZeroIdx);
4770 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4771 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4772 // Merge them together, SubVec should be zero extended.
4773 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4774 DAG.getConstant(0, dl, WideOpVT),
4775 SubVec, ZeroIdx);
4776 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4777 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4778 }
4779
4780 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4781 Undef, SubVec, ZeroIdx);
4782
4783 if (Vec.isUndef()) {
4784 assert(IdxVal != 0 && "Unexpected index");
4785 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4786 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4787 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4788 }
4789
4791 assert(IdxVal != 0 && "Unexpected index");
4792 // If upper elements of Vec are known undef, then just shift into place.
4793 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4794 [](SDValue V) { return V.isUndef(); })) {
4795 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4796 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4797 } else {
4798 NumElems = WideOpVT.getVectorNumElements();
4799 unsigned ShiftLeft = NumElems - SubVecNumElems;
4800 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4801 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4802 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4803 if (ShiftRight != 0)
4804 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4805 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4806 }
4807 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4808 }
4809
4810 // Simple case when we put subvector in the upper part
4811 if (IdxVal + SubVecNumElems == NumElems) {
4812 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4813 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4814 if (SubVecNumElems * 2 == NumElems) {
4815 // Special case, use legal zero extending insert_subvector. This allows
4816 // isel to optimize when bits are known zero.
4817 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4818 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4819 DAG.getConstant(0, dl, WideOpVT),
4820 Vec, ZeroIdx);
4821 } else {
4822 // Otherwise use explicit shifts to zero the bits.
4823 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4824 Undef, Vec, ZeroIdx);
4825 NumElems = WideOpVT.getVectorNumElements();
4826 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4827 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4828 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4829 }
4830 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4831 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4832 }
4833
4834 // Inserting into the middle is more complicated.
4835
4836 NumElems = WideOpVT.getVectorNumElements();
4837
4838 // Widen the vector if needed.
4839 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4840
4841 unsigned ShiftLeft = NumElems - SubVecNumElems;
4842 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4843
4844 // Do an optimization for the most frequently used types.
4845 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4846 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4847 Mask0.flipAllBits();
4848 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4849 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4850 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4851 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4852 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4853 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4854 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4855 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4856
4857 // Reduce to original width if needed.
4858 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4859 }
4860
4861 // Clear the upper bits of the subvector and move it to its insert position.
4862 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4863 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4864 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4865 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4866
4867 // Isolate the bits below the insertion point.
4868 unsigned LowShift = NumElems - IdxVal;
4869 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4870 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4871 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4872 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4873
4874 // Isolate the bits after the last inserted bit.
4875 unsigned HighShift = IdxVal + SubVecNumElems;
4876 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4877 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4878 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4879 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4880
4881 // Now OR all 3 pieces together.
4882 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4883 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4884
4885 // Reduce to original width if needed.
4886 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4887}
4888
4890 const SDLoc &dl) {
4891 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4892 EVT SubVT = V1.getValueType();
4893 EVT SubSVT = SubVT.getScalarType();
4894 unsigned SubNumElts = SubVT.getVectorNumElements();
4895 unsigned SubVectorWidth = SubVT.getSizeInBits();
4896 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4897 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4898 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4899}
4900
4901/// Returns a vector of specified type with all bits set.
4902/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4903/// Then bitcast to their original type, ensuring they get CSE'd.
4904static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4905 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4906 "Expected a 128/256/512-bit vector type");
4907 unsigned NumElts = VT.getSizeInBits() / 32;
4908 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4909 return DAG.getBitcast(VT, Vec);
4910}
4911
4912// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
4913static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
4914 switch (Opc) {
4915 case ISD::SHL:
4916 case X86ISD::VSHL:
4917 case X86ISD::VSHLI:
4918 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
4919 case ISD::SRL:
4920 case X86ISD::VSRL:
4921 case X86ISD::VSRLI:
4922 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
4923 case ISD::SRA:
4924 case X86ISD::VSRA:
4925 case X86ISD::VSRAI:
4926 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
4927 }
4928 llvm_unreachable("Unknown target vector shift node");
4929}
4930
4931/// Handle vector element shifts where the shift amount is a constant.
4932/// Takes immediate version of shift as input.
4933static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
4934 SDValue SrcOp, uint64_t ShiftAmt,
4935 SelectionDAG &DAG) {
4936 MVT ElementType = VT.getVectorElementType();
4937
4938 // Bitcast the source vector to the output type, this is mainly necessary for
4939 // vXi8/vXi64 shifts.
4940 if (VT != SrcOp.getSimpleValueType())
4941 SrcOp = DAG.getBitcast(VT, SrcOp);
4942
4943 // Fold this packed shift into its first operand if ShiftAmt is 0.
4944 if (ShiftAmt == 0)
4945 return SrcOp;
4946
4947 // Check for ShiftAmt >= element width
4948 if (ShiftAmt >= ElementType.getSizeInBits()) {
4949 if (Opc == X86ISD::VSRAI)
4950 ShiftAmt = ElementType.getSizeInBits() - 1;
4951 else
4952 return DAG.getConstant(0, dl, VT);
4953 }
4954
4955 assert(
4956 (Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) &&
4957 "Unknown target vector shift-by-constant node");
4958
4959 // Fold this packed vector shift into a build vector if SrcOp is a
4960 // vector of Constants or UNDEFs.
4962 unsigned ShiftOpc;
4963 switch (Opc) {
4964 default:
4965 llvm_unreachable("Unknown opcode!");
4966 case X86ISD::VSHLI:
4967 ShiftOpc = ISD::SHL;
4968 break;
4969 case X86ISD::VSRLI:
4970 ShiftOpc = ISD::SRL;
4971 break;
4972 case X86ISD::VSRAI:
4973 ShiftOpc = ISD::SRA;
4974 break;
4975 }
4976
4977 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
4978 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
4979 return C;
4980 }
4981
4982 return DAG.getNode(Opc, dl, VT, SrcOp,
4983 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
4984}
4985
4986/// Handle vector element shifts by a splat shift amount
4987static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
4988 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
4989 const X86Subtarget &Subtarget,
4990 SelectionDAG &DAG) {
4991 MVT AmtVT = ShAmt.getSimpleValueType();
4992 assert(AmtVT.isVector() && "Vector shift type mismatch");
4993 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
4994 "Illegal vector splat index");
4995
4996 // Move the splat element to the bottom element.
4997 if (ShAmtIdx != 0) {
4998 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
4999 Mask[0] = ShAmtIdx;
5000 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
5001 }
5002
5003 // Peek through any zext node if we can get back to a 128-bit source.
5004 if (AmtVT.getScalarSizeInBits() == 64 &&
5005 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
5007 ShAmt.getOperand(0).getValueType().isSimple() &&
5008 ShAmt.getOperand(0).getValueType().is128BitVector()) {
5009 ShAmt = ShAmt.getOperand(0);
5010 AmtVT = ShAmt.getSimpleValueType();
5011 }
5012
5013 // See if we can mask off the upper elements using the existing source node.
5014 // The shift uses the entire lower 64-bits of the amount vector, so no need to
5015 // do this for vXi64 types.
5016 bool IsMasked = false;
5017 if (AmtVT.getScalarSizeInBits() < 64) {
5018 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
5019 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
5020 // If the shift amount has come from a scalar, then zero-extend the scalar
5021 // before moving to the vector.
5022 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
5023 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
5024 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
5025 AmtVT = MVT::v4i32;
5026 IsMasked = true;
5027 } else if (ShAmt.getOpcode() == ISD::AND) {
5028 // See if the shift amount is already masked (e.g. for rotation modulo),
5029 // then we can zero-extend it by setting all the other mask elements to
5030 // zero.
5031 SmallVector<SDValue> MaskElts(
5032 AmtVT.getVectorNumElements(),
5033 DAG.getConstant(0, dl, AmtVT.getScalarType()));
5034 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
5035 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
5036 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
5037 {ShAmt.getOperand(1), Mask}))) {
5038 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
5039 IsMasked = true;
5040 }
5041 }
5042 }
5043
5044 // Extract if the shift amount vector is larger than 128-bits.
5045 if (AmtVT.getSizeInBits() > 128) {
5046 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
5047 AmtVT = ShAmt.getSimpleValueType();
5048 }
5049
5050 // Zero-extend bottom element to v2i64 vector type, either by extension or
5051 // shuffle masking.
5052 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
5053 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
5054 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
5055 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
5056 } else if (Subtarget.hasSSE41()) {
5057 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
5058 MVT::v2i64, ShAmt);
5059 } else {
5060 SDValue ByteShift = DAG.getTargetConstant(
5061 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
5062 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
5063 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
5064 ByteShift);
5065 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
5066 ByteShift);
5067 }
5068 }
5069
5070 // Change opcode to non-immediate version.
5072
5073 // The return type has to be a 128-bit type with the same element
5074 // type as the input type.
5075 MVT EltVT = VT.getVectorElementType();
5076 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
5077
5078 ShAmt = DAG.getBitcast(ShVT, ShAmt);
5079 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
5080}
5081
5082static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
5083 SDValue In, SelectionDAG &DAG) {
5084 EVT InVT = In.getValueType();
5085 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
5086
5087 // Canonicalize Opcode to general extension version.
5088 switch (Opcode) {
5089 case ISD::ANY_EXTEND:
5091 Opcode = ISD::ANY_EXTEND;
5092 break;
5093 case ISD::SIGN_EXTEND:
5095 Opcode = ISD::SIGN_EXTEND;
5096 break;
5097 case ISD::ZERO_EXTEND:
5099 Opcode = ISD::ZERO_EXTEND;
5100 break;
5101 default:
5102 llvm_unreachable("Unknown extension opcode");
5103 }
5104
5105 // For 256-bit vectors, we only need the lower (128-bit) input half.
5106 // For 512-bit vectors, we only need the lower input half or quarter.
5107 if (InVT.getSizeInBits() > 128) {
5108 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
5109 "Expected VTs to be the same size!");
5110 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5111 In = extractSubVector(In, 0, DAG, DL,
5112 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
5113 InVT = In.getValueType();
5114 }
5115
5116 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
5117 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
5118
5119 return DAG.getNode(Opcode, DL, VT, In);
5120}
5121
5122// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
5124 SDValue Mask, SelectionDAG &DAG) {
5125 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
5126 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
5127 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
5128}
5129
5131 bool Lo, bool Unary) {
5132 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
5133 "Illegal vector type to unpack");
5134 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5135 int NumElts = VT.getVectorNumElements();
5136 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5137 for (int i = 0; i < NumElts; ++i) {
5138 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5139 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5140 Pos += (Unary ? 0 : NumElts * (i % 2));
5141 Pos += (Lo ? 0 : NumEltsInLane / 2);
5142 Mask.push_back(Pos);
5143 }
5144}
5145
5146/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
5147/// imposed by AVX and specific to the unary pattern. Example:
5148/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
5149/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
5151 bool Lo) {
5152 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5153 int NumElts = VT.getVectorNumElements();
5154 for (int i = 0; i < NumElts; ++i) {
5155 int Pos = i / 2;
5156 Pos += (Lo ? 0 : NumElts / 2);
5157 Mask.push_back(Pos);
5158 }
5159}
5160
5161// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
5162static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
5163 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
5166 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
5167 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
5168 int M = Mask[I];
5169 if (M < 0)
5170 continue;
5171 SDValue V = (M < NumElts) ? V1 : V2;
5172 if (V.isUndef())
5173 continue;
5174 Ops[I] = V.getOperand(M % NumElts);
5175 }
5176 return DAG.getBuildVector(VT, dl, Ops);
5177 }
5178
5179 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5180}
5181
5182/// Returns a vector_shuffle node for an unpackl operation.
5183static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
5184 SDValue V1, SDValue V2) {
5186 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5187 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
5188}
5189
5190/// Returns a vector_shuffle node for an unpackh operation.
5191static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
5192 SDValue V1, SDValue V2) {
5194 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5195 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
5196}
5197
5198/// Returns a node that packs the LHS + RHS nodes together at half width.
5199/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
5200/// TODO: Add subvector splitting if/when we have a need for it.
5201static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5202 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
5203 bool PackHiHalf = false) {
5204 MVT OpVT = LHS.getSimpleValueType();
5205 unsigned EltSizeInBits = VT.getScalarSizeInBits();
5206 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
5207 assert(OpVT == RHS.getSimpleValueType() &&
5208 VT.getSizeInBits() == OpVT.getSizeInBits() &&
5209 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
5210 "Unexpected PACK operand types");
5211 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
5212 "Unexpected PACK result type");
5213
5214 // Rely on vector shuffles for vXi64 -> vXi32 packing.
5215 if (EltSizeInBits == 32) {
5216 SmallVector<int> PackMask;
5217 int Offset = PackHiHalf ? 1 : 0;
5218 int NumElts = VT.getVectorNumElements();
5219 for (int I = 0; I != NumElts; I += 4) {
5220 PackMask.push_back(I + Offset);
5221 PackMask.push_back(I + Offset + 2);
5222 PackMask.push_back(I + Offset + NumElts);
5223 PackMask.push_back(I + Offset + NumElts + 2);
5224 }
5225 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
5226 DAG.getBitcast(VT, RHS), PackMask);
5227 }
5228
5229 // See if we already have sufficient leading bits for PACKSS/PACKUS.
5230 if (!PackHiHalf) {
5231 if (UsePackUS &&
5232 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
5233 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
5234 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
5235
5236 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
5237 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
5238 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
5239 }
5240
5241 // Fallback to sign/zero extending the requested half and pack.
5242 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
5243 if (UsePackUS) {
5244 if (PackHiHalf) {
5245 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
5246 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
5247 } else {
5248 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
5249 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
5250 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
5251 };
5252 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
5253 };
5254
5255 if (!PackHiHalf) {
5256 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
5257 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
5258 }
5259 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
5260 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
5261 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
5262}
5263
5264/// Return a vector_shuffle of the specified vector of zero or undef vector.
5265/// This produces a shuffle where the low element of V2 is swizzled into the
5266/// zero/undef vector, landing at element Idx.
5267/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5269 bool IsZero,
5270 const X86Subtarget &Subtarget,
5271 SelectionDAG &DAG) {
5272 MVT VT = V2.getSimpleValueType();
5273 SDValue V1 = IsZero
5274 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5275 int NumElems = VT.getVectorNumElements();
5276 SmallVector<int, 16> MaskVec(NumElems);
5277 for (int i = 0; i != NumElems; ++i)
5278 // If this is the insertion idx, put the low elt of V2 here.
5279 MaskVec[i] = (i == Idx) ? NumElems : i;
5280 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5281}
5282
5284 if (Ptr.getOpcode() == X86ISD::Wrapper ||
5285 Ptr.getOpcode() == X86ISD::WrapperRIP)
5286 Ptr = Ptr.getOperand(0);
5287 return dyn_cast<ConstantPoolSDNode>(Ptr);
5288}
5289
5290// TODO: Add support for non-zero offsets.
5293 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
5294 return nullptr;
5295 return CNode->getConstVal();
5296}
5297
5299 if (!Load || !ISD::isNormalLoad(Load))
5300 return nullptr;
5301 return getTargetConstantFromBasePtr(Load->getBasePtr());
5302}
5303
5308
5309const Constant *
5311 assert(LD && "Unexpected null LoadSDNode");
5312 return getTargetConstantFromNode(LD);
5313}
5314
5316 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
5317 SDValue Cond = N->getOperand(0);
5318 SDValue RHS = N->getOperand(2);
5319 EVT CondVT = Cond.getValueType();
5320 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
5321 CondVT.getVectorElementType() == MVT::i1 &&
5322 ISD::isBuildVectorAllZeros(RHS.getNode());
5323}
5324
5325// Extract raw constant bits from constant pools.
5326static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5327 APInt &UndefElts,
5328 SmallVectorImpl<APInt> &EltBits,
5329 bool AllowWholeUndefs = true,
5330 bool AllowPartialUndefs = false) {
5331 assert(EltBits.empty() && "Expected an empty EltBits vector");
5332
5334
5335 EVT VT = Op.getValueType();
5336 unsigned SizeInBits = VT.getSizeInBits();
5337 unsigned NumElts = SizeInBits / EltSizeInBits;
5338
5339 // Can't split constant.
5340 if ((SizeInBits % EltSizeInBits) != 0)
5341 return false;
5342
5343 // Bitcast a source array of element bits to the target size.
5344 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5345 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5346 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5347 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5348 "Constant bit sizes don't match");
5349
5350 // Don't split if we don't allow undef bits.
5351 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5352 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5353 return false;
5354
5355 // If we're already the right size, don't bother bitcasting.
5356 if (NumSrcElts == NumElts) {
5357 UndefElts = UndefSrcElts;
5358 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5359 return true;
5360 }
5361
5362 // Extract all the undef/constant element data and pack into single bitsets.
5363 APInt UndefBits(SizeInBits, 0);
5364 APInt MaskBits(SizeInBits, 0);
5365
5366 for (unsigned i = 0; i != NumSrcElts; ++i) {
5367 unsigned BitOffset = i * SrcEltSizeInBits;
5368 if (UndefSrcElts[i])
5369 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5370 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5371 }
5372
5373 // Split the undef/constant single bitset data into the target elements.
5374 UndefElts = APInt(NumElts, 0);
5375 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5376
5377 for (unsigned i = 0; i != NumElts; ++i) {
5378 unsigned BitOffset = i * EltSizeInBits;
5379 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5380
5381 // Only treat an element as UNDEF if all bits are UNDEF.
5382 if (UndefEltBits.isAllOnes()) {
5383 if (!AllowWholeUndefs)
5384 return false;
5385 UndefElts.setBit(i);
5386 continue;
5387 }
5388
5389 // If only some bits are UNDEF then treat them as zero (or bail if not
5390 // supported).
5391 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5392 return false;
5393
5394 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5395 }
5396 return true;
5397 };
5398
5399 // Collect constant bits and insert into mask/undef bit masks.
5400 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5401 unsigned UndefBitIndex) {
5402 if (!Cst)
5403 return false;
5404 if (isa<UndefValue>(Cst)) {
5405 Undefs.setBit(UndefBitIndex);
5406 return true;
5407 }
5408 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5409 Mask = CInt->getValue();
5410 return true;
5411 }
5412 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5413 Mask = CFP->getValueAPF().bitcastToAPInt();
5414 return true;
5415 }
5416 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5417 Type *Ty = CDS->getType();
5418 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5419 Type *EltTy = CDS->getElementType();
5420 bool IsInteger = EltTy->isIntegerTy();
5421 bool IsFP =
5422 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5423 if (!IsInteger && !IsFP)
5424 return false;
5425 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5426 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5427 if (IsInteger)
5428 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5429 else
5430 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5431 I * EltBits);
5432 return true;
5433 }
5434 return false;
5435 };
5436
5437 // Handle UNDEFs.
5438 if (Op.isUndef()) {
5439 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5440 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5441 return CastBitData(UndefSrcElts, SrcEltBits);
5442 }
5443
5444 // Extract scalar constant bits.
5445 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5446 APInt UndefSrcElts = APInt::getZero(1);
5447 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5448 return CastBitData(UndefSrcElts, SrcEltBits);
5449 }
5450 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5451 APInt UndefSrcElts = APInt::getZero(1);
5452 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5453 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5454 return CastBitData(UndefSrcElts, SrcEltBits);
5455 }
5456
5457 // Extract constant bits from build vector.
5458 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5459 BitVector Undefs;
5460 SmallVector<APInt> SrcEltBits;
5461 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5462 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5463 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5464 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5465 if (Undefs[I])
5466 UndefSrcElts.setBit(I);
5467 return CastBitData(UndefSrcElts, SrcEltBits);
5468 }
5469 }
5470
5471 // Extract constant bits from constant pool vector.
5472 if (auto *Cst = getTargetConstantFromNode(Op)) {
5473 Type *CstTy = Cst->getType();
5474 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5475 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5476 return false;
5477
5478 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5479 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5480 if ((SizeInBits % SrcEltSizeInBits) != 0)
5481 return false;
5482
5483 APInt UndefSrcElts(NumSrcElts, 0);
5484 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5485 for (unsigned i = 0; i != NumSrcElts; ++i)
5486 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5487 UndefSrcElts, i))
5488 return false;
5489
5490 return CastBitData(UndefSrcElts, SrcEltBits);
5491 }
5492
5493 // Extract constant bits from a broadcasted constant pool scalar.
5494 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5495 EltSizeInBits <= VT.getScalarSizeInBits()) {
5496 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5497 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5498 return false;
5499
5500 SDValue Ptr = MemIntr->getBasePtr();
5501 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
5502 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5503 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5504
5505 APInt UndefSrcElts(NumSrcElts, 0);
5506 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5507 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5508 if (UndefSrcElts[0])
5509 UndefSrcElts.setBits(0, NumSrcElts);
5510 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5511 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5512 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5513 return CastBitData(UndefSrcElts, SrcEltBits);
5514 }
5515 }
5516 }
5517
5518 // Extract constant bits from a subvector broadcast.
5519 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5520 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5521 SDValue Ptr = MemIntr->getBasePtr();
5522 // The source constant may be larger than the subvector broadcast,
5523 // ensure we extract the correct subvector constants.
5524 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5525 Type *CstTy = Cst->getType();
5526 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5527 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5528 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5529 (SizeInBits % SubVecSizeInBits) != 0)
5530 return false;
5531 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5532 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5533 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5534 APInt UndefSubElts(NumSubElts, 0);
5535 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5536 APInt(CstEltSizeInBits, 0));
5537 for (unsigned i = 0; i != NumSubElts; ++i) {
5538 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5539 UndefSubElts, i))
5540 return false;
5541 for (unsigned j = 1; j != NumSubVecs; ++j)
5542 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5543 }
5544 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5545 UndefSubElts);
5546 return CastBitData(UndefSubElts, SubEltBits);
5547 }
5548 }
5549
5550 // Extract a rematerialized scalar constant insertion.
5551 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5552 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5553 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5554 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5555 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5556
5557 APInt UndefSrcElts(NumSrcElts, 0);
5558 SmallVector<APInt, 64> SrcEltBits;
5559 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5560 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5561 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5562 return CastBitData(UndefSrcElts, SrcEltBits);
5563 }
5564
5565 // Insert constant bits from a base and sub vector sources.
5566 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5567 // If bitcasts to larger elements we might lose track of undefs - don't
5568 // allow any to be safe.
5569 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5570 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5571
5572 APInt UndefSrcElts, UndefSubElts;
5573 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5574 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5575 UndefSubElts, EltSubBits,
5576 AllowWholeUndefs && AllowUndefs,
5577 AllowPartialUndefs && AllowUndefs) &&
5578 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5579 UndefSrcElts, EltSrcBits,
5580 AllowWholeUndefs && AllowUndefs,
5581 AllowPartialUndefs && AllowUndefs)) {
5582 unsigned BaseIdx = Op.getConstantOperandVal(2);
5583 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5584 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5585 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5586 return CastBitData(UndefSrcElts, EltSrcBits);
5587 }
5588 }
5589
5590 // Extract constant bits from a subvector's source.
5591 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5592 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5593 EltBits, AllowWholeUndefs,
5594 AllowPartialUndefs)) {
5595 EVT SrcVT = Op.getOperand(0).getValueType();
5596 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5597 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5598 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5599 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5600 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5601 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5602 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5603
5604 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5605 if ((BaseIdx + NumSubElts) != NumSrcElts)
5606 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5607 if (BaseIdx != 0)
5608 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5609 return true;
5610 }
5611
5612 // Extract constant bits from shuffle node sources.
5613 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5614 // TODO - support shuffle through bitcasts.
5615 if (EltSizeInBits != VT.getScalarSizeInBits())
5616 return false;
5617
5618 ArrayRef<int> Mask = SVN->getMask();
5619 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5620 llvm::any_of(Mask, [](int M) { return M < 0; }))
5621 return false;
5622
5623 APInt UndefElts0, UndefElts1;
5624 SmallVector<APInt, 32> EltBits0, EltBits1;
5625 if (isAnyInRange(Mask, 0, NumElts) &&
5626 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5627 UndefElts0, EltBits0, AllowWholeUndefs,
5628 AllowPartialUndefs))
5629 return false;
5630 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5631 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5632 UndefElts1, EltBits1, AllowWholeUndefs,
5633 AllowPartialUndefs))
5634 return false;
5635
5636 UndefElts = APInt::getZero(NumElts);
5637 for (int i = 0; i != (int)NumElts; ++i) {
5638 int M = Mask[i];
5639 if (M < 0) {
5640 UndefElts.setBit(i);
5641 EltBits.push_back(APInt::getZero(EltSizeInBits));
5642 } else if (M < (int)NumElts) {
5643 if (UndefElts0[M])
5644 UndefElts.setBit(i);
5645 EltBits.push_back(EltBits0[M]);
5646 } else {
5647 if (UndefElts1[M - NumElts])
5648 UndefElts.setBit(i);
5649 EltBits.push_back(EltBits1[M - NumElts]);
5650 }
5651 }
5652 return true;
5653 }
5654
5655 return false;
5656}
5657
5658namespace llvm {
5659namespace X86 {
5660bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5661 APInt UndefElts;
5662 SmallVector<APInt, 16> EltBits;
5664 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5665 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5666 int SplatIndex = -1;
5667 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5668 if (UndefElts[i])
5669 continue;
5670 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5671 SplatIndex = -1;
5672 break;
5673 }
5674 SplatIndex = i;
5675 }
5676 if (0 <= SplatIndex) {
5677 SplatVal = EltBits[SplatIndex];
5678 return true;
5679 }
5680 }
5681
5682 return false;
5683}
5684
5685int getRoundingModeX86(unsigned RM) {
5686 switch (static_cast<::llvm::RoundingMode>(RM)) {
5687 // clang-format off
5688 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest;
5689 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward;
5690 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward;
5691 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero;
5692 default: return X86::rmInvalid;
5693 // clang-format on
5694 }
5695}
5696
5697} // namespace X86
5698} // namespace llvm
5699
5701 unsigned MaskEltSizeInBits,
5703 APInt &UndefElts) {
5704 // Extract the raw target constant bits.
5705 SmallVector<APInt, 64> EltBits;
5706 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5707 EltBits, /* AllowWholeUndefs */ true,
5708 /* AllowPartialUndefs */ false))
5709 return false;
5710
5711 // Insert the extracted elements into the mask.
5712 for (const APInt &Elt : EltBits)
5713 RawMask.push_back(Elt.getZExtValue());
5714
5715 return true;
5716}
5717
5718static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5719 bool AllowUndefs) {
5720 APInt UndefElts;
5721 SmallVector<APInt, 64> EltBits;
5722 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5723 /*AllowWholeUndefs*/ AllowUndefs,
5724 /*AllowPartialUndefs*/ false))
5725 return false;
5726
5727 bool IsPow2OrUndef = true;
5728 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5729 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5730 return IsPow2OrUndef;
5731}
5732
5733// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5735 // TODO: don't always ignore oneuse constraints.
5736 V = peekThroughBitcasts(V);
5737 EVT VT = V.getValueType();
5738
5739 // Match not(xor X, -1) -> X.
5740 if (V.getOpcode() == ISD::XOR &&
5741 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5742 isAllOnesConstant(V.getOperand(1))))
5743 return V.getOperand(0);
5744
5745 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5746 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5747 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5748 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5749 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5750 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5751 V.getOperand(1));
5752 }
5753 }
5754
5755 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5756 if (V.getOpcode() == X86ISD::PCMPGT &&
5757 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5758 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5759 V.getOperand(0).hasOneUse()) {
5760 APInt UndefElts;
5761 SmallVector<APInt> EltBits;
5762 if (getTargetConstantBitsFromNode(V.getOperand(0),
5763 V.getScalarValueSizeInBits(), UndefElts,
5764 EltBits) &&
5765 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5766 // Don't fold min_signed_value -> (min_signed_value - 1)
5767 bool MinSigned = false;
5768 for (APInt &Elt : EltBits) {
5769 MinSigned |= Elt.isMinSignedValue();
5770 Elt -= 1;
5771 }
5772 if (!MinSigned) {
5773 SDLoc DL(V);
5774 MVT VT = V.getSimpleValueType();
5775 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5776 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5777 }
5778 }
5779 }
5780
5781 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5783 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5784 for (SDValue &CatOp : CatOps) {
5785 SDValue NotCat = IsNOT(CatOp, DAG);
5786 if (!NotCat)
5787 return SDValue();
5788 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5789 }
5790 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5791 }
5792
5793 // Match not(or(not(X),not(Y))) -> and(X, Y).
5794 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5795 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5796 // TODO: Handle cases with single NOT operand -> ANDNP
5797 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5798 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5799 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5800 DAG.getBitcast(VT, Op1));
5801 }
5802
5803 return SDValue();
5804}
5805
5806/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5807/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5808/// Note: This ignores saturation, so inputs must be checked first.
5810 bool Unary, unsigned NumStages = 1) {
5811 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5812 unsigned NumElts = VT.getVectorNumElements();
5813 unsigned NumLanes = VT.getSizeInBits() / 128;
5814 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5815 unsigned Offset = Unary ? 0 : NumElts;
5816 unsigned Repetitions = 1u << (NumStages - 1);
5817 unsigned Increment = 1u << NumStages;
5818 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5819
5820 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5821 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5822 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5823 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5824 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5825 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5826 }
5827 }
5828}
5829
5830// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5831static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5832 APInt &DemandedLHS, APInt &DemandedRHS) {
5833 int NumLanes = VT.getSizeInBits() / 128;
5834 int NumElts = DemandedElts.getBitWidth();
5835 int NumInnerElts = NumElts / 2;
5836 int NumEltsPerLane = NumElts / NumLanes;
5837 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5838
5839 DemandedLHS = APInt::getZero(NumInnerElts);
5840 DemandedRHS = APInt::getZero(NumInnerElts);
5841
5842 // Map DemandedElts to the packed operands.
5843 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5844 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5845 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5846 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5847 if (DemandedElts[OuterIdx])
5848 DemandedLHS.setBit(InnerIdx);
5849 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5850 DemandedRHS.setBit(InnerIdx);
5851 }
5852 }
5853}
5854
5855// Split the demanded elts of a HADD/HSUB node between its operands.
5856static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5857 APInt &DemandedLHS, APInt &DemandedRHS) {
5859 DemandedLHS, DemandedRHS);
5860 DemandedLHS |= DemandedLHS << 1;
5861 DemandedRHS |= DemandedRHS << 1;
5862}
5863
5864/// Calculates the shuffle mask corresponding to the target-specific opcode.
5865/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5866/// operands in \p Ops, and returns true.
5867/// Sets \p IsUnary to true if only one source is used. Note that this will set
5868/// IsUnary for shuffles which use a single input multiple times, and in those
5869/// cases it will adjust the mask to only have indices within that single input.
5870/// It is an error to call this with non-empty Mask/Ops vectors.
5871static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5873 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5874 if (!isTargetShuffle(N.getOpcode()))
5875 return false;
5876
5877 MVT VT = N.getSimpleValueType();
5878 unsigned NumElems = VT.getVectorNumElements();
5879 unsigned MaskEltSize = VT.getScalarSizeInBits();
5881 APInt RawUndefs;
5882 uint64_t ImmN;
5883
5884 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5885 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5886
5887 IsUnary = false;
5888 bool IsFakeUnary = false;
5889 switch (N.getOpcode()) {
5890 case X86ISD::BLENDI:
5891 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5892 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5893 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5894 DecodeBLENDMask(NumElems, ImmN, Mask);
5895 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5896 break;
5897 case X86ISD::SHUFP:
5898 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5899 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5900 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5901 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5902 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5903 break;
5904 case X86ISD::INSERTPS:
5905 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5906 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5907 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5908 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5909 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5910 break;
5911 case X86ISD::EXTRQI:
5912 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5913 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5914 isa<ConstantSDNode>(N.getOperand(2))) {
5915 int BitLen = N.getConstantOperandVal(1);
5916 int BitIdx = N.getConstantOperandVal(2);
5917 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5918 IsUnary = true;
5919 }
5920 break;
5921 case X86ISD::INSERTQI:
5922 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5923 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5924 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5925 isa<ConstantSDNode>(N.getOperand(3))) {
5926 int BitLen = N.getConstantOperandVal(2);
5927 int BitIdx = N.getConstantOperandVal(3);
5928 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5929 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5930 }
5931 break;
5932 case X86ISD::UNPCKH:
5933 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5934 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5935 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5936 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5937 break;
5938 case X86ISD::UNPCKL:
5939 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5940 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5941 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5942 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5943 break;
5944 case X86ISD::MOVHLPS:
5945 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5946 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5947 DecodeMOVHLPSMask(NumElems, Mask);
5948 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5949 break;
5950 case X86ISD::MOVLHPS:
5951 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5952 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5953 DecodeMOVLHPSMask(NumElems, Mask);
5954 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5955 break;
5956 case X86ISD::VALIGN:
5957 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5958 "Only 32-bit and 64-bit elements are supported!");
5959 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5960 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5961 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5962 DecodeVALIGNMask(NumElems, ImmN, Mask);
5963 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5964 Ops.push_back(N.getOperand(1));
5965 Ops.push_back(N.getOperand(0));
5966 break;
5967 case X86ISD::PALIGNR:
5968 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5969 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5970 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5971 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5972 DecodePALIGNRMask(NumElems, ImmN, Mask);
5973 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5974 Ops.push_back(N.getOperand(1));
5975 Ops.push_back(N.getOperand(0));
5976 break;
5977 case X86ISD::VSHLDQ:
5978 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5979 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5980 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5981 DecodePSLLDQMask(NumElems, ImmN, Mask);
5982 IsUnary = true;
5983 break;
5984 case X86ISD::VSRLDQ:
5985 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5986 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5987 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5988 DecodePSRLDQMask(NumElems, ImmN, Mask);
5989 IsUnary = true;
5990 break;
5991 case X86ISD::PSHUFD:
5992 case X86ISD::VPERMILPI:
5993 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5994 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5995 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5996 IsUnary = true;
5997 break;
5998 case X86ISD::PSHUFHW:
5999 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6000 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6001 DecodePSHUFHWMask(NumElems, ImmN, Mask);
6002 IsUnary = true;
6003 break;
6004 case X86ISD::PSHUFLW:
6005 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6006 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6007 DecodePSHUFLWMask(NumElems, ImmN, Mask);
6008 IsUnary = true;
6009 break;
6010 case X86ISD::VZEXT_MOVL:
6011 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6012 DecodeZeroMoveLowMask(NumElems, Mask);
6013 IsUnary = true;
6014 break;
6015 case X86ISD::VBROADCAST:
6016 // We only decode broadcasts of same-sized vectors, peeking through to
6017 // extracted subvectors is likely to cause hasOneUse issues with
6018 // SimplifyDemandedBits etc.
6019 if (N.getOperand(0).getValueType() == VT) {
6020 DecodeVectorBroadcast(NumElems, Mask);
6021 IsUnary = true;
6022 break;
6023 }
6024 return false;
6025 case X86ISD::VPERMILPV: {
6026 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6027 IsUnary = true;
6028 SDValue MaskNode = N.getOperand(1);
6029 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6030 RawUndefs)) {
6031 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
6032 break;
6033 }
6034 return false;
6035 }
6036 case X86ISD::PSHUFB: {
6037 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
6038 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6039 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6040 IsUnary = true;
6041 SDValue MaskNode = N.getOperand(1);
6042 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6043 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
6044 break;
6045 }
6046 return false;
6047 }
6048 case X86ISD::VPERMI:
6049 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6050 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6051 DecodeVPERMMask(NumElems, ImmN, Mask);
6052 IsUnary = true;
6053 break;
6054 case X86ISD::MOVSS:
6055 case X86ISD::MOVSD:
6056 case X86ISD::MOVSH:
6057 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6058 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6059 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
6060 break;
6061 case X86ISD::VPERM2X128:
6062 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6063 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6064 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6065 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
6066 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6067 break;
6068 case X86ISD::SHUF128:
6069 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6070 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6071 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6072 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
6073 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6074 break;
6075 case X86ISD::MOVSLDUP:
6076 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6077 DecodeMOVSLDUPMask(NumElems, Mask);
6078 IsUnary = true;
6079 break;
6080 case X86ISD::MOVSHDUP:
6081 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6082 DecodeMOVSHDUPMask(NumElems, Mask);
6083 IsUnary = true;
6084 break;
6085 case X86ISD::MOVDDUP:
6086 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6087 DecodeMOVDDUPMask(NumElems, Mask);
6088 IsUnary = true;
6089 break;
6090 case X86ISD::VPERMIL2: {
6091 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6092 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6093 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6094 SDValue MaskNode = N.getOperand(2);
6095 SDValue CtrlNode = N.getOperand(3);
6096 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
6097 unsigned CtrlImm = CtrlOp->getZExtValue();
6098 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6099 RawUndefs)) {
6100 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
6101 Mask);
6102 break;
6103 }
6104 }
6105 return false;
6106 }
6107 case X86ISD::VPPERM: {
6108 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6109 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6110 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6111 SDValue MaskNode = N.getOperand(2);
6112 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6113 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
6114 break;
6115 }
6116 return false;
6117 }
6118 case X86ISD::VPERMV: {
6119 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6120 IsUnary = true;
6121 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
6122 Ops.push_back(N.getOperand(1));
6123 SDValue MaskNode = N.getOperand(0);
6124 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6125 RawUndefs)) {
6126 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
6127 break;
6128 }
6129 return false;
6130 }
6131 case X86ISD::VPERMV3: {
6132 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6133 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
6134 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
6135 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
6136 Ops.push_back(N.getOperand(0));
6137 Ops.push_back(N.getOperand(2));
6138 SDValue MaskNode = N.getOperand(1);
6139 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6140 RawUndefs)) {
6141 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
6142 break;
6143 }
6144 return false;
6145 }
6146 case X86ISD::COMPRESS: {
6147 SDValue CmpVec = N.getOperand(0);
6148 SDValue PassThru = N.getOperand(1);
6149 SDValue CmpMask = N.getOperand(2);
6150 APInt UndefElts;
6151 SmallVector<APInt> EltBits;
6152 if (!getTargetConstantBitsFromNode(CmpMask, 1, UndefElts, EltBits))
6153 return false;
6154 assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
6155 "Illegal compression mask");
6156 for (unsigned I = 0; I != NumElems; ++I) {
6157 if (!EltBits[I].isZero())
6158 Mask.push_back(I);
6159 }
6160 while (Mask.size() != NumElems) {
6161 Mask.push_back(NumElems + Mask.size());
6162 }
6163 Ops.push_back(CmpVec);
6164 Ops.push_back(PassThru);
6165 return true;
6166 }
6167 case X86ISD::EXPAND: {
6168 SDValue ExpVec = N.getOperand(0);
6169 SDValue PassThru = N.getOperand(1);
6170 SDValue ExpMask = N.getOperand(2);
6171 APInt UndefElts;
6172 SmallVector<APInt> EltBits;
6173 if (!getTargetConstantBitsFromNode(ExpMask, 1, UndefElts, EltBits))
6174 return false;
6175 assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
6176 "Illegal expansion mask");
6177 unsigned ExpIndex = 0;
6178 for (unsigned I = 0; I != NumElems; ++I) {
6179 if (EltBits[I].isZero())
6180 Mask.push_back(I + NumElems);
6181 else
6182 Mask.push_back(ExpIndex++);
6183 }
6184 Ops.push_back(ExpVec);
6185 Ops.push_back(PassThru);
6186 return true;
6187 }
6188 default:
6189 llvm_unreachable("unknown target shuffle node");
6190 }
6191
6192 // Empty mask indicates the decode failed.
6193 if (Mask.empty())
6194 return false;
6195
6196 // Check if we're getting a shuffle mask with zero'd elements.
6197 if (!AllowSentinelZero && isAnyZero(Mask))
6198 return false;
6199
6200 // If we have a fake unary shuffle, the shuffle mask is spread across two
6201 // inputs that are actually the same node. Re-map the mask to always point
6202 // into the first input.
6203 if (IsFakeUnary)
6204 for (int &M : Mask)
6205 if (M >= (int)Mask.size())
6206 M -= Mask.size();
6207
6208 // If we didn't already add operands in the opcode-specific code, default to
6209 // adding 1 or 2 operands starting at 0.
6210 if (Ops.empty()) {
6211 Ops.push_back(N.getOperand(0));
6212 if (!IsUnary || IsFakeUnary)
6213 Ops.push_back(N.getOperand(1));
6214 }
6215
6216 return true;
6217}
6218
6219// Wrapper for getTargetShuffleMask with InUnary;
6220static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
6222 SmallVectorImpl<int> &Mask) {
6223 bool IsUnary;
6224 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
6225}
6226
6227/// Compute whether each element of a shuffle is zeroable.
6228///
6229/// A "zeroable" vector shuffle element is one which can be lowered to zero.
6230/// Either it is an undef element in the shuffle mask, the element of the input
6231/// referenced is undef, or the element of the input referenced is known to be
6232/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
6233/// as many lanes with this technique as possible to simplify the remaining
6234/// shuffle.
6236 SDValue V1, SDValue V2,
6237 APInt &KnownUndef, APInt &KnownZero) {
6238 int Size = Mask.size();
6239 KnownUndef = KnownZero = APInt::getZero(Size);
6240
6241 V1 = peekThroughBitcasts(V1);
6242 V2 = peekThroughBitcasts(V2);
6243
6244 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
6245 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
6246
6247 int VectorSizeInBits = V1.getValueSizeInBits();
6248 int ScalarSizeInBits = VectorSizeInBits / Size;
6249 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
6250
6251 for (int i = 0; i < Size; ++i) {
6252 int M = Mask[i];
6253 // Handle the easy cases.
6254 if (M < 0) {
6255 KnownUndef.setBit(i);
6256 continue;
6257 }
6258 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
6259 KnownZero.setBit(i);
6260 continue;
6261 }
6262
6263 // Determine shuffle input and normalize the mask.
6264 SDValue V = M < Size ? V1 : V2;
6265 M %= Size;
6266
6267 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
6268 if (V.getOpcode() != ISD::BUILD_VECTOR)
6269 continue;
6270
6271 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
6272 // the (larger) source element must be UNDEF/ZERO.
6273 if ((Size % V.getNumOperands()) == 0) {
6274 int Scale = Size / V->getNumOperands();
6275 SDValue Op = V.getOperand(M / Scale);
6276 if (Op.isUndef())
6277 KnownUndef.setBit(i);
6278 if (X86::isZeroNode(Op))
6279 KnownZero.setBit(i);
6280 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
6281 APInt Val = Cst->getAPIntValue();
6282 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6283 if (Val == 0)
6284 KnownZero.setBit(i);
6285 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6286 APInt Val = Cst->getValueAPF().bitcastToAPInt();
6287 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6288 if (Val == 0)
6289 KnownZero.setBit(i);
6290 }
6291 continue;
6292 }
6293
6294 // If the BUILD_VECTOR has more elements then all the (smaller) source
6295 // elements must be UNDEF or ZERO.
6296 if ((V.getNumOperands() % Size) == 0) {
6297 int Scale = V->getNumOperands() / Size;
6298 bool AllUndef = true;
6299 bool AllZero = true;
6300 for (int j = 0; j < Scale; ++j) {
6301 SDValue Op = V.getOperand((M * Scale) + j);
6302 AllUndef &= Op.isUndef();
6303 AllZero &= X86::isZeroNode(Op);
6304 }
6305 if (AllUndef)
6306 KnownUndef.setBit(i);
6307 if (AllZero)
6308 KnownZero.setBit(i);
6309 continue;
6310 }
6311 }
6312}
6313
6314/// Decode a target shuffle mask and inputs and see if any values are
6315/// known to be undef or zero from their inputs.
6316/// Returns true if the target shuffle mask was decoded.
6317/// FIXME: Merge this with computeZeroableShuffleElements?
6320 APInt &KnownUndef, APInt &KnownZero) {
6321 bool IsUnary;
6322 if (!isTargetShuffle(N.getOpcode()))
6323 return false;
6324
6325 MVT VT = N.getSimpleValueType();
6326 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
6327 return false;
6328
6329 int Size = Mask.size();
6330 SDValue V1 = Ops[0];
6331 SDValue V2 = IsUnary ? V1 : Ops[1];
6332 KnownUndef = KnownZero = APInt::getZero(Size);
6333
6334 V1 = peekThroughBitcasts(V1);
6335 V2 = peekThroughBitcasts(V2);
6336
6337 assert((VT.getSizeInBits() % Size) == 0 &&
6338 "Illegal split of shuffle value type");
6339 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
6340
6341 // Extract known constant input data.
6342 APInt UndefSrcElts[2];
6343 SmallVector<APInt, 32> SrcEltBits[2];
6344 bool IsSrcConstant[2] = {
6345 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6346 SrcEltBits[0], /*AllowWholeUndefs*/ true,
6347 /*AllowPartialUndefs*/ false),
6348 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6349 SrcEltBits[1], /*AllowWholeUndefs*/ true,
6350 /*AllowPartialUndefs*/ false)};
6351
6352 for (int i = 0; i < Size; ++i) {
6353 int M = Mask[i];
6354
6355 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6356 if (M < 0) {
6357 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
6358 if (SM_SentinelUndef == M)
6359 KnownUndef.setBit(i);
6360 if (SM_SentinelZero == M)
6361 KnownZero.setBit(i);
6362 continue;
6363 }
6364
6365 // Determine shuffle input and normalize the mask.
6366 unsigned SrcIdx = M / Size;
6367 SDValue V = M < Size ? V1 : V2;
6368 M %= Size;
6369
6370 // We are referencing an UNDEF input.
6371 if (V.isUndef()) {
6372 KnownUndef.setBit(i);
6373 continue;
6374 }
6375
6376 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6377 // TODO: We currently only set UNDEF for integer types - floats use the same
6378 // registers as vectors and many of the scalar folded loads rely on the
6379 // SCALAR_TO_VECTOR pattern.
6380 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6381 (Size % V.getValueType().getVectorNumElements()) == 0) {
6382 int Scale = Size / V.getValueType().getVectorNumElements();
6383 int Idx = M / Scale;
6384 if (Idx != 0 && !VT.isFloatingPoint())
6385 KnownUndef.setBit(i);
6386 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6387 KnownZero.setBit(i);
6388 continue;
6389 }
6390
6391 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6392 // base vectors.
6393 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6394 SDValue Vec = V.getOperand(0);
6395 int NumVecElts = Vec.getValueType().getVectorNumElements();
6396 if (Vec.isUndef() && Size == NumVecElts) {
6397 int Idx = V.getConstantOperandVal(2);
6398 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6399 if (M < Idx || (Idx + NumSubElts) <= M)
6400 KnownUndef.setBit(i);
6401 }
6402 continue;
6403 }
6404
6405 // Attempt to extract from the source's constant bits.
6406 if (IsSrcConstant[SrcIdx]) {
6407 if (UndefSrcElts[SrcIdx][M])
6408 KnownUndef.setBit(i);
6409 else if (SrcEltBits[SrcIdx][M] == 0)
6410 KnownZero.setBit(i);
6411 }
6412 }
6413
6414 assert(VT.getVectorNumElements() == (unsigned)Size &&
6415 "Different mask size from vector size!");
6416 return true;
6417}
6418
6419// Replace target shuffle mask elements with known undef/zero sentinels.
6421 const APInt &KnownUndef,
6422 const APInt &KnownZero,
6423 bool ResolveKnownZeros= true) {
6424 unsigned NumElts = Mask.size();
6425 assert(KnownUndef.getBitWidth() == NumElts &&
6426 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6427
6428 for (unsigned i = 0; i != NumElts; ++i) {
6429 if (KnownUndef[i])
6430 Mask[i] = SM_SentinelUndef;
6431 else if (ResolveKnownZeros && KnownZero[i])
6432 Mask[i] = SM_SentinelZero;
6433 }
6434}
6435
6436// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6438 APInt &KnownUndef,
6439 APInt &KnownZero) {
6440 unsigned NumElts = Mask.size();
6441 KnownUndef = KnownZero = APInt::getZero(NumElts);
6442
6443 for (unsigned i = 0; i != NumElts; ++i) {
6444 int M = Mask[i];
6445 if (SM_SentinelUndef == M)
6446 KnownUndef.setBit(i);
6447 if (SM_SentinelZero == M)
6448 KnownZero.setBit(i);
6449 }
6450}
6451
6452// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6454 SDValue Cond, bool IsBLENDV = false) {
6455 EVT CondVT = Cond.getValueType();
6456 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6457 unsigned NumElts = CondVT.getVectorNumElements();
6458
6459 APInt UndefElts;
6460 SmallVector<APInt, 32> EltBits;
6461 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6462 /*AllowWholeUndefs*/ true,
6463 /*AllowPartialUndefs*/ false))
6464 return false;
6465
6466 Mask.resize(NumElts, SM_SentinelUndef);
6467
6468 for (int i = 0; i != (int)NumElts; ++i) {
6469 Mask[i] = i;
6470 // Arbitrarily choose from the 2nd operand if the select condition element
6471 // is undef.
6472 // TODO: Can we do better by matching patterns such as even/odd?
6473 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6474 (IsBLENDV && EltBits[i].isNonNegative()))
6475 Mask[i] += NumElts;
6476 }
6477
6478 return true;
6479}
6480
6481// Forward declaration (for getFauxShuffleMask recursive check).
6482static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6485 const SelectionDAG &DAG, unsigned Depth,
6486 bool ResolveKnownElts);
6487
6488// Attempt to decode ops that could be represented as a shuffle mask.
6489// The decoded shuffle mask may contain a different number of elements to the
6490// destination value type.
6491// TODO: Merge into getTargetShuffleInputs()
6492static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6495 const SelectionDAG &DAG, unsigned Depth,
6496 bool ResolveKnownElts) {
6497 Mask.clear();
6498 Ops.clear();
6499
6500 MVT VT = N.getSimpleValueType();
6501 unsigned NumElts = VT.getVectorNumElements();
6502 unsigned NumSizeInBits = VT.getSizeInBits();
6503 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6504 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6505 return false;
6506 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6507 unsigned NumSizeInBytes = NumSizeInBits / 8;
6508 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6509
6510 unsigned Opcode = N.getOpcode();
6511 switch (Opcode) {
6512 case ISD::VECTOR_SHUFFLE: {
6513 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6514 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6515 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6516 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6517 Ops.push_back(N.getOperand(0));
6518 Ops.push_back(N.getOperand(1));
6519 return true;
6520 }
6521 return false;
6522 }
6523 case ISD::AND:
6524 case X86ISD::ANDNP: {
6525 // Attempt to decode as a per-byte mask.
6526 APInt UndefElts;
6527 SmallVector<APInt, 32> EltBits;
6528 SDValue N0 = N.getOperand(0);
6529 SDValue N1 = N.getOperand(1);
6530 bool IsAndN = (X86ISD::ANDNP == Opcode);
6531 uint64_t ZeroMask = IsAndN ? 255 : 0;
6532 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6533 /*AllowWholeUndefs*/ false,
6534 /*AllowPartialUndefs*/ false))
6535 return false;
6536 // We can't assume an undef src element gives an undef dst - the other src
6537 // might be zero.
6538 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6539 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6540 const APInt &ByteBits = EltBits[i];
6541 if (ByteBits != 0 && ByteBits != 255)
6542 return false;
6543 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6544 }
6545 Ops.push_back(IsAndN ? N1 : N0);
6546 return true;
6547 }
6548 case ISD::OR: {
6549 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6550 // is a valid shuffle index.
6551 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6552 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6553 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6554 return false;
6555
6556 SmallVector<int, 64> SrcMask0, SrcMask1;
6557 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6560 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6561 Depth + 1, true) ||
6562 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6563 Depth + 1, true))
6564 return false;
6565
6566 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6567 SmallVector<int, 64> Mask0, Mask1;
6568 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6569 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6570 for (int i = 0; i != (int)MaskSize; ++i) {
6571 // NOTE: Don't handle demanded SM_SentinelUndef, as we can end up in
6572 // infinite loops converting between OR and BLEND shuffles due to
6573 // canWidenShuffleElements merging away undef elements, meaning we
6574 // fail to recognise the OR as the undef element isn't known zero.
6575 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6576 Mask.push_back(SM_SentinelZero);
6577 else if (Mask1[i] == SM_SentinelZero)
6578 Mask.push_back(i);
6579 else if (Mask0[i] == SM_SentinelZero)
6580 Mask.push_back(i + MaskSize);
6581 else if (MaskSize == NumElts && !DemandedElts[i])
6582 Mask.push_back(SM_SentinelUndef);
6583 else
6584 return false;
6585 }
6586 Ops.push_back(N.getOperand(0));
6587 Ops.push_back(N.getOperand(1));
6588 return true;
6589 }
6590 case ISD::CONCAT_VECTORS: {
6591 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6592 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6593 if (NumBitsPerElt == 64) {
6594 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6595 for (unsigned M = 0; M != NumSubElts; ++M)
6596 Mask.push_back((I * NumElts) + M);
6597 Ops.push_back(N.getOperand(I));
6598 }
6599 return true;
6600 }
6601 return false;
6602 }
6603 case ISD::INSERT_SUBVECTOR: {
6604 SDValue Src = N.getOperand(0);
6605 SDValue Sub = N.getOperand(1);
6606 EVT SubVT = Sub.getValueType();
6607 unsigned NumSubElts = SubVT.getVectorNumElements();
6608 uint64_t InsertIdx = N.getConstantOperandVal(2);
6609 // Subvector isn't demanded - just return the base vector.
6610 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6611 Mask.resize(NumElts);
6612 std::iota(Mask.begin(), Mask.end(), 0);
6613 Ops.push_back(Src);
6614 return true;
6615 }
6616 // Handle CONCAT(SUB0, SUB1).
6617 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6618 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6619 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6620 Src.getOperand(0).isUndef() &&
6621 Src.getOperand(1).getValueType() == SubVT &&
6622 Src.getConstantOperandVal(2) == 0 &&
6623 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6624 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6625 Mask.resize(NumElts);
6626 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6627 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6628 Ops.push_back(Src.getOperand(1));
6629 Ops.push_back(Sub);
6630 return true;
6631 }
6632 // Handle INSERT_SUBVECTOR(UNDEF, SUB, IDX) iff IDX != 0
6633 if (InsertIdx != 0 && Src.isUndef() &&
6635 Mask.assign(NumElts, SM_SentinelUndef);
6636 std::iota(Mask.begin() + InsertIdx, Mask.begin() + InsertIdx + NumSubElts,
6637 0);
6638 Ops.push_back(Sub);
6639 return true;
6640 }
6641 if (!N->isOnlyUserOf(Sub.getNode()))
6642 return false;
6643
6644 SmallVector<int, 64> SubMask;
6645 SmallVector<SDValue, 2> SubInputs;
6647 EVT SubSrcVT = SubSrc.getValueType();
6648 if (!SubSrcVT.isVector())
6649 return false;
6650
6651 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6652 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6653 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6654 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6655 SDValue SubSrcSrc = SubSrc.getOperand(0);
6656 unsigned NumSubSrcSrcElts =
6657 SubSrcSrc.getValueType().getVectorNumElements();
6658 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6659 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6660 "Subvector valuetype mismatch");
6661 InsertIdx *= (MaxElts / NumElts);
6662 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6663 NumSubElts *= (MaxElts / NumElts);
6664 bool SrcIsUndef = Src.isUndef();
6665 for (int i = 0; i != (int)MaxElts; ++i)
6666 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6667 for (int i = 0; i != (int)NumSubElts; ++i)
6668 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6669 if (!SrcIsUndef)
6670 Ops.push_back(Src);
6671 Ops.push_back(SubSrcSrc);
6672 return true;
6673 }
6674
6675 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6676 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6677 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6678 Depth + 1, ResolveKnownElts))
6679 return false;
6680
6681 // Subvector shuffle inputs must not be larger than the subvector.
6682 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6683 return SubVT.getFixedSizeInBits() <
6684 SubInput.getValueSizeInBits().getFixedValue();
6685 }))
6686 return false;
6687
6688 if (SubMask.size() != NumSubElts) {
6689 assert(((SubMask.size() % NumSubElts) == 0 ||
6690 (NumSubElts % SubMask.size()) == 0) &&
6691 "Illegal submask scale");
6692 if ((NumSubElts % SubMask.size()) == 0) {
6693 int Scale = NumSubElts / SubMask.size();
6694 SmallVector<int, 64> ScaledSubMask;
6695 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6696 SubMask = ScaledSubMask;
6697 } else {
6698 int Scale = SubMask.size() / NumSubElts;
6699 NumSubElts = SubMask.size();
6700 NumElts *= Scale;
6701 InsertIdx *= Scale;
6702 }
6703 }
6704 Ops.push_back(Src);
6705 Ops.append(SubInputs.begin(), SubInputs.end());
6706 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6707 Mask.append(NumElts, SM_SentinelZero);
6708 else
6709 for (int i = 0; i != (int)NumElts; ++i)
6710 Mask.push_back(i);
6711 for (int i = 0; i != (int)NumSubElts; ++i) {
6712 int M = SubMask[i];
6713 if (0 <= M) {
6714 int InputIdx = M / NumSubElts;
6715 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6716 }
6717 Mask[i + InsertIdx] = M;
6718 }
6719 return true;
6720 }
6721 case X86ISD::PINSRB:
6722 case X86ISD::PINSRW:
6725 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6726 // vector, for matching src/dst vector types.
6727 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6728
6729 unsigned DstIdx = 0;
6730 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6731 // Check we have an in-range constant insertion index.
6732 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6733 N.getConstantOperandAPInt(2).uge(NumElts))
6734 return false;
6735 DstIdx = N.getConstantOperandVal(2);
6736
6737 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6738 if (X86::isZeroNode(Scl)) {
6739 Ops.push_back(N.getOperand(0));
6740 for (unsigned i = 0; i != NumElts; ++i)
6741 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6742 return true;
6743 }
6744 }
6745
6746 // Peek through trunc/aext/zext/bitcast.
6747 // TODO: aext shouldn't require SM_SentinelZero padding.
6748 // TODO: handle shift of scalars.
6749 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6750 while (Scl.getOpcode() == ISD::TRUNCATE ||
6751 Scl.getOpcode() == ISD::ANY_EXTEND ||
6752 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6753 (Scl.getOpcode() == ISD::BITCAST &&
6756 Scl = Scl.getOperand(0);
6757 MinBitsPerElt =
6758 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6759 }
6760 if ((MinBitsPerElt % 8) != 0)
6761 return false;
6762
6763 // Attempt to find the source vector the scalar was extracted from.
6764 SDValue SrcExtract;
6765 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6766 Scl.getOpcode() == X86ISD::PEXTRW ||
6767 Scl.getOpcode() == X86ISD::PEXTRB) &&
6768 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6769 SrcExtract = Scl;
6770 }
6771 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6772 return false;
6773
6774 SDValue SrcVec = SrcExtract.getOperand(0);
6775 EVT SrcVT = SrcVec.getValueType();
6776 if (!SrcVT.getScalarType().isByteSized())
6777 return false;
6778 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6779 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6780 unsigned DstByte = DstIdx * NumBytesPerElt;
6781 MinBitsPerElt =
6782 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6783
6784 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6785 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6786 Ops.push_back(SrcVec);
6787 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6788 } else {
6789 Ops.push_back(SrcVec);
6790 Ops.push_back(N.getOperand(0));
6791 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6792 Mask.push_back(NumSizeInBytes + i);
6793 }
6794
6795 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6796 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6797 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6798 Mask[DstByte + i] = SrcByte + i;
6799 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6800 Mask[DstByte + i] = SM_SentinelZero;
6801 return true;
6802 }
6803 case X86ISD::PACKSS:
6804 case X86ISD::PACKUS: {
6805 SDValue N0 = N.getOperand(0);
6806 SDValue N1 = N.getOperand(1);
6807 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6808 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6809 "Unexpected input value type");
6810
6811 APInt EltsLHS, EltsRHS;
6812 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6813
6814 // If we know input saturation won't happen (or we don't care for particular
6815 // lanes), we can treat this as a truncation shuffle.
6816 bool Offset0 = false, Offset1 = false;
6817 if (Opcode == X86ISD::PACKSS) {
6818 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6819 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6820 (!(N1.isUndef() || EltsRHS.isZero()) &&
6821 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6822 return false;
6823 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6824 // PACKSS then it was likely being used for sign-extension for a
6825 // truncation, so just peek through and adjust the mask accordingly.
6826 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6827 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6828 Offset0 = true;
6829 N0 = N0.getOperand(0);
6830 }
6831 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6832 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6833 Offset1 = true;
6834 N1 = N1.getOperand(0);
6835 }
6836 } else {
6837 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6838 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6839 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6840 (!(N1.isUndef() || EltsRHS.isZero()) &&
6841 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6842 return false;
6843 }
6844
6845 bool IsUnary = (N0 == N1);
6846
6847 Ops.push_back(N0);
6848 if (!IsUnary)
6849 Ops.push_back(N1);
6850
6851 createPackShuffleMask(VT, Mask, IsUnary);
6852
6853 if (Offset0 || Offset1) {
6854 for (int &M : Mask)
6855 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6856 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6857 ++M;
6858 }
6859 return true;
6860 }
6861 case ISD::VSELECT:
6862 case X86ISD::BLENDV: {
6863 SDValue Cond = N.getOperand(0);
6864 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6865 Ops.push_back(N.getOperand(1));
6866 Ops.push_back(N.getOperand(2));
6867 return true;
6868 }
6869 return false;
6870 }
6871 case X86ISD::VTRUNC: {
6872 SDValue Src = N.getOperand(0);
6873 EVT SrcVT = Src.getValueType();
6874 if (SrcVT.getSizeInBits() != NumSizeInBits)
6875 return false;
6876 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6877 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6878 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6879 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6880 for (unsigned i = 0; i != NumSrcElts; ++i)
6881 Mask.push_back(i * Scale);
6882 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6883 Ops.push_back(Src);
6884 return true;
6885 }
6886 case ISD::SHL:
6887 case ISD::SRL: {
6888 APInt UndefElts;
6889 SmallVector<APInt, 32> EltBits;
6890 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6891 UndefElts, EltBits,
6892 /*AllowWholeUndefs*/ true,
6893 /*AllowPartialUndefs*/ false))
6894 return false;
6895
6896 // We can only decode 'whole byte' bit shifts as shuffles.
6897 for (unsigned I = 0; I != NumElts; ++I)
6898 if (DemandedElts[I] && !UndefElts[I] &&
6899 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6900 return false;
6901
6902 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6903 Ops.push_back(N.getOperand(0));
6904
6905 for (unsigned I = 0; I != NumElts; ++I) {
6906 if (!DemandedElts[I] || UndefElts[I])
6907 continue;
6908 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6909 unsigned Lo = I * NumBytesPerElt;
6910 unsigned Hi = Lo + NumBytesPerElt;
6911 // Clear mask to all zeros and insert the shifted byte indices.
6912 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6913 if (ISD::SHL == Opcode)
6914 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6915 else
6916 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6917 Lo + ByteShift);
6918 }
6919 return true;
6920 }
6921 case X86ISD::VSHLI:
6922 case X86ISD::VSRLI: {
6923 uint64_t ShiftVal = N.getConstantOperandVal(1);
6924 // Out of range bit shifts are guaranteed to be zero.
6925 if (NumBitsPerElt <= ShiftVal) {
6926 Mask.append(NumElts, SM_SentinelZero);
6927 return true;
6928 }
6929
6930 // We can only decode 'whole byte' bit shifts as shuffles.
6931 if ((ShiftVal % 8) != 0)
6932 break;
6933
6934 uint64_t ByteShift = ShiftVal / 8;
6935 Ops.push_back(N.getOperand(0));
6936
6937 // Clear mask to all zeros and insert the shifted byte indices.
6938 Mask.append(NumSizeInBytes, SM_SentinelZero);
6939
6940 if (X86ISD::VSHLI == Opcode) {
6941 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6942 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6943 Mask[i + j] = i + j - ByteShift;
6944 } else {
6945 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6946 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6947 Mask[i + j - ByteShift] = i + j;
6948 }
6949 return true;
6950 }
6951 case ISD::ROTL:
6952 case ISD::ROTR: {
6953 APInt UndefElts;
6954 SmallVector<APInt, 32> EltBits;
6955 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6956 UndefElts, EltBits,
6957 /*AllowWholeUndefs*/ true,
6958 /*AllowPartialUndefs*/ false))
6959 return false;
6960
6961 // We can only decode 'whole byte' bit rotates as shuffles.
6962 for (unsigned I = 0; I != NumElts; ++I)
6963 if (DemandedElts[I] && !UndefElts[I] &&
6964 (EltBits[I].urem(NumBitsPerElt) % 8) != 0)
6965 return false;
6966
6967 Ops.push_back(N.getOperand(0));
6968 for (unsigned I = 0; I != NumElts; ++I) {
6969 if (!DemandedElts[I] || UndefElts[I]) {
6970 Mask.append(NumBytesPerElt, SM_SentinelUndef);
6971 continue;
6972 }
6973 int Offset = EltBits[I].urem(NumBitsPerElt) / 8;
6974 Offset = (ISD::ROTL == Opcode ? NumBytesPerElt - Offset : Offset);
6975 int BaseIdx = I * NumBytesPerElt;
6976 for (int J = 0; J != (int)NumBytesPerElt; ++J) {
6977 Mask.push_back(BaseIdx + ((Offset + J) % NumBytesPerElt));
6978 }
6979 }
6980 return true;
6981 }
6982 case X86ISD::VROTLI:
6983 case X86ISD::VROTRI: {
6984 // We can only decode 'whole byte' bit rotates as shuffles.
6985 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6986 if ((RotateVal % 8) != 0)
6987 return false;
6988 Ops.push_back(N.getOperand(0));
6989 int Offset = RotateVal / 8;
6990 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6991 for (int i = 0; i != (int)NumElts; ++i) {
6992 int BaseIdx = i * NumBytesPerElt;
6993 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6994 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6995 }
6996 }
6997 return true;
6998 }
6999 case X86ISD::VBROADCAST: {
7000 SDValue Src = N.getOperand(0);
7001 if (!Src.getSimpleValueType().isVector()) {
7002 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7003 !isNullConstant(Src.getOperand(1)) ||
7004 Src.getOperand(0).getValueType().getScalarType() !=
7005 VT.getScalarType())
7006 return false;
7007 Src = Src.getOperand(0);
7008 }
7009 Ops.push_back(Src);
7010 Mask.append(NumElts, 0);
7011 return true;
7012 }
7014 SDValue Src = N.getOperand(0);
7015 EVT SrcVT = Src.getValueType();
7016 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7017
7018 // Extended source must be a simple vector.
7019 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7020 (NumBitsPerSrcElt % 8) != 0)
7021 return false;
7022
7023 // We can only handle all-signbits extensions.
7024 APInt DemandedSrcElts =
7025 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
7026 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
7027 return false;
7028
7029 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
7030 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
7031 for (unsigned I = 0; I != NumElts; ++I)
7032 Mask.append(Scale, I);
7033 Ops.push_back(Src);
7034 return true;
7035 }
7036 case ISD::ZERO_EXTEND:
7037 case ISD::ANY_EXTEND:
7040 SDValue Src = N.getOperand(0);
7041 EVT SrcVT = Src.getValueType();
7042
7043 // Extended source must be a simple vector.
7044 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7045 (SrcVT.getScalarSizeInBits() % 8) != 0)
7046 return false;
7047
7048 bool IsAnyExtend =
7049 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7050 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
7051 IsAnyExtend, Mask);
7052 Ops.push_back(Src);
7053 return true;
7054 }
7055 }
7056
7057 return false;
7058}
7059
7060/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7062 SmallVectorImpl<int> &Mask) {
7063 int MaskWidth = Mask.size();
7064 SmallVector<SDValue, 16> UsedInputs;
7065 for (int i = 0, e = Inputs.size(); i < e; ++i) {
7066 int lo = UsedInputs.size() * MaskWidth;
7067 int hi = lo + MaskWidth;
7068
7069 // Strip UNDEF input usage.
7070 if (Inputs[i].isUndef())
7071 for (int &M : Mask)
7072 if ((lo <= M) && (M < hi))
7073 M = SM_SentinelUndef;
7074
7075 // Check for unused inputs.
7076 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7077 for (int &M : Mask)
7078 if (lo <= M)
7079 M -= MaskWidth;
7080 continue;
7081 }
7082
7083 // Check for repeated inputs.
7084 bool IsRepeat = false;
7085 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7086 if (UsedInputs[j] != Inputs[i])
7087 continue;
7088 for (int &M : Mask)
7089 if (lo <= M)
7090 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7091 IsRepeat = true;
7092 break;
7093 }
7094 if (IsRepeat)
7095 continue;
7096
7097 UsedInputs.push_back(Inputs[i]);
7098 }
7099 Inputs = std::move(UsedInputs);
7100}
7101
7102/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7103/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7104/// Returns true if the target shuffle mask was decoded.
7105static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7108 APInt &KnownUndef, APInt &KnownZero,
7109 const SelectionDAG &DAG, unsigned Depth,
7110 bool ResolveKnownElts) {
7112 return false; // Limit search depth.
7113
7114 EVT VT = Op.getValueType();
7115 if (!VT.isSimple() || !VT.isVector())
7116 return false;
7117
7118 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
7119 if (ResolveKnownElts)
7120 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7121 return true;
7122 }
7123 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7124 ResolveKnownElts)) {
7125 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7126 return true;
7127 }
7128 return false;
7129}
7130
7131static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7134 const SelectionDAG &DAG, unsigned Depth,
7135 bool ResolveKnownElts) {
7136 APInt KnownUndef, KnownZero;
7137 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7138 KnownZero, DAG, Depth, ResolveKnownElts);
7139}
7140
7143 const SelectionDAG &DAG, unsigned Depth = 0,
7144 bool ResolveKnownElts = true) {
7145 EVT VT = Op.getValueType();
7146 if (!VT.isSimple() || !VT.isVector())
7147 return false;
7148
7149 unsigned NumElts = Op.getValueType().getVectorNumElements();
7150 APInt DemandedElts = APInt::getAllOnes(NumElts);
7151 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
7152 ResolveKnownElts);
7153}
7154
7155// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
7156static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
7157 EVT MemVT, MemSDNode *Mem, unsigned Offset,
7158 SelectionDAG &DAG) {
7159 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
7160 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
7161 "Unknown broadcast load type");
7162
7163 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
7164 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
7165 return SDValue();
7166
7167 SDValue Ptr = DAG.getMemBasePlusOffset(Mem->getBasePtr(),
7169 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7170 SDValue Ops[] = {Mem->getChain(), Ptr};
7171 SDValue BcstLd = DAG.getMemIntrinsicNode(
7172 Opcode, DL, Tys, Ops, MemVT,
7174 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
7175 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
7176 return BcstLd;
7177}
7178
7179/// Returns the scalar element that will make up the i'th
7180/// element of the result of the vector shuffle.
7181static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
7182 SelectionDAG &DAG, unsigned Depth) {
7184 return SDValue(); // Limit search depth.
7185
7186 EVT VT = Op.getValueType();
7187 unsigned Opcode = Op.getOpcode();
7188 unsigned NumElems = VT.getVectorNumElements();
7189
7190 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
7191 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
7192 int Elt = SV->getMaskElt(Index);
7193
7194 if (Elt < 0)
7195 return DAG.getUNDEF(VT.getVectorElementType());
7196
7197 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
7198 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
7199 }
7200
7201 // Recurse into target specific vector shuffles to find scalars.
7202 if (isTargetShuffle(Opcode)) {
7203 MVT ShufVT = VT.getSimpleVT();
7204 MVT ShufSVT = ShufVT.getVectorElementType();
7205 int NumElems = (int)ShufVT.getVectorNumElements();
7206 SmallVector<int, 16> ShuffleMask;
7208 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
7209 return SDValue();
7210
7211 int Elt = ShuffleMask[Index];
7212 if (Elt == SM_SentinelZero)
7213 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
7214 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
7215 if (Elt == SM_SentinelUndef)
7216 return DAG.getUNDEF(ShufSVT);
7217
7218 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
7219 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
7220 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
7221 }
7222
7223 // Recurse into insert_subvector base/sub vector to find scalars.
7224 if (Opcode == ISD::INSERT_SUBVECTOR) {
7225 SDValue Vec = Op.getOperand(0);
7226 SDValue Sub = Op.getOperand(1);
7227 uint64_t SubIdx = Op.getConstantOperandVal(2);
7228 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
7229
7230 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
7231 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
7232 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
7233 }
7234
7235 // Recurse into concat_vectors sub vector to find scalars.
7236 if (Opcode == ISD::CONCAT_VECTORS) {
7237 EVT SubVT = Op.getOperand(0).getValueType();
7238 unsigned NumSubElts = SubVT.getVectorNumElements();
7239 uint64_t SubIdx = Index / NumSubElts;
7240 uint64_t SubElt = Index % NumSubElts;
7241 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
7242 }
7243
7244 // Recurse into extract_subvector src vector to find scalars.
7245 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
7246 SDValue Src = Op.getOperand(0);
7247 uint64_t SrcIdx = Op.getConstantOperandVal(1);
7248 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
7249 }
7250
7251 // We only peek through bitcasts of the same vector width.
7252 if (Opcode == ISD::BITCAST) {
7253 SDValue Src = Op.getOperand(0);
7254 EVT SrcVT = Src.getValueType();
7255 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
7256 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
7257 return SDValue();
7258 }
7259
7260 // Actual nodes that may contain scalar elements
7261
7262 // For insert_vector_elt - either return the index matching scalar or recurse
7263 // into the base vector.
7264 if (Opcode == ISD::INSERT_VECTOR_ELT &&
7265 isa<ConstantSDNode>(Op.getOperand(2))) {
7266 if (Op.getConstantOperandAPInt(2) == Index)
7267 return Op.getOperand(1);
7268 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
7269 }
7270
7271 if (Opcode == ISD::SCALAR_TO_VECTOR)
7272 return (Index == 0) ? Op.getOperand(0)
7273 : DAG.getUNDEF(VT.getVectorElementType());
7274
7275 if (Opcode == ISD::BUILD_VECTOR)
7276 return Op.getOperand(Index);
7277
7278 return SDValue();
7279}
7280
7281// Use PINSRB/PINSRW/PINSRD to create a build vector.
7283 const APInt &NonZeroMask,
7284 unsigned NumNonZero, unsigned NumZero,
7285 SelectionDAG &DAG,
7286 const X86Subtarget &Subtarget) {
7287 MVT VT = Op.getSimpleValueType();
7288 unsigned NumElts = VT.getVectorNumElements();
7289 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
7290 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
7291 "Illegal vector insertion");
7292
7293 SDValue V;
7294 bool First = true;
7295
7296 for (unsigned i = 0; i < NumElts; ++i) {
7297 bool IsNonZero = NonZeroMask[i];
7298 if (!IsNonZero)
7299 continue;
7300
7301 // If the build vector contains zeros or our first insertion is not the
7302 // first index then insert into zero vector to break any register
7303 // dependency else use SCALAR_TO_VECTOR.
7304 if (First) {
7305 First = false;
7306 if (NumZero || 0 != i)
7307 V = getZeroVector(VT, Subtarget, DAG, DL);
7308 else {
7309 assert(0 == i && "Expected insertion into zero-index");
7310 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7311 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
7312 V = DAG.getBitcast(VT, V);
7313 continue;
7314 }
7315 }
7316 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
7317 DAG.getVectorIdxConstant(i, DL));
7318 }
7319
7320 return V;
7321}
7322
7323/// Custom lower build_vector of v16i8.
7325 const APInt &NonZeroMask,
7326 unsigned NumNonZero, unsigned NumZero,
7327 SelectionDAG &DAG,
7328 const X86Subtarget &Subtarget) {
7329 if (NumNonZero > 8 && !Subtarget.hasSSE41())
7330 return SDValue();
7331
7332 // SSE4.1 - use PINSRB to insert each byte directly.
7333 if (Subtarget.hasSSE41())
7334 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
7335 DAG, Subtarget);
7336
7337 SDValue V;
7338
7339 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
7340 // If both the lowest 16-bits are non-zero, then convert to MOVD.
7341 if (!NonZeroMask.extractBits(2, 0).isZero() &&
7342 !NonZeroMask.extractBits(2, 2).isZero()) {
7343 for (unsigned I = 0; I != 4; ++I) {
7344 if (!NonZeroMask[I])
7345 continue;
7346 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
7347 if (I != 0)
7348 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
7349 DAG.getConstant(I * 8, DL, MVT::i8));
7350 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
7351 }
7352 assert(V && "Failed to fold v16i8 vector to zero");
7353 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
7354 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
7355 V = DAG.getBitcast(MVT::v8i16, V);
7356 }
7357 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
7358 bool ThisIsNonZero = NonZeroMask[i];
7359 bool NextIsNonZero = NonZeroMask[i + 1];
7360 if (!ThisIsNonZero && !NextIsNonZero)
7361 continue;
7362
7363 SDValue Elt;
7364 if (ThisIsNonZero) {
7365 if (NumZero || NextIsNonZero)
7366 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7367 else
7368 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7369 }
7370
7371 if (NextIsNonZero) {
7372 SDValue NextElt = Op.getOperand(i + 1);
7373 if (i == 0 && NumZero)
7374 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
7375 else
7376 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
7377 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
7378 DAG.getConstant(8, DL, MVT::i8));
7379 if (ThisIsNonZero)
7380 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
7381 else
7382 Elt = NextElt;
7383 }
7384
7385 // If our first insertion is not the first index or zeros are needed, then
7386 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
7387 // elements undefined).
7388 if (!V) {
7389 if (i != 0 || NumZero)
7390 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
7391 else {
7392 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
7393 V = DAG.getBitcast(MVT::v8i16, V);
7394 continue;
7395 }
7396 }
7397 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7398 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
7399 DAG.getVectorIdxConstant(i / 2, DL));
7400 }
7401
7402 return DAG.getBitcast(MVT::v16i8, V);
7403}
7404
7405/// Custom lower build_vector of v8i16.
7407 const APInt &NonZeroMask,
7408 unsigned NumNonZero, unsigned NumZero,
7409 SelectionDAG &DAG,
7410 const X86Subtarget &Subtarget) {
7411 if (NumNonZero > 4 && !Subtarget.hasSSE41())
7412 return SDValue();
7413
7414 // Use PINSRW to insert each byte directly.
7415 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
7416 Subtarget);
7417}
7418
7419/// Custom lower build_vector of v4i32 or v4f32.
7421 SelectionDAG &DAG,
7422 const X86Subtarget &Subtarget) {
7423 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7424 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7425 // Because we're creating a less complicated build vector here, we may enable
7426 // further folding of the MOVDDUP via shuffle transforms.
7427 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7428 Op.getOperand(0) == Op.getOperand(2) &&
7429 Op.getOperand(1) == Op.getOperand(3) &&
7430 Op.getOperand(0) != Op.getOperand(1)) {
7431 MVT VT = Op.getSimpleValueType();
7432 MVT EltVT = VT.getVectorElementType();
7433 // Create a new build vector with the first 2 elements followed by undef
7434 // padding, bitcast to v2f64, duplicate, and bitcast back.
7435 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7436 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7437 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7438 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7439 return DAG.getBitcast(VT, Dup);
7440 }
7441
7442 // Find all zeroable elements.
7443 std::bitset<4> Zeroable, Undefs;
7444 for (int i = 0; i < 4; ++i) {
7445 SDValue Elt = Op.getOperand(i);
7446 Undefs[i] = Elt.isUndef();
7447 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7448 }
7449 assert(Zeroable.size() - Zeroable.count() > 1 &&
7450 "We expect at least two non-zero elements!");
7451
7452 // We only know how to deal with build_vector nodes where elements are either
7453 // zeroable or extract_vector_elt with constant index.
7454 SDValue FirstNonZero;
7455 unsigned FirstNonZeroIdx;
7456 for (unsigned i = 0; i < 4; ++i) {
7457 if (Zeroable[i])
7458 continue;
7459 SDValue Elt = Op.getOperand(i);
7460 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7462 return SDValue();
7463 // Make sure that this node is extracting from a 128-bit vector.
7464 MVT VT = Elt.getOperand(0).getSimpleValueType();
7465 if (!VT.is128BitVector())
7466 return SDValue();
7467 if (!FirstNonZero.getNode()) {
7468 FirstNonZero = Elt;
7469 FirstNonZeroIdx = i;
7470 }
7471 }
7472
7473 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7474 SDValue V1 = FirstNonZero.getOperand(0);
7475 MVT VT = V1.getSimpleValueType();
7476
7477 // See if this build_vector can be lowered as a blend with zero.
7478 SDValue Elt;
7479 unsigned EltMaskIdx, EltIdx;
7480 int Mask[4];
7481 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7482 if (Zeroable[EltIdx]) {
7483 // The zero vector will be on the right hand side.
7484 Mask[EltIdx] = EltIdx+4;
7485 continue;
7486 }
7487
7488 Elt = Op->getOperand(EltIdx);
7489 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7490 EltMaskIdx = Elt.getConstantOperandVal(1);
7491 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7492 break;
7493 Mask[EltIdx] = EltIdx;
7494 }
7495
7496 if (EltIdx == 4) {
7497 // Let the shuffle legalizer deal with blend operations.
7498 SDValue VZeroOrUndef = (Zeroable == Undefs)
7499 ? DAG.getUNDEF(VT)
7500 : getZeroVector(VT, Subtarget, DAG, DL);
7501 if (V1.getSimpleValueType() != VT)
7502 V1 = DAG.getBitcast(VT, V1);
7503 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7504 }
7505
7506 // See if we can lower this build_vector to a INSERTPS.
7507 if (!Subtarget.hasSSE41())
7508 return SDValue();
7509
7510 SDValue V2 = Elt.getOperand(0);
7511 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7512 V1 = SDValue();
7513
7514 bool CanFold = true;
7515 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7516 if (Zeroable[i])
7517 continue;
7518
7519 SDValue Current = Op->getOperand(i);
7520 SDValue SrcVector = Current->getOperand(0);
7521 if (!V1.getNode())
7522 V1 = SrcVector;
7523 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7524 }
7525
7526 if (!CanFold)
7527 return SDValue();
7528
7529 assert(V1.getNode() && "Expected at least two non-zero elements!");
7530 if (V1.getSimpleValueType() != MVT::v4f32)
7531 V1 = DAG.getBitcast(MVT::v4f32, V1);
7532 if (V2.getSimpleValueType() != MVT::v4f32)
7533 V2 = DAG.getBitcast(MVT::v4f32, V2);
7534
7535 // Ok, we can emit an INSERTPS instruction.
7536 unsigned ZMask = Zeroable.to_ulong();
7537
7538 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7539 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7540 SDValue Result =
7541 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7542 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7543 return DAG.getBitcast(VT, Result);
7544}
7545
7546/// Return a vector logical shift node.
7547static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7548 SelectionDAG &DAG, const TargetLowering &TLI,
7549 const SDLoc &dl) {
7550 assert(VT.is128BitVector() && "Unknown type for VShift");
7551 MVT ShVT = MVT::v16i8;
7552 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7553 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7554 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7555 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7556 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7557}
7558
7560 SelectionDAG &DAG) {
7561
7562 // Check if the scalar load can be widened into a vector load. And if
7563 // the address is "base + cst" see if the cst can be "absorbed" into
7564 // the shuffle mask.
7566 SDValue Ptr = LD->getBasePtr();
7567 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7568 return SDValue();
7569 EVT PVT = LD->getValueType(0);
7570 if (PVT != MVT::i32 && PVT != MVT::f32)
7571 return SDValue();
7572
7573 int FI = -1;
7574 int64_t Offset = 0;
7575 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7576 FI = FINode->getIndex();
7577 Offset = 0;
7578 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7580 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7582 Ptr = Ptr.getOperand(0);
7583 } else {
7584 return SDValue();
7585 }
7586
7587 // FIXME: 256-bit vector instructions don't require a strict alignment,
7588 // improve this code to support it better.
7589 Align RequiredAlign(VT.getSizeInBits() / 8);
7590 SDValue Chain = LD->getChain();
7591 // Make sure the stack object alignment is at least 16 or 32.
7593 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7594 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7595 if (MFI.isFixedObjectIndex(FI)) {
7596 // Can't change the alignment. FIXME: It's possible to compute
7597 // the exact stack offset and reference FI + adjust offset instead.
7598 // If someone *really* cares about this. That's the way to implement it.
7599 return SDValue();
7600 } else {
7601 MFI.setObjectAlignment(FI, RequiredAlign);
7602 }
7603 }
7604
7605 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7606 // Ptr + (Offset & ~15).
7607 if (Offset < 0)
7608 return SDValue();
7609 if ((Offset % RequiredAlign.value()) & 3)
7610 return SDValue();
7611 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7612 if (StartOffset) {
7613 SDLoc DL(Ptr);
7614 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7615 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7616 }
7617
7618 int EltNo = (Offset - StartOffset) >> 2;
7619 unsigned NumElems = VT.getVectorNumElements();
7620
7621 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7622 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7623 LD->getPointerInfo().getWithOffset(StartOffset));
7624
7625 SmallVector<int, 8> Mask(NumElems, EltNo);
7626
7627 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7628 }
7629
7630 return SDValue();
7631}
7632
7633// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7634static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7635 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7636 auto *BaseLd = cast<LoadSDNode>(Elt);
7637 if (!BaseLd->isSimple())
7638 return false;
7639 Ld = BaseLd;
7640 ByteOffset = 0;
7641 return true;
7642 }
7643
7644 switch (Elt.getOpcode()) {
7645 case ISD::BITCAST:
7646 case ISD::TRUNCATE:
7648 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7649 case ISD::SRL:
7650 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7651 uint64_t Amt = AmtC->getZExtValue();
7652 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7653 ByteOffset += Amt / 8;
7654 return true;
7655 }
7656 }
7657 break;
7659 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7660 SDValue Src = Elt.getOperand(0);
7661 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7662 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7663 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7664 findEltLoadSrc(Src, Ld, ByteOffset)) {
7665 uint64_t Idx = IdxC->getZExtValue();
7666 ByteOffset += Idx * (SrcSizeInBits / 8);
7667 return true;
7668 }
7669 }
7670 break;
7671 }
7672
7673 return false;
7674}
7675
7676/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7677/// elements can be replaced by a single large load which has the same value as
7678/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7679///
7680/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7682 const SDLoc &DL, SelectionDAG &DAG,
7683 const X86Subtarget &Subtarget,
7684 bool IsAfterLegalize,
7685 unsigned Depth = 0) {
7687 return SDValue(); // Limit search depth.
7688 if ((VT.getScalarSizeInBits() % 8) != 0)
7689 return SDValue();
7690
7691 unsigned NumElems = Elts.size();
7692
7693 int LastLoadedElt = -1;
7694 APInt LoadMask = APInt::getZero(NumElems);
7695 APInt ZeroMask = APInt::getZero(NumElems);
7696 APInt UndefMask = APInt::getZero(NumElems);
7697
7698 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7699 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7700
7701 // For each element in the initializer, see if we've found a load, zero or an
7702 // undef.
7703 for (unsigned i = 0; i < NumElems; ++i) {
7704 SDValue Elt = peekThroughBitcasts(Elts[i]);
7705 if (!Elt.getNode())
7706 return SDValue();
7707 if (Elt.isUndef()) {
7708 UndefMask.setBit(i);
7709 continue;
7710 }
7712 ZeroMask.setBit(i);
7713 continue;
7714 }
7715
7716 // Each loaded element must be the correct fractional portion of the
7717 // requested vector load.
7718 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7719 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7720 return SDValue();
7721
7722 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7723 return SDValue();
7724 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7725 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7726 return SDValue();
7727
7728 LoadMask.setBit(i);
7729 LastLoadedElt = i;
7730 }
7731 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7732 NumElems &&
7733 "Incomplete element masks");
7734
7735 // Handle Special Cases - all undef or undef/zero.
7736 if (UndefMask.popcount() == NumElems)
7737 return DAG.getUNDEF(VT);
7738 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7739 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7740 : DAG.getConstantFP(0.0, DL, VT);
7741
7742 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7743 int FirstLoadedElt = LoadMask.countr_zero();
7744 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7745 EVT EltBaseVT = EltBase.getValueType();
7746 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7747 "Register/Memory size mismatch");
7748 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7749 assert(LDBase && "Did not find base load for merging consecutive loads");
7750 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7751 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7752 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7753 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7754 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7755
7756 // TODO: Support offsetting the base load.
7757 if (ByteOffsets[FirstLoadedElt] != 0)
7758 return SDValue();
7759
7760 // Check to see if the element's load is consecutive to the base load
7761 // or offset from a previous (already checked) load.
7762 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7763 LoadSDNode *Ld = Loads[EltIdx];
7764 int64_t ByteOffset = ByteOffsets[EltIdx];
7765 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7766 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7767 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7768 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7769 }
7770 int Stride = EltIdx - FirstLoadedElt;
7771 if (DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, Stride))
7772 return true;
7773 // Try again using the memory load size (we might have broken a large load
7774 // into smaller elements), ensure the stride is the full memory load size
7775 // apart and a whole number of elements fit in each memory load.
7776 unsigned BaseMemSizeInBits = Base->getMemoryVT().getSizeInBits();
7777 if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
7778 (BaseMemSizeInBits % BaseSizeInBits) == 0) {
7779 unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
7780 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseMemSizeInBits / 8,
7781 Stride / Scale);
7782 }
7783 return false;
7784 };
7785
7786 // Consecutive loads can contain UNDEFS but not ZERO elements.
7787 // Consecutive loads with UNDEFs and ZEROs elements require a
7788 // an additional shuffle stage to clear the ZERO elements.
7789 bool IsConsecutiveLoad = true;
7790 bool IsConsecutiveLoadWithZeros = true;
7791 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7792 if (LoadMask[i]) {
7793 if (!CheckConsecutiveLoad(LDBase, i)) {
7794 IsConsecutiveLoad = false;
7795 IsConsecutiveLoadWithZeros = false;
7796 break;
7797 }
7798 } else if (ZeroMask[i]) {
7799 IsConsecutiveLoad = false;
7800 }
7801 }
7802
7803 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7804 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7805 assert(LDBase->isSimple() &&
7806 "Cannot merge volatile or atomic loads.");
7807 SDValue NewLd =
7808 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7809 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7810 for (auto *LD : Loads)
7811 if (LD)
7812 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7813 return NewLd;
7814 };
7815
7816 // Check if the base load is entirely dereferenceable.
7817 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7818 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7819
7820 // LOAD - all consecutive load/undefs (must start/end with a load or be
7821 // entirely dereferenceable). If we have found an entire vector of loads and
7822 // undefs, then return a large load of the entire vector width starting at the
7823 // base pointer. If the vector contains zeros, then attempt to shuffle those
7824 // elements.
7825 if (FirstLoadedElt == 0 &&
7826 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7827 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7828 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7829 return SDValue();
7830
7831 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7832 // will lower to regular temporal loads and use the cache.
7833 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7834 VT.is256BitVector() && !Subtarget.hasInt256())
7835 return SDValue();
7836
7837 if (NumElems == 1)
7838 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7839
7840 if (!ZeroMask)
7841 return CreateLoad(VT, LDBase);
7842
7843 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7844 // vector and a zero vector to clear out the zero elements.
7845 if (!IsAfterLegalize && VT.isVector()) {
7846 unsigned NumMaskElts = VT.getVectorNumElements();
7847 if ((NumMaskElts % NumElems) == 0) {
7848 unsigned Scale = NumMaskElts / NumElems;
7849 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7850 for (unsigned i = 0; i < NumElems; ++i) {
7851 if (UndefMask[i])
7852 continue;
7853 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7854 for (unsigned j = 0; j != Scale; ++j)
7855 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7856 }
7857 SDValue V = CreateLoad(VT, LDBase);
7858 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7859 : DAG.getConstantFP(0.0, DL, VT);
7860 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7861 }
7862 }
7863 }
7864
7865 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7866 if (VT.is256BitVector() || VT.is512BitVector()) {
7867 unsigned HalfNumElems = NumElems / 2;
7868 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7869 EVT HalfVT =
7870 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7871 SDValue HalfLD =
7872 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7873 DAG, Subtarget, IsAfterLegalize, Depth + 1);
7874 if (HalfLD)
7875 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7876 HalfLD, DAG.getVectorIdxConstant(0, DL));
7877 }
7878 }
7879
7880 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7881 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7882 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7883 LoadSizeInBits == 64) &&
7884 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7885 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7886 : MVT::getIntegerVT(LoadSizeInBits);
7887 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7888 // Allow v4f32 on SSE1 only targets.
7889 // FIXME: Add more isel patterns so we can just use VT directly.
7890 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7891 VecVT = MVT::v4f32;
7892 if (TLI.isTypeLegal(VecVT)) {
7893 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7894 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7895 SDValue ResNode = DAG.getMemIntrinsicNode(
7896 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7898 for (auto *LD : Loads)
7899 if (LD)
7900 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7901 return DAG.getBitcast(VT, ResNode);
7902 }
7903 }
7904
7905 // BROADCAST - match the smallest possible repetition pattern, load that
7906 // scalar/subvector element and then broadcast to the entire vector.
7907 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7908 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7909 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7910 unsigned RepeatSize = SubElems * BaseSizeInBits;
7911 unsigned ScalarSize = std::min(RepeatSize, 64u);
7912 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7913 continue;
7914
7915 // Don't attempt a 1:N subvector broadcast - it should be caught by
7916 // combineConcatVectorOps, else will cause infinite loops.
7917 if (RepeatSize > ScalarSize && SubElems == 1)
7918 continue;
7919
7920 bool Match = true;
7921 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7922 for (unsigned i = 0; i != NumElems && Match; ++i) {
7923 if (!LoadMask[i])
7924 continue;
7925 SDValue Elt = peekThroughBitcasts(Elts[i]);
7926 if (RepeatedLoads[i % SubElems].isUndef())
7927 RepeatedLoads[i % SubElems] = Elt;
7928 else
7929 Match &= (RepeatedLoads[i % SubElems] == Elt);
7930 }
7931
7932 // We must have loads at both ends of the repetition.
7933 Match &= !RepeatedLoads.front().isUndef();
7934 Match &= !RepeatedLoads.back().isUndef();
7935 if (!Match)
7936 continue;
7937
7938 EVT RepeatVT =
7939 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7940 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7941 : EVT::getFloatingPointVT(ScalarSize);
7942 if (RepeatSize > ScalarSize)
7943 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7944 RepeatSize / ScalarSize);
7945 EVT BroadcastVT =
7946 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7947 VT.getSizeInBits() / ScalarSize);
7948 if (TLI.isTypeLegal(BroadcastVT)) {
7949 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7950 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize,
7951 Depth + 1)) {
7952 SDValue Broadcast = RepeatLoad;
7953 if (RepeatSize > ScalarSize) {
7954 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7955 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7956 } else {
7957 if (!Subtarget.hasAVX2() &&
7959 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7960 Subtarget,
7961 /*AssumeSingleUse=*/true))
7962 return SDValue();
7963 Broadcast =
7964 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7965 }
7966 return DAG.getBitcast(VT, Broadcast);
7967 }
7968 }
7969 }
7970 }
7971
7972 // REVERSE - attempt to match the loads in reverse and then shuffle back.
7973 // TODO: Do this for any permute or mismatching element counts.
7974 if (Depth == 0 && ZeroMask.isZero() && UndefMask.isZero() &&
7975 TLI.isTypeLegal(VT) && VT.isVector() &&
7976 NumElems == VT.getVectorNumElements()) {
7977 SmallVector<SDValue, 16> ReverseElts(Elts.rbegin(), Elts.rend());
7979 VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) {
7980 SmallVector<int, 16> ReverseMask(NumElems);
7981 std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0);
7982 return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask);
7983 }
7984 }
7985
7986 return SDValue();
7987}
7988
7989// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7990// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7991// are consecutive, non-overlapping, and in the right order.
7993 SelectionDAG &DAG,
7994 const X86Subtarget &Subtarget,
7995 bool IsAfterLegalize) {
7997 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7998 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7999 Elts.push_back(Elt);
8000 continue;
8001 }
8002 return SDValue();
8003 }
8004 assert(Elts.size() == VT.getVectorNumElements());
8005 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8006 IsAfterLegalize);
8007}
8008
8010 const APInt &Undefs, LLVMContext &C) {
8011 unsigned ScalarSize = VT.getScalarSizeInBits();
8012 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
8013
8014 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
8015 if (VT.isFloatingPoint()) {
8016 if (ScalarSize == 16)
8017 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
8018 if (ScalarSize == 32)
8019 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8020 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
8021 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8022 }
8023 return Constant::getIntegerValue(Ty, Val);
8024 };
8025
8026 SmallVector<Constant *, 32> ConstantVec;
8027 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
8028 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
8029 : getConstantScalar(Bits[I]));
8030
8031 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8032}
8033
8034static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8035 unsigned SplatBitSize, LLVMContext &C) {
8036 unsigned ScalarSize = VT.getScalarSizeInBits();
8037
8038 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
8039 if (VT.isFloatingPoint()) {
8040 if (ScalarSize == 16)
8041 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
8042 if (ScalarSize == 32)
8043 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8044 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
8045 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8046 }
8047 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8048 };
8049
8050 if (ScalarSize == SplatBitSize)
8051 return getConstantScalar(SplatValue);
8052
8053 unsigned NumElm = SplatBitSize / ScalarSize;
8054 SmallVector<Constant *, 32> ConstantVec;
8055 for (unsigned I = 0; I != NumElm; ++I) {
8056 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
8057 ConstantVec.push_back(getConstantScalar(Val));
8058 }
8059 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8060}
8061
8063 for (auto *U : N->users()) {
8064 unsigned Opc = U->getOpcode();
8065 // VPERMV/VPERMV3 shuffles can never fold their index operands.
8066 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8067 return false;
8068 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8069 return false;
8070 if (isTargetShuffle(Opc))
8071 return true;
8072 if (Opc == ISD::BITCAST) // Ignore bitcasts
8073 return isFoldableUseOfShuffle(U);
8074 if (N->hasOneUse()) {
8075 // TODO, there may be some general way to know if a SDNode can
8076 // be folded. We now only know whether an MI is foldable.
8077 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
8078 return false;
8079 return true;
8080 }
8081 }
8082 return false;
8083}
8084
8085// If the node has a single use by a VSELECT then AVX512 targets may be able to
8086// fold as a predicated instruction.
8087static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
8088 unsigned SizeInBits = V.getValueSizeInBits();
8089 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
8090 (SizeInBits >= 128 && Subtarget.hasVLX())) {
8091 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
8092 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
8093 return true;
8094 }
8095 }
8096 return false;
8097}
8098
8099/// Attempt to use the vbroadcast instruction to generate a splat value
8100/// from a splat BUILD_VECTOR which uses:
8101/// a. A single scalar load, or a constant.
8102/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8103///
8104/// The VBROADCAST node is returned when a pattern is found,
8105/// or SDValue() otherwise.
8107 const SDLoc &dl,
8108 const X86Subtarget &Subtarget,
8109 SelectionDAG &DAG) {
8110 // VBROADCAST requires AVX.
8111 // TODO: Splats could be generated for non-AVX CPUs using SSE
8112 // instructions, but there's less potential gain for only 128-bit vectors.
8113 if (!Subtarget.hasAVX())
8114 return SDValue();
8115
8116 MVT VT = BVOp->getSimpleValueType(0);
8117 unsigned NumElts = VT.getVectorNumElements();
8118 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8119 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
8120 "Unsupported vector type for broadcast.");
8121
8122 // See if the build vector is a repeating sequence of scalars (inc. splat).
8123 SDValue Ld;
8124 BitVector UndefElements;
8125 SmallVector<SDValue, 16> Sequence;
8126 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
8127 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
8128 if (Sequence.size() == 1)
8129 Ld = Sequence[0];
8130 }
8131
8132 // Attempt to use VBROADCASTM
8133 // From this pattern:
8134 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8135 // b. t1 = (build_vector t0 t0)
8136 //
8137 // Create (VBROADCASTM v2i1 X)
8138 if (!Sequence.empty() && Subtarget.hasCDI()) {
8139 // If not a splat, are the upper sequence values zeroable?
8140 unsigned SeqLen = Sequence.size();
8141 bool UpperZeroOrUndef =
8142 SeqLen == 1 ||
8143 llvm::all_of(ArrayRef(Sequence).drop_front(),
8144 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
8145 SDValue Op0 = Sequence[0];
8146 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
8147 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
8148 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
8149 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
8150 ? Op0.getOperand(0)
8151 : Op0.getOperand(0).getOperand(0);
8152 MVT MaskVT = BOperand.getSimpleValueType();
8153 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
8154 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
8155 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8156 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
8157 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
8158 unsigned Scale = 512 / VT.getSizeInBits();
8159 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
8160 }
8161 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8162 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
8163 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
8164 return DAG.getBitcast(VT, Bcst);
8165 }
8166 }
8167 }
8168
8169 unsigned NumUndefElts = UndefElements.count();
8170 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8171 APInt SplatValue, Undef;
8172 unsigned SplatBitSize;
8173 bool HasUndef;
8174 // Check if this is a repeated constant pattern suitable for broadcasting.
8175 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8176 SplatBitSize > VT.getScalarSizeInBits() &&
8177 SplatBitSize < VT.getSizeInBits()) {
8178 // Avoid replacing with broadcast when it's a use of a shuffle
8179 // instruction to preserve the present custom lowering of shuffles.
8180 if (isFoldableUseOfShuffle(BVOp))
8181 return SDValue();
8182 // replace BUILD_VECTOR with broadcast of the repeated constants.
8183 LLVMContext *Ctx = DAG.getContext();
8184 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8185 if (SplatBitSize == 32 || SplatBitSize == 64 ||
8186 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8187 // Load the constant scalar/subvector and broadcast it.
8188 MVT CVT = MVT::getIntegerVT(SplatBitSize);
8189 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
8190 SDValue CP = DAG.getConstantPool(C, PVT);
8191 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8192
8193 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8194 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8195 SDValue Ops[] = {DAG.getEntryNode(), CP};
8196 MachinePointerInfo MPI =
8198 SDValue Brdcst =
8199 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
8200 MPI, Alignment, MachineMemOperand::MOLoad);
8201 return DAG.getBitcast(VT, Brdcst);
8202 }
8203 if (SplatBitSize > 64) {
8204 // Load the vector of constants and broadcast it.
8205 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
8206 SDValue VCP = DAG.getConstantPool(VecC, PVT);
8207 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8208 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
8209 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
8210 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8211 SDValue Ops[] = {DAG.getEntryNode(), VCP};
8212 MachinePointerInfo MPI =
8214 return DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, dl, Tys,
8215 Ops, VVT, MPI, Alignment,
8217 }
8218 }
8219
8220 // If we are moving a scalar into a vector (Ld must be set and all elements
8221 // but 1 are undef) and that operation is not obviously supported by
8222 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8223 // That's better than general shuffling and may eliminate a load to GPR and
8224 // move from scalar to vector register.
8225 if (!Ld || NumElts - NumUndefElts != 1)
8226 return SDValue();
8227 unsigned ScalarSize = Ld.getValueSizeInBits();
8228 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8229 return SDValue();
8230 }
8231
8232 bool ConstSplatVal =
8233 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8234 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8235
8236 // TODO: Handle broadcasts of non-constant sequences.
8237
8238 // Make sure that all of the users of a non-constant load are from the
8239 // BUILD_VECTOR node.
8240 // FIXME: Is the use count needed for non-constant, non-load case?
8241 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8242 return SDValue();
8243
8244 unsigned ScalarSize = Ld.getValueSizeInBits();
8245 bool IsGE256 = (VT.getSizeInBits() >= 256);
8246
8247 // When optimizing for size, generate up to 5 extra bytes for a broadcast
8248 // instruction to save 8 or more bytes of constant pool data.
8249 // TODO: If multiple splats are generated to load the same constant,
8250 // it may be detrimental to overall size. There needs to be a way to detect
8251 // that condition to know if this is truly a size win.
8252 bool OptForSize = DAG.shouldOptForSize();
8253
8254 // Handle broadcasting a single constant scalar from the constant pool
8255 // into a vector.
8256 // On Sandybridge (no AVX2), it is still better to load a constant vector
8257 // from the constant pool and not to broadcast it from a scalar.
8258 // But override that restriction when optimizing for size.
8259 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
8260 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
8261 EVT CVT = Ld.getValueType();
8262 assert(!CVT.isVector() && "Must not broadcast a vector type");
8263
8264 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
8265 // For size optimization, also splat v2f64 and v2i64, and for size opt
8266 // with AVX2, also splat i8 and i16.
8267 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
8268 if (ScalarSize == 32 ||
8269 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
8270 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
8271 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
8272 const Constant *C = nullptr;
8274 C = CI->getConstantIntValue();
8276 C = CF->getConstantFPValue();
8277
8278 assert(C && "Invalid constant type");
8279
8280 SDValue CP =
8282 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8283
8284 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8285 SDValue Ops[] = {DAG.getEntryNode(), CP};
8286 MachinePointerInfo MPI =
8288 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
8289 MPI, Alignment, MachineMemOperand::MOLoad);
8290 }
8291 }
8292
8293 // Handle AVX2 in-register broadcasts.
8294 if (!IsLoad && Subtarget.hasInt256() &&
8295 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
8296 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8297
8298 // The scalar source must be a normal load.
8299 if (!IsLoad)
8300 return SDValue();
8301
8302 // Make sure the non-chain result is only used by this build vector.
8303 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
8304 return SDValue();
8305
8306 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8307 (Subtarget.hasVLX() && ScalarSize == 64)) {
8308 auto *LN = cast<LoadSDNode>(Ld);
8309 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8310 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8311 SDValue BCast =
8312 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
8313 LN->getMemoryVT(), LN->getMemOperand());
8314 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8315 return BCast;
8316 }
8317
8318 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
8319 // double since there is no vbroadcastsd xmm
8320 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
8321 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
8322 auto *LN = cast<LoadSDNode>(Ld);
8323 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8324 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8325 SDValue BCast =
8326 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
8327 LN->getMemoryVT(), LN->getMemOperand());
8328 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8329 return BCast;
8330 }
8331
8332 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
8333 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8334
8335 // Unsupported broadcast.
8336 return SDValue();
8337}
8338
8339/// For an EXTRACT_VECTOR_ELT with a constant index return the real
8340/// underlying vector and index.
8341///
8342/// Modifies \p ExtractedFromVec to the real vector and returns the real
8343/// index.
8344static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
8345 SDValue ExtIdx) {
8346 int Idx = ExtIdx->getAsZExtVal();
8347 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
8348 return Idx;
8349
8350 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
8351 // lowered this:
8352 // (extract_vector_elt (v8f32 %1), Constant<6>)
8353 // to:
8354 // (extract_vector_elt (vector_shuffle<2,u,u,u>
8355 // (extract_subvector (v8f32 %0), Constant<4>),
8356 // undef)
8357 // Constant<0>)
8358 // In this case the vector is the extract_subvector expression and the index
8359 // is 2, as specified by the shuffle.
8360 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
8361 SDValue ShuffleVec = SVOp->getOperand(0);
8362 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
8363 assert(ShuffleVecVT.getVectorElementType() ==
8364 ExtractedFromVec.getSimpleValueType().getVectorElementType());
8365
8366 int ShuffleIdx = SVOp->getMaskElt(Idx);
8367 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
8368 ExtractedFromVec = ShuffleVec;
8369 return ShuffleIdx;
8370 }
8371 return Idx;
8372}
8373
8375 SelectionDAG &DAG) {
8376 MVT VT = Op.getSimpleValueType();
8377
8378 // Skip if insert_vec_elt is not supported.
8379 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8381 return SDValue();
8382
8383 unsigned NumElems = Op.getNumOperands();
8384 SDValue VecIn1;
8385 SDValue VecIn2;
8386 SmallVector<unsigned, 4> InsertIndices;
8387 SmallVector<int, 8> Mask(NumElems, -1);
8388
8389 for (unsigned i = 0; i != NumElems; ++i) {
8390 unsigned Opc = Op.getOperand(i).getOpcode();
8391
8392 if (Opc == ISD::POISON || Opc == ISD::UNDEF)
8393 continue;
8394
8396 // Quit if more than 1 elements need inserting.
8397 if (InsertIndices.size() > 1)
8398 return SDValue();
8399
8400 InsertIndices.push_back(i);
8401 continue;
8402 }
8403
8404 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
8405 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
8406
8407 // Quit if non-constant index.
8408 if (!isa<ConstantSDNode>(ExtIdx))
8409 return SDValue();
8410 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
8411
8412 // Quit if extracted from vector of different type.
8413 if (ExtractedFromVec.getValueType() != VT)
8414 return SDValue();
8415
8416 if (!VecIn1.getNode())
8417 VecIn1 = ExtractedFromVec;
8418 else if (VecIn1 != ExtractedFromVec) {
8419 if (!VecIn2.getNode())
8420 VecIn2 = ExtractedFromVec;
8421 else if (VecIn2 != ExtractedFromVec)
8422 // Quit if more than 2 vectors to shuffle
8423 return SDValue();
8424 }
8425
8426 if (ExtractedFromVec == VecIn1)
8427 Mask[i] = Idx;
8428 else if (ExtractedFromVec == VecIn2)
8429 Mask[i] = Idx + NumElems;
8430 }
8431
8432 if (!VecIn1.getNode())
8433 return SDValue();
8434
8435 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getPOISON(VT);
8436 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
8437
8438 for (unsigned Idx : InsertIndices)
8439 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
8440 DAG.getVectorIdxConstant(Idx, DL));
8441
8442 return NV;
8443}
8444
8445// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
8447 const X86Subtarget &Subtarget) {
8448 MVT VT = Op.getSimpleValueType();
8449 MVT SVT = Subtarget.hasFP16() ? MVT::f16 : MVT::i16;
8450 MVT IVT = VT.changeVectorElementType(SVT);
8452 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8453 NewOps.push_back(DAG.getBitcast(SVT, Op.getOperand(I)));
8454 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8455 return DAG.getBitcast(VT, Res);
8456}
8457
8458// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8460 SelectionDAG &DAG,
8461 const X86Subtarget &Subtarget) {
8462
8463 MVT VT = Op.getSimpleValueType();
8464 assert((VT.getVectorElementType() == MVT::i1) &&
8465 "Unexpected type in LowerBUILD_VECTORvXi1!");
8466 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8467 ISD::isBuildVectorAllOnes(Op.getNode()))
8468 return Op;
8469
8470 uint64_t Immediate = 0;
8471 SmallVector<unsigned, 16> NonConstIdx;
8472 bool IsSplat = true;
8473 bool HasConstElts = false;
8474 int SplatIdx = -1;
8475 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8476 SDValue In = Op.getOperand(idx);
8477 if (In.isUndef())
8478 continue;
8479 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8480 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8481 HasConstElts = true;
8482 } else {
8483 NonConstIdx.push_back(idx);
8484 }
8485 if (SplatIdx < 0)
8486 SplatIdx = idx;
8487 else if (In != Op.getOperand(SplatIdx))
8488 IsSplat = false;
8489 }
8490
8491 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8492 if (IsSplat) {
8493 // The build_vector allows the scalar element to be larger than the vector
8494 // element type. We need to mask it to use as a condition unless we know
8495 // the upper bits are zero.
8496 // FIXME: Use computeKnownBits instead of checking specific opcode?
8497 SDValue Cond = Op.getOperand(SplatIdx);
8498 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8499 if (Cond.getOpcode() != ISD::SETCC)
8500 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8501 DAG.getConstant(1, dl, MVT::i8));
8502
8503 // Perform the select in the scalar domain so we can use cmov.
8504 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8505 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8506 DAG.getAllOnesConstant(dl, MVT::i32),
8507 DAG.getConstant(0, dl, MVT::i32));
8508 Select = DAG.getBitcast(MVT::v32i1, Select);
8509 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8510 } else {
8511 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8512 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8513 DAG.getAllOnesConstant(dl, ImmVT),
8514 DAG.getConstant(0, dl, ImmVT));
8515 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8516 Select = DAG.getBitcast(VecVT, Select);
8517 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8518 DAG.getVectorIdxConstant(0, dl));
8519 }
8520 }
8521
8522 // insert elements one by one
8523 SDValue DstVec;
8524 if (HasConstElts) {
8525 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8526 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8527 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8528 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8529 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8530 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8531 } else {
8532 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8533 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8534 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8535 DstVec = DAG.getBitcast(VecVT, Imm);
8536 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8537 DAG.getVectorIdxConstant(0, dl));
8538 }
8539 } else
8540 DstVec = DAG.getUNDEF(VT);
8541
8542 for (unsigned InsertIdx : NonConstIdx) {
8543 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8544 Op.getOperand(InsertIdx),
8545 DAG.getVectorIdxConstant(InsertIdx, dl));
8546 }
8547 return DstVec;
8548}
8549
8550[[maybe_unused]] static bool isHorizOp(unsigned Opcode) {
8551 switch (Opcode) {
8552 case X86ISD::PACKSS:
8553 case X86ISD::PACKUS:
8554 case X86ISD::FHADD:
8555 case X86ISD::FHSUB:
8556 case X86ISD::HADD:
8557 case X86ISD::HSUB:
8558 case X86ISD::HADDS:
8559 case X86ISD::HSUBS:
8560 return true;
8561 }
8562 return false;
8563}
8564
8565/// This is a helper function of LowerToHorizontalOp().
8566/// This function checks that the build_vector \p N in input implements a
8567/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8568/// may not match the layout of an x86 256-bit horizontal instruction.
8569/// In other words, if this returns true, then some extraction/insertion will
8570/// be required to produce a valid horizontal instruction.
8571///
8572/// Parameter \p Opcode defines the kind of horizontal operation to match.
8573/// For example, if \p Opcode is equal to ISD::ADD, then this function
8574/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8575/// is equal to ISD::SUB, then this function checks if this is a horizontal
8576/// arithmetic sub.
8577///
8578/// This function only analyzes elements of \p N whose indices are
8579/// in range [BaseIdx, LastIdx).
8580///
8581/// TODO: This function was originally used to match both real and fake partial
8582/// horizontal operations, but the index-matching logic is incorrect for that.
8583/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8584/// code because it is only used for partial h-op matching now?
8585static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8586 const SDLoc &DL, SelectionDAG &DAG,
8587 unsigned BaseIdx, unsigned LastIdx,
8588 SDValue &V0, SDValue &V1) {
8589 EVT VT = N->getValueType(0);
8590 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8591 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8592 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8593 "Invalid Vector in input!");
8594
8595 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8596 bool CanFold = true;
8597 unsigned ExpectedVExtractIdx = BaseIdx;
8598 unsigned NumElts = LastIdx - BaseIdx;
8599 V0 = DAG.getUNDEF(VT);
8600 V1 = DAG.getUNDEF(VT);
8601
8602 // Check if N implements a horizontal binop.
8603 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8604 SDValue Op = N->getOperand(i + BaseIdx);
8605
8606 // Skip UNDEFs.
8607 if (Op->isUndef()) {
8608 // Update the expected vector extract index.
8609 if (i * 2 == NumElts)
8610 ExpectedVExtractIdx = BaseIdx;
8611 ExpectedVExtractIdx += 2;
8612 continue;
8613 }
8614
8615 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8616
8617 if (!CanFold)
8618 break;
8619
8620 SDValue Op0 = Op.getOperand(0);
8621 SDValue Op1 = Op.getOperand(1);
8622
8623 // Try to match the following pattern:
8624 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8625 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8627 Op0.getOperand(0) == Op1.getOperand(0) &&
8630 if (!CanFold)
8631 break;
8632
8633 unsigned I0 = Op0.getConstantOperandVal(1);
8634 unsigned I1 = Op1.getConstantOperandVal(1);
8635
8636 if (i * 2 < NumElts) {
8637 if (V0.isUndef()) {
8638 V0 = Op0.getOperand(0);
8639 if (V0.getValueType() != VT)
8640 return false;
8641 }
8642 } else {
8643 if (V1.isUndef()) {
8644 V1 = Op0.getOperand(0);
8645 if (V1.getValueType() != VT)
8646 return false;
8647 }
8648 if (i * 2 == NumElts)
8649 ExpectedVExtractIdx = BaseIdx;
8650 }
8651
8652 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8653 if (I0 == ExpectedVExtractIdx)
8654 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8655 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8656 // Try to match the following dag sequence:
8657 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8658 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8659 } else
8660 CanFold = false;
8661
8662 ExpectedVExtractIdx += 2;
8663 }
8664
8665 return CanFold;
8666}
8667
8668/// Emit a sequence of two 128-bit horizontal add/sub followed by
8669/// a concat_vector.
8670///
8671/// This is a helper function of LowerToHorizontalOp().
8672/// This function expects two 256-bit vectors called V0 and V1.
8673/// At first, each vector is split into two separate 128-bit vectors.
8674/// Then, the resulting 128-bit vectors are used to implement two
8675/// horizontal binary operations.
8676///
8677/// The kind of horizontal binary operation is defined by \p X86Opcode.
8678///
8679/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8680/// the two new horizontal binop.
8681/// When Mode is set, the first horizontal binop dag node would take as input
8682/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8683/// horizontal binop dag node would take as input the lower 128-bit of V1
8684/// and the upper 128-bit of V1.
8685/// Example:
8686/// HADD V0_LO, V0_HI
8687/// HADD V1_LO, V1_HI
8688///
8689/// Otherwise, the first horizontal binop dag node takes as input the lower
8690/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8691/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8692/// Example:
8693/// HADD V0_LO, V1_LO
8694/// HADD V0_HI, V1_HI
8695///
8696/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8697/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8698/// the upper 128-bits of the result.
8699static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8700 const SDLoc &DL, SelectionDAG &DAG,
8701 unsigned X86Opcode, bool Mode,
8702 bool isUndefLO, bool isUndefHI) {
8703 MVT VT = V0.getSimpleValueType();
8704 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8705 "Invalid nodes in input!");
8706
8707 unsigned NumElts = VT.getVectorNumElements();
8708 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8709 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8710 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8711 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8712 MVT NewVT = V0_LO.getSimpleValueType();
8713
8714 SDValue LO = DAG.getUNDEF(NewVT);
8715 SDValue HI = DAG.getUNDEF(NewVT);
8716
8717 if (Mode) {
8718 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8719 if (!isUndefLO && !V0->isUndef())
8720 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8721 if (!isUndefHI && !V1->isUndef())
8722 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8723 } else {
8724 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8725 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8726 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8727
8728 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8729 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8730 }
8731
8732 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8733}
8734
8735/// Returns true iff \p BV builds a vector with the result equivalent to
8736/// the result of ADDSUB/SUBADD operation.
8737/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8738/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8739/// \p Opnd0 and \p Opnd1.
8741 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8742 SDValue &Opnd0, SDValue &Opnd1,
8743 unsigned &NumExtracts, bool &IsSubAdd,
8744 bool &HasAllowContract) {
8745 using namespace SDPatternMatch;
8746
8747 MVT VT = BV->getSimpleValueType(0);
8748 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8749 return false;
8750
8751 unsigned NumElts = VT.getVectorNumElements();
8752 SDValue InVec0 = DAG.getUNDEF(VT);
8753 SDValue InVec1 = DAG.getUNDEF(VT);
8754
8755 NumExtracts = 0;
8756 HasAllowContract = NumElts != 0;
8757
8758 // Odd-numbered elements in the input build vector are obtained from
8759 // adding/subtracting two integer/float elements.
8760 // Even-numbered elements in the input build vector are obtained from
8761 // subtracting/adding two integer/float elements.
8762 unsigned Opc[2] = {0, 0};
8763 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8764 SDValue Op = BV->getOperand(i);
8765
8766 // Skip 'undef' values.
8767 unsigned Opcode = Op.getOpcode();
8768 if (Opcode == ISD::UNDEF)
8769 continue;
8770
8771 // Early exit if we found an unexpected opcode.
8772 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8773 return false;
8774
8775 SDValue Op0 = Op.getOperand(0);
8776 SDValue Op1 = Op.getOperand(1);
8777
8778 // Try to match the following pattern:
8779 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8780 // Early exit if we cannot match that sequence.
8781 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8782 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8783 return false;
8784
8785 // We found a valid add/sub node, make sure its the same opcode as previous
8786 // elements for this parity.
8787 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8788 return false;
8789 Opc[i % 2] = Opcode;
8790
8791 // Update InVec0 and InVec1.
8792 if (InVec0.isUndef())
8793 InVec0 = Op0.getOperand(0);
8794 if (InVec1.isUndef())
8795 InVec1 = Op1.getOperand(0);
8796
8797 // Make sure that operands in input to each add/sub node always
8798 // come from a same pair of vectors.
8799 if (InVec0 != Op0.getOperand(0)) {
8800 if (Opcode == ISD::FSUB)
8801 return false;
8802
8803 // FADD is commutable. Try to commute the operands
8804 // and then test again.
8805 std::swap(Op0, Op1);
8806 if (InVec0 != Op0.getOperand(0))
8807 return false;
8808 }
8809
8810 if (InVec1 != Op1.getOperand(0))
8811 return false;
8812
8813 // Increment the number of extractions done.
8814 ++NumExtracts;
8815 HasAllowContract &= Op->getFlags().hasAllowContract();
8816 }
8817
8818 // Ensure we have found an opcode for both parities and that they are
8819 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8820 // inputs are undef.
8821 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8822 InVec0.isUndef() || InVec1.isUndef())
8823 return false;
8824
8825 IsSubAdd = Opc[0] == ISD::FADD;
8826
8827 Opnd0 = InVec0;
8828 Opnd1 = InVec1;
8829 return true;
8830}
8831
8832/// Returns true if is possible to fold MUL and an idiom that has already been
8833/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8834/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8835/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8836///
8837/// Prior to calling this function it should be known that there is some
8838/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8839/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8840/// before replacement of such SDNode with ADDSUB operation. Thus the number
8841/// of \p Opnd0 uses is expected to be equal to 2.
8842/// For example, this function may be called for the following IR:
8843/// %AB = fmul fast <2 x double> %A, %B
8844/// %Sub = fsub fast <2 x double> %AB, %C
8845/// %Add = fadd fast <2 x double> %AB, %C
8846/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8847/// <2 x i32> <i32 0, i32 3>
8848/// There is a def for %Addsub here, which potentially can be replaced by
8849/// X86ISD::ADDSUB operation:
8850/// %Addsub = X86ISD::ADDSUB %AB, %C
8851/// and such ADDSUB can further be replaced with FMADDSUB:
8852/// %Addsub = FMADDSUB %A, %B, %C.
8853///
8854/// The main reason why this method is called before the replacement of the
8855/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8856/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8857/// FMADDSUB is.
8858static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8859 SelectionDAG &DAG, SDValue &Opnd0,
8860 SDValue &Opnd1, SDValue &Opnd2,
8861 unsigned ExpectedUses,
8862 bool AllowSubAddOrAddSubContract) {
8863 if (Opnd0.getOpcode() != ISD::FMUL ||
8864 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8865 return false;
8866
8867 // FIXME: These checks must match the similar ones in
8868 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8869 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8870 // or MUL + ADDSUB to FMADDSUB.
8871 bool AllowFusion =
8872 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8873 if (!AllowFusion)
8874 return false;
8875
8876 Opnd2 = Opnd1;
8877 Opnd1 = Opnd0.getOperand(1);
8878 Opnd0 = Opnd0.getOperand(0);
8879
8880 return true;
8881}
8882
8883/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8884/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8885/// X86ISD::FMSUBADD node.
8887 const SDLoc &DL,
8888 const X86Subtarget &Subtarget,
8889 SelectionDAG &DAG) {
8890 SDValue Opnd0, Opnd1;
8891 unsigned NumExtracts;
8892 bool IsSubAdd;
8893 bool HasAllowContract;
8894 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8895 HasAllowContract))
8896 return SDValue();
8897
8898 MVT VT = BV->getSimpleValueType(0);
8899
8900 // Try to generate X86ISD::FMADDSUB node here.
8901 SDValue Opnd2;
8902 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8903 HasAllowContract)) {
8904 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8905 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8906 }
8907
8908 // We only support ADDSUB.
8909 if (IsSubAdd)
8910 return SDValue();
8911
8912 // There are no known X86 targets with 512-bit ADDSUB instructions!
8913 // Convert to blend(fsub,fadd).
8914 if (VT.is512BitVector()) {
8915 SmallVector<int> Mask;
8916 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8917 Mask.push_back(I);
8918 Mask.push_back(I + E + 1);
8919 }
8920 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8921 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8922 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8923 }
8924
8925 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8926}
8927
8929 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8930 // Initialize outputs to known values.
8931 MVT VT = BV->getSimpleValueType(0);
8932 HOpcode = ISD::DELETED_NODE;
8933 V0 = DAG.getUNDEF(VT);
8934 V1 = DAG.getUNDEF(VT);
8935
8936 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8937 // half of the result is calculated independently from the 128-bit halves of
8938 // the inputs, so that makes the index-checking logic below more complicated.
8939 unsigned NumElts = VT.getVectorNumElements();
8940 unsigned GenericOpcode = ISD::DELETED_NODE;
8941 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8942 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8943 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8944 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8945 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8946 // Ignore undef elements.
8947 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8948 if (Op.isUndef())
8949 continue;
8950
8951 // If there's an opcode mismatch, we're done.
8952 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8953 return false;
8954
8955 // Initialize horizontal opcode.
8956 if (HOpcode == ISD::DELETED_NODE) {
8957 GenericOpcode = Op.getOpcode();
8958 switch (GenericOpcode) {
8959 // clang-format off
8960 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8961 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8962 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8963 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8964 default: return false;
8965 // clang-format on
8966 }
8967 }
8968
8969 SDValue Op0 = Op.getOperand(0);
8970 SDValue Op1 = Op.getOperand(1);
8971 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8973 Op0.getOperand(0) != Op1.getOperand(0) ||
8975 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8976 return false;
8977
8978 // The source vector is chosen based on which 64-bit half of the
8979 // destination vector is being calculated.
8980 if (j < NumEltsIn64Bits) {
8981 if (V0.isUndef())
8982 V0 = Op0.getOperand(0);
8983 } else {
8984 if (V1.isUndef())
8985 V1 = Op0.getOperand(0);
8986 }
8987
8988 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8989 if (SourceVec != Op0.getOperand(0))
8990 return false;
8991
8992 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8993 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8994 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8995 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8996 (j % NumEltsIn64Bits) * 2;
8997 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8998 continue;
8999
9000 // If this is not a commutative op, this does not match.
9001 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9002 return false;
9003
9004 // Addition is commutative, so try swapping the extract indexes.
9005 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9006 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9007 continue;
9008
9009 // Extract indexes do not match horizontal requirement.
9010 return false;
9011 }
9012 }
9013 // We matched. Opcode and operands are returned by reference as arguments.
9014 return true;
9015}
9016
9018 const SDLoc &DL, SelectionDAG &DAG,
9019 unsigned HOpcode, SDValue V0, SDValue V1) {
9020 // If either input vector is not the same size as the build vector,
9021 // extract/insert the low bits to the correct size.
9022 // This is free (examples: zmm --> xmm, xmm --> ymm).
9023 MVT VT = BV->getSimpleValueType(0);
9024 unsigned Width = VT.getSizeInBits();
9025 if (V0.getValueSizeInBits() > Width)
9026 V0 = extractSubVector(V0, 0, DAG, DL, Width);
9027 else if (V0.getValueSizeInBits() < Width)
9028 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
9029
9030 if (V1.getValueSizeInBits() > Width)
9031 V1 = extractSubVector(V1, 0, DAG, DL, Width);
9032 else if (V1.getValueSizeInBits() < Width)
9033 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
9034
9035 unsigned NumElts = VT.getVectorNumElements();
9036 APInt DemandedElts = APInt::getAllOnes(NumElts);
9037 for (unsigned i = 0; i != NumElts; ++i)
9038 if (BV->getOperand(i).isUndef())
9039 DemandedElts.clearBit(i);
9040
9041 // If we don't need the upper xmm, then perform as a xmm hop.
9042 unsigned HalfNumElts = NumElts / 2;
9043 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9044 MVT HalfVT = VT.getHalfNumVectorElementsVT();
9045 V0 = extractSubVector(V0, 0, DAG, DL, 128);
9046 V1 = extractSubVector(V1, 0, DAG, DL, 128);
9047 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
9048 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
9049 }
9050
9051 return DAG.getNode(HOpcode, DL, VT, V0, V1);
9052}
9053
9054/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9056 const X86Subtarget &Subtarget,
9057 SelectionDAG &DAG) {
9058 // We need at least 2 non-undef elements to make this worthwhile by default.
9059 unsigned NumNonUndefs =
9060 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9061 if (NumNonUndefs < 2)
9062 return SDValue();
9063
9064 // There are 4 sets of horizontal math operations distinguished by type:
9065 // int/FP at 128-bit/256-bit. Each type was introduced with a different
9066 // subtarget feature. Try to match those "native" patterns first.
9067 MVT VT = BV->getSimpleValueType(0);
9068 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9069 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9070 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9071 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9072 unsigned HOpcode;
9073 SDValue V0, V1;
9074 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9075 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
9076 }
9077
9078 // Try harder to match 256-bit ops by using extract/concat.
9079 if (!Subtarget.hasAVX() || !VT.is256BitVector())
9080 return SDValue();
9081
9082 // Count the number of UNDEF operands in the build_vector in input.
9083 unsigned NumElts = VT.getVectorNumElements();
9084 unsigned Half = NumElts / 2;
9085 unsigned NumUndefsLO = 0;
9086 unsigned NumUndefsHI = 0;
9087 for (unsigned i = 0, e = Half; i != e; ++i)
9088 if (BV->getOperand(i)->isUndef())
9089 NumUndefsLO++;
9090
9091 for (unsigned i = Half, e = NumElts; i != e; ++i)
9092 if (BV->getOperand(i)->isUndef())
9093 NumUndefsHI++;
9094
9095 SDValue InVec0, InVec1;
9096 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9097 SDValue InVec2, InVec3;
9098 unsigned X86Opcode;
9099 bool CanFold = true;
9100
9101 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
9102 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
9103 InVec3) &&
9104 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9105 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9106 X86Opcode = X86ISD::HADD;
9107 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
9108 InVec1) &&
9109 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
9110 InVec3) &&
9111 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9112 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9113 X86Opcode = X86ISD::HSUB;
9114 else
9115 CanFold = false;
9116
9117 if (CanFold) {
9118 // Do not try to expand this build_vector into a pair of horizontal
9119 // add/sub if we can emit a pair of scalar add/sub.
9120 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9121 return SDValue();
9122
9123 // Convert this build_vector into a pair of horizontal binops followed by
9124 // a concat vector. We must adjust the outputs from the partial horizontal
9125 // matching calls above to account for undefined vector halves.
9126 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9127 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9128 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
9129 bool isUndefLO = NumUndefsLO == Half;
9130 bool isUndefHI = NumUndefsHI == Half;
9131 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9132 isUndefHI);
9133 }
9134 }
9135
9136 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9137 VT == MVT::v16i16) {
9138 unsigned X86Opcode;
9139 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
9140 InVec1))
9141 X86Opcode = X86ISD::HADD;
9142 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
9143 InVec1))
9144 X86Opcode = X86ISD::HSUB;
9145 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
9146 InVec1))
9147 X86Opcode = X86ISD::FHADD;
9148 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
9149 InVec1))
9150 X86Opcode = X86ISD::FHSUB;
9151 else
9152 return SDValue();
9153
9154 // Don't try to expand this build_vector into a pair of horizontal add/sub
9155 // if we can simply emit a pair of scalar add/sub.
9156 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9157 return SDValue();
9158
9159 // Convert this build_vector into two horizontal add/sub followed by
9160 // a concat vector.
9161 bool isUndefLO = NumUndefsLO == Half;
9162 bool isUndefHI = NumUndefsHI == Half;
9163 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9164 isUndefLO, isUndefHI);
9165 }
9166
9167 return SDValue();
9168}
9169
9170static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
9171 SelectionDAG &DAG);
9172
9173/// If a BUILD_VECTOR's source elements all apply the same bit operation and
9174/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9175/// just apply the bit to the vectors.
9176/// NOTE: Its not in our interest to start make a general purpose vectorizer
9177/// from this, but enough scalar bit operations are created from the later
9178/// legalization + scalarization stages to need basic support.
9180 const X86Subtarget &Subtarget,
9181 SelectionDAG &DAG) {
9182 MVT VT = Op->getSimpleValueType(0);
9183 unsigned NumElems = VT.getVectorNumElements();
9184 unsigned ElemSize = VT.getScalarSizeInBits();
9185 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9186
9187 // Check that all elements have the same opcode.
9188 // TODO: Should we allow UNDEFS and if so how many?
9189 unsigned Opcode = Op->getOperand(0).getOpcode();
9190 for (unsigned i = 1; i < NumElems; ++i)
9191 if (Opcode != Op->getOperand(i).getOpcode())
9192 return SDValue();
9193
9194 // TODO: We may be able to add support for other Ops (e.g. ADD/SUB).
9195 bool IsShift = false;
9196 switch (Opcode) {
9197 default:
9198 return SDValue();
9199 case ISD::SHL:
9200 case ISD::SRL:
9201 case ISD::SRA:
9202 IsShift = true;
9203 break;
9204 case ISD::AND:
9205 case ISD::XOR:
9206 case ISD::OR:
9207 // Don't do this if the buildvector is a splat - we'd replace one
9208 // constant with an entire vector.
9209 if (Op->getSplatValue())
9210 return SDValue();
9211 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9212 return SDValue();
9213 break;
9214 }
9215
9216 // Collect elements.
9217 bool RHSAllConst = true;
9218 SmallVector<SDValue, 4> LHSElts, RHSElts;
9219 for (SDValue Elt : Op->ops()) {
9220 SDValue LHS = Elt.getOperand(0);
9221 SDValue RHS = Elt.getOperand(1);
9222 RHSAllConst &= isa<ConstantSDNode>(RHS);
9223 LHSElts.push_back(LHS);
9224 RHSElts.push_back(RHS);
9225 }
9226
9227 // Canonicalize shift amounts.
9228 if (IsShift) {
9229 // We expect the canonicalized RHS operand to be the constant.
9230 // TODO: Permit non-constant XOP/AVX2 cases?
9231 if (!RHSAllConst)
9232 return SDValue();
9233
9234 // Extend shift amounts.
9235 for (SDValue &Op1 : RHSElts)
9236 if (Op1.getValueSizeInBits() != ElemSize)
9237 Op1 = DAG.getZExtOrTrunc(Op1, DL, VT.getScalarType());
9238
9239 // Limit to shifts by uniform immediates.
9240 // TODO: Only accept vXi8/vXi64 special cases?
9241 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9242 if (any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9243 return SDValue();
9244 }
9245 assert(all_of(llvm::concat<SDValue>(LHSElts, RHSElts),
9246 [ElemSize](SDValue V) {
9247 return V.getValueSizeInBits() == ElemSize;
9248 }) &&
9249 "Element size mismatch");
9250
9251 // To avoid an increase in GPR->FPU instructions, LHS/RHS must be foldable as
9252 // a load or RHS must be constant.
9253 SDValue LHS = EltsFromConsecutiveLoads(VT, LHSElts, DL, DAG, Subtarget,
9254 /*IsAfterLegalize=*/true);
9255 SDValue RHS = EltsFromConsecutiveLoads(VT, RHSElts, DL, DAG, Subtarget,
9256 /*IsAfterLegalize=*/true);
9257 if (!LHS && !RHS && !RHSAllConst)
9258 return SDValue();
9259
9260 if (!LHS)
9261 LHS = DAG.getBuildVector(VT, DL, LHSElts);
9262 if (!RHS)
9263 RHS = DAG.getBuildVector(VT, DL, RHSElts);
9264 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
9265
9266 if (!IsShift)
9267 return Res;
9268
9269 // Immediately lower the shift to ensure the constant build vector doesn't
9270 // get converted to a constant pool before the shift is lowered.
9271 return LowerShift(Res, Subtarget, DAG);
9272}
9273
9274static bool isShuffleFoldableLoad(SDValue);
9275
9276/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
9277/// representing a blend.
9279 X86Subtarget const &Subtarget,
9280 SelectionDAG &DAG) {
9281 MVT VT = BVOp->getSimpleValueType(0u);
9282
9283 if (VT != MVT::v4f64)
9284 return SDValue();
9285
9286 // Collect unique operands.
9287 auto UniqueOps = SmallSet<SDValue, 16u>();
9288 for (SDValue Op : BVOp->ops()) {
9289 if (isIntOrFPConstant(Op) || Op.isUndef())
9290 return SDValue();
9291 UniqueOps.insert(Op);
9292 }
9293
9294 // Candidate BUILD_VECTOR must have 2 unique operands.
9295 if (UniqueOps.size() != 2u)
9296 return SDValue();
9297
9298 SDValue Op0 = BVOp->getOperand(0u);
9299 UniqueOps.erase(Op0);
9300 SDValue Op1 = *UniqueOps.begin();
9301
9302 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
9303 isShuffleFoldableLoad(Op1)) {
9304 // Create shuffle mask.
9305 auto const NumElems = VT.getVectorNumElements();
9306 SmallVector<int, 16u> Mask(NumElems);
9307 for (auto I = 0u; I < NumElems; ++I) {
9308 SDValue Op = BVOp->getOperand(I);
9309 Mask[I] = Op == Op0 ? I : I + NumElems;
9310 }
9311 // Create shuffle of splats.
9312 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
9313 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
9314 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
9315 }
9316
9317 return SDValue();
9318}
9319
9320/// Widen a BUILD_VECTOR if the scalar operands are freely mergeable.
9322 X86Subtarget const &Subtarget,
9323 SelectionDAG &DAG) {
9324 using namespace SDPatternMatch;
9325 MVT VT = BVOp->getSimpleValueType(0);
9326 MVT SVT = VT.getScalarType();
9327 unsigned NumElts = VT.getVectorNumElements();
9328 unsigned EltBits = SVT.getSizeInBits();
9329
9330 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
9331 return SDValue();
9332
9333 unsigned WideBits = 2 * EltBits;
9334 MVT WideSVT = MVT::getIntegerVT(WideBits);
9335 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts / 2);
9336 if (!DAG.getTargetLoweringInfo().isTypeLegal(WideSVT))
9337 return SDValue();
9338
9340 for (unsigned I = 0; I != NumElts; I += 2) {
9341 SDValue Op0 = BVOp->getOperand(I + 0);
9342 SDValue Op1 = BVOp->getOperand(I + 1);
9343
9344 if (Op0.isUndef() && Op1.isUndef()) {
9345 WideOps.push_back(DAG.getUNDEF(WideSVT));
9346 continue;
9347 }
9348
9349 // TODO: Constant repacking?
9350
9351 // Merge scalars that have been split from the same source.
9352 SDValue X, Y;
9353 if (sd_match(Op0, m_Trunc(m_Value(X))) &&
9354 sd_match(Op1, m_Trunc(m_Srl(m_Value(Y), m_SpecificInt(EltBits)))) &&
9356 X.getValueType().bitsGE(WideSVT)) {
9357 if (X.getValueType().bitsGT(WideSVT))
9358 X = DAG.getNode(ISD::TRUNCATE, DL, WideSVT, X);
9359 WideOps.push_back(X);
9360 continue;
9361 }
9362
9363 return SDValue();
9364 }
9365
9366 assert(WideOps.size() == (NumElts / 2) && "Failed to widen build vector");
9367 return DAG.getBitcast(VT, DAG.getBuildVector(WideVT, DL, WideOps));
9368}
9369
9370/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9371/// functionality to do this, so it's all zeros, all ones, or some derivation
9372/// that is cheap to calculate.
9374 SelectionDAG &DAG,
9375 const X86Subtarget &Subtarget) {
9376 MVT VT = Op.getSimpleValueType();
9377
9378 // Vectors containing all zeros can be matched by pxor and xorps.
9379 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9380 return Op;
9381
9382 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9383 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9384 // vpcmpeqd on 256-bit vectors.
9385 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
9386 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9387 return Op;
9388
9389 return getOnesVector(VT, DAG, DL);
9390 }
9391
9392 return SDValue();
9393}
9394
9395/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
9396/// from a vector of source values and a vector of extraction indices.
9397/// The vectors might be manipulated to match the type of the permute op.
9398static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
9399 const SDLoc &DL, SelectionDAG &DAG,
9400 const X86Subtarget &Subtarget) {
9401 MVT ShuffleVT = VT;
9402 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9403 unsigned NumElts = VT.getVectorNumElements();
9404 unsigned SizeInBits = VT.getSizeInBits();
9405
9406 // Adjust IndicesVec to match VT size.
9407 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
9408 "Illegal variable permute mask size");
9409 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
9410 // Narrow/widen the indices vector to the correct size.
9411 if (IndicesVec.getValueSizeInBits() > SizeInBits)
9412 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
9413 NumElts * VT.getScalarSizeInBits());
9414 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
9415 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
9416 SDLoc(IndicesVec), SizeInBits);
9417 // Zero-extend the index elements within the vector.
9418 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
9419 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
9420 IndicesVT, IndicesVec);
9421 }
9422 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
9423
9424 // Handle SrcVec that don't match VT type.
9425 if (SrcVec.getValueSizeInBits() != SizeInBits) {
9426 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
9427 // Handle larger SrcVec by treating it as a larger permute.
9428 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
9429 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
9430 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9431 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
9432 Subtarget, DAG, SDLoc(IndicesVec));
9433 SDValue NewSrcVec =
9434 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9435 if (NewSrcVec)
9436 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
9437 return SDValue();
9438 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
9439 // Widen smaller SrcVec to match VT.
9440 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
9441 } else
9442 return SDValue();
9443 }
9444
9445 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
9446 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
9447 EVT SrcVT = Idx.getValueType();
9448 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
9449 uint64_t IndexScale = 0;
9450 uint64_t IndexOffset = 0;
9451
9452 // If we're scaling a smaller permute op, then we need to repeat the
9453 // indices, scaling and offsetting them as well.
9454 // e.g. v4i32 -> v16i8 (Scale = 4)
9455 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
9456 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
9457 for (uint64_t i = 0; i != Scale; ++i) {
9458 IndexScale |= Scale << (i * NumDstBits);
9459 IndexOffset |= i << (i * NumDstBits);
9460 }
9461
9462 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
9463 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
9464 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
9465 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
9466 return Idx;
9467 };
9468
9469 unsigned Opcode = 0;
9470 switch (VT.SimpleTy) {
9471 default:
9472 break;
9473 case MVT::v16i8:
9474 if (Subtarget.hasSSSE3())
9475 Opcode = X86ISD::PSHUFB;
9476 break;
9477 case MVT::v8i16:
9478 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9479 Opcode = X86ISD::VPERMV;
9480 else if (Subtarget.hasSSSE3()) {
9481 Opcode = X86ISD::PSHUFB;
9482 ShuffleVT = MVT::v16i8;
9483 }
9484 break;
9485 case MVT::v4f32:
9486 case MVT::v4i32:
9487 if (Subtarget.hasAVX()) {
9488 Opcode = X86ISD::VPERMILPV;
9489 ShuffleVT = MVT::v4f32;
9490 } else if (Subtarget.hasSSSE3()) {
9491 Opcode = X86ISD::PSHUFB;
9492 ShuffleVT = MVT::v16i8;
9493 }
9494 break;
9495 case MVT::v2f64:
9496 case MVT::v2i64:
9497 if (Subtarget.hasAVX()) {
9498 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
9499 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9500 Opcode = X86ISD::VPERMILPV;
9501 ShuffleVT = MVT::v2f64;
9502 } else if (Subtarget.hasSSE41()) {
9503 // SSE41 can compare v2i64 - select between indices 0 and 1.
9504 return DAG.getSelectCC(
9505 DL, IndicesVec,
9506 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
9507 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
9508 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
9510 }
9511 break;
9512 case MVT::v32i8:
9513 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9514 Opcode = X86ISD::VPERMV;
9515 else if (Subtarget.hasXOP()) {
9516 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
9517 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9518 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9519 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9520 return DAG.getNode(
9522 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9523 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9524 } else if (Subtarget.hasAVX()) {
9525 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9526 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9527 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9528 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9529 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9531 // Permute Lo and Hi and then select based on index range.
9532 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9533 // care about the bit[7] as its just an index vector.
9534 SDValue Idx = Ops[2];
9535 EVT VT = Idx.getValueType();
9536 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9537 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9538 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9540 };
9541 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9542 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9543 PSHUFBBuilder);
9544 }
9545 break;
9546 case MVT::v16i16:
9547 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9548 Opcode = X86ISD::VPERMV;
9549 else if (Subtarget.hasAVX()) {
9550 // Scale to v32i8 and perform as v32i8.
9551 IndicesVec = ScaleIndices(IndicesVec, 2);
9552 return DAG.getBitcast(
9554 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9555 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9556 }
9557 break;
9558 case MVT::v8f32:
9559 case MVT::v8i32:
9560 if (Subtarget.hasAVX2())
9561 Opcode = X86ISD::VPERMV;
9562 else if (Subtarget.hasAVX()) {
9563 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9564 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9565 {0, 1, 2, 3, 0, 1, 2, 3});
9566 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9567 {4, 5, 6, 7, 4, 5, 6, 7});
9568 if (Subtarget.hasXOP())
9569 return DAG.getBitcast(
9570 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9571 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9572 // Permute Lo and Hi and then select based on index range.
9573 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9574 SDValue Res = DAG.getSelectCC(
9575 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9576 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9577 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9579 return DAG.getBitcast(VT, Res);
9580 }
9581 break;
9582 case MVT::v4i64:
9583 case MVT::v4f64:
9584 if (Subtarget.hasAVX512()) {
9585 if (!Subtarget.hasVLX()) {
9586 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9587 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9588 SDLoc(SrcVec));
9589 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9590 DAG, SDLoc(IndicesVec));
9591 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9592 DAG, Subtarget);
9593 return extract256BitVector(Res, 0, DAG, DL);
9594 }
9595 Opcode = X86ISD::VPERMV;
9596 } else if (Subtarget.hasAVX()) {
9597 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9598 SDValue LoLo =
9599 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9600 SDValue HiHi =
9601 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9602 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9603 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9604 if (Subtarget.hasXOP())
9605 return DAG.getBitcast(
9606 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9607 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9608 // Permute Lo and Hi and then select based on index range.
9609 // This works as VPERMILPD only uses index bit[1] to permute elements.
9610 SDValue Res = DAG.getSelectCC(
9611 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9612 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9613 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9615 return DAG.getBitcast(VT, Res);
9616 }
9617 break;
9618 case MVT::v64i8:
9619 if (Subtarget.hasVBMI())
9620 Opcode = X86ISD::VPERMV;
9621 break;
9622 case MVT::v32i16:
9623 if (Subtarget.hasBWI())
9624 Opcode = X86ISD::VPERMV;
9625 break;
9626 case MVT::v16f32:
9627 case MVT::v16i32:
9628 case MVT::v8f64:
9629 case MVT::v8i64:
9630 if (Subtarget.hasAVX512())
9631 Opcode = X86ISD::VPERMV;
9632 break;
9633 }
9634 if (!Opcode)
9635 return SDValue();
9636
9637 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9638 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9639 "Illegal variable permute shuffle type");
9640
9641 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9642 if (Scale > 1)
9643 IndicesVec = ScaleIndices(IndicesVec, Scale);
9644
9645 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9646 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9647
9648 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9649 SDValue Res = Opcode == X86ISD::VPERMV
9650 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9651 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9652 return DAG.getBitcast(VT, Res);
9653}
9654
9655// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9656// reasoned to be a permutation of a vector by indices in a non-constant vector.
9657// (build_vector (extract_elt V, (extract_elt I, 0)),
9658// (extract_elt V, (extract_elt I, 1)),
9659// ...
9660// ->
9661// (vpermv I, V)
9662//
9663// TODO: Handle undefs
9664// TODO: Utilize pshufb and zero mask blending to support more efficient
9665// construction of vectors with constant-0 elements.
9666static SDValue
9668 SelectionDAG &DAG,
9669 const X86Subtarget &Subtarget) {
9670 SDValue SrcVec, IndicesVec;
9671
9672 auto PeekThroughFreeze = [](SDValue N) {
9673 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9674 return N->getOperand(0);
9675 return N;
9676 };
9677 // Check for a match of the permute source vector and permute index elements.
9678 // This is done by checking that the i-th build_vector operand is of the form:
9679 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9680 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9681 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9682 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9683 return SDValue();
9684
9685 // If this is the first extract encountered in V, set the source vector,
9686 // otherwise verify the extract is from the previously defined source
9687 // vector.
9688 if (!SrcVec)
9689 SrcVec = Op.getOperand(0);
9690 else if (SrcVec != Op.getOperand(0))
9691 return SDValue();
9692 SDValue ExtractedIndex = Op->getOperand(1);
9693 // Peek through extends.
9694 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9695 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9696 ExtractedIndex = ExtractedIndex.getOperand(0);
9697 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9698 return SDValue();
9699
9700 // If this is the first extract from the index vector candidate, set the
9701 // indices vector, otherwise verify the extract is from the previously
9702 // defined indices vector.
9703 if (!IndicesVec)
9704 IndicesVec = ExtractedIndex.getOperand(0);
9705 else if (IndicesVec != ExtractedIndex.getOperand(0))
9706 return SDValue();
9707
9708 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9709 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9710 return SDValue();
9711 }
9712
9713 MVT VT = V.getSimpleValueType();
9714 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9715}
9716
9717SDValue
9718X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9719 SDLoc dl(Op);
9720
9721 MVT VT = Op.getSimpleValueType();
9722 MVT EltVT = VT.getVectorElementType();
9723 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9724 unsigned NumElems = Op.getNumOperands();
9725
9726 // Generate vectors for predicate vectors.
9727 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9728 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9729
9730 if (VT.getVectorElementType() == MVT::bf16 &&
9731 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9732 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9733
9734 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9735 return VectorCst;
9736
9737 unsigned EVTBits = EltVT.getSizeInBits();
9738 APInt UndefMask = APInt::getZero(NumElems);
9739 APInt FrozenUndefMask = APInt::getZero(NumElems);
9740 APInt ZeroMask = APInt::getZero(NumElems);
9741 APInt NonZeroMask = APInt::getZero(NumElems);
9742 bool IsAllConstants = true;
9743 bool OneUseFrozenUndefs = true;
9744 SmallSet<SDValue, 8> Values;
9745 unsigned NumConstants = NumElems;
9746 for (unsigned i = 0; i < NumElems; ++i) {
9747 SDValue Elt = Op.getOperand(i);
9748 if (Elt.isUndef()) {
9749 UndefMask.setBit(i);
9750 continue;
9751 }
9752 if (ISD::isFreezeUndef(Elt.getNode())) {
9753 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9754 FrozenUndefMask.setBit(i);
9755 continue;
9756 }
9757 Values.insert(Elt);
9758 if (!isIntOrFPConstant(Elt)) {
9759 IsAllConstants = false;
9760 NumConstants--;
9761 }
9762 if (X86::isZeroNode(Elt)) {
9763 ZeroMask.setBit(i);
9764 } else {
9765 NonZeroMask.setBit(i);
9766 }
9767 }
9768
9769 // All undef vector. Return an UNDEF.
9770 if (UndefMask.isAllOnes())
9771 return DAG.getUNDEF(VT);
9772
9773 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9774 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9775 return DAG.getFreeze(DAG.getUNDEF(VT));
9776
9777 // All undef/freeze(undef)/zero vector. Return a zero vector.
9778 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9779 return getZeroVector(VT, Subtarget, DAG, dl);
9780
9781 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9782 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9783 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9784 // and blend the FREEZE-UNDEF operands back in.
9785 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9786 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9787 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9788 SmallVector<int, 16> BlendMask(NumElems, -1);
9789 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9790 for (unsigned i = 0; i < NumElems; ++i) {
9791 if (UndefMask[i]) {
9792 BlendMask[i] = -1;
9793 continue;
9794 }
9795 BlendMask[i] = i;
9796 if (!FrozenUndefMask[i])
9797 Elts[i] = Op.getOperand(i);
9798 else
9799 BlendMask[i] += NumElems;
9800 }
9801 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9802 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9803 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9804 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9805 }
9806
9807 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9808
9809 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9810 // be better off lowering to a smaller build vector and padding with
9811 // undef/zero.
9812 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9814 unsigned UpperElems = NumElems / 2;
9815 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9816 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9817 if (NumUpperUndefsOrZeros >= UpperElems) {
9818 if (VT.is512BitVector() &&
9819 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9820 UpperElems = NumElems - (NumElems / 4);
9821 // If freeze(undef) is in any upper elements, force to zero.
9822 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9823 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9824 SDValue NewBV =
9825 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9826 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9827 }
9828 }
9829
9830 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9831 return AddSub;
9832 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9833 return HorizontalOp;
9834 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9835 return Broadcast;
9836 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9837 return BitOp;
9838 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9839 return Blend;
9840 if (SDValue WideBV = widenBuildVector(BV, dl, Subtarget, DAG))
9841 return WideBV;
9842
9843 unsigned NumZero = ZeroMask.popcount();
9844 unsigned NumNonZero = NonZeroMask.popcount();
9845
9846 // If we are inserting one variable into a vector of non-zero constants, try
9847 // to avoid loading each constant element as a scalar. Load the constants as a
9848 // vector and then insert the variable scalar element. If insertion is not
9849 // supported, fall back to a shuffle to get the scalar blended with the
9850 // constants. Insertion into a zero vector is handled as a special-case
9851 // somewhere below here.
9852 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9853 FrozenUndefMask.isZero() &&
9856 // Create an all-constant vector. The variable element in the old
9857 // build vector is replaced by undef in the constant vector. Save the
9858 // variable scalar element and its index for use in the insertelement.
9859 LLVMContext &Context = *DAG.getContext();
9860 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9861 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9862 SDValue VarElt;
9863 SDValue InsIndex;
9864 for (unsigned i = 0; i != NumElems; ++i) {
9865 SDValue Elt = Op.getOperand(i);
9866 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9867 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9868 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9869 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9870 else if (!Elt.isUndef()) {
9871 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9872 "Expected one variable element in this vector");
9873 VarElt = Elt;
9874 InsIndex = DAG.getVectorIdxConstant(i, dl);
9875 }
9876 }
9877 Constant *CV = ConstantVector::get(ConstVecOps);
9878 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9879
9880 // The constants we just created may not be legal (eg, floating point). We
9881 // must lower the vector right here because we can not guarantee that we'll
9882 // legalize it before loading it. This is also why we could not just create
9883 // a new build vector here. If the build vector contains illegal constants,
9884 // it could get split back up into a series of insert elements.
9885 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9886 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9887 MachineFunction &MF = DAG.getMachineFunction();
9888 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9889 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9890 unsigned InsertC = InsIndex->getAsZExtVal();
9891 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9892 if (InsertC < NumEltsInLow128Bits)
9893 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9894
9895 // There's no good way to insert into the high elements of a >128-bit
9896 // vector, so use shuffles to avoid an extract/insert sequence.
9897 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9898 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9899 SmallVector<int, 8> ShuffleMask;
9900 unsigned NumElts = VT.getVectorNumElements();
9901 for (unsigned i = 0; i != NumElts; ++i)
9902 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9903 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9904 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9905 }
9906
9907 // Special case for single non-zero, non-undef, element.
9908 if (NumNonZero == 1) {
9909 unsigned Idx = NonZeroMask.countr_zero();
9910 SDValue Item = Op.getOperand(Idx);
9911
9912 // If we have a constant or non-constant insertion into the low element of
9913 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9914 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9915 // depending on what the source datatype is.
9916 if (Idx == 0) {
9917 if (NumZero == 0)
9918 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9919
9920 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9921 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9922 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9923 assert((VT.is128BitVector() || VT.is256BitVector() ||
9924 VT.is512BitVector()) &&
9925 "Expected an SSE value type!");
9926 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9927 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9928 // zero vector.
9929 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9930 }
9931
9932 // We can't directly insert an i8 or i16 into a vector, so zero extend
9933 // it to i32 first.
9934 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9935 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9936 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9937 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9938 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9939 return DAG.getBitcast(VT, Item);
9940 }
9941 }
9942
9943 // Is it a vector logical left shift?
9944 if (NumElems == 2 && Idx == 1 &&
9945 X86::isZeroNode(Op.getOperand(0)) &&
9946 !X86::isZeroNode(Op.getOperand(1))) {
9947 unsigned NumBits = VT.getSizeInBits();
9948 return getVShift(true, VT,
9950 VT, Op.getOperand(1)),
9951 NumBits/2, DAG, *this, dl);
9952 }
9953
9954 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9955 return SDValue();
9956
9957 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9958 // is a non-constant being inserted into an element other than the low one,
9959 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9960 // movd/movss) to move this into the low element, then shuffle it into
9961 // place.
9962 if (EVTBits == 32) {
9963 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9964 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9965 }
9966 }
9967
9968 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9969 if (Values.size() == 1) {
9970 if (EVTBits == 32) {
9971 // Instead of a shuffle like this:
9972 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9973 // Check if it's possible to issue this instead.
9974 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9975 unsigned Idx = NonZeroMask.countr_zero();
9976 SDValue Item = Op.getOperand(Idx);
9977 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9978 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9979 }
9980 return SDValue();
9981 }
9982
9983 // A vector full of immediates; various special cases are already
9984 // handled, so this is best done with a single constant-pool load.
9985 if (IsAllConstants)
9986 return SDValue();
9987
9988 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9989 return V;
9990
9991 // See if we can use a vector load to get all of the elements.
9992 {
9993 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9994 if (SDValue LD =
9995 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9996 return LD;
9997 }
9998
9999 // If this is a splat of pairs of 32-bit elements, we can use a narrower
10000 // build_vector and broadcast it.
10001 // TODO: We could probably generalize this more.
10002 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10003 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10004 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10005 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10006 // Make sure all the even/odd operands match.
10007 for (unsigned i = 2; i != NumElems; ++i)
10008 if (Ops[i % 2] != Op.getOperand(i))
10009 return false;
10010 return true;
10011 };
10012 if (CanSplat(Op, NumElems, Ops)) {
10013 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10014 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10015 // Create a new build vector and cast to v2i64/v2f64.
10016 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10017 DAG.getBuildVector(NarrowVT, dl, Ops));
10018 // Broadcast from v2i64/v2f64 and cast to final VT.
10019 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10020 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10021 NewBV));
10022 }
10023 }
10024
10025 // For AVX-length vectors, build the individual 128-bit pieces and use
10026 // shuffles to put them in place.
10027 if (VT.getSizeInBits() > 128) {
10028 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10029
10030 // Build both the lower and upper subvector.
10031 SDValue Lower =
10032 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10034 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10035
10036 // Recreate the wider vector with the lower and upper part.
10037 return concatSubVectors(Lower, Upper, DAG, dl);
10038 }
10039
10040 // Let legalizer expand 2-wide build_vectors.
10041 if (EVTBits == 64) {
10042 if (NumNonZero == 1) {
10043 // One half is zero or undef.
10044 unsigned Idx = NonZeroMask.countr_zero();
10045 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10046 Op.getOperand(Idx));
10047 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10048 }
10049 return SDValue();
10050 }
10051
10052 // If element VT is < 32 bits, convert it to inserts into a zero vector.
10053 if (EVTBits == 8 && NumElems == 16)
10054 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
10055 NumZero, DAG, Subtarget))
10056 return V;
10057
10058 if (EltVT == MVT::i16 && NumElems == 8)
10059 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
10060 NumZero, DAG, Subtarget))
10061 return V;
10062
10063 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10064 if (EVTBits == 32 && NumElems == 4)
10065 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
10066 return V;
10067
10068 // If element VT is == 32 bits, turn it into a number of shuffles.
10069 if (NumElems == 4 && NumZero > 0) {
10070 SmallVector<SDValue, 8> Ops(NumElems);
10071 for (unsigned i = 0; i < 4; ++i) {
10072 bool isZero = !NonZeroMask[i];
10073 if (isZero)
10074 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10075 else
10076 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10077 }
10078
10079 for (unsigned i = 0; i < 2; ++i) {
10080 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10081 default: llvm_unreachable("Unexpected NonZero count");
10082 case 0:
10083 Ops[i] = Ops[i*2]; // Must be a zero vector.
10084 break;
10085 case 1:
10086 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10087 break;
10088 case 2:
10089 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10090 break;
10091 case 3:
10092 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10093 break;
10094 }
10095 }
10096
10097 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10098 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10099 int MaskVec[] = {
10100 Reverse1 ? 1 : 0,
10101 Reverse1 ? 0 : 1,
10102 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10103 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10104 };
10105 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10106 }
10107
10108 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
10109
10110 // Check for a build vector from mostly shuffle plus few inserting.
10111 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
10112 return Sh;
10113
10114 // For SSE 4.1, use insertps to put the high elements into the low element.
10115 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
10117 if (!Op.getOperand(0).isUndef())
10118 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10119 else
10120 Result = DAG.getUNDEF(VT);
10121
10122 for (unsigned i = 1; i < NumElems; ++i) {
10123 if (Op.getOperand(i).isUndef()) continue;
10124 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10125 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
10126 }
10127 return Result;
10128 }
10129
10130 // Otherwise, expand into a number of unpckl*, start by extending each of
10131 // our (non-undef) elements to the full vector width with the element in the
10132 // bottom slot of the vector (which generates no code for SSE).
10133 SmallVector<SDValue, 8> Ops(NumElems);
10134 for (unsigned i = 0; i < NumElems; ++i) {
10135 if (!Op.getOperand(i).isUndef())
10136 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10137 else
10138 Ops[i] = DAG.getUNDEF(VT);
10139 }
10140
10141 // Next, we iteratively mix elements, e.g. for v4f32:
10142 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10143 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10144 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10145 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10146 // Generate scaled UNPCKL shuffle mask.
10147 SmallVector<int, 16> Mask;
10148 for(unsigned i = 0; i != Scale; ++i)
10149 Mask.push_back(i);
10150 for (unsigned i = 0; i != Scale; ++i)
10151 Mask.push_back(NumElems+i);
10152 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10153
10154 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10155 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10156 }
10157 return Ops[0];
10158}
10159
10160// 256-bit AVX can use the vinsertf128 instruction
10161// to create 256-bit vectors from two other 128-bit ones.
10162// TODO: Detect subvector broadcast here instead of DAG combine?
10164 SelectionDAG &DAG,
10165 const X86Subtarget &Subtarget) {
10166 MVT ResVT = Op.getSimpleValueType();
10167 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
10168 "Value type must be 256-/512-bit wide");
10169
10170 unsigned NumOperands = Op.getNumOperands();
10171 unsigned NumFreezeUndef = 0;
10172 unsigned NumZero = 0;
10173 unsigned NumNonZero = 0;
10174 unsigned NonZeros = 0;
10175 SmallSet<SDValue, 4> Undefs;
10176 for (unsigned i = 0; i != NumOperands; ++i) {
10177 SDValue SubVec = Op.getOperand(i);
10178 if (SubVec.isUndef())
10179 continue;
10180 if (ISD::isFreezeUndef(SubVec.getNode())) {
10181 // If the freeze(undef) has multiple uses then we must fold to zero.
10182 if (SubVec.hasOneUse()) {
10183 ++NumFreezeUndef;
10184 } else {
10185 ++NumZero;
10186 Undefs.insert(SubVec);
10187 }
10188 }
10189 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10190 ++NumZero;
10191 else {
10192 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10193 NonZeros |= 1 << i;
10194 ++NumNonZero;
10195 }
10196 }
10197
10198 // If we have more than 2 non-zeros, build each half separately.
10199 if (NumNonZero > 2) {
10200 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10201 ArrayRef<SDUse> Ops = Op->ops();
10202 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10203 Ops.slice(0, NumOperands/2));
10204 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10205 Ops.slice(NumOperands/2));
10206 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10207 }
10208
10209 // Otherwise, build it up through insert_subvectors.
10210 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10211 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
10212 : DAG.getUNDEF(ResVT));
10213
10214 // Replace Undef operands with ZeroVector.
10215 for (SDValue U : Undefs)
10217 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
10218
10219 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10220 unsigned NumSubElems = SubVT.getVectorNumElements();
10221 for (unsigned i = 0; i != NumOperands; ++i) {
10222 if ((NonZeros & (1 << i)) == 0)
10223 continue;
10224
10225 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
10226 DAG.getVectorIdxConstant(i * NumSubElems, dl));
10227 }
10228
10229 return Vec;
10230}
10231
10232// Returns true if the given node is a type promotion (by concatenating i1
10233// zeros) of the result of a node that already zeros all upper bits of
10234// k-register.
10235// TODO: Merge this with LowerAVXCONCAT_VECTORS?
10237 const X86Subtarget &Subtarget,
10238 SelectionDAG & DAG) {
10239 MVT ResVT = Op.getSimpleValueType();
10240 unsigned NumOperands = Op.getNumOperands();
10241 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
10242 "Unexpected number of operands in CONCAT_VECTORS");
10243
10244 uint64_t Zeros = 0;
10245 uint64_t NonZeros = 0;
10246 for (unsigned i = 0; i != NumOperands; ++i) {
10247 SDValue SubVec = Op.getOperand(i);
10248 if (SubVec.isUndef())
10249 continue;
10250 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10251 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10252 Zeros |= (uint64_t)1 << i;
10253 else
10254 NonZeros |= (uint64_t)1 << i;
10255 }
10256
10257 unsigned NumElems = ResVT.getVectorNumElements();
10258
10259 // If we are inserting non-zero vector and there are zeros in LSBs and undef
10260 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10261 // insert_subvector will give us two kshifts.
10262 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10263 Log2_64(NonZeros) != NumOperands - 1) {
10264 unsigned Idx = Log2_64(NonZeros);
10265 SDValue SubVec = Op.getOperand(Idx);
10266 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10267 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
10268 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
10269 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
10270 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10271 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10272 DAG.getVectorIdxConstant(0, dl));
10273 }
10274
10275 // If there are zero or one non-zeros we can handle this very simply.
10276 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10277 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10278 if (!NonZeros)
10279 return Vec;
10280 unsigned Idx = Log2_64(NonZeros);
10281 SDValue SubVec = Op.getOperand(Idx);
10282 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10283 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10284 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
10285 }
10286
10287 if (NumOperands > 2) {
10288 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10289 ArrayRef<SDUse> Ops = Op->ops();
10290 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10291 Ops.slice(0, NumOperands / 2));
10292 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10293 Ops.slice(NumOperands / 2));
10294 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10295 }
10296
10297 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
10298
10299 if (ResVT.getVectorNumElements() >= 16)
10300 return Op; // The operation is legal with KUNPCK
10301
10302 SDValue Vec =
10303 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
10304 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
10305 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10306 DAG.getVectorIdxConstant(NumElems / 2, dl));
10307}
10308
10310 const X86Subtarget &Subtarget,
10311 SelectionDAG &DAG) {
10312 SDLoc DL(Op);
10313 MVT VT = Op.getSimpleValueType();
10314 if (VT.getVectorElementType() == MVT::i1)
10315 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
10316
10317 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10318 // from two other 128-bit ones.
10319 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10320 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
10321 (VT.is512BitVector() &&
10322 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
10323 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
10324}
10325
10326//===----------------------------------------------------------------------===//
10327// Vector shuffle lowering
10328//
10329// This is an experimental code path for lowering vector shuffles on x86. It is
10330// designed to handle arbitrary vector shuffles and blends, gracefully
10331// degrading performance as necessary. It works hard to recognize idiomatic
10332// shuffles and lower them to optimal instruction patterns without leaving
10333// a framework that allows reasonably efficient handling of all vector shuffle
10334// patterns.
10335//===----------------------------------------------------------------------===//
10336
10337/// Checks whether the vector elements referenced by two shuffle masks are
10338/// equivalent.
10339static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
10340 int Idx, int ExpectedIdx) {
10341 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
10342 ExpectedIdx < MaskSize && "Out of range element index");
10343 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
10344 return false;
10345
10346 EVT VT = Op.getValueType();
10347 EVT ExpectedVT = ExpectedOp.getValueType();
10348
10349 // Sources must be vectors and match the mask's element count.
10350 if (!VT.isVector() || !ExpectedVT.isVector() ||
10351 (int)VT.getVectorNumElements() != MaskSize ||
10352 (int)ExpectedVT.getVectorNumElements() != MaskSize)
10353 return false;
10354
10355 // Exact match.
10356 if (Idx == ExpectedIdx && Op == ExpectedOp)
10357 return true;
10358
10359 switch (Op.getOpcode()) {
10360 case ISD::BUILD_VECTOR:
10361 // If the values are build vectors, we can look through them to find
10362 // equivalent inputs that make the shuffles equivalent.
10363 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
10364 case ISD::BITCAST: {
10366 EVT SrcVT = Src.getValueType();
10367 if (Op == ExpectedOp && SrcVT.isVector()) {
10368 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
10369 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
10370 return (Idx % Scale) == (ExpectedIdx % Scale) &&
10371 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
10372 Idx / Scale, ExpectedIdx / Scale);
10373 }
10374 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
10375 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
10376 for (unsigned I = 0; I != Scale; ++I)
10377 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
10378 (Idx * Scale) + I,
10379 (ExpectedIdx * Scale) + I))
10380 return false;
10381 return true;
10382 }
10383 }
10384 break;
10385 }
10386 case ISD::VECTOR_SHUFFLE: {
10387 auto *SVN = cast<ShuffleVectorSDNode>(Op);
10388 return Op == ExpectedOp &&
10389 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
10390 }
10391 case X86ISD::VBROADCAST:
10392 case X86ISD::VBROADCAST_LOAD:
10393 return Op == ExpectedOp;
10394 case X86ISD::SUBV_BROADCAST_LOAD:
10395 if (Op == ExpectedOp) {
10396 auto *MemOp = cast<MemSDNode>(Op);
10397 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
10398 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
10399 }
10400 break;
10401 case X86ISD::VPERMI: {
10402 if (Op == ExpectedOp) {
10404 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
10405 SDValue Src = Op.getOperand(0);
10406 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
10407 Mask[ExpectedIdx]);
10408 }
10409 break;
10410 }
10411 case X86ISD::HADD:
10412 case X86ISD::HSUB:
10413 case X86ISD::FHADD:
10414 case X86ISD::FHSUB:
10415 case X86ISD::PACKSS:
10416 case X86ISD::PACKUS:
10417 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
10418 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
10419 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
10420 int NumElts = VT.getVectorNumElements();
10421 int NumLanes = VT.getSizeInBits() / 128;
10422 int NumEltsPerLane = NumElts / NumLanes;
10423 int NumHalfEltsPerLane = NumEltsPerLane / 2;
10424 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
10425 bool SameElt =
10426 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
10427 return SameLane && SameElt;
10428 }
10429 break;
10430 }
10431
10432 return false;
10433}
10434
10435/// Tiny helper function to identify a no-op mask.
10436///
10437/// This is a somewhat boring predicate function. It checks whether the mask
10438/// array input, which is assumed to be a single-input shuffle mask of the kind
10439/// used by the X86 shuffle instructions (not a fully general
10440/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10441/// in-place shuffle are 'no-op's.
10443 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10444 assert(Mask[i] >= -1 && "Out of bound mask element!");
10445 if (Mask[i] >= 0 && Mask[i] != i)
10446 return false;
10447 }
10448 return true;
10449}
10450
10451/// Test whether there are elements crossing LaneSizeInBits lanes in this
10452/// shuffle mask.
10453///
10454/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10455/// and we routinely test for these.
10456static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10457 unsigned ScalarSizeInBits,
10458 ArrayRef<int> Mask) {
10459 assert(LaneSizeInBits && ScalarSizeInBits &&
10460 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10461 "Illegal shuffle lane size");
10462 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10463 int Size = Mask.size();
10464 for (int i = 0; i < Size; ++i)
10465 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10466 return true;
10467 return false;
10468}
10469
10470/// Test whether there are elements crossing 128-bit lanes in this
10471/// shuffle mask.
10473 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10474}
10475
10476/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10477/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10478/// better support 'repeated mask + lane permute' style shuffles.
10479static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10480 unsigned ScalarSizeInBits,
10481 ArrayRef<int> Mask) {
10482 assert(LaneSizeInBits && ScalarSizeInBits &&
10483 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10484 "Illegal shuffle lane size");
10485 int NumElts = Mask.size();
10486 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10487 int NumLanes = NumElts / NumEltsPerLane;
10488 if (NumLanes > 1) {
10489 for (int i = 0; i != NumLanes; ++i) {
10490 int SrcLane = -1;
10491 for (int j = 0; j != NumEltsPerLane; ++j) {
10492 int M = Mask[(i * NumEltsPerLane) + j];
10493 if (M < 0)
10494 continue;
10495 int Lane = (M % NumElts) / NumEltsPerLane;
10496 if (SrcLane >= 0 && SrcLane != Lane)
10497 return true;
10498 SrcLane = Lane;
10499 }
10500 }
10501 }
10502 return false;
10503}
10504
10505/// Test whether a shuffle mask is equivalent within each sub-lane.
10506///
10507/// This checks a shuffle mask to see if it is performing the same
10508/// lane-relative shuffle in each sub-lane. This trivially implies
10509/// that it is also not lane-crossing. It may however involve a blend from the
10510/// same lane of a second vector.
10511///
10512/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10513/// non-trivial to compute in the face of undef lanes. The representation is
10514/// suitable for use with existing 128-bit shuffles as entries from the second
10515/// vector have been remapped to [LaneSize, 2*LaneSize).
10516static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10517 ArrayRef<int> Mask,
10518 SmallVectorImpl<int> &RepeatedMask) {
10519 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10520 RepeatedMask.assign(LaneSize, -1);
10521 int Size = Mask.size();
10522 for (int i = 0; i < Size; ++i) {
10523 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10524 if (Mask[i] < 0)
10525 continue;
10526 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10527 // This entry crosses lanes, so there is no way to model this shuffle.
10528 return false;
10529
10530 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10531 // Adjust second vector indices to start at LaneSize instead of Size.
10532 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10533 : Mask[i] % LaneSize + LaneSize;
10534 if (RepeatedMask[i % LaneSize] < 0)
10535 // This is the first non-undef entry in this slot of a 128-bit lane.
10536 RepeatedMask[i % LaneSize] = LocalM;
10537 else if (RepeatedMask[i % LaneSize] != LocalM)
10538 // Found a mismatch with the repeated mask.
10539 return false;
10540 }
10541 return true;
10542}
10543
10544/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10545static bool
10547 SmallVectorImpl<int> &RepeatedMask) {
10548 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10549}
10550
10551static bool
10553 SmallVector<int, 32> RepeatedMask;
10554 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10555}
10556
10557/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10558static bool
10560 SmallVectorImpl<int> &RepeatedMask) {
10561 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10562}
10563
10564/// Test whether a target shuffle mask is equivalent within each sub-lane.
10565/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10566static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10567 unsigned EltSizeInBits,
10568 ArrayRef<int> Mask,
10569 SmallVectorImpl<int> &RepeatedMask) {
10570 int LaneSize = LaneSizeInBits / EltSizeInBits;
10571 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10572 int Size = Mask.size();
10573 for (int i = 0; i < Size; ++i) {
10574 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10575 if (Mask[i] == SM_SentinelUndef)
10576 continue;
10577 if (Mask[i] == SM_SentinelZero) {
10578 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10579 return false;
10580 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10581 continue;
10582 }
10583 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10584 // This entry crosses lanes, so there is no way to model this shuffle.
10585 return false;
10586
10587 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10588 // later vector indices to start at multiples of LaneSize instead of Size.
10589 int LaneM = Mask[i] / Size;
10590 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10591 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10592 // This is the first non-undef entry in this slot of a 128-bit lane.
10593 RepeatedMask[i % LaneSize] = LocalM;
10594 else if (RepeatedMask[i % LaneSize] != LocalM)
10595 // Found a mismatch with the repeated mask.
10596 return false;
10597 }
10598 return true;
10599}
10600
10601/// Test whether a target shuffle mask is equivalent within each sub-lane.
10602/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10603static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10604 ArrayRef<int> Mask,
10605 SmallVectorImpl<int> &RepeatedMask) {
10606 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10607 Mask, RepeatedMask);
10608}
10609
10610/// Checks whether a shuffle mask is equivalent to an explicit list of
10611/// arguments.
10612///
10613/// This is a fast way to test a shuffle mask against a fixed pattern:
10614///
10615/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10616///
10617/// It returns true if the mask is exactly as wide as the argument list, and
10618/// each element of the mask is either -1 (signifying undef) or the value given
10619/// in the argument.
10620static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10621 SDValue V1 = SDValue(),
10622 SDValue V2 = SDValue()) {
10623 int Size = Mask.size();
10624 if (Size != (int)ExpectedMask.size())
10625 return false;
10626
10627 for (int i = 0; i < Size; ++i) {
10628 assert(Mask[i] >= -1 && "Out of bound mask element!");
10629 int MaskIdx = Mask[i];
10630 int ExpectedIdx = ExpectedMask[i];
10631 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10632 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10633 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10634 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10635 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10636 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10637 return false;
10638 }
10639 }
10640 return true;
10641}
10642
10643/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10644///
10645/// The masks must be exactly the same width.
10646///
10647/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10648/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10649///
10650/// SM_SentinelZero is accepted as a valid negative index but must match in
10651/// both, or via a known bits test.
10653 ArrayRef<int> ExpectedMask,
10654 const SelectionDAG &DAG,
10655 SDValue V1 = SDValue(),
10656 SDValue V2 = SDValue()) {
10657 int Size = Mask.size();
10658 if (Size != (int)ExpectedMask.size())
10659 return false;
10660 assert(llvm::all_of(ExpectedMask,
10661 [Size](int M) {
10662 return M == SM_SentinelZero ||
10663 isInRange(M, 0, 2 * Size);
10664 }) &&
10665 "Illegal target shuffle mask");
10666
10667 // Check for out-of-range target shuffle mask indices.
10668 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10669 return false;
10670
10671 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10672 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10673 !V1.getValueType().isVector()))
10674 V1 = SDValue();
10675 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10676 !V2.getValueType().isVector()))
10677 V2 = SDValue();
10678
10679 APInt ZeroV1 = APInt::getZero(Size);
10680 APInt ZeroV2 = APInt::getZero(Size);
10681
10682 for (int i = 0; i < Size; ++i) {
10683 int MaskIdx = Mask[i];
10684 int ExpectedIdx = ExpectedMask[i];
10685 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10686 continue;
10687 // If we failed to match an expected SM_SentinelZero then early out.
10688 if (ExpectedIdx < 0)
10689 return false;
10690 if (MaskIdx == SM_SentinelZero) {
10691 // If we need this expected index to be a zero element, then update the
10692 // relevant zero mask and perform the known bits at the end to minimize
10693 // repeated computes.
10694 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10695 if (ExpectedV &&
10696 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10697 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10698 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10699 ZeroMask.setBit(BitIdx);
10700 continue;
10701 }
10702 }
10703 if (MaskIdx >= 0) {
10704 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10705 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10706 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10707 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10708 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10709 continue;
10710 }
10711 return false;
10712 }
10713 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10714 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10715}
10716
10717// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10718// instructions.
10720 const SelectionDAG &DAG) {
10721 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10722 return false;
10723
10724 SmallVector<int, 8> Unpcklwd;
10725 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10726 /* Unary = */ false);
10727 SmallVector<int, 8> Unpckhwd;
10728 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10729 /* Unary = */ false);
10730 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10731 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10732 return IsUnpackwdMask;
10733}
10734
10736 const SelectionDAG &DAG) {
10737 // Create 128-bit vector type based on mask size.
10738 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10739 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10740
10741 // We can't assume a canonical shuffle mask, so try the commuted version too.
10742 SmallVector<int, 4> CommutedMask(Mask);
10744
10745 // Match any of unary/binary or low/high.
10746 for (unsigned i = 0; i != 4; ++i) {
10747 SmallVector<int, 16> UnpackMask;
10748 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10749 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10750 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10751 return true;
10752 }
10753 return false;
10754}
10755
10756/// Return true if a shuffle mask chooses elements identically in its top and
10757/// bottom halves. For example, any splat mask has the same top and bottom
10758/// halves. If an element is undefined in only one half of the mask, the halves
10759/// are not considered identical.
10761 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10762 unsigned HalfSize = Mask.size() / 2;
10763 for (unsigned i = 0; i != HalfSize; ++i) {
10764 if (Mask[i] != Mask[i + HalfSize])
10765 return false;
10766 }
10767 return true;
10768}
10769
10770/// Get a 4-lane 8-bit shuffle immediate for a mask.
10771///
10772/// This helper function produces an 8-bit shuffle immediate corresponding to
10773/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10774/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10775/// example.
10776///
10777/// NB: We rely heavily on "undef" masks preserving the input lane.
10778static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10779 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10780 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10781 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10782 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10783 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10784
10785 // If the mask only uses one non-undef element, then fully 'splat' it to
10786 // improve later broadcast matching.
10787 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10788 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10789
10790 int FirstElt = Mask[FirstIndex];
10791 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10792 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10793
10794 unsigned Imm = 0;
10795 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10796 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10797 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10798 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10799 return Imm;
10800}
10801
10803 SelectionDAG &DAG) {
10804 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10805}
10806
10807// Canonicalize SHUFPD mask to improve chances of further folding.
10808// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10809static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10810 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10811 "Unexpected SHUFPD mask size");
10812 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10813 "Unexpected SHUFPD mask elements");
10814
10815 // If the mask only uses one non-undef element, then fully 'splat' it to
10816 // improve later broadcast matching.
10817 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10818 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10819 "All undef shuffle mask");
10820
10821 int FirstElt = Mask[FirstIndex];
10822 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10823 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10824 unsigned Imm = 0;
10825 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10826 Imm |= FirstElt << I;
10827 return Imm;
10828 }
10829
10830 // Attempt to keep any undef elements in place to improve chances of the
10831 // shuffle becoming a (commutative) blend.
10832 unsigned Imm = 0;
10833 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10834 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10835
10836 return Imm;
10837}
10838
10840 SelectionDAG &DAG) {
10841 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10842}
10843
10844// The Shuffle result is as follow:
10845// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10846// Each Zeroable's element correspond to a particular Mask's element.
10847// As described in computeZeroableShuffleElements function.
10848//
10849// The function looks for a sub-mask that the nonzero elements are in
10850// increasing order. If such sub-mask exist. The function returns true.
10851static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10852 ArrayRef<int> Mask, const EVT &VectorType,
10853 bool &IsZeroSideLeft) {
10854 int NextElement = -1;
10855 // Check if the Mask's nonzero elements are in increasing order.
10856 for (int i = 0, e = Mask.size(); i < e; i++) {
10857 // Checks if the mask's zeros elements are built from only zeros.
10858 assert(Mask[i] >= -1 && "Out of bound mask element!");
10859 if (Mask[i] < 0)
10860 return false;
10861 if (Zeroable[i])
10862 continue;
10863 // Find the lowest non zero element
10864 if (NextElement < 0) {
10865 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10866 IsZeroSideLeft = NextElement != 0;
10867 }
10868 // Exit if the mask's non zero elements are not in increasing order.
10869 if (NextElement != Mask[i])
10870 return false;
10871 NextElement++;
10872 }
10873 return true;
10874}
10875
10876static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10878 const X86Subtarget &Subtarget,
10879 unsigned Depth = 0);
10880
10881/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10883 ArrayRef<int> Mask, SDValue V1,
10884 SDValue V2, const APInt &Zeroable,
10885 const X86Subtarget &Subtarget,
10886 SelectionDAG &DAG) {
10887 int Size = Mask.size();
10888 int LaneSize = 128 / VT.getScalarSizeInBits();
10889 const int NumBytes = VT.getSizeInBits() / 8;
10890 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10891
10892 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10893 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10894 (Subtarget.hasBWI() && VT.is512BitVector()));
10895
10896 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10897 // Sign bit set in i8 mask means zero element.
10898 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10899
10900 SDValue V;
10901 for (int i = 0; i < NumBytes; ++i) {
10902 int M = Mask[i / NumEltBytes];
10903 if (M < 0) {
10904 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10905 continue;
10906 }
10907 if (Zeroable[i / NumEltBytes]) {
10908 PSHUFBMask[i] = ZeroMask;
10909 continue;
10910 }
10911
10912 // We can only use a single input of V1 or V2.
10913 SDValue SrcV = (M >= Size ? V2 : V1);
10914 if (V && V != SrcV)
10915 return SDValue();
10916 V = SrcV;
10917 M %= Size;
10918
10919 // PSHUFB can't cross lanes, ensure this doesn't happen.
10920 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10921 return SDValue();
10922
10923 M = M % LaneSize;
10924 M = M * NumEltBytes + (i % NumEltBytes);
10925 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10926 }
10927 assert(V && "Failed to find a source input");
10928
10929 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10930 return DAG.getBitcast(
10931 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10932 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10933}
10934
10935/// Return Mask with the necessary casting or extending
10936/// for \p Mask according to \p MaskVT when lowering masking intrinsics
10937static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10938 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10939 const SDLoc &dl) {
10940 MVT SrcVT = Mask.getSimpleValueType();
10941 assert(SrcVT.isScalarInteger() && "Expected scalar integer mask source!");
10942 assert(MaskVT.bitsLE(SrcVT) && "Unexpected mask size!");
10943 assert(MaskVT.getVectorElementType() == MVT::i1 && "Bool vector expected!");
10944
10945 if (isAllOnesConstant(Mask))
10946 return DAG.getConstant(1, dl, MaskVT);
10947 if (X86::isZeroNode(Mask))
10948 return DAG.getConstant(0, dl, MaskVT);
10949
10950 // Attempt to pre-truncate the mask source (to a minimum of i8).
10951 if (SrcVT.getSizeInBits() > MaskVT.getVectorNumElements()) {
10952 SrcVT = MVT::getIntegerVT(std::max((int)MaskVT.getVectorNumElements(), 8));
10953 Mask = DAG.getNode(ISD::TRUNCATE, dl, SrcVT, Mask);
10954 }
10955
10956 if (SrcVT == MVT::i64 && Subtarget.is32Bit()) {
10957 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
10958 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
10959 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
10960 SDValue Lo, Hi;
10961 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
10962 Lo = DAG.getBitcast(MVT::v32i1, Lo);
10963 Hi = DAG.getBitcast(MVT::v32i1, Hi);
10964 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
10965 }
10966
10967 MVT BitcastVT = MVT::getVectorVT(MVT::i1, SrcVT.getSizeInBits());
10968 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
10969 // are extracted by EXTRACT_SUBVECTOR.
10970 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
10971 DAG.getBitcast(BitcastVT, Mask),
10972 DAG.getVectorIdxConstant(0, dl));
10973}
10974
10975// X86 has dedicated shuffle that can be lowered to VEXPAND
10977 SDValue V2, ArrayRef<int> Mask,
10978 const APInt &Zeroable,
10979 const X86Subtarget &Subtarget,
10980 SelectionDAG &DAG) {
10981 bool IsLeftZeroSide = true;
10982 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10983 IsLeftZeroSide))
10984 return SDValue();
10985 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10987 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10988 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10989 unsigned NumElts = VT.getVectorNumElements();
10990 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10991 "Unexpected number of vector elements");
10992 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10993 Subtarget, DAG, DL);
10994 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10995 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10996 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10997}
10998
10999static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11000 unsigned &UnpackOpcode, bool IsUnary,
11001 ArrayRef<int> TargetMask, const SDLoc &DL,
11002 SelectionDAG &DAG,
11003 const X86Subtarget &Subtarget) {
11004 int NumElts = VT.getVectorNumElements();
11005
11006 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11007 for (int i = 0; i != NumElts; i += 2) {
11008 int M1 = TargetMask[i + 0];
11009 int M2 = TargetMask[i + 1];
11010 Undef1 &= (SM_SentinelUndef == M1);
11011 Undef2 &= (SM_SentinelUndef == M2);
11012 Zero1 &= isUndefOrZero(M1);
11013 Zero2 &= isUndefOrZero(M2);
11014 }
11015 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
11016 "Zeroable shuffle detected");
11017
11018 // Attempt to match the target mask against the unpack lo/hi mask patterns.
11019 SmallVector<int, 64> Unpckl, Unpckh;
11020 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11021 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
11022 (IsUnary ? V1 : V2))) {
11023 UnpackOpcode = X86ISD::UNPCKL;
11024 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11025 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11026 return true;
11027 }
11028
11029 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11030 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
11031 (IsUnary ? V1 : V2))) {
11032 UnpackOpcode = X86ISD::UNPCKH;
11033 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11034 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11035 return true;
11036 }
11037
11038 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11039 if (IsUnary && (Zero1 || Zero2)) {
11040 // Don't bother if we can blend instead.
11041 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11042 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11043 return false;
11044
11045 bool MatchLo = true, MatchHi = true;
11046 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11047 int M = TargetMask[i];
11048
11049 // Ignore if the input is known to be zero or the index is undef.
11050 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11051 (M == SM_SentinelUndef))
11052 continue;
11053
11054 MatchLo &= (M == Unpckl[i]);
11055 MatchHi &= (M == Unpckh[i]);
11056 }
11057
11058 if (MatchLo || MatchHi) {
11059 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11060 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11061 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11062 return true;
11063 }
11064 }
11065
11066 // If a binary shuffle, commute and try again.
11067 if (!IsUnary) {
11069 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
11070 UnpackOpcode = X86ISD::UNPCKL;
11071 std::swap(V1, V2);
11072 return true;
11073 }
11074
11076 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
11077 UnpackOpcode = X86ISD::UNPCKH;
11078 std::swap(V1, V2);
11079 return true;
11080 }
11081 }
11082
11083 return false;
11084}
11085
11086// X86 has dedicated unpack instructions that can handle specific blend
11087// operations: UNPCKH and UNPCKL.
11089 SDValue V2, ArrayRef<int> Mask,
11090 SelectionDAG &DAG) {
11091 SmallVector<int, 8> Unpckl;
11092 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11093 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11094 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11095
11096 SmallVector<int, 8> Unpckh;
11097 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11098 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11099 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11100
11101 // Commute and try again.
11103 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11104 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11105
11107 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11108 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11109
11110 return SDValue();
11111}
11112
11113/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11114/// followed by unpack 256-bit.
11116 SDValue V2, ArrayRef<int> Mask,
11117 SelectionDAG &DAG) {
11118 SmallVector<int, 32> Unpckl, Unpckh;
11119 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11120 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11121
11122 unsigned UnpackOpcode;
11123 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11124 UnpackOpcode = X86ISD::UNPCKL;
11125 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11126 UnpackOpcode = X86ISD::UNPCKH;
11127 else
11128 return SDValue();
11129
11130 // This is a "natural" unpack operation (rather than the 128-bit sectored
11131 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11132 // input in order to use the x86 instruction.
11133 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11134 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11135 V1 = DAG.getBitcast(VT, V1);
11136 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11137}
11138
11139// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11140// source into the lower elements and zeroing the upper elements.
11141static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11142 ArrayRef<int> Mask, const APInt &Zeroable,
11143 const X86Subtarget &Subtarget) {
11144 if (!VT.is512BitVector() && !Subtarget.hasVLX())
11145 return false;
11146
11147 unsigned NumElts = Mask.size();
11148 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11149 unsigned MaxScale = 64 / EltSizeInBits;
11150
11151 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11152 unsigned SrcEltBits = EltSizeInBits * Scale;
11153 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11154 continue;
11155 unsigned NumSrcElts = NumElts / Scale;
11156 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11157 continue;
11158 unsigned UpperElts = NumElts - NumSrcElts;
11159 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
11160 continue;
11161 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11162 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11163 DstVT = MVT::getIntegerVT(EltSizeInBits);
11164 if ((NumSrcElts * EltSizeInBits) >= 128) {
11165 // ISD::TRUNCATE
11166 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11167 } else {
11168 // X86ISD::VTRUNC
11169 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11170 }
11171 return true;
11172 }
11173
11174 return false;
11175}
11176
11177// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11178// element padding to the final DstVT.
11179static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11180 const X86Subtarget &Subtarget,
11181 SelectionDAG &DAG, bool ZeroUppers) {
11182 MVT SrcVT = Src.getSimpleValueType();
11183 MVT DstSVT = DstVT.getScalarType();
11184 unsigned NumDstElts = DstVT.getVectorNumElements();
11185 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11186 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11187
11188 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11189 return SDValue();
11190
11191 // Perform a direct ISD::TRUNCATE if possible.
11192 if (NumSrcElts == NumDstElts)
11193 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11194
11195 if (NumSrcElts > NumDstElts) {
11196 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11197 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11198 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11199 }
11200
11201 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11202 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11203 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11204 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11205 DstVT.getSizeInBits());
11206 }
11207
11208 // Non-VLX targets must truncate from a 512-bit type, so we need to
11209 // widen, truncate and then possibly extract the original subvector.
11210 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11211 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11212 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11213 }
11214
11215 // Fallback to a X86ISD::VTRUNC, padding if necessary.
11216 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11217 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11218 if (DstVT != TruncVT)
11219 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11220 DstVT.getSizeInBits());
11221 return Trunc;
11222}
11223
11224// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11225//
11226// An example is the following:
11227//
11228// t0: ch = EntryToken
11229// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11230// t25: v4i32 = truncate t2
11231// t41: v8i16 = bitcast t25
11232// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11233// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11234// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11235// t18: v2i64 = bitcast t51
11236//
11237// One can just use a single vpmovdw instruction, without avx512vl we need to
11238// use the zmm variant and extract the lower subvector, padding with zeroes.
11239// TODO: Merge with lowerShuffleAsVTRUNC.
11241 SDValue V2, ArrayRef<int> Mask,
11242 const APInt &Zeroable,
11243 const X86Subtarget &Subtarget,
11244 SelectionDAG &DAG) {
11245 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
11246 if (!Subtarget.hasAVX512())
11247 return SDValue();
11248
11249 unsigned NumElts = VT.getVectorNumElements();
11250 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11251 unsigned MaxScale = 64 / EltSizeInBits;
11252 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11253 unsigned SrcEltBits = EltSizeInBits * Scale;
11254 unsigned NumSrcElts = NumElts / Scale;
11255 unsigned UpperElts = NumElts - NumSrcElts;
11256 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11257 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
11258 continue;
11259
11260 // Attempt to find a matching source truncation, but as a fall back VLX
11261 // cases can use the VPMOV directly.
11262 SDValue Src = peekThroughBitcasts(V1);
11263 if (Src.getOpcode() == ISD::TRUNCATE &&
11264 Src.getScalarValueSizeInBits() == SrcEltBits) {
11265 Src = Src.getOperand(0);
11266 } else if (Subtarget.hasVLX()) {
11267 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11268 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11269 Src = DAG.getBitcast(SrcVT, Src);
11270 // Don't do this if PACKSS/PACKUS could perform it cheaper.
11271 if (Scale == 2 &&
11272 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
11273 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
11274 return SDValue();
11275 } else
11276 return SDValue();
11277
11278 // VPMOVWB is only available with avx512bw.
11279 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
11280 return SDValue();
11281
11282 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11283 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11284 }
11285
11286 return SDValue();
11287}
11288
11289// Attempt to match binary shuffle patterns as a truncate.
11291 SDValue V2, ArrayRef<int> Mask,
11292 const APInt &Zeroable,
11293 const X86Subtarget &Subtarget,
11294 SelectionDAG &DAG) {
11295 assert((VT.is128BitVector() || VT.is256BitVector()) &&
11296 "Unexpected VTRUNC type");
11297 if (!Subtarget.hasAVX512() ||
11298 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
11299 return SDValue();
11300
11301 unsigned NumElts = VT.getVectorNumElements();
11302 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11303 unsigned MaxScale = 64 / EltSizeInBits;
11304 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11305 // TODO: Support non-BWI VPMOVWB truncations?
11306 unsigned SrcEltBits = EltSizeInBits * Scale;
11307 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11308 continue;
11309
11310 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
11311 // Bail if the V2 elements are undef.
11312 unsigned NumHalfSrcElts = NumElts / Scale;
11313 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11314 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
11315 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
11316 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11317 continue;
11318
11319 // The elements beyond the truncation must be undef/zero.
11320 unsigned UpperElts = NumElts - NumSrcElts;
11321 if (UpperElts > 0 &&
11322 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
11323 continue;
11324 bool UndefUppers =
11325 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11326
11327 // As we're using both sources then we need to concat them together
11328 // and truncate from the double-sized src.
11329 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
11330
11331 // For offset truncations, ensure that the concat is cheap.
11332 SDValue Src =
11333 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
11334 if (!Src) {
11335 if (Offset)
11336 continue;
11337 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11338 }
11339
11340 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11341 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11342 Src = DAG.getBitcast(SrcVT, Src);
11343
11344 // Shift the offset'd elements into place for the truncation.
11345 // TODO: Use getTargetVShiftByConstNode.
11346 if (Offset)
11347 Src = DAG.getNode(
11348 X86ISD::VSRLI, DL, SrcVT, Src,
11349 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
11350
11351 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11352 }
11353 }
11354
11355 return SDValue();
11356}
11357
11358/// Check whether a compaction lowering can be done by dropping even/odd
11359/// elements and compute how many times even/odd elements must be dropped.
11360///
11361/// This handles shuffles which take every Nth element where N is a power of
11362/// two. Example shuffle masks:
11363///
11364/// (even)
11365/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11366/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11367/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11368/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11369/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11370/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11371///
11372/// (odd)
11373/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
11374/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
11375///
11376/// Any of these lanes can of course be undef.
11377///
11378/// This routine only supports N <= 3.
11379/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11380/// for larger N.
11381///
11382/// \returns N above, or the number of times even/odd elements must be dropped
11383/// if there is such a number. Otherwise returns zero.
11384static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
11385 bool IsSingleInput) {
11386 // The modulus for the shuffle vector entries is based on whether this is
11387 // a single input or not.
11388 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11389 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11390 "We should only be called with masks with a power-of-2 size!");
11391
11392 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11393 int Offset = MatchEven ? 0 : 1;
11394
11395 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11396 // and 2^3 simultaneously. This is because we may have ambiguity with
11397 // partially undef inputs.
11398 bool ViableForN[3] = {true, true, true};
11399
11400 for (int i = 0, e = Mask.size(); i < e; ++i) {
11401 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11402 // want.
11403 if (Mask[i] < 0)
11404 continue;
11405
11406 bool IsAnyViable = false;
11407 for (unsigned j = 0; j != std::size(ViableForN); ++j)
11408 if (ViableForN[j]) {
11409 uint64_t N = j + 1;
11410
11411 // The shuffle mask must be equal to (i * 2^N) % M.
11412 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
11413 IsAnyViable = true;
11414 else
11415 ViableForN[j] = false;
11416 }
11417 // Early exit if we exhaust the possible powers of two.
11418 if (!IsAnyViable)
11419 break;
11420 }
11421
11422 for (unsigned j = 0; j != std::size(ViableForN); ++j)
11423 if (ViableForN[j])
11424 return j + 1;
11425
11426 // Return 0 as there is no viable power of two.
11427 return 0;
11428}
11429
11430// X86 has dedicated pack instructions that can handle specific truncation
11431// operations: PACKSS and PACKUS.
11432// Checks for compaction shuffle masks if MaxStages > 1.
11433// TODO: Add support for matching multiple PACKSS/PACKUS stages.
11434static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11435 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11436 const SelectionDAG &DAG,
11437 const X86Subtarget &Subtarget,
11438 unsigned MaxStages = 1) {
11439 unsigned NumElts = VT.getVectorNumElements();
11440 unsigned BitSize = VT.getScalarSizeInBits();
11441 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11442 "Illegal maximum compaction");
11443
11444 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11445 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11446 unsigned NumPackedBits = NumSrcBits - BitSize;
11447 N1 = peekThroughBitcasts(N1);
11448 N2 = peekThroughBitcasts(N2);
11449 unsigned NumBits1 = N1.getScalarValueSizeInBits();
11450 unsigned NumBits2 = N2.getScalarValueSizeInBits();
11451 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
11452 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
11453 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11454 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11455 return false;
11456 if (Subtarget.hasSSE41() || BitSize == 8) {
11457 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11458 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
11459 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
11460 V1 = N1;
11461 V2 = N2;
11462 SrcVT = PackVT;
11463 PackOpcode = X86ISD::PACKUS;
11464 return true;
11465 }
11466 }
11467 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
11468 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
11469 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
11470 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
11471 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11472 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
11473 V1 = N1;
11474 V2 = N2;
11475 SrcVT = PackVT;
11476 PackOpcode = X86ISD::PACKSS;
11477 return true;
11478 }
11479 return false;
11480 };
11481
11482 // Attempt to match against wider and wider compaction patterns.
11483 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11484 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11485 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11486
11487 // Try binary shuffle.
11488 SmallVector<int, 32> BinaryMask;
11489 createPackShuffleMask(VT, BinaryMask, false, NumStages);
11490 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
11491 if (MatchPACK(V1, V2, PackVT))
11492 return true;
11493
11494 // Try unary shuffle.
11495 SmallVector<int, 32> UnaryMask;
11496 createPackShuffleMask(VT, UnaryMask, true, NumStages);
11497 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
11498 if (MatchPACK(V1, V1, PackVT))
11499 return true;
11500 }
11501
11502 return false;
11503}
11504
11506 SDValue V2, ArrayRef<int> Mask,
11507 const X86Subtarget &Subtarget,
11508 SelectionDAG &DAG) {
11509 MVT PackVT;
11510 unsigned PackOpcode;
11511 unsigned SizeBits = VT.getSizeInBits();
11512 unsigned EltBits = VT.getScalarSizeInBits();
11513 unsigned MaxStages = Log2_32(64 / EltBits);
11514 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11515 Subtarget, MaxStages))
11516 return SDValue();
11517
11518 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11519 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11520
11521 // Don't lower multi-stage packs on AVX512, truncation is better.
11522 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11523 return SDValue();
11524
11525 // Pack to the largest type possible:
11526 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11527 unsigned MaxPackBits = 16;
11528 if (CurrentEltBits > 16 &&
11529 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11530 MaxPackBits = 32;
11531
11532 // Repeatedly pack down to the target size.
11533 SDValue Res;
11534 for (unsigned i = 0; i != NumStages; ++i) {
11535 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11536 unsigned NumSrcElts = SizeBits / SrcEltBits;
11537 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11538 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11539 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11540 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11541 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11542 DAG.getBitcast(SrcVT, V2));
11543 V1 = V2 = Res;
11544 CurrentEltBits /= 2;
11545 }
11546 assert(Res && Res.getValueType() == VT &&
11547 "Failed to lower compaction shuffle");
11548 return Res;
11549}
11550
11551/// Try to emit a bitmask instruction for a shuffle.
11552///
11553/// This handles cases where we can model a blend exactly as a bitmask due to
11554/// one of the inputs being zeroable.
11556 SDValue V2, ArrayRef<int> Mask,
11557 const APInt &Zeroable,
11558 const X86Subtarget &Subtarget,
11559 SelectionDAG &DAG) {
11560 MVT MaskVT = VT;
11561 MVT EltVT = VT.getVectorElementType();
11562 SDValue Zero, AllOnes;
11563 // Use f64 if i64 isn't legal.
11564 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11565 EltVT = MVT::f64;
11566 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11567 }
11568
11569 MVT LogicVT = VT;
11570 if (EltVT.isFloatingPoint()) {
11571 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11572 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11573 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11574 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11575 } else {
11576 Zero = DAG.getConstant(0, DL, EltVT);
11577 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11578 }
11579
11580 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11581 SDValue V;
11582 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11583 if (Zeroable[i])
11584 continue;
11585 if (Mask[i] % Size != i)
11586 return SDValue(); // Not a blend.
11587 if (!V)
11588 V = Mask[i] < Size ? V1 : V2;
11589 else if (V != (Mask[i] < Size ? V1 : V2))
11590 return SDValue(); // Can only let one input through the mask.
11591
11592 VMaskOps[i] = AllOnes;
11593 }
11594 if (!V)
11595 return SDValue(); // No non-zeroable elements!
11596
11597 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11598 VMask = DAG.getBitcast(LogicVT, VMask);
11599 V = DAG.getBitcast(LogicVT, V);
11600 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11601 return DAG.getBitcast(VT, And);
11602}
11603
11604/// Try to emit a blend instruction for a shuffle using bit math.
11605///
11606/// This is used as a fallback approach when first class blend instructions are
11607/// unavailable. Currently it is only suitable for integer vectors, but could
11608/// be generalized for floating point vectors if desirable.
11610 SDValue V2, ArrayRef<int> Mask,
11611 SelectionDAG &DAG) {
11612 assert(VT.isInteger() && "Only supports integer vector types!");
11613 MVT EltVT = VT.getVectorElementType();
11614 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11615 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11617 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11618 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11619 return SDValue(); // Shuffled input!
11620 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11621 }
11622
11623 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11624 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11625}
11626
11628 SDValue PreservedSrc,
11629 const X86Subtarget &Subtarget,
11630 SelectionDAG &DAG);
11631
11634 const APInt &Zeroable, bool &ForceV1Zero,
11635 bool &ForceV2Zero, uint64_t &BlendMask) {
11636 bool V1IsZeroOrUndef =
11638 bool V2IsZeroOrUndef =
11640
11641 BlendMask = 0;
11642 ForceV1Zero = false, ForceV2Zero = false;
11643 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11644
11645 int NumElts = Mask.size();
11646 int NumLanes = VT.getSizeInBits() / 128;
11647 int NumEltsPerLane = NumElts / NumLanes;
11648 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11649
11650 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11651 // then ensure the blend mask part for that lane just references that input.
11652 bool ForceWholeLaneMasks =
11653 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11654
11655 // Attempt to generate the binary blend mask. If an input is zero then
11656 // we can use any lane.
11657 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11658 // Keep track of the inputs used per lane.
11659 bool LaneV1InUse = false;
11660 bool LaneV2InUse = false;
11661 uint64_t LaneBlendMask = 0;
11662 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11663 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11664 int M = Mask[Elt];
11665 if (M == SM_SentinelUndef)
11666 continue;
11667 if (M == Elt || (0 <= M && M < NumElts &&
11668 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11669 Mask[Elt] = Elt;
11670 LaneV1InUse = true;
11671 continue;
11672 }
11673 if (M == (Elt + NumElts) ||
11674 (NumElts <= M &&
11675 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11676 LaneBlendMask |= 1ull << LaneElt;
11677 Mask[Elt] = Elt + NumElts;
11678 LaneV2InUse = true;
11679 continue;
11680 }
11681 if (Zeroable[Elt]) {
11682 if (V1IsZeroOrUndef) {
11683 ForceV1Zero = true;
11684 Mask[Elt] = Elt;
11685 LaneV1InUse = true;
11686 continue;
11687 }
11688 if (V2IsZeroOrUndef) {
11689 ForceV2Zero = true;
11690 LaneBlendMask |= 1ull << LaneElt;
11691 Mask[Elt] = Elt + NumElts;
11692 LaneV2InUse = true;
11693 continue;
11694 }
11695 }
11696 return false;
11697 }
11698
11699 // If we only used V2 then splat the lane blend mask to avoid any demanded
11700 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11701 // blend mask bit).
11702 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11703 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11704
11705 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11706 }
11707 return true;
11708}
11709
11710/// Try to emit a blend instruction for a shuffle.
11711///
11712/// This doesn't do any checks for the availability of instructions for blending
11713/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11714/// be matched in the backend with the type given. What it does check for is
11715/// that the shuffle mask is a blend, or convertible into a blend with zero.
11717 SDValue V2, ArrayRef<int> Original,
11718 const APInt &Zeroable,
11719 const X86Subtarget &Subtarget,
11720 SelectionDAG &DAG) {
11721 uint64_t BlendMask = 0;
11722 bool ForceV1Zero = false, ForceV2Zero = false;
11723 SmallVector<int, 64> Mask(Original);
11724 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11725 BlendMask))
11726 return SDValue();
11727
11728 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11729 if (ForceV1Zero)
11730 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11731 if (ForceV2Zero)
11732 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11733
11734 unsigned NumElts = VT.getVectorNumElements();
11735
11736 switch (VT.SimpleTy) {
11737 case MVT::v4i64:
11738 case MVT::v8i32:
11739 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11740 [[fallthrough]];
11741 case MVT::v4f64:
11742 case MVT::v8f32:
11743 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11744 [[fallthrough]];
11745 case MVT::v2f64:
11746 case MVT::v2i64:
11747 case MVT::v4f32:
11748 case MVT::v4i32:
11749 case MVT::v8i16:
11750 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11751 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11752 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11753 case MVT::v16i16: {
11754 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11755 SmallVector<int, 8> RepeatedMask;
11756 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11757 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11758 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11759 BlendMask = 0;
11760 for (int i = 0; i < 8; ++i)
11761 if (RepeatedMask[i] >= 8)
11762 BlendMask |= 1ull << i;
11763 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11764 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11765 }
11766 // Use PBLENDW for lower/upper lanes and then blend lanes.
11767 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11768 // merge to VSELECT where useful.
11769 uint64_t LoMask = BlendMask & 0xFF;
11770 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11771 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11772 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11773 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11774 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11775 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11776 return DAG.getVectorShuffle(
11777 MVT::v16i16, DL, Lo, Hi,
11778 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11779 }
11780 [[fallthrough]];
11781 }
11782 case MVT::v32i8:
11783 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11784 [[fallthrough]];
11785 case MVT::v16i8: {
11786 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11787
11788 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11789 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11790 Subtarget, DAG))
11791 return Masked;
11792
11793 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11794 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11795 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11796 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11797 }
11798
11799 // If we have VPTERNLOG, we can use that as a bit blend.
11800 if (Subtarget.hasVLX())
11801 if (SDValue BitBlend =
11802 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11803 return BitBlend;
11804
11805 // Scale the blend by the number of bytes per element.
11806 int Scale = VT.getScalarSizeInBits() / 8;
11807
11808 // This form of blend is always done on bytes. Compute the byte vector
11809 // type.
11810 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11811
11812 // x86 allows load folding with blendvb from the 2nd source operand. But
11813 // we are still using LLVM select here (see comment below), so that's V1.
11814 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11815 // allow that load-folding possibility.
11816 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11818 std::swap(V1, V2);
11819 }
11820
11821 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11822 // mix of LLVM's code generator and the x86 backend. We tell the code
11823 // generator that boolean values in the elements of an x86 vector register
11824 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11825 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11826 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11827 // of the element (the remaining are ignored) and 0 in that high bit would
11828 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11829 // the LLVM model for boolean values in vector elements gets the relevant
11830 // bit set, it is set backwards and over constrained relative to x86's
11831 // actual model.
11832 SmallVector<SDValue, 32> VSELECTMask;
11833 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11834 for (int j = 0; j < Scale; ++j)
11835 VSELECTMask.push_back(
11836 Mask[i] < 0
11837 ? DAG.getUNDEF(MVT::i8)
11838 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11839
11840 V1 = DAG.getBitcast(BlendVT, V1);
11841 V2 = DAG.getBitcast(BlendVT, V2);
11842 return DAG.getBitcast(
11843 VT,
11844 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11845 V1, V2));
11846 }
11847 case MVT::v16f32:
11848 case MVT::v8f64:
11849 case MVT::v8i64:
11850 case MVT::v16i32:
11851 case MVT::v32i16:
11852 case MVT::v64i8: {
11853 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11854 bool OptForSize = DAG.shouldOptForSize();
11855 if (!OptForSize) {
11856 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11857 Subtarget, DAG))
11858 return Masked;
11859 }
11860
11861 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11862 // masked move.
11863 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11864 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11865 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11866 }
11867 default:
11868 llvm_unreachable("Not a supported integer vector type!");
11869 }
11870}
11871
11872/// Try to lower as a blend of elements from two inputs followed by
11873/// a single-input permutation.
11874///
11875/// This matches the pattern where we can blend elements from two inputs and
11876/// then reduce the shuffle to a single-input permutation.
11878 SDValue V1, SDValue V2,
11879 ArrayRef<int> Mask,
11880 SelectionDAG &DAG,
11881 bool ImmBlends = false) {
11882 // We build up the blend mask while checking whether a blend is a viable way
11883 // to reduce the shuffle.
11884 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11885 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11886
11887 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11888 if (Mask[i] < 0)
11889 continue;
11890
11891 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11892
11893 if (BlendMask[Mask[i] % Size] < 0)
11894 BlendMask[Mask[i] % Size] = Mask[i];
11895 else if (BlendMask[Mask[i] % Size] != Mask[i])
11896 return SDValue(); // Can't blend in the needed input!
11897
11898 PermuteMask[i] = Mask[i] % Size;
11899 }
11900
11901 // If only immediate blends, then bail if the blend mask can't be widened to
11902 // i16.
11903 unsigned EltSize = VT.getScalarSizeInBits();
11904 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11905 return SDValue();
11906
11907 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11908 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11909}
11910
11911/// Try to lower as an unpack of elements from two inputs followed by
11912/// a single-input permutation.
11913///
11914/// This matches the pattern where we can unpack elements from two inputs and
11915/// then reduce the shuffle to a single-input (wider) permutation.
11917 SDValue V1, SDValue V2,
11918 ArrayRef<int> Mask,
11919 SelectionDAG &DAG) {
11920 int NumElts = Mask.size();
11921 int NumLanes = VT.getSizeInBits() / 128;
11922 int NumLaneElts = NumElts / NumLanes;
11923 int NumHalfLaneElts = NumLaneElts / 2;
11924
11925 bool MatchLo = true, MatchHi = true;
11926 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11927
11928 // Determine UNPCKL/UNPCKH type and operand order.
11929 for (int Elt = 0; Elt != NumElts; ++Elt) {
11930 int M = Mask[Elt];
11931 if (M < 0)
11932 continue;
11933
11934 // Normalize the mask value depending on whether it's V1 or V2.
11935 int NormM = M;
11936 SDValue &Op = Ops[Elt & 1];
11937 if (M < NumElts && (Op.isUndef() || Op == V1))
11938 Op = V1;
11939 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11940 Op = V2;
11941 NormM -= NumElts;
11942 } else
11943 return SDValue();
11944
11945 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11946 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11947 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11948 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11949 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11950 if (MatchLoAnyLane || MatchHiAnyLane) {
11951 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11952 "Failed to match UNPCKLO/UNPCKHI");
11953 break;
11954 }
11955 }
11956 MatchLo &= MatchLoAnyLane;
11957 MatchHi &= MatchHiAnyLane;
11958 if (!MatchLo && !MatchHi)
11959 return SDValue();
11960 }
11961 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11962
11963 // Element indices have changed after unpacking. Calculate permute mask
11964 // so that they will be put back to the position as dictated by the
11965 // original shuffle mask indices.
11966 SmallVector<int, 32> PermuteMask(NumElts, -1);
11967 for (int Elt = 0; Elt != NumElts; ++Elt) {
11968 int M = Mask[Elt];
11969 if (M < 0)
11970 continue;
11971 int NormM = M;
11972 if (NumElts <= M)
11973 NormM -= NumElts;
11974 bool IsFirstOp = M < NumElts;
11975 int BaseMaskElt =
11976 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11977 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11978 PermuteMask[Elt] = BaseMaskElt;
11979 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11980 PermuteMask[Elt] = BaseMaskElt + 1;
11981 assert(PermuteMask[Elt] != -1 &&
11982 "Input mask element is defined but failed to assign permute mask");
11983 }
11984
11985 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11986 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11987 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11988}
11989
11990/// Try to lower a shuffle as a permute of the inputs followed by an
11991/// UNPCK instruction.
11992///
11993/// This specifically targets cases where we end up with alternating between
11994/// the two inputs, and so can permute them into something that feeds a single
11995/// UNPCK instruction. Note that this routine only targets integer vectors
11996/// because for floating point vectors we have a generalized SHUFPS lowering
11997/// strategy that handles everything that doesn't *exactly* match an unpack,
11998/// making this clever lowering unnecessary.
12000 SDValue V1, SDValue V2,
12001 ArrayRef<int> Mask,
12002 const X86Subtarget &Subtarget,
12003 SelectionDAG &DAG) {
12004 int Size = Mask.size();
12005 assert(Mask.size() >= 2 && "Single element masks are invalid.");
12006
12007 // This routine only supports 128-bit integer dual input vectors.
12008 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
12009 return SDValue();
12010
12011 int NumLoInputs =
12012 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
12013 int NumHiInputs =
12014 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
12015
12016 bool UnpackLo = NumLoInputs >= NumHiInputs;
12017
12018 auto TryUnpack = [&](int ScalarSize, int Scale) {
12019 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
12020 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
12021
12022 for (int i = 0; i < Size; ++i) {
12023 if (Mask[i] < 0)
12024 continue;
12025
12026 // Each element of the unpack contains Scale elements from this mask.
12027 int UnpackIdx = i / Scale;
12028
12029 // We only handle the case where V1 feeds the first slots of the unpack.
12030 // We rely on canonicalization to ensure this is the case.
12031 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
12032 return SDValue();
12033
12034 // Setup the mask for this input. The indexing is tricky as we have to
12035 // handle the unpack stride.
12036 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
12037 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
12038 Mask[i] % Size;
12039 }
12040
12041 // If we will have to shuffle both inputs to use the unpack, check whether
12042 // we can just unpack first and shuffle the result. If so, skip this unpack.
12043 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
12044 !isNoopShuffleMask(V2Mask))
12045 return SDValue();
12046
12047 // Shuffle the inputs into place.
12048 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12049 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12050
12051 // Cast the inputs to the type we will use to unpack them.
12052 MVT UnpackVT =
12053 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
12054 V1 = DAG.getBitcast(UnpackVT, V1);
12055 V2 = DAG.getBitcast(UnpackVT, V2);
12056
12057 // Unpack the inputs and cast the result back to the desired type.
12058 return DAG.getBitcast(
12059 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12060 UnpackVT, V1, V2));
12061 };
12062
12063 // We try each unpack from the largest to the smallest to try and find one
12064 // that fits this mask.
12065 int OrigScalarSize = VT.getScalarSizeInBits();
12066 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
12067 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
12068 return Unpack;
12069
12070 // If we're shuffling with a zero vector then we're better off not doing
12071 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
12074 return SDValue();
12075
12076 // If none of the unpack-rooted lowerings worked (or were profitable) try an
12077 // initial unpack.
12078 if (NumLoInputs == 0 || NumHiInputs == 0) {
12079 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
12080 "We have to have *some* inputs!");
12081 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
12082
12083 // FIXME: We could consider the total complexity of the permute of each
12084 // possible unpacking. Or at the least we should consider how many
12085 // half-crossings are created.
12086 // FIXME: We could consider commuting the unpacks.
12087
12088 SmallVector<int, 32> PermMask((unsigned)Size, -1);
12089 for (int i = 0; i < Size; ++i) {
12090 if (Mask[i] < 0)
12091 continue;
12092
12093 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
12094
12095 PermMask[i] =
12096 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
12097 }
12098 return DAG.getVectorShuffle(
12099 VT, DL,
12100 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
12101 V1, V2),
12102 DAG.getUNDEF(VT), PermMask);
12103 }
12104
12105 return SDValue();
12106}
12107
12108/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12109/// permuting the elements of the result in place.
12111 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12112 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12113 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12114 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12115 (VT.is512BitVector() && !Subtarget.hasBWI()))
12116 return SDValue();
12117
12118 // We don't currently support lane crossing permutes.
12119 if (is128BitLaneCrossingShuffleMask(VT, Mask))
12120 return SDValue();
12121
12122 int Scale = VT.getScalarSizeInBits() / 8;
12123 int NumLanes = VT.getSizeInBits() / 128;
12124 int NumElts = VT.getVectorNumElements();
12125 int NumEltsPerLane = NumElts / NumLanes;
12126
12127 // Determine range of mask elts.
12128 bool Blend1 = true;
12129 bool Blend2 = true;
12130 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
12131 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
12132 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12133 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12134 int M = Mask[Lane + Elt];
12135 if (M < 0)
12136 continue;
12137 if (M < NumElts) {
12138 Blend1 &= (M == (Lane + Elt));
12139 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12140 M = M % NumEltsPerLane;
12141 Range1.first = std::min(Range1.first, M);
12142 Range1.second = std::max(Range1.second, M);
12143 } else {
12144 M -= NumElts;
12145 Blend2 &= (M == (Lane + Elt));
12146 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12147 M = M % NumEltsPerLane;
12148 Range2.first = std::min(Range2.first, M);
12149 Range2.second = std::max(Range2.second, M);
12150 }
12151 }
12152 }
12153
12154 // Bail if we don't need both elements.
12155 // TODO - it might be worth doing this for unary shuffles if the permute
12156 // can be widened.
12157 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12158 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12159 return SDValue();
12160
12161 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12162 return SDValue();
12163
12164 // Rotate the 2 ops so we can access both ranges, then permute the result.
12165 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12166 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12167 SDValue Rotate = DAG.getBitcast(
12168 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12169 DAG.getBitcast(ByteVT, Lo),
12170 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12171 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12172 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12173 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12174 int M = Mask[Lane + Elt];
12175 if (M < 0)
12176 continue;
12177 if (M < NumElts)
12178 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12179 else
12180 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12181 }
12182 }
12183 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12184 };
12185
12186 // Check if the ranges are small enough to rotate from either direction.
12187 if (Range2.second < Range1.first)
12188 return RotateAndPermute(V1, V2, Range1.first, 0);
12189 if (Range1.second < Range2.first)
12190 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12191 return SDValue();
12192}
12193
12195 return isUndefOrEqual(Mask, 0);
12196}
12197
12199 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
12200}
12201
12202/// Check if the Mask consists of the same element repeated multiple times.
12204 size_t NumUndefs = 0;
12205 std::optional<int> UniqueElt;
12206 for (int Elt : Mask) {
12207 if (Elt == SM_SentinelUndef) {
12208 NumUndefs++;
12209 continue;
12210 }
12211 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
12212 return false;
12213 UniqueElt = Elt;
12214 }
12215 // Make sure the element is repeated enough times by checking the number of
12216 // undefs is small.
12217 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
12218}
12219
12220/// Generic routine to decompose a shuffle and blend into independent
12221/// blends and permutes.
12222///
12223/// This matches the extremely common pattern for handling combined
12224/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12225/// operations. It will try to pick the best arrangement of shuffles and
12226/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12228 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12229 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12230 int NumElts = Mask.size();
12231 int NumLanes = VT.getSizeInBits() / 128;
12232 int NumEltsPerLane = NumElts / NumLanes;
12233
12234 // Shuffle the input elements into the desired positions in V1 and V2 and
12235 // unpack/blend them together.
12236 bool IsAlternating = true;
12237 bool V1Zero = true, V2Zero = true;
12238 SmallVector<int, 32> V1Mask(NumElts, -1);
12239 SmallVector<int, 32> V2Mask(NumElts, -1);
12240 SmallVector<int, 32> FinalMask(NumElts, -1);
12241 for (int i = 0; i < NumElts; ++i) {
12242 int M = Mask[i];
12243 if (M >= 0 && M < NumElts) {
12244 V1Mask[i] = M;
12245 FinalMask[i] = i;
12246 V1Zero &= Zeroable[i];
12247 IsAlternating &= (i & 1) == 0;
12248 } else if (M >= NumElts) {
12249 V2Mask[i] = M - NumElts;
12250 FinalMask[i] = i + NumElts;
12251 V2Zero &= Zeroable[i];
12252 IsAlternating &= (i & 1) == 1;
12253 }
12254 }
12255
12256 // If we effectively only demand the 0'th element of \p Input, and not only
12257 // as 0'th element, then broadcast said input,
12258 // and change \p InputMask to be a no-op (identity) mask.
12259 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
12260 &DAG](SDValue &Input,
12261 MutableArrayRef<int> InputMask) {
12262 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
12263 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
12264 !X86::mayFoldLoad(Input, Subtarget)))
12265 return;
12266 if (isNoopShuffleMask(InputMask))
12267 return;
12268 assert(isBroadcastShuffleMask(InputMask) &&
12269 "Expected to demand only the 0'th element.");
12270 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
12271 for (auto I : enumerate(InputMask)) {
12272 int &InputMaskElt = I.value();
12273 if (InputMaskElt >= 0)
12274 InputMaskElt = I.index();
12275 }
12276 };
12277
12278 // Currently, we may need to produce one shuffle per input, and blend results.
12279 // It is possible that the shuffle for one of the inputs is already a no-op.
12280 // See if we can simplify non-no-op shuffles into broadcasts,
12281 // which we consider to be strictly better than an arbitrary shuffle.
12282 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
12284 canonicalizeBroadcastableInput(V1, V1Mask);
12285 canonicalizeBroadcastableInput(V2, V2Mask);
12286 }
12287
12288 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12289 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12290 // the shuffle may be able to fold with a load or other benefit. However, when
12291 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12292 // pre-shuffle first is a better strategy.
12293 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12294 // If we don't have blends, see if we can create a cheap unpack.
12295 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
12296 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
12297 is128BitUnpackShuffleMask(V2Mask, DAG)))
12298 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
12299 DL, VT, V1, V2, Mask, Subtarget, DAG))
12300 return PermUnpack;
12301
12302 // Only prefer immediate blends to unpack/rotate.
12303 if (SDValue BlendPerm =
12304 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
12305 return BlendPerm;
12306
12307 // If either input vector provides only a single element which is repeated
12308 // multiple times, unpacking from both input vectors would generate worse
12309 // code. e.g. for
12310 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
12311 // it is better to process t4 first to create a vector of t4[0], then unpack
12312 // that vector with t2.
12313 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
12315 if (SDValue UnpackPerm =
12316 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
12317 return UnpackPerm;
12318
12320 DL, VT, V1, V2, Mask, Subtarget, DAG))
12321 return RotatePerm;
12322
12323 // Unpack/rotate failed - try again with variable blends.
12324 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12325 DAG))
12326 return BlendPerm;
12327
12328 if (VT.getScalarSizeInBits() >= 32)
12329 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
12330 DL, VT, V1, V2, Mask, Subtarget, DAG))
12331 return PermUnpack;
12332 }
12333
12334 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12335 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12336 // TODO: It doesn't have to be alternating - but each lane mustn't have more
12337 // than half the elements coming from each source.
12338 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12339 V1Mask.assign(NumElts, -1);
12340 V2Mask.assign(NumElts, -1);
12341 FinalMask.assign(NumElts, -1);
12342 for (int i = 0; i != NumElts; i += NumEltsPerLane)
12343 for (int j = 0; j != NumEltsPerLane; ++j) {
12344 int M = Mask[i + j];
12345 if (M >= 0 && M < NumElts) {
12346 V1Mask[i + (j / 2)] = M;
12347 FinalMask[i + j] = i + (j / 2);
12348 } else if (M >= NumElts) {
12349 V2Mask[i + (j / 2)] = M - NumElts;
12350 FinalMask[i + j] = i + (j / 2) + NumElts;
12351 }
12352 }
12353 }
12354
12355 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12356 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12357 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12358}
12359
12360static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12361 const X86Subtarget &Subtarget,
12362 ArrayRef<int> Mask) {
12363 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12364 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
12365
12366 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12367 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12368 int MaxSubElts = 64 / EltSizeInBits;
12369 unsigned RotateAmt, NumSubElts;
12370 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
12371 MaxSubElts, NumSubElts, RotateAmt))
12372 return -1;
12373 unsigned NumElts = Mask.size();
12374 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12375 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12376 return RotateAmt;
12377}
12378
12379/// Lower shuffle using X86ISD::VROTLI rotations.
12381 ArrayRef<int> Mask,
12382 const X86Subtarget &Subtarget,
12383 SelectionDAG &DAG) {
12384 // Only XOP + AVX512 targets have bit rotation instructions.
12385 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12386 bool IsLegal =
12387 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12388 if (!IsLegal && Subtarget.hasSSE3())
12389 return SDValue();
12390
12391 MVT RotateVT;
12392 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12393 Subtarget, Mask);
12394 if (RotateAmt < 0)
12395 return SDValue();
12396
12397 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12398 // expanded to OR(SRL,SHL), will be more efficient, but if they can
12399 // widen to vXi16 or more then existing lowering should will be better.
12400 if (!IsLegal) {
12401 if ((RotateAmt % 16) == 0)
12402 return SDValue();
12403 // TODO: Use getTargetVShiftByConstNode.
12404 unsigned ShlAmt = RotateAmt;
12405 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12406 V1 = DAG.getBitcast(RotateVT, V1);
12407 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12408 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12409 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12410 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12411 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12412 return DAG.getBitcast(VT, Rot);
12413 }
12414
12415 SDValue Rot =
12416 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12417 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12418 return DAG.getBitcast(VT, Rot);
12419}
12420
12421/// Try to match a vector shuffle as an element rotation.
12422///
12423/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12425 ArrayRef<int> Mask) {
12426 int NumElts = Mask.size();
12427
12428 // We need to detect various ways of spelling a rotation:
12429 // [11, 12, 13, 14, 15, 0, 1, 2]
12430 // [-1, 12, 13, 14, -1, -1, 1, -1]
12431 // [-1, -1, -1, -1, -1, -1, 1, 2]
12432 // [ 3, 4, 5, 6, 7, 8, 9, 10]
12433 // [-1, 4, 5, 6, -1, -1, 9, -1]
12434 // [-1, 4, 5, 6, -1, -1, -1, -1]
12435 int Rotation = 0;
12436 SDValue Lo, Hi;
12437 for (int i = 0; i < NumElts; ++i) {
12438 int M = Mask[i];
12439 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
12440 "Unexpected mask index.");
12441 if (M < 0)
12442 continue;
12443
12444 // Determine where a rotated vector would have started.
12445 int StartIdx = i - (M % NumElts);
12446 if (StartIdx == 0)
12447 // The identity rotation isn't interesting, stop.
12448 return -1;
12449
12450 // If we found the tail of a vector the rotation must be the missing
12451 // front. If we found the head of a vector, it must be how much of the
12452 // head.
12453 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12454
12455 if (Rotation == 0)
12456 Rotation = CandidateRotation;
12457 else if (Rotation != CandidateRotation)
12458 // The rotations don't match, so we can't match this mask.
12459 return -1;
12460
12461 // Compute which value this mask is pointing at.
12462 SDValue MaskV = M < NumElts ? V1 : V2;
12463
12464 // Compute which of the two target values this index should be assigned
12465 // to. This reflects whether the high elements are remaining or the low
12466 // elements are remaining.
12467 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12468
12469 // Either set up this value if we've not encountered it before, or check
12470 // that it remains consistent.
12471 if (!TargetV)
12472 TargetV = MaskV;
12473 else if (TargetV != MaskV)
12474 // This may be a rotation, but it pulls from the inputs in some
12475 // unsupported interleaving.
12476 return -1;
12477 }
12478
12479 // Check that we successfully analyzed the mask, and normalize the results.
12480 assert(Rotation != 0 && "Failed to locate a viable rotation!");
12481 assert((Lo || Hi) && "Failed to find a rotated input vector!");
12482 if (!Lo)
12483 Lo = Hi;
12484 else if (!Hi)
12485 Hi = Lo;
12486
12487 V1 = Lo;
12488 V2 = Hi;
12489
12490 return Rotation;
12491}
12492
12493/// Try to lower a vector shuffle as a byte rotation.
12494///
12495/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12496/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12497/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12498/// try to generically lower a vector shuffle through such an pattern. It
12499/// does not check for the profitability of lowering either as PALIGNR or
12500/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12501/// This matches shuffle vectors that look like:
12502///
12503/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12504///
12505/// Essentially it concatenates V1 and V2, shifts right by some number of
12506/// elements, and takes the low elements as the result. Note that while this is
12507/// specified as a *right shift* because x86 is little-endian, it is a *left
12508/// rotate* of the vector lanes.
12510 ArrayRef<int> Mask) {
12511 // Don't accept any shuffles with zero elements.
12512 if (isAnyZero(Mask))
12513 return -1;
12514
12515 // PALIGNR works on 128-bit lanes.
12516 SmallVector<int, 16> RepeatedMask;
12517 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12518 return -1;
12519
12520 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12521 if (Rotation <= 0)
12522 return -1;
12523
12524 // PALIGNR rotates bytes, so we need to scale the
12525 // rotation based on how many bytes are in the vector lane.
12526 int NumElts = RepeatedMask.size();
12527 int Scale = 16 / NumElts;
12528 return Rotation * Scale;
12529}
12530
12532 SDValue V2, ArrayRef<int> Mask,
12533 const X86Subtarget &Subtarget,
12534 SelectionDAG &DAG) {
12535 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12536
12537 SDValue Lo = V1, Hi = V2;
12538 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12539 if (ByteRotation <= 0)
12540 return SDValue();
12541
12542 // Cast the inputs to i8 vector of correct length to match PALIGNR or
12543 // PSLLDQ/PSRLDQ.
12544 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12545 Lo = DAG.getBitcast(ByteVT, Lo);
12546 Hi = DAG.getBitcast(ByteVT, Hi);
12547
12548 // SSSE3 targets can use the palignr instruction.
12549 if (Subtarget.hasSSSE3()) {
12550 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
12551 "512-bit PALIGNR requires BWI instructions");
12552 return DAG.getBitcast(
12553 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12554 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12555 }
12556
12557 assert(VT.is128BitVector() &&
12558 "Rotate-based lowering only supports 128-bit lowering!");
12559 assert(Mask.size() <= 16 &&
12560 "Can shuffle at most 16 bytes in a 128-bit vector!");
12561 assert(ByteVT == MVT::v16i8 &&
12562 "SSE2 rotate lowering only needed for v16i8!");
12563
12564 // Default SSE2 implementation
12565 int LoByteShift = 16 - ByteRotation;
12566 int HiByteShift = ByteRotation;
12567
12568 SDValue LoShift =
12569 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12570 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12571 SDValue HiShift =
12572 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12573 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12574 return DAG.getBitcast(VT,
12575 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12576}
12577
12578/// Try to lower a vector shuffle as a dword/qword rotation.
12579///
12580/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12581/// rotation of the concatenation of two vectors; This routine will
12582/// try to generically lower a vector shuffle through such an pattern.
12583///
12584/// Essentially it concatenates V1 and V2, shifts right by some number of
12585/// elements, and takes the low elements as the result. Note that while this is
12586/// specified as a *right shift* because x86 is little-endian, it is a *left
12587/// rotate* of the vector lanes.
12589 SDValue V2, ArrayRef<int> Mask,
12590 const APInt &Zeroable,
12591 const X86Subtarget &Subtarget,
12592 SelectionDAG &DAG) {
12593 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12594 "Only 32-bit and 64-bit elements are supported!");
12595
12596 // 128/256-bit vectors are only supported with VLX.
12597 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12598 && "VLX required for 128/256-bit vectors");
12599
12600 SDValue Lo = V1, Hi = V2;
12601 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12602 if (0 < Rotation)
12603 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12604 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12605
12606 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12607 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12608 // TODO: We can probably make this more aggressive and use shift-pairs like
12609 // lowerShuffleAsByteShiftMask.
12610 unsigned NumElts = Mask.size();
12611 unsigned ZeroLo = Zeroable.countr_one();
12612 unsigned ZeroHi = Zeroable.countl_one();
12613 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12614 if (!ZeroLo && !ZeroHi)
12615 return SDValue();
12616
12617 if (ZeroLo) {
12618 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12619 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12620 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12621 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12622 getZeroVector(VT, Subtarget, DAG, DL),
12623 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12624 }
12625
12626 if (ZeroHi) {
12627 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12628 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12629 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12630 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12631 getZeroVector(VT, Subtarget, DAG, DL), Src,
12632 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12633 }
12634
12635 return SDValue();
12636}
12637
12638/// Try to lower a vector shuffle as a byte shift sequence.
12640 SDValue V2, ArrayRef<int> Mask,
12641 const APInt &Zeroable,
12642 const X86Subtarget &Subtarget,
12643 SelectionDAG &DAG) {
12644 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12645 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12646
12647 // We need a shuffle that has zeros at one/both ends and a sequential
12648 // shuffle from one source within.
12649 unsigned ZeroLo = Zeroable.countr_one();
12650 unsigned ZeroHi = Zeroable.countl_one();
12651 if (!ZeroLo && !ZeroHi)
12652 return SDValue();
12653
12654 unsigned NumElts = Mask.size();
12655 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12656 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12657 return SDValue();
12658
12659 unsigned Scale = VT.getScalarSizeInBits() / 8;
12660 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12661 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12662 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12663 return SDValue();
12664
12665 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12666 Res = DAG.getBitcast(MVT::v16i8, Res);
12667
12668 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12669 // inner sequential set of elements, possibly offset:
12670 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12671 // 01234567 --> 4567zzzz --> zzzzz456
12672 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12673 if (ZeroLo == 0) {
12674 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12675 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12676 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12677 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12678 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12679 } else if (ZeroHi == 0) {
12680 unsigned Shift = Mask[ZeroLo] % NumElts;
12681 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12682 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12683 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12684 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12685 } else if (!Subtarget.hasSSSE3()) {
12686 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12687 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12688 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12689 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12690 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12691 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12692 Shift += Mask[ZeroLo] % NumElts;
12693 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12694 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12695 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12696 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12697 } else
12698 return SDValue();
12699
12700 return DAG.getBitcast(VT, Res);
12701}
12702
12703/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12704///
12705/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12706/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12707/// matches elements from one of the input vectors shuffled to the left or
12708/// right with zeroable elements 'shifted in'. It handles both the strictly
12709/// bit-wise element shifts and the byte shift across an entire 128-bit double
12710/// quad word lane.
12711///
12712/// PSHL : (little-endian) left bit shift.
12713/// [ zz, 0, zz, 2 ]
12714/// [ -1, 4, zz, -1 ]
12715/// PSRL : (little-endian) right bit shift.
12716/// [ 1, zz, 3, zz]
12717/// [ -1, -1, 7, zz]
12718/// PSLLDQ : (little-endian) left byte shift
12719/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12720/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12721/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12722/// PSRLDQ : (little-endian) right byte shift
12723/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12724/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12725/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12726static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12727 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12728 int MaskOffset, const APInt &Zeroable,
12729 const X86Subtarget &Subtarget) {
12730 int Size = Mask.size();
12731 unsigned SizeInBits = Size * ScalarSizeInBits;
12732
12733 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12734 for (int i = 0; i < Size; i += Scale)
12735 for (int j = 0; j < Shift; ++j)
12736 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12737 return false;
12738
12739 return true;
12740 };
12741
12742 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12743 for (int i = 0; i != Size; i += Scale) {
12744 unsigned Pos = Left ? i + Shift : i;
12745 unsigned Low = Left ? i : i + Shift;
12746 unsigned Len = Scale - Shift;
12747 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12748 return -1;
12749 }
12750
12751 int ShiftEltBits = ScalarSizeInBits * Scale;
12752 bool ByteShift = ShiftEltBits > 64;
12753 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12754 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12755 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12756
12757 // Normalize the scale for byte shifts to still produce an i64 element
12758 // type.
12759 Scale = ByteShift ? Scale / 2 : Scale;
12760
12761 // We need to round trip through the appropriate type for the shift.
12762 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12763 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12764 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12765 return ShiftAmt;
12766 };
12767
12768 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12769 // keep doubling the size of the integer elements up to that. We can
12770 // then shift the elements of the integer vector by whole multiples of
12771 // their width within the elements of the larger integer vector. Test each
12772 // multiple to see if we can find a match with the moved element indices
12773 // and that the shifted in elements are all zeroable.
12774 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12775 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12776 for (int Shift = 1; Shift != Scale; ++Shift)
12777 for (bool Left : {true, false})
12778 if (CheckZeros(Shift, Scale, Left)) {
12779 int ShiftAmt = MatchShift(Shift, Scale, Left);
12780 if (0 < ShiftAmt)
12781 return ShiftAmt;
12782 }
12783
12784 // no match
12785 return -1;
12786}
12787
12789 SDValue V2, ArrayRef<int> Mask,
12790 const APInt &Zeroable,
12791 const X86Subtarget &Subtarget,
12792 SelectionDAG &DAG, bool BitwiseOnly) {
12793 int Size = Mask.size();
12794 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12795
12796 MVT ShiftVT;
12797 SDValue V = V1;
12798 unsigned Opcode;
12799
12800 // Try to match shuffle against V1 shift.
12801 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12802 Mask, 0, Zeroable, Subtarget);
12803
12804 // If V1 failed, try to match shuffle against V2 shift.
12805 if (ShiftAmt < 0) {
12806 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12807 Mask, Size, Zeroable, Subtarget);
12808 V = V2;
12809 }
12810
12811 if (ShiftAmt < 0)
12812 return SDValue();
12813
12814 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12815 return SDValue();
12816
12817 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12818 "Illegal integer vector type");
12819 V = DAG.getBitcast(ShiftVT, V);
12820 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12821 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12822 return DAG.getBitcast(VT, V);
12823}
12824
12825// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12826// Remainder of lower half result is zero and upper half is all undef.
12827static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12828 ArrayRef<int> Mask, uint64_t &BitLen,
12829 uint64_t &BitIdx, const APInt &Zeroable) {
12830 int Size = Mask.size();
12831 int HalfSize = Size / 2;
12832 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12833 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12834
12835 // Upper half must be undefined.
12836 if (!isUndefUpperHalf(Mask))
12837 return false;
12838
12839 // Determine the extraction length from the part of the
12840 // lower half that isn't zeroable.
12841 int Len = HalfSize;
12842 for (; Len > 0; --Len)
12843 if (!Zeroable[Len - 1])
12844 break;
12845 assert(Len > 0 && "Zeroable shuffle mask");
12846
12847 // Attempt to match first Len sequential elements from the lower half.
12848 SDValue Src;
12849 int Idx = -1;
12850 for (int i = 0; i != Len; ++i) {
12851 int M = Mask[i];
12852 if (M == SM_SentinelUndef)
12853 continue;
12854 SDValue &V = (M < Size ? V1 : V2);
12855 M = M % Size;
12856
12857 // The extracted elements must start at a valid index and all mask
12858 // elements must be in the lower half.
12859 if (i > M || M >= HalfSize)
12860 return false;
12861
12862 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12863 Src = V;
12864 Idx = M - i;
12865 continue;
12866 }
12867 return false;
12868 }
12869
12870 if (!Src || Idx < 0)
12871 return false;
12872
12873 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12874 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12875 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12876 V1 = Src;
12877 return true;
12878}
12879
12880// INSERTQ: Extract lowest Len elements from lower half of second source and
12881// insert over first source, starting at Idx.
12882// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12883static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12884 ArrayRef<int> Mask, uint64_t &BitLen,
12885 uint64_t &BitIdx) {
12886 int Size = Mask.size();
12887 int HalfSize = Size / 2;
12888 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12889
12890 // Upper half must be undefined.
12891 if (!isUndefUpperHalf(Mask))
12892 return false;
12893
12894 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12895 SDValue Base;
12896
12897 // Attempt to match first source from mask before insertion point.
12898 if (isUndefInRange(Mask, 0, Idx)) {
12899 /* EMPTY */
12900 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12901 Base = V1;
12902 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12903 Base = V2;
12904 } else {
12905 continue;
12906 }
12907
12908 // Extend the extraction length looking to match both the insertion of
12909 // the second source and the remaining elements of the first.
12910 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12911 SDValue Insert;
12912 int Len = Hi - Idx;
12913
12914 // Match insertion.
12915 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12916 Insert = V1;
12917 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12918 Insert = V2;
12919 } else {
12920 continue;
12921 }
12922
12923 // Match the remaining elements of the lower half.
12924 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12925 /* EMPTY */
12926 } else if ((!Base || (Base == V1)) &&
12927 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12928 Base = V1;
12929 } else if ((!Base || (Base == V2)) &&
12930 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12931 Size + Hi)) {
12932 Base = V2;
12933 } else {
12934 continue;
12935 }
12936
12937 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12938 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12939 V1 = Base;
12940 V2 = Insert;
12941 return true;
12942 }
12943 }
12944
12945 return false;
12946}
12947
12948/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12950 SDValue V2, ArrayRef<int> Mask,
12951 const APInt &Zeroable, SelectionDAG &DAG) {
12952 uint64_t BitLen, BitIdx;
12953 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12954 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12955 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12956 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12957
12958 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12959 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12960 V2 ? V2 : DAG.getUNDEF(VT),
12961 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12962 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12963
12964 return SDValue();
12965}
12966
12967/// Lower a vector shuffle as an any/signed/zero extension.
12968///
12969/// Given a specific number of elements, element bit width, and extension
12970/// stride, produce either an extension based on the available
12971/// features of the subtarget. The extended elements are consecutive and
12972/// begin and can start from an offsetted element index in the input; to
12973/// avoid excess shuffling the offset must either being in the bottom lane
12974/// or at the start of a higher lane. All extended elements must be from
12975/// the same lane.
12977 int Scale, int Offset,
12978 unsigned ExtOpc, SDValue InputV,
12979 ArrayRef<int> Mask,
12980 const X86Subtarget &Subtarget,
12981 SelectionDAG &DAG) {
12982 assert(Scale > 1 && "Need a scale to extend.");
12983 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12984 int EltBits = VT.getScalarSizeInBits();
12985 int NumElements = VT.getVectorNumElements();
12986 int NumEltsPerLane = 128 / EltBits;
12987 int OffsetLane = Offset / NumEltsPerLane;
12988 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12989 "Only 8, 16, and 32 bit elements can be extended.");
12990 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12991 assert(0 <= Offset && "Extension offset must be positive.");
12992 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12993 "Extension offset must be in the first lane or start an upper lane.");
12994
12995 // Check that an index is in same lane as the base offset.
12996 auto SafeOffset = [&](int Idx) {
12997 return OffsetLane == (Idx / NumEltsPerLane);
12998 };
12999
13000 // Shift along an input so that the offset base moves to the first element.
13001 auto ShuffleOffset = [&](SDValue V) {
13002 if (!Offset)
13003 return V;
13004
13005 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13006 for (int i = 0; i * Scale < NumElements; ++i) {
13007 int SrcIdx = i + Offset;
13008 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13009 }
13010 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13011 };
13012
13013 // Found a valid a/zext mask! Try various lowering strategies based on the
13014 // input type and available ISA extensions.
13015 if (Subtarget.hasSSE41()) {
13016 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13017 // PUNPCK will catch this in a later shuffle match.
13018 if (Offset && Scale == 2 && VT.is128BitVector())
13019 return SDValue();
13020 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13021 NumElements / Scale);
13022 InputV = DAG.getBitcast(VT, InputV);
13023 InputV = ShuffleOffset(InputV);
13024 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
13025 return DAG.getBitcast(VT, InputV);
13026 }
13027
13028 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
13029 InputV = DAG.getBitcast(VT, InputV);
13030 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
13031
13032 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
13033 if (ExtOpc == ISD::SIGN_EXTEND)
13034 return SDValue();
13035
13036 // For any extends we can cheat for larger element sizes and use shuffle
13037 // instructions that can fold with a load and/or copy.
13038 if (AnyExt && EltBits == 32) {
13039 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13040 -1};
13041 return DAG.getBitcast(
13042 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13043 DAG.getBitcast(MVT::v4i32, InputV),
13044 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13045 }
13046 if (AnyExt && EltBits == 16 && Scale > 2) {
13047 int PSHUFDMask[4] = {Offset / 2, -1,
13048 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13049 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13050 DAG.getBitcast(MVT::v4i32, InputV),
13051 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13052 int PSHUFWMask[4] = {1, -1, -1, -1};
13053 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13054 return DAG.getBitcast(
13055 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13056 DAG.getBitcast(MVT::v8i16, InputV),
13057 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13058 }
13059
13060 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13061 // to 64-bits.
13062 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13063 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
13064 assert(VT.is128BitVector() && "Unexpected vector width!");
13065
13066 int LoIdx = Offset * EltBits;
13067 SDValue Lo = DAG.getBitcast(
13068 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13069 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13070 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13071
13072 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13073 return DAG.getBitcast(VT, Lo);
13074
13075 int HiIdx = (Offset + 1) * EltBits;
13076 SDValue Hi = DAG.getBitcast(
13077 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13078 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13079 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13080 return DAG.getBitcast(VT,
13081 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13082 }
13083
13084 // If this would require more than 2 unpack instructions to expand, use
13085 // pshufb when available. We can only use more than 2 unpack instructions
13086 // when zero extending i8 elements which also makes it easier to use pshufb.
13087 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13088 assert(NumElements == 16 && "Unexpected byte vector width!");
13089 SDValue PSHUFBMask[16];
13090 for (int i = 0; i < 16; ++i) {
13091 int Idx = Offset + (i / Scale);
13092 if ((i % Scale == 0 && SafeOffset(Idx))) {
13093 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13094 continue;
13095 }
13096 PSHUFBMask[i] =
13097 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13098 }
13099 InputV = DAG.getBitcast(MVT::v16i8, InputV);
13100 return DAG.getBitcast(
13101 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13102 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13103 }
13104
13105 // If we are extending from an offset, ensure we start on a boundary that
13106 // we can unpack from.
13107 int AlignToUnpack = Offset % (NumElements / Scale);
13108 if (AlignToUnpack) {
13109 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13110 for (int i = AlignToUnpack; i < NumElements; ++i)
13111 ShMask[i - AlignToUnpack] = i;
13112 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13113 Offset -= AlignToUnpack;
13114 }
13115
13116 // Otherwise emit a sequence of unpacks.
13117 do {
13118 unsigned UnpackLoHi = X86ISD::UNPCKL;
13119 if (Offset >= (NumElements / 2)) {
13120 UnpackLoHi = X86ISD::UNPCKH;
13121 Offset -= (NumElements / 2);
13122 }
13123
13124 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13125 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13126 : getZeroVector(InputVT, Subtarget, DAG, DL);
13127 InputV = DAG.getBitcast(InputVT, InputV);
13128 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13129 Scale /= 2;
13130 EltBits *= 2;
13131 NumElements /= 2;
13132 } while (Scale > 1);
13133 return DAG.getBitcast(VT, InputV);
13134}
13135
13136/// Try to lower a vector shuffle as a zero extension on any microarch.
13137///
13138/// This routine will try to do everything in its power to cleverly lower
13139/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13140/// check for the profitability of this lowering, it tries to aggressively
13141/// match this pattern. It will use all of the micro-architectural details it
13142/// can to emit an efficient lowering. It handles both blends with all-zero
13143/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13144/// masking out later).
13145///
13146/// The reason we have dedicated lowering for zext-style shuffles is that they
13147/// are both incredibly common and often quite performance sensitive.
13149 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13150 const APInt &Zeroable, const X86Subtarget &Subtarget,
13151 SelectionDAG &DAG) {
13152 int Bits = VT.getSizeInBits();
13153 int NumLanes = Bits / 128;
13154 int NumElements = VT.getVectorNumElements();
13155 int NumEltsPerLane = NumElements / NumLanes;
13156 assert(VT.getScalarSizeInBits() <= 32 &&
13157 "Exceeds 32-bit integer zero extension limit");
13158 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
13159
13160 // Define a helper function to check a particular ext-scale and lower to it if
13161 // valid.
13162 auto Lower = [&](int Scale) -> SDValue {
13163 SDValue InputV;
13164 bool AnyExt = true;
13165 int Offset = 0;
13166 int Matches = 0;
13167 for (int i = 0; i < NumElements; ++i) {
13168 int M = Mask[i];
13169 if (M < 0)
13170 continue; // Valid anywhere but doesn't tell us anything.
13171 if (i % Scale != 0) {
13172 // Each of the extended elements need to be zeroable.
13173 if (!Zeroable[i])
13174 return SDValue();
13175
13176 // We no longer are in the anyext case.
13177 AnyExt = false;
13178 continue;
13179 }
13180
13181 // Each of the base elements needs to be consecutive indices into the
13182 // same input vector.
13183 SDValue V = M < NumElements ? V1 : V2;
13184 M = M % NumElements;
13185 if (!InputV) {
13186 InputV = V;
13187 Offset = M - (i / Scale);
13188 } else if (InputV != V)
13189 return SDValue(); // Flip-flopping inputs.
13190
13191 // Offset must start in the lowest 128-bit lane or at the start of an
13192 // upper lane.
13193 // FIXME: Is it ever worth allowing a negative base offset?
13194 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13195 (Offset % NumEltsPerLane) == 0))
13196 return SDValue();
13197
13198 // If we are offsetting, all referenced entries must come from the same
13199 // lane.
13200 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13201 return SDValue();
13202
13203 if ((M % NumElements) != (Offset + (i / Scale)))
13204 return SDValue(); // Non-consecutive strided elements.
13205 Matches++;
13206 }
13207
13208 // If we fail to find an input, we have a zero-shuffle which should always
13209 // have already been handled.
13210 // FIXME: Maybe handle this here in case during blending we end up with one?
13211 if (!InputV)
13212 return SDValue();
13213
13214 // If we are offsetting, don't extend if we only match a single input, we
13215 // can always do better by using a basic PSHUF or PUNPCK.
13216 if (Offset != 0 && Matches < 2)
13217 return SDValue();
13218
13219 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
13220 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
13221 InputV, Mask, Subtarget, DAG);
13222 };
13223
13224 // The widest scale possible for extending is to a 64-bit integer.
13225 assert(Bits % 64 == 0 &&
13226 "The number of bits in a vector must be divisible by 64 on x86!");
13227 int NumExtElements = Bits / 64;
13228
13229 // Each iteration, try extending the elements half as much, but into twice as
13230 // many elements.
13231 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13232 assert(NumElements % NumExtElements == 0 &&
13233 "The input vector size must be divisible by the extended size.");
13234 if (SDValue V = Lower(NumElements / NumExtElements))
13235 return V;
13236 }
13237
13238 // General extends failed, but 128-bit vectors may be able to use MOVQ.
13239 if (Bits != 128)
13240 return SDValue();
13241
13242 // Returns one of the source operands if the shuffle can be reduced to a
13243 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13244 auto CanZExtLowHalf = [&]() {
13245 for (int i = NumElements / 2; i != NumElements; ++i)
13246 if (!Zeroable[i])
13247 return SDValue();
13248 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13249 return V1;
13250 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13251 return V2;
13252 return SDValue();
13253 };
13254
13255 if (SDValue V = CanZExtLowHalf()) {
13256 V = DAG.getBitcast(MVT::v2i64, V);
13257 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13258 return DAG.getBitcast(VT, V);
13259 }
13260
13261 // No viable ext lowering found.
13262 return SDValue();
13263}
13264
13265/// Try to get a scalar value for a specific element of a vector.
13266///
13267/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13269 SelectionDAG &DAG) {
13270 MVT VT = V.getSimpleValueType();
13271 MVT EltVT = VT.getVectorElementType();
13272 V = peekThroughBitcasts(V);
13273
13274 // If the bitcasts shift the element size, we can't extract an equivalent
13275 // element from it.
13276 MVT NewVT = V.getSimpleValueType();
13277 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13278 return SDValue();
13279
13280 if (V.getOpcode() == ISD::BUILD_VECTOR ||
13281 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13282 // Ensure the scalar operand is the same size as the destination.
13283 // FIXME: Add support for scalar truncation where possible.
13284 SDValue S = V.getOperand(Idx);
13285 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13286 return DAG.getBitcast(EltVT, S);
13287 }
13288
13289 return SDValue();
13290}
13291
13292/// Helper to test for a load that can be folded with x86 shuffles.
13293///
13294/// This is particularly important because the set of instructions varies
13295/// significantly based on whether the operand is a load or not.
13297 return V.hasOneUse() &&
13299}
13300
13301template<typename T>
13302static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
13303 T EltVT = VT.getScalarType();
13304 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
13305 (EltVT == MVT::f16 && !Subtarget.hasFP16());
13306}
13307
13308/// Try to lower insertion of a single element into a zero vector.
13309///
13310/// This is a common pattern that we have especially efficient patterns to lower
13311/// across all subtarget feature sets.
13313 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13314 const APInt &Zeroable, const X86Subtarget &Subtarget,
13315 SelectionDAG &DAG) {
13316 MVT ExtVT = VT;
13317 MVT EltVT = VT.getVectorElementType();
13318 unsigned NumElts = VT.getVectorNumElements();
13319 unsigned EltBits = VT.getScalarSizeInBits();
13320
13321 if (isSoftF16(EltVT, Subtarget))
13322 return SDValue();
13323
13324 int V2Index =
13325 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13326 Mask.begin();
13327 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
13328 bool IsV1Zeroable = true;
13329 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13330 if (i != V2Index && !Zeroable[i]) {
13331 IsV1Zeroable = false;
13332 break;
13333 }
13334
13335 // Bail if a non-zero V1 isn't used in place.
13336 if (!IsV1Zeroable) {
13337 SmallVector<int, 8> V1Mask(Mask);
13338 V1Mask[V2Index] = -1;
13339 if (!isNoopShuffleMask(V1Mask))
13340 return SDValue();
13341 }
13342
13343 // Check for a single input from a SCALAR_TO_VECTOR node.
13344 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13345 // all the smarts here sunk into that routine. However, the current
13346 // lowering of BUILD_VECTOR makes that nearly impossible until the old
13347 // vector shuffle lowering is dead.
13348 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13349 DAG);
13350 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13351 // We need to zext the scalar if it is smaller than an i32.
13352 V2S = DAG.getBitcast(EltVT, V2S);
13353 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
13354 // Using zext to expand a narrow element won't work for non-zero
13355 // insertions. But we can use a masked constant vector if we're
13356 // inserting V2 into the bottom of V1.
13357 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
13358 return SDValue();
13359
13360 // Zero-extend directly to i32.
13361 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13362 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13363
13364 // If we're inserting into a constant, mask off the inserted index
13365 // and OR with the zero-extended scalar.
13366 if (!IsV1Zeroable) {
13367 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
13368 Bits[V2Index] = APInt::getZero(EltBits);
13369 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
13370 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
13371 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13372 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
13373 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
13374 }
13375 }
13376 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13377 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13378 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
13379 // Either not inserting from the low element of the input or the input
13380 // element size is too small to use VZEXT_MOVL to clear the high bits.
13381 return SDValue();
13382 }
13383
13384 if (!IsV1Zeroable) {
13385 // If V1 can't be treated as a zero vector we have fewer options to lower
13386 // this. We can't support integer vectors or non-zero targets cheaply.
13387 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
13388 if (!VT.isFloatingPoint() || V2Index != 0)
13389 return SDValue();
13390 if (!VT.is128BitVector())
13391 return SDValue();
13392
13393 // Otherwise, use MOVSD, MOVSS or MOVSH.
13394 unsigned MovOpc = 0;
13395 if (EltVT == MVT::f16)
13396 MovOpc = X86ISD::MOVSH;
13397 else if (EltVT == MVT::f32)
13398 MovOpc = X86ISD::MOVSS;
13399 else if (EltVT == MVT::f64)
13400 MovOpc = X86ISD::MOVSD;
13401 else
13402 llvm_unreachable("Unsupported floating point element type to handle!");
13403 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
13404 }
13405
13406 // This lowering only works for the low element with floating point vectors.
13407 if (VT.isFloatingPoint() && V2Index != 0)
13408 return SDValue();
13409
13410 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13411 if (ExtVT != VT)
13412 V2 = DAG.getBitcast(VT, V2);
13413
13414 if (V2Index != 0) {
13415 // If we have 4 or fewer lanes we can cheaply shuffle the element into
13416 // the desired position. Otherwise it is more efficient to do a vector
13417 // shift left. We know that we can do a vector shift left because all
13418 // the inputs are zero.
13419 if (VT.isFloatingPoint() || NumElts <= 4) {
13420 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13421 V2Shuffle[V2Index] = 0;
13422 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13423 } else {
13424 V2 = DAG.getBitcast(MVT::v16i8, V2);
13425 V2 = DAG.getNode(
13426 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13427 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
13428 V2 = DAG.getBitcast(VT, V2);
13429 }
13430 }
13431 return V2;
13432}
13433
13434/// Try to lower broadcast of a single - truncated - integer element,
13435/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13436///
13437/// This assumes we have AVX2.
13439 int BroadcastIdx,
13440 const X86Subtarget &Subtarget,
13441 SelectionDAG &DAG) {
13442 assert(Subtarget.hasAVX2() &&
13443 "We can only lower integer broadcasts with AVX2!");
13444
13445 MVT EltVT = VT.getVectorElementType();
13446 MVT V0VT = V0.getSimpleValueType();
13447
13448 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
13449 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
13450
13451 MVT V0EltVT = V0VT.getVectorElementType();
13452 if (!V0EltVT.isInteger())
13453 return SDValue();
13454
13455 const unsigned EltSize = EltVT.getSizeInBits();
13456 const unsigned V0EltSize = V0EltVT.getSizeInBits();
13457
13458 // This is only a truncation if the original element type is larger.
13459 if (V0EltSize <= EltSize)
13460 return SDValue();
13461
13462 assert(((V0EltSize % EltSize) == 0) &&
13463 "Scalar type sizes must all be powers of 2 on x86!");
13464
13465 const unsigned V0Opc = V0.getOpcode();
13466 const unsigned Scale = V0EltSize / EltSize;
13467 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13468
13469 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13470 V0Opc != ISD::BUILD_VECTOR)
13471 return SDValue();
13472
13473 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13474
13475 // If we're extracting non-least-significant bits, shift so we can truncate.
13476 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13477 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13478 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13479 if (const int OffsetIdx = BroadcastIdx % Scale)
13480 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13481 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13482
13483 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13484 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13485}
13486
13487/// Test whether this can be lowered with a single SHUFPS instruction.
13488///
13489/// This is used to disable more specialized lowerings when the shufps lowering
13490/// will happen to be efficient.
13492 // This routine only handles 128-bit shufps.
13493 assert(Mask.size() == 4 && "Unsupported mask size!");
13494 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
13495 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
13496 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
13497 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
13498
13499 // To lower with a single SHUFPS we need to have the low half and high half
13500 // each requiring a single input.
13501 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13502 return false;
13503 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13504 return false;
13505
13506 return true;
13507}
13508
13509/// Test whether the specified input (0 or 1) is in-place blended by the
13510/// given mask.
13511///
13512/// This returns true if the elements from a particular input are already in the
13513/// slot required by the given mask and require no permutation.
13515 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13516 int Size = Mask.size();
13517 for (int i = 0; i < Size; ++i)
13518 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13519 return false;
13520
13521 return true;
13522}
13523
13524/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
13525/// the given mask.
13526///
13528 int BroadcastableElement = 0) {
13529 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13530 int Size = Mask.size();
13531 for (int i = 0; i < Size; ++i)
13532 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
13533 Mask[i] % Size != BroadcastableElement)
13534 return false;
13535 return true;
13536}
13537
13538/// If we are extracting two 128-bit halves of a vector and shuffling the
13539/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13540/// multi-shuffle lowering.
13542 SDValue N1, ArrayRef<int> Mask,
13543 SelectionDAG &DAG) {
13544 MVT VT = N0.getSimpleValueType();
13545 assert((VT.is128BitVector() &&
13546 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
13547 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13548
13549 // Check that both sources are extracts of the same source vector.
13550 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13552 N0.getOperand(0) != N1.getOperand(0) ||
13553 !N0.hasOneUse() || !N1.hasOneUse())
13554 return SDValue();
13555
13556 SDValue WideVec = N0.getOperand(0);
13557 MVT WideVT = WideVec.getSimpleValueType();
13558 if (!WideVT.is256BitVector())
13559 return SDValue();
13560
13561 // Match extracts of each half of the wide source vector. Commute the shuffle
13562 // if the extract of the low half is N1.
13563 unsigned NumElts = VT.getVectorNumElements();
13564 SmallVector<int, 4> NewMask(Mask);
13565 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13566 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13567 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13569 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13570 return SDValue();
13571
13572 // Final bailout: if the mask is simple, we are better off using an extract
13573 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13574 // because that avoids a constant load from memory.
13575 if (NumElts == 4 &&
13576 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13577 return SDValue();
13578
13579 // Extend the shuffle mask with undef elements.
13580 NewMask.append(NumElts, -1);
13581
13582 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13583 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13584 NewMask);
13585 // This is free: ymm -> xmm.
13586 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13587 DAG.getVectorIdxConstant(0, DL));
13588}
13589
13590/// Try to lower broadcast of a single element.
13591///
13592/// For convenience, this code also bundles all of the subtarget feature set
13593/// filtering. While a little annoying to re-dispatch on type here, there isn't
13594/// a convenient way to factor it out.
13596 SDValue V2, ArrayRef<int> Mask,
13597 const X86Subtarget &Subtarget,
13598 SelectionDAG &DAG) {
13599 MVT EltVT = VT.getVectorElementType();
13600 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13601 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13602 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13603 return SDValue();
13604
13605 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13606 // we can only broadcast from a register with AVX2.
13607 unsigned NumEltBits = VT.getScalarSizeInBits();
13608 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13609 ? X86ISD::MOVDDUP
13610 : X86ISD::VBROADCAST;
13611 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13612
13613 // Check that the mask is a broadcast.
13614 int BroadcastIdx = getSplatIndex(Mask);
13615 if (BroadcastIdx < 0) {
13616 // Check for hidden broadcast.
13617 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13618 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13619 return SDValue();
13620 BroadcastIdx = 0;
13621 }
13622 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13623 "a sorted mask where the broadcast "
13624 "comes from V1.");
13625 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13626
13627 // Go up the chain of (vector) values to find a scalar load that we can
13628 // combine with the broadcast.
13629 // TODO: Combine this logic with findEltLoadSrc() used by
13630 // EltsFromConsecutiveLoads().
13631 int BitOffset = BroadcastIdx * NumEltBits;
13632 SDValue V = V1;
13633 for (;;) {
13634 switch (V.getOpcode()) {
13635 case ISD::BITCAST: {
13636 V = V.getOperand(0);
13637 continue;
13638 }
13639 case ISD::CONCAT_VECTORS: {
13640 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13641 int OpIdx = BitOffset / OpBitWidth;
13642 V = V.getOperand(OpIdx);
13643 BitOffset %= OpBitWidth;
13644 continue;
13645 }
13647 // The extraction index adds to the existing offset.
13648 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13649 unsigned Idx = V.getConstantOperandVal(1);
13650 unsigned BeginOffset = Idx * EltBitWidth;
13651 BitOffset += BeginOffset;
13652 V = V.getOperand(0);
13653 continue;
13654 }
13655 case ISD::INSERT_SUBVECTOR: {
13656 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13657 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13658 int Idx = (int)V.getConstantOperandVal(2);
13659 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13660 int BeginOffset = Idx * EltBitWidth;
13661 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13662 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13663 BitOffset -= BeginOffset;
13664 V = VInner;
13665 } else {
13666 V = VOuter;
13667 }
13668 continue;
13669 }
13670 }
13671 break;
13672 }
13673 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13674 BroadcastIdx = BitOffset / NumEltBits;
13675
13676 // Do we need to bitcast the source to retrieve the original broadcast index?
13677 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13678
13679 // Check if this is a broadcast of a scalar. We special case lowering
13680 // for scalars so that we can more effectively fold with loads.
13681 // If the original value has a larger element type than the shuffle, the
13682 // broadcast element is in essence truncated. Make that explicit to ease
13683 // folding.
13684 if (BitCastSrc && VT.isInteger())
13685 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13686 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13687 return TruncBroadcast;
13688
13689 // Also check the simpler case, where we can directly reuse the scalar.
13690 if (!BitCastSrc &&
13691 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13692 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13693 V = V.getOperand(BroadcastIdx);
13694
13695 // If we can't broadcast from a register, check that the input is a load.
13696 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13697 return SDValue();
13698 } else if (ISD::isNormalLoad(V.getNode()) &&
13699 cast<LoadSDNode>(V)->isSimple()) {
13700 // We do not check for one-use of the vector load because a broadcast load
13701 // is expected to be a win for code size, register pressure, and possibly
13702 // uops even if the original vector load is not eliminated.
13703
13704 // Reduce the vector load and shuffle to a broadcasted scalar load.
13705 auto *Ld = cast<LoadSDNode>(V);
13706 SDValue BaseAddr = Ld->getBasePtr();
13707 MVT SVT = VT.getScalarType();
13708 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13709 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13710 SDValue NewAddr =
13712
13713 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13714 // than MOVDDUP.
13715 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13716 if (Opcode == X86ISD::VBROADCAST) {
13717 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13718 SDValue Ops[] = {Ld->getChain(), NewAddr};
13719 V = DAG.getMemIntrinsicNode(
13720 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13722 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13724 return DAG.getBitcast(VT, V);
13725 }
13726 assert(SVT == MVT::f64 && "Unexpected VT!");
13727 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13729 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13731 } else if (!BroadcastFromReg) {
13732 // We can't broadcast from a vector register.
13733 return SDValue();
13734 } else if (BitOffset != 0) {
13735 // We can only broadcast from the zero-element of a vector register,
13736 // but it can be advantageous to broadcast from the zero-element of a
13737 // subvector.
13738 if (!VT.is256BitVector() && !VT.is512BitVector())
13739 return SDValue();
13740
13741 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13742 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13743 return SDValue();
13744
13745 // If we are broadcasting an element from the lowest 128-bit subvector, try
13746 // to move the element in position.
13747 if (BitOffset < 128 && NumActiveElts > 1 &&
13748 V.getScalarValueSizeInBits() == NumEltBits) {
13749 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13750 "Unexpected bit-offset");
13751 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13752 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13753 V = extractSubVector(V, 0, DAG, DL, 128);
13754 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13755 } else {
13756 // Only broadcast the zero-element of a 128-bit subvector.
13757 if ((BitOffset % 128) != 0)
13758 return SDValue();
13759
13760 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13761 "Unexpected bit-offset");
13762 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13763 "Unexpected vector size");
13764 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13765 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13766 }
13767 }
13768
13769 // On AVX we can use VBROADCAST directly for scalar sources.
13770 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13771 V = DAG.getBitcast(MVT::f64, V);
13772 if (Subtarget.hasAVX()) {
13773 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13774 return DAG.getBitcast(VT, V);
13775 }
13776 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13777 }
13778
13779 // If this is a scalar, do the broadcast on this type and bitcast.
13780 if (!V.getValueType().isVector()) {
13781 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13782 "Unexpected scalar size");
13783 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13785 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13786 }
13787
13788 // We only support broadcasting from 128-bit vectors to minimize the
13789 // number of patterns we need to deal with in isel. So extract down to
13790 // 128-bits, removing as many bitcasts as possible.
13791 if (V.getValueSizeInBits() > 128)
13793
13794 // Otherwise cast V to a vector with the same element type as VT, but
13795 // possibly narrower than VT. Then perform the broadcast.
13796 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13797 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13798 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13799}
13800
13801// Check for whether we can use INSERTPS to perform the shuffle. We only use
13802// INSERTPS when the V1 elements are already in the correct locations
13803// because otherwise we can just always use two SHUFPS instructions which
13804// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13805// perform INSERTPS if a single V1 element is out of place and all V2
13806// elements are zeroable.
13808 unsigned &InsertPSMask,
13809 const APInt &Zeroable,
13810 ArrayRef<int> Mask, SelectionDAG &DAG) {
13811 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13812 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13813 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13814
13815 // Attempt to match INSERTPS with one element from VA or VB being
13816 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13817 // are updated.
13818 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13819 ArrayRef<int> CandidateMask) {
13820 unsigned ZMask = 0;
13821 int VADstIndex = -1;
13822 int VBDstIndex = -1;
13823 bool VAUsedInPlace = false;
13824
13825 for (int i = 0; i < 4; ++i) {
13826 // Synthesize a zero mask from the zeroable elements (includes undefs).
13827 if (Zeroable[i]) {
13828 ZMask |= 1 << i;
13829 continue;
13830 }
13831
13832 // Flag if we use any VA inputs in place.
13833 if (i == CandidateMask[i]) {
13834 VAUsedInPlace = true;
13835 continue;
13836 }
13837
13838 // We can only insert a single non-zeroable element.
13839 if (VADstIndex >= 0 || VBDstIndex >= 0)
13840 return false;
13841
13842 if (CandidateMask[i] < 4) {
13843 // VA input out of place for insertion.
13844 VADstIndex = i;
13845 } else {
13846 // VB input for insertion.
13847 VBDstIndex = i;
13848 }
13849 }
13850
13851 // Don't bother if we have no (non-zeroable) element for insertion.
13852 if (VADstIndex < 0 && VBDstIndex < 0)
13853 return false;
13854
13855 // Determine element insertion src/dst indices. The src index is from the
13856 // start of the inserted vector, not the start of the concatenated vector.
13857 unsigned VBSrcIndex = 0;
13858 if (VADstIndex >= 0) {
13859 // If we have a VA input out of place, we use VA as the V2 element
13860 // insertion and don't use the original V2 at all.
13861 VBSrcIndex = CandidateMask[VADstIndex];
13862 VBDstIndex = VADstIndex;
13863 VB = VA;
13864 } else {
13865 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13866 }
13867
13868 // If no V1 inputs are used in place, then the result is created only from
13869 // the zero mask and the V2 insertion - so remove V1 dependency.
13870 if (!VAUsedInPlace)
13871 VA = DAG.getUNDEF(MVT::v4f32);
13872
13873 // Update V1, V2 and InsertPSMask accordingly.
13874 V1 = VA;
13875 V2 = VB;
13876
13877 // Insert the V2 element into the desired position.
13878 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13879 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13880 return true;
13881 };
13882
13883 if (matchAsInsertPS(V1, V2, Mask))
13884 return true;
13885
13886 // Commute and try again.
13887 SmallVector<int, 4> CommutedMask(Mask);
13889 if (matchAsInsertPS(V2, V1, CommutedMask))
13890 return true;
13891
13892 return false;
13893}
13894
13896 ArrayRef<int> Mask, const APInt &Zeroable,
13897 SelectionDAG &DAG) {
13898 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13899 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13900
13901 // Attempt to match the insertps pattern.
13902 unsigned InsertPSMask = 0;
13903 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13904 return SDValue();
13905
13906 // Insert the V2 element into the desired position.
13907 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13908 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13909}
13910
13911/// Handle lowering of 2-lane 64-bit floating point shuffles.
13912///
13913/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13914/// support for floating point shuffles but not integer shuffles. These
13915/// instructions will incur a domain crossing penalty on some chips though so
13916/// it is better to avoid lowering through this for integer vectors where
13917/// possible.
13919 const APInt &Zeroable, SDValue V1, SDValue V2,
13920 const X86Subtarget &Subtarget,
13921 SelectionDAG &DAG) {
13922 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13923 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13924 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13925
13926 if (V2.isUndef()) {
13927 // Check for being able to broadcast a single element.
13928 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13929 Mask, Subtarget, DAG))
13930 return Broadcast;
13931
13932 // Straight shuffle of a single input vector. Simulate this by using the
13933 // single input as both of the "inputs" to this instruction..
13934 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13935
13936 if (Subtarget.hasAVX()) {
13937 // If we have AVX, we can use VPERMILPS which will allow folding a load
13938 // into the shuffle.
13939 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13940 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13941 }
13942
13943 return DAG.getNode(
13944 X86ISD::SHUFP, DL, MVT::v2f64,
13945 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13946 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13947 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13948 }
13949 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13950 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13951 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13952 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13953
13954 if (Subtarget.hasAVX2())
13955 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13956 return Extract;
13957
13958 // When loading a scalar and then shuffling it into a vector we can often do
13959 // the insertion cheaply.
13961 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13962 return Insertion;
13963 // Try inverting the insertion since for v2 masks it is easy to do and we
13964 // can't reliably sort the mask one way or the other.
13965 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13966 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13968 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13969 return Insertion;
13970
13971 // Try to use one of the special instruction patterns to handle two common
13972 // blend patterns if a zero-blend above didn't work.
13973 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13974 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13975 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13976 // We can either use a special instruction to load over the low double or
13977 // to move just the low double.
13978 return DAG.getNode(
13979 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13980 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13981
13982 if (Subtarget.hasSSE41())
13983 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13984 Zeroable, Subtarget, DAG))
13985 return Blend;
13986
13987 // Use dedicated unpack instructions for masks that match their pattern.
13988 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13989 return V;
13990
13991 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13992 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13993 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13994}
13995
13996/// Handle lowering of 2-lane 64-bit integer shuffles.
13997///
13998/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13999/// the integer unit to minimize domain crossing penalties. However, for blends
14000/// it falls back to the floating point shuffle operation with appropriate bit
14001/// casting.
14003 const APInt &Zeroable, SDValue V1, SDValue V2,
14004 const X86Subtarget &Subtarget,
14005 SelectionDAG &DAG) {
14006 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14007 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
14008 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
14009
14010 if (V2.isUndef()) {
14011 // Check for being able to broadcast a single element.
14012 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14013 Mask, Subtarget, DAG))
14014 return Broadcast;
14015
14016 // Straight shuffle of a single input vector. For everything from SSE2
14017 // onward this has a single fast instruction with no scary immediates.
14018 // We have to map the mask as it is actually a v4i32 shuffle instruction.
14019 V1 = DAG.getBitcast(MVT::v4i32, V1);
14020 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14021 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14022 Mask[1] < 0 ? -1 : (Mask[1] * 2),
14023 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14024 return DAG.getBitcast(
14025 MVT::v2i64,
14026 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14027 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14028 }
14029 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
14030 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
14031 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
14032 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
14033
14034 if (Subtarget.hasAVX2())
14035 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14036 return Extract;
14037
14038 // Try to use shift instructions.
14039 if (SDValue Shift =
14040 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
14041 DAG, /*BitwiseOnly*/ false))
14042 return Shift;
14043
14044 // When loading a scalar and then shuffling it into a vector we can often do
14045 // the insertion cheaply.
14047 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14048 return Insertion;
14049 // Try inverting the insertion since for v2 masks it is easy to do and we
14050 // can't reliably sort the mask one way or the other.
14051 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14053 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14054 return Insertion;
14055
14056 // We have different paths for blend lowering, but they all must use the
14057 // *exact* same predicate.
14058 bool IsBlendSupported = Subtarget.hasSSE41();
14059 if (IsBlendSupported)
14060 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14061 Zeroable, Subtarget, DAG))
14062 return Blend;
14063
14064 // Use dedicated unpack instructions for masks that match their pattern.
14065 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
14066 return V;
14067
14068 // Try to use byte rotation instructions.
14069 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14070 if (Subtarget.hasSSSE3()) {
14071 if (Subtarget.hasVLX())
14072 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14073 Zeroable, Subtarget, DAG))
14074 return Rotate;
14075
14076 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14077 Subtarget, DAG))
14078 return Rotate;
14079 }
14080
14081 // If we have direct support for blends, we should lower by decomposing into
14082 // a permute. That will be faster than the domain cross.
14083 if (IsBlendSupported)
14084 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14085 Zeroable, Subtarget, DAG);
14086
14087 // We implement this with SHUFPD which is pretty lame because it will likely
14088 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14089 // However, all the alternatives are still more cycles and newer chips don't
14090 // have this problem. It would be really nice if x86 had better shuffles here.
14091 V1 = DAG.getBitcast(MVT::v2f64, V1);
14092 V2 = DAG.getBitcast(MVT::v2f64, V2);
14093 return DAG.getBitcast(MVT::v2i64,
14094 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14095}
14096
14097/// Lower a vector shuffle using the SHUFPS instruction.
14098///
14099/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14100/// It makes no assumptions about whether this is the *best* lowering, it simply
14101/// uses it.
14103 ArrayRef<int> Mask, SDValue V1,
14104 SDValue V2, SelectionDAG &DAG) {
14105 SDValue LowV = V1, HighV = V2;
14106 SmallVector<int, 4> NewMask(Mask);
14107 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14108
14109 if (NumV2Elements == 1) {
14110 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14111
14112 // Compute the index adjacent to V2Index and in the same half by toggling
14113 // the low bit.
14114 int V2AdjIndex = V2Index ^ 1;
14115
14116 if (Mask[V2AdjIndex] < 0) {
14117 // Handles all the cases where we have a single V2 element and an undef.
14118 // This will only ever happen in the high lanes because we commute the
14119 // vector otherwise.
14120 if (V2Index < 2)
14121 std::swap(LowV, HighV);
14122 NewMask[V2Index] -= 4;
14123 } else {
14124 // Handle the case where the V2 element ends up adjacent to a V1 element.
14125 // To make this work, blend them together as the first step.
14126 int V1Index = V2AdjIndex;
14127 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14128 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14129 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14130
14131 // Now proceed to reconstruct the final blend as we have the necessary
14132 // high or low half formed.
14133 if (V2Index < 2) {
14134 LowV = V2;
14135 HighV = V1;
14136 } else {
14137 HighV = V2;
14138 }
14139 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14140 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14141 }
14142 } else if (NumV2Elements == 2) {
14143 if (Mask[0] < 4 && Mask[1] < 4) {
14144 // Handle the easy case where we have V1 in the low lanes and V2 in the
14145 // high lanes.
14146 NewMask[2] -= 4;
14147 NewMask[3] -= 4;
14148 } else if (Mask[2] < 4 && Mask[3] < 4) {
14149 // We also handle the reversed case because this utility may get called
14150 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14151 // arrange things in the right direction.
14152 NewMask[0] -= 4;
14153 NewMask[1] -= 4;
14154 HighV = V1;
14155 LowV = V2;
14156 } else {
14157 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14158 // trying to place elements directly, just blend them and set up the final
14159 // shuffle to place them.
14160
14161 // The first two blend mask elements are for V1, the second two are for
14162 // V2.
14163 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14164 Mask[2] < 4 ? Mask[2] : Mask[3],
14165 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14166 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14167 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14168 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14169
14170 // Now we do a normal shuffle of V1 by giving V1 as both operands to
14171 // a blend.
14172 LowV = HighV = V1;
14173 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14174 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14175 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14176 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14177 }
14178 } else if (NumV2Elements == 3) {
14179 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14180 // we can get here due to other paths (e.g repeated mask matching) that we
14181 // don't want to do another round of lowerVECTOR_SHUFFLE.
14183 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14184 }
14185 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14186 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14187}
14188
14189/// Lower 4-lane 32-bit floating point shuffles.
14190///
14191/// Uses instructions exclusively from the floating point unit to minimize
14192/// domain crossing penalties, as these are sufficient to implement all v4f32
14193/// shuffles.
14195 const APInt &Zeroable, SDValue V1, SDValue V2,
14196 const X86Subtarget &Subtarget,
14197 SelectionDAG &DAG) {
14198 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14199 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14200 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14201
14202 if (Subtarget.hasSSE41())
14203 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14204 Zeroable, Subtarget, DAG))
14205 return Blend;
14206
14207 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14208
14209 if (NumV2Elements == 0) {
14210 // Check for being able to broadcast a single element.
14211 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14212 Mask, Subtarget, DAG))
14213 return Broadcast;
14214
14215 // Use even/odd duplicate instructions for masks that match their pattern.
14216 if (Subtarget.hasSSE3()) {
14217 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14218 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14219 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14220 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14221 }
14222
14223 if (Subtarget.hasAVX()) {
14224 // If we have AVX, we can use VPERMILPS which will allow folding a load
14225 // into the shuffle.
14226 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14227 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14228 }
14229
14230 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14231 // in SSE1 because otherwise they are widened to v2f64 and never get here.
14232 if (!Subtarget.hasSSE2()) {
14233 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14234 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14235 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14236 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14237 }
14238
14239 // Otherwise, use a straight shuffle of a single input vector. We pass the
14240 // input vector to both operands to simulate this with a SHUFPS.
14241 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14242 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14243 }
14244
14245 if (Subtarget.hasSSE2())
14247 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
14248 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
14249 return ZExt;
14250 }
14251
14252 if (Subtarget.hasAVX2())
14253 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14254 return Extract;
14255
14256 // There are special ways we can lower some single-element blends. However, we
14257 // have custom ways we can lower more complex single-element blends below that
14258 // we defer to if both this and BLENDPS fail to match, so restrict this to
14259 // when the V2 input is targeting element 0 of the mask -- that is the fast
14260 // case here.
14261 if (NumV2Elements == 1 && Mask[0] >= 4)
14262 if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask,
14263 Zeroable, Subtarget, DAG))
14264 return V;
14265
14266 if (Subtarget.hasSSE41()) {
14267 bool MatchesShufPS = isSingleSHUFPSMask(Mask);
14268
14269 // Use INSERTPS if we can complete the shuffle efficiently.
14270 if (!MatchesShufPS || Zeroable == 0x3 || Zeroable == 0xC)
14271 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14272 return V;
14273
14274 if (!MatchesShufPS)
14275 if (SDValue BlendPerm =
14276 lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG))
14277 return BlendPerm;
14278 }
14279
14280 // Use low/high mov instructions. These are only valid in SSE1 because
14281 // otherwise they are widened to v2f64 and never get here.
14282 if (!Subtarget.hasSSE2()) {
14283 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14284 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14285 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14286 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14287 }
14288
14289 // Use dedicated unpack instructions for masks that match their pattern.
14290 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
14291 return V;
14292
14293 // Otherwise fall back to a SHUFPS lowering strategy.
14294 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14295}
14296
14297/// Lower 4-lane i32 vector shuffles.
14298///
14299/// We try to handle these with integer-domain shuffles where we can, but for
14300/// blends we use the floating point domain blend instructions.
14302 const APInt &Zeroable, SDValue V1, SDValue V2,
14303 const X86Subtarget &Subtarget,
14304 SelectionDAG &DAG) {
14305 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14306 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14307 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14308
14309 // Whenever we can lower this as a zext, that instruction is strictly faster
14310 // than any alternative. It also allows us to fold memory operands into the
14311 // shuffle in many cases.
14312 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14313 Zeroable, Subtarget, DAG))
14314 return ZExt;
14315
14316 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14317
14318 // Try to use shift instructions if fast.
14319 if (Subtarget.preferLowerShuffleAsShift()) {
14320 if (SDValue Shift =
14321 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
14322 Subtarget, DAG, /*BitwiseOnly*/ true))
14323 return Shift;
14324 if (NumV2Elements == 0)
14325 if (SDValue Rotate =
14326 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
14327 return Rotate;
14328 }
14329
14330 if (NumV2Elements == 0) {
14331 // Try to use broadcast unless the mask only has one non-undef element.
14332 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14333 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14334 Mask, Subtarget, DAG))
14335 return Broadcast;
14336 }
14337
14338 // Straight shuffle of a single input vector. For everything from SSE2
14339 // onward this has a single fast instruction with no scary immediates.
14340 // We coerce the shuffle pattern to be compatible with UNPCK instructions
14341 // but we aren't actually going to use the UNPCK instruction because doing
14342 // so prevents folding a load into this instruction or making a copy.
14343 const int UnpackLoMask[] = {0, 0, 1, 1};
14344 const int UnpackHiMask[] = {2, 2, 3, 3};
14345 if (!isSingleElementRepeatedMask(Mask)) {
14346 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14347 Mask = UnpackLoMask;
14348 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14349 Mask = UnpackHiMask;
14350 }
14351
14352 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14353 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14354 }
14355
14356 if (Subtarget.hasAVX2())
14357 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14358 return Extract;
14359
14360 // Try to use shift instructions.
14361 if (SDValue Shift =
14362 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
14363 DAG, /*BitwiseOnly*/ false))
14364 return Shift;
14365
14366 // There are special ways we can lower some single-element blends.
14367 if (NumV2Elements == 1)
14369 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14370 return V;
14371
14372 // We have different paths for blend lowering, but they all must use the
14373 // *exact* same predicate.
14374 bool IsBlendSupported = Subtarget.hasSSE41();
14375 if (IsBlendSupported)
14376 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14377 Zeroable, Subtarget, DAG))
14378 return Blend;
14379
14380 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14381 Zeroable, Subtarget, DAG))
14382 return Masked;
14383
14384 // Use dedicated unpack instructions for masks that match their pattern.
14385 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
14386 return V;
14387
14388 // Try to use byte rotation instructions.
14389 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14390 if (Subtarget.hasSSSE3()) {
14391 if (Subtarget.hasVLX())
14392 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14393 Zeroable, Subtarget, DAG))
14394 return Rotate;
14395
14396 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14397 Subtarget, DAG))
14398 return Rotate;
14399 }
14400
14401 // Assume that a single SHUFPS is faster than an alternative sequence of
14402 // multiple instructions (even if the CPU has a domain penalty).
14403 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14404 if (!isSingleSHUFPSMask(Mask)) {
14405 // If we have direct support for blends, we should lower by decomposing into
14406 // a permute. That will be faster than the domain cross.
14407 if (IsBlendSupported)
14408 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14409 Zeroable, Subtarget, DAG);
14410
14411 // Try to lower by permuting the inputs into an unpack instruction.
14412 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14413 Mask, Subtarget, DAG))
14414 return Unpack;
14415 }
14416
14417 // We implement this with SHUFPS because it can blend from two vectors.
14418 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14419 // up the inputs, bypassing domain shift penalties that we would incur if we
14420 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14421 // relevant.
14422 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14423 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14424 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14425 return DAG.getBitcast(MVT::v4i32, ShufPS);
14426}
14427
14428/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14429/// shuffle lowering, and the most complex part.
14430///
14431/// The lowering strategy is to try to form pairs of input lanes which are
14432/// targeted at the same half of the final vector, and then use a dword shuffle
14433/// to place them onto the right half, and finally unpack the paired lanes into
14434/// their final position.
14435///
14436/// The exact breakdown of how to form these dword pairs and align them on the
14437/// correct sides is really tricky. See the comments within the function for
14438/// more of the details.
14439///
14440/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14441/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14442/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14443/// vector, form the analogous 128-bit 8-element Mask.
14445 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14446 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14447 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
14448 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14449
14450 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
14451 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14452 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14453
14454 // Attempt to directly match PSHUFLW or PSHUFHW.
14455 if (isUndefOrInRange(LoMask, 0, 4) &&
14456 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14457 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14458 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14459 }
14460 if (isUndefOrInRange(HiMask, 4, 8) &&
14461 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14462 for (int i = 0; i != 4; ++i)
14463 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14464 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14465 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14466 }
14467
14468 SmallVector<int, 4> LoInputs;
14469 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14470 array_pod_sort(LoInputs.begin(), LoInputs.end());
14471 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14472 SmallVector<int, 4> HiInputs;
14473 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14474 array_pod_sort(HiInputs.begin(), HiInputs.end());
14475 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14476 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14477 int NumHToL = LoInputs.size() - NumLToL;
14478 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14479 int NumHToH = HiInputs.size() - NumLToH;
14480 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14481 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14482 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14483 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14484
14485 // If we are shuffling values from one half - check how many different DWORD
14486 // pairs we need to create. If only 1 or 2 then we can perform this as a
14487 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14488 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14489 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14490 V = DAG.getNode(ShufWOp, DL, VT, V,
14491 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14492 V = DAG.getBitcast(PSHUFDVT, V);
14493 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14494 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14495 return DAG.getBitcast(VT, V);
14496 };
14497
14498 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14499 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14500 SmallVector<std::pair<int, int>, 4> DWordPairs;
14501 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14502
14503 // Collect the different DWORD pairs.
14504 for (int DWord = 0; DWord != 4; ++DWord) {
14505 int M0 = Mask[2 * DWord + 0];
14506 int M1 = Mask[2 * DWord + 1];
14507 M0 = (M0 >= 0 ? M0 % 4 : M0);
14508 M1 = (M1 >= 0 ? M1 % 4 : M1);
14509 if (M0 < 0 && M1 < 0)
14510 continue;
14511
14512 bool Match = false;
14513 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14514 auto &DWordPair = DWordPairs[j];
14515 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14516 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14517 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14518 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14519 PSHUFDMask[DWord] = DOffset + j;
14520 Match = true;
14521 break;
14522 }
14523 }
14524 if (!Match) {
14525 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14526 DWordPairs.push_back(std::make_pair(M0, M1));
14527 }
14528 }
14529
14530 if (DWordPairs.size() <= 2) {
14531 DWordPairs.resize(2, std::make_pair(-1, -1));
14532 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14533 DWordPairs[1].first, DWordPairs[1].second};
14534 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
14535 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
14536 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
14537 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
14538 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
14539 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
14540 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
14541 }
14542 if ((NumHToL + NumHToH) == 0)
14543 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14544 if ((NumLToL + NumLToH) == 0)
14545 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14546 }
14547 }
14548
14549 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14550 // such inputs we can swap two of the dwords across the half mark and end up
14551 // with <=2 inputs to each half in each half. Once there, we can fall through
14552 // to the generic code below. For example:
14553 //
14554 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14555 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14556 //
14557 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14558 // and an existing 2-into-2 on the other half. In this case we may have to
14559 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14560 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14561 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14562 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14563 // half than the one we target for fixing) will be fixed when we re-enter this
14564 // path. We will also combine away any sequence of PSHUFD instructions that
14565 // result into a single instruction. Here is an example of the tricky case:
14566 //
14567 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14568 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14569 //
14570 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14571 //
14572 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14573 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14574 //
14575 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14576 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14577 //
14578 // The result is fine to be handled by the generic logic.
14579 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14580 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14581 int AOffset, int BOffset) {
14582 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14583 "Must call this with A having 3 or 1 inputs from the A half.");
14584 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14585 "Must call this with B having 1 or 3 inputs from the B half.");
14586 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14587 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14588
14589 bool ThreeAInputs = AToAInputs.size() == 3;
14590
14591 // Compute the index of dword with only one word among the three inputs in
14592 // a half by taking the sum of the half with three inputs and subtracting
14593 // the sum of the actual three inputs. The difference is the remaining
14594 // slot.
14595 int ADWord = 0, BDWord = 0;
14596 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14597 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14598 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14599 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14600 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14601 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14602 int TripleNonInputIdx =
14603 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14604 TripleDWord = TripleNonInputIdx / 2;
14605
14606 // We use xor with one to compute the adjacent DWord to whichever one the
14607 // OneInput is in.
14608 OneInputDWord = (OneInput / 2) ^ 1;
14609
14610 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14611 // and BToA inputs. If there is also such a problem with the BToB and AToB
14612 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14613 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14614 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14615 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14616 // Compute how many inputs will be flipped by swapping these DWords. We
14617 // need
14618 // to balance this to ensure we don't form a 3-1 shuffle in the other
14619 // half.
14620 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14621 llvm::count(AToBInputs, 2 * ADWord + 1);
14622 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14623 llvm::count(BToBInputs, 2 * BDWord + 1);
14624 if ((NumFlippedAToBInputs == 1 &&
14625 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14626 (NumFlippedBToBInputs == 1 &&
14627 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14628 // We choose whether to fix the A half or B half based on whether that
14629 // half has zero flipped inputs. At zero, we may not be able to fix it
14630 // with that half. We also bias towards fixing the B half because that
14631 // will more commonly be the high half, and we have to bias one way.
14632 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14633 ArrayRef<int> Inputs) {
14634 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14635 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14636 // Determine whether the free index is in the flipped dword or the
14637 // unflipped dword based on where the pinned index is. We use this bit
14638 // in an xor to conditionally select the adjacent dword.
14639 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14640 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14641 if (IsFixIdxInput == IsFixFreeIdxInput)
14642 FixFreeIdx += 1;
14643 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14644 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14645 "We need to be changing the number of flipped inputs!");
14646 int PSHUFHalfMask[] = {0, 1, 2, 3};
14647 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14648 V = DAG.getNode(
14649 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14650 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14651 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14652
14653 for (int &M : Mask)
14654 if (M >= 0 && M == FixIdx)
14655 M = FixFreeIdx;
14656 else if (M >= 0 && M == FixFreeIdx)
14657 M = FixIdx;
14658 };
14659 if (NumFlippedBToBInputs != 0) {
14660 int BPinnedIdx =
14661 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14662 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14663 } else {
14664 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14665 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14666 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14667 }
14668 }
14669 }
14670
14671 int PSHUFDMask[] = {0, 1, 2, 3};
14672 PSHUFDMask[ADWord] = BDWord;
14673 PSHUFDMask[BDWord] = ADWord;
14674 V = DAG.getBitcast(
14675 VT,
14676 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14677 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14678
14679 // Adjust the mask to match the new locations of A and B.
14680 for (int &M : Mask)
14681 if (M >= 0 && M/2 == ADWord)
14682 M = 2 * BDWord + M % 2;
14683 else if (M >= 0 && M/2 == BDWord)
14684 M = 2 * ADWord + M % 2;
14685
14686 // Recurse back into this routine to re-compute state now that this isn't
14687 // a 3 and 1 problem.
14688 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14689 };
14690 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14691 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14692 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14693 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14694
14695 // At this point there are at most two inputs to the low and high halves from
14696 // each half. That means the inputs can always be grouped into dwords and
14697 // those dwords can then be moved to the correct half with a dword shuffle.
14698 // We use at most one low and one high word shuffle to collect these paired
14699 // inputs into dwords, and finally a dword shuffle to place them.
14700 int PSHUFLMask[4] = {-1, -1, -1, -1};
14701 int PSHUFHMask[4] = {-1, -1, -1, -1};
14702 int PSHUFDMask[4] = {-1, -1, -1, -1};
14703
14704 // First fix the masks for all the inputs that are staying in their
14705 // original halves. This will then dictate the targets of the cross-half
14706 // shuffles.
14707 auto fixInPlaceInputs =
14708 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14709 MutableArrayRef<int> SourceHalfMask,
14710 MutableArrayRef<int> HalfMask, int HalfOffset) {
14711 if (InPlaceInputs.empty())
14712 return;
14713 if (InPlaceInputs.size() == 1) {
14714 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14715 InPlaceInputs[0] - HalfOffset;
14716 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14717 return;
14718 }
14719 if (IncomingInputs.empty()) {
14720 // Just fix all of the in place inputs.
14721 for (int Input : InPlaceInputs) {
14722 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14723 PSHUFDMask[Input / 2] = Input / 2;
14724 }
14725 return;
14726 }
14727
14728 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14729 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14730 InPlaceInputs[0] - HalfOffset;
14731 // Put the second input next to the first so that they are packed into
14732 // a dword. We find the adjacent index by toggling the low bit.
14733 int AdjIndex = InPlaceInputs[0] ^ 1;
14734 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14735 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14736 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14737 };
14738 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14739 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14740
14741 // Now gather the cross-half inputs and place them into a free dword of
14742 // their target half.
14743 // FIXME: This operation could almost certainly be simplified dramatically to
14744 // look more like the 3-1 fixing operation.
14745 auto moveInputsToRightHalf = [&PSHUFDMask](
14746 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14747 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14748 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14749 int DestOffset) {
14750 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14751 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14752 };
14753 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14754 int Word) {
14755 int LowWord = Word & ~1;
14756 int HighWord = Word | 1;
14757 return isWordClobbered(SourceHalfMask, LowWord) ||
14758 isWordClobbered(SourceHalfMask, HighWord);
14759 };
14760
14761 if (IncomingInputs.empty())
14762 return;
14763
14764 if (ExistingInputs.empty()) {
14765 // Map any dwords with inputs from them into the right half.
14766 for (int Input : IncomingInputs) {
14767 // If the source half mask maps over the inputs, turn those into
14768 // swaps and use the swapped lane.
14769 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14770 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14771 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14772 Input - SourceOffset;
14773 // We have to swap the uses in our half mask in one sweep.
14774 for (int &M : HalfMask)
14775 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14776 M = Input;
14777 else if (M == Input)
14778 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14779 } else {
14780 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14781 Input - SourceOffset &&
14782 "Previous placement doesn't match!");
14783 }
14784 // Note that this correctly re-maps both when we do a swap and when
14785 // we observe the other side of the swap above. We rely on that to
14786 // avoid swapping the members of the input list directly.
14787 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14788 }
14789
14790 // Map the input's dword into the correct half.
14791 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14792 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14793 else
14794 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14795 Input / 2 &&
14796 "Previous placement doesn't match!");
14797 }
14798
14799 // And just directly shift any other-half mask elements to be same-half
14800 // as we will have mirrored the dword containing the element into the
14801 // same position within that half.
14802 for (int &M : HalfMask)
14803 if (M >= SourceOffset && M < SourceOffset + 4) {
14804 M = M - SourceOffset + DestOffset;
14805 assert(M >= 0 && "This should never wrap below zero!");
14806 }
14807 return;
14808 }
14809
14810 // Ensure we have the input in a viable dword of its current half. This
14811 // is particularly tricky because the original position may be clobbered
14812 // by inputs being moved and *staying* in that half.
14813 if (IncomingInputs.size() == 1) {
14814 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14815 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14816 SourceOffset;
14817 SourceHalfMask[InputFixed - SourceOffset] =
14818 IncomingInputs[0] - SourceOffset;
14819 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14820 IncomingInputs[0] = InputFixed;
14821 }
14822 } else if (IncomingInputs.size() == 2) {
14823 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14824 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14825 // We have two non-adjacent or clobbered inputs we need to extract from
14826 // the source half. To do this, we need to map them into some adjacent
14827 // dword slot in the source mask.
14828 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14829 IncomingInputs[1] - SourceOffset};
14830
14831 // If there is a free slot in the source half mask adjacent to one of
14832 // the inputs, place the other input in it. We use (Index XOR 1) to
14833 // compute an adjacent index.
14834 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14835 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14836 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14837 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14838 InputsFixed[1] = InputsFixed[0] ^ 1;
14839 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14840 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14841 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14842 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14843 InputsFixed[0] = InputsFixed[1] ^ 1;
14844 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14845 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14846 // The two inputs are in the same DWord but it is clobbered and the
14847 // adjacent DWord isn't used at all. Move both inputs to the free
14848 // slot.
14849 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14850 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14851 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14852 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14853 } else {
14854 // The only way we hit this point is if there is no clobbering
14855 // (because there are no off-half inputs to this half) and there is no
14856 // free slot adjacent to one of the inputs. In this case, we have to
14857 // swap an input with a non-input.
14858 for (int i = 0; i < 4; ++i)
14859 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14860 "We can't handle any clobbers here!");
14861 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14862 "Cannot have adjacent inputs here!");
14863
14864 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14865 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14866
14867 // We also have to update the final source mask in this case because
14868 // it may need to undo the above swap.
14869 for (int &M : FinalSourceHalfMask)
14870 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14871 M = InputsFixed[1] + SourceOffset;
14872 else if (M == InputsFixed[1] + SourceOffset)
14873 M = (InputsFixed[0] ^ 1) + SourceOffset;
14874
14875 InputsFixed[1] = InputsFixed[0] ^ 1;
14876 }
14877
14878 // Point everything at the fixed inputs.
14879 for (int &M : HalfMask)
14880 if (M == IncomingInputs[0])
14881 M = InputsFixed[0] + SourceOffset;
14882 else if (M == IncomingInputs[1])
14883 M = InputsFixed[1] + SourceOffset;
14884
14885 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14886 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14887 }
14888 } else {
14889 llvm_unreachable("Unhandled input size!");
14890 }
14891
14892 // Now hoist the DWord down to the right half.
14893 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14894 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14895 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14896 for (int &M : HalfMask)
14897 for (int Input : IncomingInputs)
14898 if (M == Input)
14899 M = FreeDWord * 2 + Input % 2;
14900 };
14901 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14902 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14903 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14904 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14905
14906 // Now enact all the shuffles we've computed to move the inputs into their
14907 // target half.
14908 if (!isNoopShuffleMask(PSHUFLMask))
14909 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14910 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14911 if (!isNoopShuffleMask(PSHUFHMask))
14912 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14913 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14914 if (!isNoopShuffleMask(PSHUFDMask))
14915 V = DAG.getBitcast(
14916 VT,
14917 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14918 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14919
14920 // At this point, each half should contain all its inputs, and we can then
14921 // just shuffle them into their final position.
14922 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14923 "Failed to lift all the high half inputs to the low mask!");
14924 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14925 "Failed to lift all the low half inputs to the high mask!");
14926
14927 // Do a half shuffle for the low mask.
14928 if (!isNoopShuffleMask(LoMask))
14929 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14930 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14931
14932 // Do a half shuffle with the high mask after shifting its values down.
14933 for (int &M : HiMask)
14934 if (M >= 0)
14935 M -= 4;
14936 if (!isNoopShuffleMask(HiMask))
14937 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14938 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14939
14940 return V;
14941}
14942
14943/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14944/// blend if only one input is used.
14946 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14947 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14949 "Lane crossing shuffle masks not supported");
14950
14951 int NumBytes = VT.getSizeInBits() / 8;
14952 int Size = Mask.size();
14953 int Scale = NumBytes / Size;
14954
14955 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14956 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14957 V1InUse = false;
14958 V2InUse = false;
14959
14960 for (int i = 0; i < NumBytes; ++i) {
14961 int M = Mask[i / Scale];
14962 if (M < 0)
14963 continue;
14964
14965 const int ZeroMask = 0x80;
14966 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14967 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14968 if (Zeroable[i / Scale])
14969 V1Idx = V2Idx = ZeroMask;
14970
14971 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14972 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14973 V1InUse |= (ZeroMask != V1Idx);
14974 V2InUse |= (ZeroMask != V2Idx);
14975 }
14976
14977 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14978 if (V1InUse)
14979 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14980 DAG.getBuildVector(ShufVT, DL, V1Mask));
14981 if (V2InUse)
14982 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14983 DAG.getBuildVector(ShufVT, DL, V2Mask));
14984
14985 // If we need shuffled inputs from both, blend the two.
14986 SDValue V;
14987 if (V1InUse && V2InUse)
14988 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14989 else
14990 V = V1InUse ? V1 : V2;
14991
14992 // Cast the result back to the correct type.
14993 return DAG.getBitcast(VT, V);
14994}
14995
14996/// Generic lowering of 8-lane i16 shuffles.
14997///
14998/// This handles both single-input shuffles and combined shuffle/blends with
14999/// two inputs. The single input shuffles are immediately delegated to
15000/// a dedicated lowering routine.
15001///
15002/// The blends are lowered in one of three fundamental ways. If there are few
15003/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15004/// of the input is significantly cheaper when lowered as an interleaving of
15005/// the two inputs, try to interleave them. Otherwise, blend the low and high
15006/// halves of the inputs separately (making them have relatively few inputs)
15007/// and then concatenate them.
15009 const APInt &Zeroable, SDValue V1, SDValue V2,
15010 const X86Subtarget &Subtarget,
15011 SelectionDAG &DAG) {
15012 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15013 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
15014 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15015
15016 // Whenever we can lower this as a zext, that instruction is strictly faster
15017 // than any alternative.
15018 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15019 Zeroable, Subtarget, DAG))
15020 return ZExt;
15021
15022 // Try to use lower using a truncation.
15023 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15024 Subtarget, DAG))
15025 return V;
15026
15027 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15028
15029 if (NumV2Inputs == 0) {
15030 // Try to use shift instructions.
15031 if (SDValue Shift =
15032 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
15033 Subtarget, DAG, /*BitwiseOnly*/ false))
15034 return Shift;
15035
15036 // Check for being able to broadcast a single element.
15037 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15038 Mask, Subtarget, DAG))
15039 return Broadcast;
15040
15041 // Try to use bit rotation instructions.
15042 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15043 Subtarget, DAG))
15044 return Rotate;
15045
15046 // Use dedicated unpack instructions for masks that match their pattern.
15047 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
15048 return V;
15049
15050 // Use dedicated pack instructions for masks that match their pattern.
15051 if (SDValue V =
15052 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
15053 return V;
15054
15055 // Try to use byte rotation instructions.
15056 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15057 Subtarget, DAG))
15058 return Rotate;
15059
15060 // Make a copy of the mask so it can be modified.
15061 SmallVector<int, 8> MutableMask(Mask);
15062 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15063 Subtarget, DAG);
15064 }
15065
15066 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
15067 "All single-input shuffles should be canonicalized to be V1-input "
15068 "shuffles.");
15069
15070 // Try to use shift instructions.
15071 if (SDValue Shift =
15072 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
15073 DAG, /*BitwiseOnly*/ false))
15074 return Shift;
15075
15076 // See if we can use SSE4A Extraction / Insertion.
15077 if (Subtarget.hasSSE4A())
15078 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15079 Zeroable, DAG))
15080 return V;
15081
15082 // There are special ways we can lower some single-element blends.
15083 if (NumV2Inputs == 1)
15085 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15086 return V;
15087
15088 // We have different paths for blend lowering, but they all must use the
15089 // *exact* same predicate.
15090 bool IsBlendSupported = Subtarget.hasSSE41();
15091 if (IsBlendSupported)
15092 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15093 Zeroable, Subtarget, DAG))
15094 return Blend;
15095
15096 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15097 Zeroable, Subtarget, DAG))
15098 return Masked;
15099
15100 // Use dedicated unpack instructions for masks that match their pattern.
15101 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
15102 return V;
15103
15104 // Use dedicated pack instructions for masks that match their pattern.
15105 if (SDValue V =
15106 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
15107 return V;
15108
15109 // Try to use lower using a truncation.
15110 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15111 Subtarget, DAG))
15112 return V;
15113
15114 // Try to use byte rotation instructions.
15115 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15116 Subtarget, DAG))
15117 return Rotate;
15118
15119 if (SDValue BitBlend =
15120 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15121 return BitBlend;
15122
15123 // Try to use byte shift instructions to mask.
15124 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15125 Zeroable, Subtarget, DAG))
15126 return V;
15127
15128 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15129 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
15130 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
15131 !Subtarget.hasVLX()) {
15132 // Check if this is part of a 256-bit vector truncation.
15133 unsigned PackOpc = 0;
15134 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
15137 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
15138 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
15139 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
15140 DAG.getTargetConstant(0xEE, DL, MVT::i8));
15141 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
15142 V1 = extract128BitVector(V1V2, 0, DAG, DL);
15143 V2 = extract128BitVector(V1V2, 4, DAG, DL);
15144 PackOpc = X86ISD::PACKUS;
15145 } else if (Subtarget.hasSSE41()) {
15146 SmallVector<SDValue, 4> DWordClearOps(4,
15147 DAG.getConstant(0, DL, MVT::i32));
15148 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15149 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15150 SDValue DWordClearMask =
15151 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15152 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15153 DWordClearMask);
15154 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15155 DWordClearMask);
15156 PackOpc = X86ISD::PACKUS;
15157 } else if (!Subtarget.hasSSSE3()) {
15158 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
15159 V1 = DAG.getBitcast(MVT::v4i32, V1);
15160 V2 = DAG.getBitcast(MVT::v4i32, V2);
15161 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
15162 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
15163 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
15164 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
15165 PackOpc = X86ISD::PACKSS;
15166 }
15167 if (PackOpc) {
15168 // Now pack things back together.
15169 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
15170 if (NumEvenDrops == 2) {
15171 Result = DAG.getBitcast(MVT::v4i32, Result);
15172 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
15173 }
15174 return Result;
15175 }
15176 }
15177
15178 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
15179 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
15180 if (NumOddDrops == 1) {
15181 bool HasSSE41 = Subtarget.hasSSE41();
15182 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15183 DAG.getBitcast(MVT::v4i32, V1),
15184 DAG.getTargetConstant(16, DL, MVT::i8));
15185 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15186 DAG.getBitcast(MVT::v4i32, V2),
15187 DAG.getTargetConstant(16, DL, MVT::i8));
15188 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
15189 MVT::v8i16, V1, V2);
15190 }
15191
15192 // Try to lower by permuting the inputs into an unpack instruction.
15193 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15194 Mask, Subtarget, DAG))
15195 return Unpack;
15196
15197 // If we can't directly blend but can use PSHUFB, that will be better as it
15198 // can both shuffle and set up the inefficient blend.
15199 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15200 bool V1InUse, V2InUse;
15201 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15202 Zeroable, DAG, V1InUse, V2InUse);
15203 }
15204
15205 // We can always bit-blend if we have to so the fallback strategy is to
15206 // decompose into single-input permutes and blends/unpacks.
15207 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
15208 Zeroable, Subtarget, DAG);
15209}
15210
15211/// Lower 8-lane 16-bit floating point shuffles.
15213 const APInt &Zeroable, SDValue V1, SDValue V2,
15214 const X86Subtarget &Subtarget,
15215 SelectionDAG &DAG) {
15216 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
15217 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
15218 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15219 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
15220
15221 if (Subtarget.hasFP16()) {
15222 if (NumV2Elements == 0) {
15223 // Check for being able to broadcast a single element.
15224 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
15225 Mask, Subtarget, DAG))
15226 return Broadcast;
15227 }
15228 if (NumV2Elements == 1 && Mask[0] >= 8)
15230 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15231 return V;
15232 }
15233
15234 V1 = DAG.getBitcast(MVT::v8i16, V1);
15235 V2 = DAG.getBitcast(MVT::v8i16, V2);
15236 return DAG.getBitcast(MVT::v8f16,
15237 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15238}
15239
15240// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15241// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15242// the active subvector is extracted.
15244 ArrayRef<int> OriginalMask, SDValue V1,
15245 SDValue V2, const X86Subtarget &Subtarget,
15246 SelectionDAG &DAG) {
15247 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
15248 SmallVector<int, 32> Mask(OriginalMask);
15249 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
15250 !isShuffleFoldableLoad(V2)) {
15252 std::swap(V1, V2);
15253 }
15254
15255 MVT MaskVT = VT.changeTypeToInteger();
15256 SDValue MaskNode;
15257 MVT ShuffleVT = VT;
15258 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15259 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15260 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15261 ShuffleVT = V1.getSimpleValueType();
15262
15263 // Adjust mask to correct indices for the second input.
15264 int NumElts = VT.getVectorNumElements();
15265 unsigned Scale = 512 / VT.getSizeInBits();
15266 SmallVector<int, 32> AdjustedMask(Mask);
15267 for (int &M : AdjustedMask)
15268 if (NumElts <= M)
15269 M += (Scale - 1) * NumElts;
15270 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15271 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15272 } else {
15273 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15274 }
15275
15276 SDValue Result;
15277 if (V2.isUndef())
15278 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15279 else
15280 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15281
15282 if (VT != ShuffleVT)
15283 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15284
15285 return Result;
15286}
15287
15288/// Generic lowering of v16i8 shuffles.
15289///
15290/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15291/// detect any complexity reducing interleaving. If that doesn't help, it uses
15292/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15293/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15294/// back together.
15296 const APInt &Zeroable, SDValue V1, SDValue V2,
15297 const X86Subtarget &Subtarget,
15298 SelectionDAG &DAG) {
15299 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15300 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15301 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
15302
15303 // Try to use shift instructions.
15304 if (SDValue Shift =
15305 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
15306 DAG, /*BitwiseOnly*/ false))
15307 return Shift;
15308
15309 // Try to use byte rotation instructions.
15310 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15311 Subtarget, DAG))
15312 return Rotate;
15313
15314 // Use dedicated pack instructions for masks that match their pattern.
15315 if (SDValue V =
15316 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15317 return V;
15318
15319 // Try to use a zext lowering.
15320 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15321 Zeroable, Subtarget, DAG))
15322 return ZExt;
15323
15324 // Try to use lower using a truncation.
15325 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15326 Subtarget, DAG))
15327 return V;
15328
15329 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15330 Subtarget, DAG))
15331 return V;
15332
15333 // See if we can use SSE4A Extraction / Insertion.
15334 if (Subtarget.hasSSE4A())
15335 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15336 Zeroable, DAG))
15337 return V;
15338
15339 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15340
15341 // For single-input shuffles, there are some nicer lowering tricks we can use.
15342 if (NumV2Elements == 0) {
15343 // Check for being able to broadcast a single element.
15344 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15345 Mask, Subtarget, DAG))
15346 return Broadcast;
15347
15348 // Try to use bit rotation instructions.
15349 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15350 Subtarget, DAG))
15351 return Rotate;
15352
15353 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
15354 return V;
15355
15356 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15357 // Notably, this handles splat and partial-splat shuffles more efficiently.
15358 // However, it only makes sense if the pre-duplication shuffle simplifies
15359 // things significantly. Currently, this means we need to be able to
15360 // express the pre-duplication shuffle as an i16 shuffle.
15361 //
15362 // FIXME: We should check for other patterns which can be widened into an
15363 // i16 shuffle as well.
15364 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15365 for (int i = 0; i < 16; i += 2)
15366 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15367 return false;
15368
15369 return true;
15370 };
15371 auto tryToWidenViaDuplication = [&]() -> SDValue {
15372 if (!canWidenViaDuplication(Mask))
15373 return SDValue();
15374 SmallVector<int, 4> LoInputs;
15375 copy_if(Mask, std::back_inserter(LoInputs),
15376 [](int M) { return M >= 0 && M < 8; });
15377 array_pod_sort(LoInputs.begin(), LoInputs.end());
15378 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
15379 SmallVector<int, 4> HiInputs;
15380 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15381 array_pod_sort(HiInputs.begin(), HiInputs.end());
15382 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
15383
15384 bool TargetLo = LoInputs.size() >= HiInputs.size();
15385 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15386 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15387
15388 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15390 for (int I : InPlaceInputs) {
15391 PreDupI16Shuffle[I/2] = I/2;
15392 LaneMap[I] = I;
15393 }
15394 int j = TargetLo ? 0 : 4, je = j + 4;
15395 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15396 // Check if j is already a shuffle of this input. This happens when
15397 // there are two adjacent bytes after we move the low one.
15398 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15399 // If we haven't yet mapped the input, search for a slot into which
15400 // we can map it.
15401 while (j < je && PreDupI16Shuffle[j] >= 0)
15402 ++j;
15403
15404 if (j == je)
15405 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15406 return SDValue();
15407
15408 // Map this input with the i16 shuffle.
15409 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15410 }
15411
15412 // Update the lane map based on the mapping we ended up with.
15413 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15414 }
15415 V1 = DAG.getBitcast(
15416 MVT::v16i8,
15417 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15418 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15419
15420 // Unpack the bytes to form the i16s that will be shuffled into place.
15421 bool EvenInUse = false, OddInUse = false;
15422 for (int i = 0; i < 16; i += 2) {
15423 EvenInUse |= (Mask[i + 0] >= 0);
15424 OddInUse |= (Mask[i + 1] >= 0);
15425 if (EvenInUse && OddInUse)
15426 break;
15427 }
15428 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15429 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15430 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15431
15432 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15433 for (int i = 0; i < 16; ++i)
15434 if (Mask[i] >= 0) {
15435 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15436 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
15437 if (PostDupI16Shuffle[i / 2] < 0)
15438 PostDupI16Shuffle[i / 2] = MappedMask;
15439 else
15440 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15441 "Conflicting entries in the original shuffle!");
15442 }
15443 return DAG.getBitcast(
15444 MVT::v16i8,
15445 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15446 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15447 };
15448 if (SDValue V = tryToWidenViaDuplication())
15449 return V;
15450 }
15451
15452 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15453 Zeroable, Subtarget, DAG))
15454 return Masked;
15455
15456 // Use dedicated unpack instructions for masks that match their pattern.
15457 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
15458 return V;
15459
15460 // Try to use byte shift instructions to mask.
15461 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15462 Zeroable, Subtarget, DAG))
15463 return V;
15464
15465 // Check for compaction patterns.
15466 bool IsSingleInput = V2.isUndef();
15467 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
15468
15469 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15470 // with PSHUFB. It is important to do this before we attempt to generate any
15471 // blends but after all of the single-input lowerings. If the single input
15472 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15473 // want to preserve that and we can DAG combine any longer sequences into
15474 // a PSHUFB in the end. But once we start blending from multiple inputs,
15475 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15476 // and there are *very* few patterns that would actually be faster than the
15477 // PSHUFB approach because of its ability to zero lanes.
15478 //
15479 // If the mask is a binary compaction, we can more efficiently perform this
15480 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15481 //
15482 // FIXME: The only exceptions to the above are blends which are exact
15483 // interleavings with direct instructions supporting them. We currently don't
15484 // handle those well here.
15485 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15486 bool V1InUse = false;
15487 bool V2InUse = false;
15488
15490 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15491
15492 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15493 // do so. This avoids using them to handle blends-with-zero which is
15494 // important as a single pshufb is significantly faster for that.
15495 if (V1InUse && V2InUse) {
15496 if (Subtarget.hasSSE41())
15497 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15498 Zeroable, Subtarget, DAG))
15499 return Blend;
15500
15501 // We can use an unpack to do the blending rather than an or in some
15502 // cases. Even though the or may be (very minorly) more efficient, we
15503 // preference this lowering because there are common cases where part of
15504 // the complexity of the shuffles goes away when we do the final blend as
15505 // an unpack.
15506 // FIXME: It might be worth trying to detect if the unpack-feeding
15507 // shuffles will both be pshufb, in which case we shouldn't bother with
15508 // this.
15510 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15511 return Unpack;
15512
15513 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15514 if (Subtarget.hasVBMI())
15515 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15516 DAG);
15517
15518 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15519 if (Subtarget.hasXOP()) {
15520 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15521 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15522 }
15523
15524 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15525 // PALIGNR will be cheaper than the second PSHUFB+OR.
15527 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15528 return V;
15529 }
15530
15531 return PSHUFB;
15532 }
15533
15534 // There are special ways we can lower some single-element blends.
15535 if (NumV2Elements == 1)
15537 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15538 return V;
15539
15540 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15541 return Blend;
15542
15543 // Check whether a compaction lowering can be done. This handles shuffles
15544 // which take every Nth element for some even N. See the helper function for
15545 // details.
15546 //
15547 // We special case these as they can be particularly efficiently handled with
15548 // the PACKUSB instruction on x86 and they show up in common patterns of
15549 // rearranging bytes to truncate wide elements.
15550 if (NumEvenDrops) {
15551 // NumEvenDrops is the power of two stride of the elements. Another way of
15552 // thinking about it is that we need to drop the even elements this many
15553 // times to get the original input.
15554
15555 // First we need to zero all the dropped bytes.
15556 assert(NumEvenDrops <= 3 &&
15557 "No support for dropping even elements more than 3 times.");
15558 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15559 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15560 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15561 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15562 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15563 WordClearMask);
15564 if (!IsSingleInput)
15565 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15566 WordClearMask);
15567
15568 // Now pack things back together.
15569 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15570 IsSingleInput ? V1 : V2);
15571 for (int i = 1; i < NumEvenDrops; ++i) {
15572 Result = DAG.getBitcast(MVT::v8i16, Result);
15573 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15574 }
15575 return Result;
15576 }
15577
15578 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15579 if (NumOddDrops == 1) {
15580 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15581 DAG.getBitcast(MVT::v8i16, V1),
15582 DAG.getTargetConstant(8, DL, MVT::i8));
15583 if (!IsSingleInput)
15584 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15585 DAG.getBitcast(MVT::v8i16, V2),
15586 DAG.getTargetConstant(8, DL, MVT::i8));
15587 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15588 IsSingleInput ? V1 : V2);
15589 }
15590
15591 // Handle multi-input cases by blending/unpacking single-input shuffles.
15592 if (NumV2Elements > 0)
15593 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15594 Zeroable, Subtarget, DAG);
15595
15596 // The fallback path for single-input shuffles widens this into two v8i16
15597 // vectors with unpacks, shuffles those, and then pulls them back together
15598 // with a pack.
15599 SDValue V = V1;
15600
15601 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15602 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15603 for (int i = 0; i < 16; ++i)
15604 if (Mask[i] >= 0)
15605 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15606
15607 SDValue VLoHalf, VHiHalf;
15608 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15609 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15610 // i16s.
15611 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15612 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15613 // Use a mask to drop the high bytes.
15614 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15615 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15616 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15617
15618 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15619 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15620
15621 // Squash the masks to point directly into VLoHalf.
15622 for (int &M : LoBlendMask)
15623 if (M >= 0)
15624 M /= 2;
15625 for (int &M : HiBlendMask)
15626 if (M >= 0)
15627 M /= 2;
15628 } else {
15629 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15630 // VHiHalf so that we can blend them as i16s.
15631 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15632
15633 VLoHalf = DAG.getBitcast(
15634 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15635 VHiHalf = DAG.getBitcast(
15636 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15637 }
15638
15639 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15640 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15641
15642 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15643}
15644
15645/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15646///
15647/// This routine breaks down the specific type of 128-bit shuffle and
15648/// dispatches to the lowering routines accordingly.
15650 MVT VT, SDValue V1, SDValue V2,
15651 const APInt &Zeroable,
15652 const X86Subtarget &Subtarget,
15653 SelectionDAG &DAG) {
15654 if (VT == MVT::v8bf16) {
15655 V1 = DAG.getBitcast(MVT::v8i16, V1);
15656 V2 = DAG.getBitcast(MVT::v8i16, V2);
15657 return DAG.getBitcast(VT,
15658 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15659 }
15660
15661 switch (VT.SimpleTy) {
15662 case MVT::v2i64:
15663 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15664 case MVT::v2f64:
15665 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15666 case MVT::v4i32:
15667 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15668 case MVT::v4f32:
15669 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15670 case MVT::v8i16:
15671 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15672 case MVT::v8f16:
15673 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15674 case MVT::v16i8:
15675 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15676
15677 default:
15678 llvm_unreachable("Unimplemented!");
15679 }
15680}
15681
15682/// Generic routine to split vector shuffle into half-sized shuffles.
15683///
15684/// This routine just extracts two subvectors, shuffles them independently, and
15685/// then concatenates them back together. This should work effectively with all
15686/// AVX vector shuffle types.
15688 SDValue V2, ArrayRef<int> Mask,
15689 SelectionDAG &DAG, bool SimpleOnly) {
15690 assert(VT.getSizeInBits() >= 256 &&
15691 "Only for 256-bit or wider vector shuffles!");
15692 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15693 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15694
15695 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15696 if (VT == MVT::v8f32) {
15697 SDValue BC1 = peekThroughBitcasts(V1);
15698 SDValue BC2 = peekThroughBitcasts(V2);
15699 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15700 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15701 DAG, SimpleOnly))
15702 return DAG.getBitcast(VT, Split);
15703 }
15704 }
15705
15706 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15707 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15708
15709 int NumElements = VT.getVectorNumElements();
15710 int SplitNumElements = NumElements / 2;
15711 MVT ScalarVT = VT.getVectorElementType();
15712 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15713
15714 // Use splitVector/extractSubVector so that split build-vectors just build two
15715 // narrower build vectors. This helps shuffling with splats and zeros.
15716 auto SplitVector = [&](SDValue V) {
15717 SDValue LoV, HiV;
15718 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15719 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15720 DAG.getBitcast(SplitVT, HiV));
15721 };
15722
15723 SDValue LoV1, HiV1, LoV2, HiV2;
15724 std::tie(LoV1, HiV1) = SplitVector(V1);
15725 std::tie(LoV2, HiV2) = SplitVector(V2);
15726
15727 // Now create two 4-way blends of these half-width vectors.
15728 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15729 bool &UseHiV1, bool &UseLoV2,
15730 bool &UseHiV2) {
15731 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15732 for (int i = 0; i < SplitNumElements; ++i) {
15733 int M = HalfMask[i];
15734 if (M >= NumElements) {
15735 if (M >= NumElements + SplitNumElements)
15736 UseHiV2 = true;
15737 else
15738 UseLoV2 = true;
15739 } else if (M >= 0) {
15740 if (M >= SplitNumElements)
15741 UseHiV1 = true;
15742 else
15743 UseLoV1 = true;
15744 }
15745 }
15746 };
15747
15748 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15749 if (!SimpleOnly)
15750 return true;
15751
15752 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15753 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15754
15755 return !(UseHiV1 || UseHiV2);
15756 };
15757
15758 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15759 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15760 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15761 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15762 for (int i = 0; i < SplitNumElements; ++i) {
15763 int M = HalfMask[i];
15764 if (M >= NumElements) {
15765 V2BlendMask[i] = M - NumElements;
15766 BlendMask[i] = SplitNumElements + i;
15767 } else if (M >= 0) {
15768 V1BlendMask[i] = M;
15769 BlendMask[i] = i;
15770 }
15771 }
15772
15773 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15774 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15775
15776 // Because the lowering happens after all combining takes place, we need to
15777 // manually combine these blend masks as much as possible so that we create
15778 // a minimal number of high-level vector shuffle nodes.
15779 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15780
15781 // First try just blending the halves of V1 or V2.
15782 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15783 return DAG.getUNDEF(SplitVT);
15784 if (!UseLoV2 && !UseHiV2)
15785 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15786 if (!UseLoV1 && !UseHiV1)
15787 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15788
15789 SDValue V1Blend, V2Blend;
15790 if (UseLoV1 && UseHiV1) {
15791 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15792 } else {
15793 // We only use half of V1 so map the usage down into the final blend mask.
15794 V1Blend = UseLoV1 ? LoV1 : HiV1;
15795 for (int i = 0; i < SplitNumElements; ++i)
15796 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15797 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15798 }
15799 if (UseLoV2 && UseHiV2) {
15800 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15801 } else {
15802 // We only use half of V2 so map the usage down into the final blend mask.
15803 V2Blend = UseLoV2 ? LoV2 : HiV2;
15804 for (int i = 0; i < SplitNumElements; ++i)
15805 if (BlendMask[i] >= SplitNumElements)
15806 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15807 }
15808 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15809 };
15810
15811 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15812 return SDValue();
15813
15814 SDValue Lo = HalfBlend(LoMask);
15815 SDValue Hi = HalfBlend(HiMask);
15816 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15817}
15818
15819/// Either split a vector in halves or decompose the shuffles and the
15820/// blend/unpack.
15821///
15822/// This is provided as a good fallback for many lowerings of non-single-input
15823/// shuffles with more than one 128-bit lane. In those cases, we want to select
15824/// between splitting the shuffle into 128-bit components and stitching those
15825/// back together vs. extracting the single-input shuffles and blending those
15826/// results.
15828 SDValue V2, ArrayRef<int> Mask,
15829 const APInt &Zeroable,
15830 const X86Subtarget &Subtarget,
15831 SelectionDAG &DAG) {
15832 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15833 "shuffles as it could then recurse on itself.");
15834 int Size = Mask.size();
15835
15836 // If this can be modeled as a broadcast of two elements followed by a blend,
15837 // prefer that lowering. This is especially important because broadcasts can
15838 // often fold with memory operands.
15839 auto DoBothBroadcast = [&] {
15840 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15841 for (int M : Mask)
15842 if (M >= Size) {
15843 if (V2BroadcastIdx < 0)
15844 V2BroadcastIdx = M - Size;
15845 else if ((M - Size) != V2BroadcastIdx &&
15846 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15847 return false;
15848 } else if (M >= 0) {
15849 if (V1BroadcastIdx < 0)
15850 V1BroadcastIdx = M;
15851 else if (M != V1BroadcastIdx &&
15852 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15853 return false;
15854 }
15855 return true;
15856 };
15857 if (DoBothBroadcast())
15858 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15859 Subtarget, DAG);
15860
15861 // If the inputs all stem from a single 128-bit lane of each input, then we
15862 // split them rather than blending because the split will decompose to
15863 // unusually few instructions.
15864 int LaneCount = VT.getSizeInBits() / 128;
15865 int LaneSize = Size / LaneCount;
15866 SmallBitVector LaneInputs[2];
15867 LaneInputs[0].resize(LaneCount, false);
15868 LaneInputs[1].resize(LaneCount, false);
15869 for (int i = 0; i < Size; ++i)
15870 if (Mask[i] >= 0)
15871 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15872 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15873 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15874 /*SimpleOnly*/ false);
15875
15876 // Without AVX2, if we can freely split the subvectors then we're better off
15877 // performing half width shuffles.
15878 if (!Subtarget.hasAVX2()) {
15879 SDValue BC1 = peekThroughBitcasts(V1);
15880 SDValue BC2 = peekThroughBitcasts(V2);
15881 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15882 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15883 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15884 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15885 if (SplatOrSplitV1 && SplatOrSplitV2)
15886 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15887 /*SimpleOnly*/ false);
15888 }
15889
15890 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15891 // requires that the decomposed single-input shuffles don't end up here.
15892 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15893 Subtarget, DAG);
15894}
15895
15896// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15897// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15899 SDValue V1, SDValue V2,
15900 ArrayRef<int> Mask,
15901 SelectionDAG &DAG) {
15902 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15903
15904 int LHSMask[4] = {-1, -1, -1, -1};
15905 int RHSMask[4] = {-1, -1, -1, -1};
15906 int SHUFPDMask[4] = {-1, -1, -1, -1};
15907
15908 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15909 // perform the shuffle once the lanes have been shuffled in place.
15910 for (int i = 0; i != 4; ++i) {
15911 int M = Mask[i];
15912 if (M < 0)
15913 continue;
15914 int LaneBase = i & ~1;
15915 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15916 LaneMask[LaneBase + (M & 1)] = M;
15917 SHUFPDMask[i] = M & 1;
15918 }
15919
15920 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15921 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15922 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15923 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15924}
15925
15926/// Lower a vector shuffle crossing multiple 128-bit lanes as
15927/// a lane permutation followed by a per-lane permutation.
15928///
15929/// This is mainly for cases where we can have non-repeating permutes
15930/// in each lane.
15931///
15932/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15933/// we should investigate merging them.
15935 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15936 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15937 int NumElts = VT.getVectorNumElements();
15938 int NumLanes = VT.getSizeInBits() / 128;
15939 int NumEltsPerLane = NumElts / NumLanes;
15940 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15941
15942 /// Attempts to find a sublane permute with the given size
15943 /// that gets all elements into their target lanes.
15944 ///
15945 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15946 /// If unsuccessful, returns false and may overwrite InLaneMask.
15947 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15948 int NumSublanesPerLane = NumSublanes / NumLanes;
15949 int NumEltsPerSublane = NumElts / NumSublanes;
15950
15951 SmallVector<int, 16> CrossLaneMask;
15952 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15953 // CrossLaneMask but one entry == one sublane.
15954 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15955 APInt DemandedCrossLane = APInt::getZero(NumElts);
15956
15957 for (int i = 0; i != NumElts; ++i) {
15958 int M = Mask[i];
15959 if (M < 0)
15960 continue;
15961
15962 int SrcSublane = M / NumEltsPerSublane;
15963 int DstLane = i / NumEltsPerLane;
15964
15965 // We only need to get the elements into the right lane, not sublane.
15966 // So search all sublanes that make up the destination lane.
15967 bool Found = false;
15968 int DstSubStart = DstLane * NumSublanesPerLane;
15969 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15970 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15971 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15972 continue;
15973
15974 Found = true;
15975 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15976 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15977 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15978 DemandedCrossLane.setBit(InLaneMask[i]);
15979 break;
15980 }
15981 if (!Found)
15982 return SDValue();
15983 }
15984
15985 // Fill CrossLaneMask using CrossLaneMaskLarge.
15986 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15987
15988 if (!CanUseSublanes) {
15989 // If we're only shuffling a single lowest lane and the rest are identity
15990 // then don't bother.
15991 // TODO - isShuffleMaskInputInPlace could be extended to something like
15992 // this.
15993 int NumIdentityLanes = 0;
15994 bool OnlyShuffleLowestLane = true;
15995 for (int i = 0; i != NumLanes; ++i) {
15996 int LaneOffset = i * NumEltsPerLane;
15997 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15998 i * NumEltsPerLane))
15999 NumIdentityLanes++;
16000 else if (CrossLaneMask[LaneOffset] != 0)
16001 OnlyShuffleLowestLane = false;
16002 }
16003 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
16004 return SDValue();
16005 }
16006
16007 // Simplify CrossLaneMask based on the actual demanded elements.
16008 if (V1.hasOneUse())
16009 for (int i = 0; i != NumElts; ++i)
16010 if (!DemandedCrossLane[i])
16011 CrossLaneMask[i] = SM_SentinelUndef;
16012
16013 // Avoid returning the same shuffle operation. For example,
16014 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
16015 // undef:v16i16
16016 if (CrossLaneMask == Mask || InLaneMask == Mask)
16017 return SDValue();
16018
16019 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
16020 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
16021 InLaneMask);
16022 };
16023
16024 // First attempt a solution with full lanes.
16025 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
16026 return V;
16027
16028 // The rest of the solutions use sublanes.
16029 if (!CanUseSublanes)
16030 return SDValue();
16031
16032 // Then attempt a solution with 64-bit sublanes (vpermq).
16033 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16034 return V;
16035
16036 // If that doesn't work and we have fast variable cross-lane shuffle,
16037 // attempt 32-bit sublanes (vpermd).
16038 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16039 return SDValue();
16040
16041 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16042}
16043
16044/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
16045static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
16046 SmallVector<int> &InLaneMask) {
16047 int Size = Mask.size();
16048 InLaneMask.assign(Mask.begin(), Mask.end());
16049 for (int i = 0; i < Size; ++i) {
16050 int &M = InLaneMask[i];
16051 if (M < 0)
16052 continue;
16053 if (((M % Size) / LaneSize) != (i / LaneSize))
16054 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16055 }
16056}
16057
16058/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16059/// source with a lane permutation.
16060///
16061/// This lowering strategy results in four instructions in the worst case for a
16062/// single-input cross lane shuffle which is lower than any other fully general
16063/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16064/// shuffle pattern should be handled prior to trying this lowering.
16066 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16067 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16068 // FIXME: This should probably be generalized for 512-bit vectors as well.
16069 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
16070 int Size = Mask.size();
16071 int LaneSize = Size / 2;
16072
16073 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16074 // Only do this if the elements aren't all from the lower lane,
16075 // otherwise we're (probably) better off doing a split.
16076 if (VT == MVT::v4f64 &&
16077 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16078 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
16079
16080 // If there are only inputs from one 128-bit lane, splitting will in fact be
16081 // less expensive. The flags track whether the given lane contains an element
16082 // that crosses to another lane.
16083 bool AllLanes;
16084 if (!Subtarget.hasAVX2()) {
16085 bool LaneCrossing[2] = {false, false};
16086 for (int i = 0; i < Size; ++i)
16087 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16088 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16089 AllLanes = LaneCrossing[0] && LaneCrossing[1];
16090 } else {
16091 bool LaneUsed[2] = {false, false};
16092 for (int i = 0; i < Size; ++i)
16093 if (Mask[i] >= 0)
16094 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16095 AllLanes = LaneUsed[0] && LaneUsed[1];
16096 }
16097
16098 // TODO - we could support shuffling V2 in the Flipped input.
16099 assert(V2.isUndef() &&
16100 "This last part of this routine only works on single input shuffles");
16101
16102 SmallVector<int> InLaneMask;
16103 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16104
16105 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
16106 "In-lane shuffle mask expected");
16107
16108 // If we're not using both lanes in each lane and the inlane mask is not
16109 // repeating, then we're better off splitting.
16110 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
16111 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
16112 /*SimpleOnly*/ false);
16113
16114 // Flip the lanes, and shuffle the results which should now be in-lane.
16115 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16116 SDValue Flipped = DAG.getBitcast(PVT, V1);
16117 Flipped =
16118 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16119 Flipped = DAG.getBitcast(VT, Flipped);
16120 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16121}
16122
16123/// Handle lowering 2-lane 128-bit shuffles.
16125 SDValue V2, ArrayRef<int> Mask,
16126 const APInt &Zeroable,
16127 const X86Subtarget &Subtarget,
16128 SelectionDAG &DAG) {
16129 if (V2.isUndef()) {
16130 // Attempt to match VBROADCAST*128 subvector broadcast load.
16131 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16132 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16133 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16135 MVT MemVT = VT.getHalfNumVectorElementsVT();
16136 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16138 if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
16139 VT, MemVT, Ld, Ofs, DAG))
16140 return BcstLd;
16141 }
16142
16143 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16144 if (Subtarget.hasAVX2())
16145 return SDValue();
16146 }
16147
16148 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16149
16150 SmallVector<int, 4> WidenedMask;
16151 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16152 return SDValue();
16153
16154 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16155 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16156
16157 // Try to use an insert into a zero vector.
16158 if (WidenedMask[0] == 0 && IsHighZero) {
16159 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16160 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16161 DAG.getVectorIdxConstant(0, DL));
16162 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16163 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16164 DAG.getVectorIdxConstant(0, DL));
16165 }
16166
16167 // TODO: If minimizing size and one of the inputs is a zero vector and the
16168 // the zero vector has only one use, we could use a VPERM2X128 to save the
16169 // instruction bytes needed to explicitly generate the zero vector.
16170
16171 // Blends are faster and handle all the non-lane-crossing cases.
16172 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16173 Subtarget, DAG))
16174 return Blend;
16175
16176 // If either input operand is a zero vector, use VPERM2X128 because its mask
16177 // allows us to replace the zero input with an implicit zero.
16178 if (!IsLowZero && !IsHighZero) {
16179 // Check for patterns which can be matched with a single insert of a 128-bit
16180 // subvector.
16181 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16182 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16183
16184 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16185 // this will likely become vinsertf128 which can't fold a 256-bit memop.
16187 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16188 SDValue SubVec =
16189 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16190 DAG.getVectorIdxConstant(0, DL));
16191 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16192 DAG.getVectorIdxConstant(2, DL));
16193 }
16194 }
16195
16196 // Try to use SHUF128 if possible.
16197 if (Subtarget.hasVLX()) {
16198 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16199 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16200 ((WidenedMask[1] % 2) << 1);
16201 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16202 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16203 }
16204 }
16205 }
16206
16207 // Otherwise form a 128-bit permutation. After accounting for undefs,
16208 // convert the 64-bit shuffle mask selection values into 128-bit
16209 // selection bits by dividing the indexes by 2 and shifting into positions
16210 // defined by a vperm2*128 instruction's immediate control byte.
16211
16212 // The immediate permute control byte looks like this:
16213 // [1:0] - select 128 bits from sources for low half of destination
16214 // [2] - ignore
16215 // [3] - zero low half of destination
16216 // [5:4] - select 128 bits from sources for high half of destination
16217 // [6] - ignore
16218 // [7] - zero high half of destination
16219
16220 assert((WidenedMask[0] >= 0 || IsLowZero) &&
16221 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
16222
16223 unsigned PermMask = 0;
16224 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16225 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16226
16227 // Check the immediate mask and replace unused sources with undef.
16228 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16229 V1 = DAG.getUNDEF(VT);
16230 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16231 V2 = DAG.getUNDEF(VT);
16232
16233 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16234 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16235}
16236
16237/// Lower a vector shuffle by first fixing the 128-bit lanes and then
16238/// shuffling each lane.
16239///
16240/// This attempts to create a repeated lane shuffle where each lane uses one
16241/// or two of the lanes of the inputs. The lanes of the input vectors are
16242/// shuffled in one or two independent shuffles to get the lanes into the
16243/// position needed by the final shuffle.
16245 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16246 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16247 // This is only useful for binary shuffle with a non-repeating mask.
16248 if (V2.isUndef() || is128BitLaneRepeatedShuffleMask(VT, Mask))
16249 return SDValue();
16250
16251 int NumElts = Mask.size();
16252 int NumLanes = VT.getSizeInBits() / 128;
16253 int NumLaneElts = 128 / VT.getScalarSizeInBits();
16254 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16255 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16256
16257 // First pass will try to fill in the RepeatMask from lanes that need two
16258 // sources.
16259 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16260 int Srcs[2] = {-1, -1};
16261 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16262 for (int i = 0; i != NumLaneElts; ++i) {
16263 int M = Mask[(Lane * NumLaneElts) + i];
16264 if (M < 0)
16265 continue;
16266 // Determine which of the possible input lanes (NumLanes from each source)
16267 // this element comes from. Assign that as one of the sources for this
16268 // lane. We can assign up to 2 sources for this lane. If we run out
16269 // sources we can't do anything.
16270 int LaneSrc = M / NumLaneElts;
16271 int Src;
16272 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16273 Src = 0;
16274 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16275 Src = 1;
16276 else
16277 return SDValue();
16278
16279 Srcs[Src] = LaneSrc;
16280 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16281 }
16282
16283 // If this lane has two sources, see if it fits with the repeat mask so far.
16284 if (Srcs[1] < 0)
16285 continue;
16286
16287 LaneSrcs[Lane][0] = Srcs[0];
16288 LaneSrcs[Lane][1] = Srcs[1];
16289
16290 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16291 assert(M1.size() == M2.size() && "Unexpected mask size");
16292 for (int i = 0, e = M1.size(); i != e; ++i)
16293 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16294 return false;
16295 return true;
16296 };
16297
16298 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16299 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
16300 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16301 int M = Mask[i];
16302 if (M < 0)
16303 continue;
16304 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
16305 "Unexpected mask element");
16306 MergedMask[i] = M;
16307 }
16308 };
16309
16310 if (MatchMasks(InLaneMask, RepeatMask)) {
16311 // Merge this lane mask into the final repeat mask.
16312 MergeMasks(InLaneMask, RepeatMask);
16313 continue;
16314 }
16315
16316 // Didn't find a match. Swap the operands and try again.
16317 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16319
16320 if (MatchMasks(InLaneMask, RepeatMask)) {
16321 // Merge this lane mask into the final repeat mask.
16322 MergeMasks(InLaneMask, RepeatMask);
16323 continue;
16324 }
16325
16326 // Couldn't find a match with the operands in either order.
16327 return SDValue();
16328 }
16329
16330 // Now handle any lanes with only one source.
16331 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16332 // If this lane has already been processed, skip it.
16333 if (LaneSrcs[Lane][0] >= 0)
16334 continue;
16335
16336 for (int i = 0; i != NumLaneElts; ++i) {
16337 int M = Mask[(Lane * NumLaneElts) + i];
16338 if (M < 0)
16339 continue;
16340
16341 // If RepeatMask isn't defined yet we can define it ourself.
16342 if (RepeatMask[i] < 0)
16343 RepeatMask[i] = M % NumLaneElts;
16344
16345 if (RepeatMask[i] < NumElts) {
16346 if (RepeatMask[i] != M % NumLaneElts)
16347 return SDValue();
16348 LaneSrcs[Lane][0] = M / NumLaneElts;
16349 } else {
16350 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16351 return SDValue();
16352 LaneSrcs[Lane][1] = M / NumLaneElts;
16353 }
16354 }
16355
16356 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16357 return SDValue();
16358 }
16359
16360 SmallVector<int, 16> NewMask(NumElts, -1);
16361 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16362 int Src = LaneSrcs[Lane][0];
16363 for (int i = 0; i != NumLaneElts; ++i) {
16364 int M = -1;
16365 if (Src >= 0)
16366 M = Src * NumLaneElts + i;
16367 NewMask[Lane * NumLaneElts + i] = M;
16368 }
16369 }
16370 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16371 // Ensure we didn't get back the shuffle we started with.
16372 // FIXME: This is a hack to make up for some splat handling code in
16373 // getVectorShuffle.
16374 if (isa<ShuffleVectorSDNode>(NewV1) &&
16375 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16376 return SDValue();
16377
16378 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16379 int Src = LaneSrcs[Lane][1];
16380 for (int i = 0; i != NumLaneElts; ++i) {
16381 int M = -1;
16382 if (Src >= 0)
16383 M = Src * NumLaneElts + i;
16384 NewMask[Lane * NumLaneElts + i] = M;
16385 }
16386 }
16387 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16388 // Ensure we didn't get back the shuffle we started with.
16389 // FIXME: This is a hack to make up for some splat handling code in
16390 // getVectorShuffle.
16391 if (isa<ShuffleVectorSDNode>(NewV2) &&
16392 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16393 return SDValue();
16394
16395 for (int i = 0; i != NumElts; ++i) {
16396 if (Mask[i] < 0) {
16397 NewMask[i] = -1;
16398 continue;
16399 }
16400 NewMask[i] = RepeatMask[i % NumLaneElts];
16401 if (NewMask[i] < 0)
16402 continue;
16403
16404 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16405 }
16406 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16407}
16408
16409/// If the input shuffle mask results in a vector that is undefined in all upper
16410/// or lower half elements and that mask accesses only 2 halves of the
16411/// shuffle's operands, return true. A mask of half the width with mask indexes
16412/// adjusted to access the extracted halves of the original shuffle operands is
16413/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16414/// lower half of each input operand is accessed.
16415static bool
16417 int &HalfIdx1, int &HalfIdx2) {
16418 assert((Mask.size() == HalfMask.size() * 2) &&
16419 "Expected input mask to be twice as long as output");
16420
16421 // Exactly one half of the result must be undef to allow narrowing.
16422 bool UndefLower = isUndefLowerHalf(Mask);
16423 bool UndefUpper = isUndefUpperHalf(Mask);
16424 if (UndefLower == UndefUpper)
16425 return false;
16426
16427 unsigned HalfNumElts = HalfMask.size();
16428 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16429 HalfIdx1 = -1;
16430 HalfIdx2 = -1;
16431 for (unsigned i = 0; i != HalfNumElts; ++i) {
16432 int M = Mask[i + MaskIndexOffset];
16433 if (M < 0) {
16434 HalfMask[i] = M;
16435 continue;
16436 }
16437
16438 // Determine which of the 4 half vectors this element is from.
16439 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16440 int HalfIdx = M / HalfNumElts;
16441
16442 // Determine the element index into its half vector source.
16443 int HalfElt = M % HalfNumElts;
16444
16445 // We can shuffle with up to 2 half vectors, set the new 'half'
16446 // shuffle mask accordingly.
16447 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16448 HalfMask[i] = HalfElt;
16449 HalfIdx1 = HalfIdx;
16450 continue;
16451 }
16452 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16453 HalfMask[i] = HalfElt + HalfNumElts;
16454 HalfIdx2 = HalfIdx;
16455 continue;
16456 }
16457
16458 // Too many half vectors referenced.
16459 return false;
16460 }
16461
16462 return true;
16463}
16464
16465/// Given the output values from getHalfShuffleMask(), create a half width
16466/// shuffle of extracted vectors followed by an insert back to full width.
16468 ArrayRef<int> HalfMask, int HalfIdx1,
16469 int HalfIdx2, bool UndefLower,
16470 SelectionDAG &DAG, bool UseConcat = false) {
16471 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
16472 assert(V1.getValueType().isSimple() && "Expecting only simple types");
16473
16474 MVT VT = V1.getSimpleValueType();
16475 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16476 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16477
16478 auto getHalfVector = [&](int HalfIdx) {
16479 if (HalfIdx < 0)
16480 return DAG.getUNDEF(HalfVT);
16481 SDValue V = (HalfIdx < 2 ? V1 : V2);
16482 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16483 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16484 DAG.getVectorIdxConstant(HalfIdx, DL));
16485 };
16486
16487 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16488 SDValue Half1 = getHalfVector(HalfIdx1);
16489 SDValue Half2 = getHalfVector(HalfIdx2);
16490 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16491 if (UseConcat) {
16492 SDValue Op0 = V;
16493 SDValue Op1 = DAG.getUNDEF(HalfVT);
16494 if (UndefLower)
16495 std::swap(Op0, Op1);
16496 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16497 }
16498
16499 unsigned Offset = UndefLower ? HalfNumElts : 0;
16500 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16502}
16503
16504/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16505/// This allows for fast cases such as subvector extraction/insertion
16506/// or shuffling smaller vector types which can lower more efficiently.
16508 SDValue V2, ArrayRef<int> Mask,
16509 const X86Subtarget &Subtarget,
16510 SelectionDAG &DAG) {
16511 assert((VT.is256BitVector() || VT.is512BitVector()) &&
16512 "Expected 256-bit or 512-bit vector");
16513
16514 bool UndefLower = isUndefLowerHalf(Mask);
16515 if (!UndefLower && !isUndefUpperHalf(Mask))
16516 return SDValue();
16517
16518 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
16519 "Completely undef shuffle mask should have been simplified already");
16520
16521 // Upper half is undef and lower half is whole upper subvector.
16522 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16523 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16524 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16525 if (!UndefLower &&
16526 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16527 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16528 DAG.getVectorIdxConstant(HalfNumElts, DL));
16529 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16530 DAG.getVectorIdxConstant(0, DL));
16531 }
16532
16533 // Lower half is undef and upper half is whole lower subvector.
16534 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16535 if (UndefLower &&
16536 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16537 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16538 DAG.getVectorIdxConstant(0, DL));
16539 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16540 DAG.getVectorIdxConstant(HalfNumElts, DL));
16541 }
16542
16543 int HalfIdx1, HalfIdx2;
16544 SmallVector<int, 8> HalfMask(HalfNumElts);
16545 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16546 return SDValue();
16547
16548 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
16549
16550 // Only shuffle the halves of the inputs when useful.
16551 unsigned NumLowerHalves =
16552 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16553 unsigned NumUpperHalves =
16554 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16555 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
16556
16557 // Determine the larger pattern of undef/halves, then decide if it's worth
16558 // splitting the shuffle based on subtarget capabilities and types.
16559 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16560 if (!UndefLower) {
16561 // XXXXuuuu: no insert is needed.
16562 // Always extract lowers when setting lower - these are all free subreg ops.
16563 if (NumUpperHalves == 0)
16564 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16565 UndefLower, DAG);
16566
16567 if (NumUpperHalves == 1) {
16568 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16569 if (Subtarget.hasAVX2()) {
16570 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16571 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16572 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16573 (!isSingleSHUFPSMask(HalfMask) ||
16574 Subtarget.hasFastVariableCrossLaneShuffle()))
16575 return SDValue();
16576 // If this is an unary shuffle (assume that the 2nd operand is
16577 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16578 // are better off extracting the upper half of 1 operand and using a
16579 // narrow shuffle.
16580 if (EltWidth == 64 && V2.isUndef())
16581 return SDValue();
16582 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16583 // full width pshufb, and then merge.
16584 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16585 return SDValue();
16586 }
16587 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16588 if (Subtarget.hasAVX512() && VT.is512BitVector())
16589 return SDValue();
16590 // Extract + narrow shuffle is better than the wide alternative.
16591 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16592 UndefLower, DAG);
16593 }
16594
16595 // Don't extract both uppers, instead shuffle and then extract.
16596 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16597 return SDValue();
16598 }
16599
16600 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16601 if (NumUpperHalves == 0) {
16602 // AVX2 has efficient 64-bit element cross-lane shuffles.
16603 // TODO: Refine to account for unary shuffle, splat, and other masks?
16604 if (Subtarget.hasAVX2() && EltWidth == 64)
16605 return SDValue();
16606 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16607 if (Subtarget.hasAVX512() && VT.is512BitVector())
16608 return SDValue();
16609 // Narrow shuffle + insert is better than the wide alternative.
16610 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16611 UndefLower, DAG);
16612 }
16613
16614 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16615 return SDValue();
16616}
16617
16618/// Handle case where shuffle sources are coming from the same 128-bit lane and
16619/// every lane can be represented as the same repeating mask - allowing us to
16620/// shuffle the sources with the repeating shuffle and then permute the result
16621/// to the destination lanes.
16623 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16624 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16625 int NumElts = VT.getVectorNumElements();
16626 int NumLanes = VT.getSizeInBits() / 128;
16627 int NumLaneElts = NumElts / NumLanes;
16628
16629 // On AVX2 we may be able to just shuffle the lowest elements and then
16630 // broadcast the result.
16631 if (Subtarget.hasAVX2()) {
16632 for (unsigned BroadcastSize : {16, 32, 64}) {
16633 if (BroadcastSize <= VT.getScalarSizeInBits())
16634 continue;
16635 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16636
16637 // Attempt to match a repeating pattern every NumBroadcastElts,
16638 // accounting for UNDEFs but only references the lowest 128-bit
16639 // lane of the inputs.
16640 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16641 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16642 for (int j = 0; j != NumBroadcastElts; ++j) {
16643 int M = Mask[i + j];
16644 if (M < 0)
16645 continue;
16646 int &R = RepeatMask[j];
16647 if (0 != ((M % NumElts) / NumLaneElts))
16648 return false;
16649 if (0 <= R && R != M)
16650 return false;
16651 R = M;
16652 }
16653 return true;
16654 };
16655
16656 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16657 if (!FindRepeatingBroadcastMask(RepeatMask))
16658 continue;
16659
16660 // Shuffle the (lowest) repeated elements in place for broadcast.
16661 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16662
16663 // Shuffle the actual broadcast.
16664 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16665 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16666 for (int j = 0; j != NumBroadcastElts; ++j)
16667 BroadcastMask[i + j] = j;
16668
16669 // Avoid returning the same shuffle operation. For example,
16670 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16671 if (BroadcastMask == Mask)
16672 return SDValue();
16673
16674 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16675 BroadcastMask);
16676 }
16677 }
16678
16679 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16680 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16681 return SDValue();
16682
16683 // Bail if we already have a repeated lane shuffle mask.
16684 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16685 return SDValue();
16686
16687 // Helper to look for repeated mask in each split sublane, and that those
16688 // sublanes can then be permuted into place.
16689 auto ShuffleSubLanes = [&](int SubLaneScale) {
16690 int NumSubLanes = NumLanes * SubLaneScale;
16691 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16692
16693 // Check that all the sources are coming from the same lane and see if we
16694 // can form a repeating shuffle mask (local to each sub-lane). At the same
16695 // time, determine the source sub-lane for each destination sub-lane.
16696 int TopSrcSubLane = -1;
16697 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16698 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16699 SubLaneScale,
16700 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16701
16702 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16703 // Extract the sub-lane mask, check that it all comes from the same lane
16704 // and normalize the mask entries to come from the first lane.
16705 int SrcLane = -1;
16706 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16707 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16708 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16709 if (M < 0)
16710 continue;
16711 int Lane = (M % NumElts) / NumLaneElts;
16712 if ((0 <= SrcLane) && (SrcLane != Lane))
16713 return SDValue();
16714 SrcLane = Lane;
16715 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16716 SubLaneMask[Elt] = LocalM;
16717 }
16718
16719 // Whole sub-lane is UNDEF.
16720 if (SrcLane < 0)
16721 continue;
16722
16723 // Attempt to match against the candidate repeated sub-lane masks.
16724 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16725 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16726 for (int i = 0; i != NumSubLaneElts; ++i) {
16727 if (M1[i] < 0 || M2[i] < 0)
16728 continue;
16729 if (M1[i] != M2[i])
16730 return false;
16731 }
16732 return true;
16733 };
16734
16735 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16736 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16737 continue;
16738
16739 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16740 for (int i = 0; i != NumSubLaneElts; ++i) {
16741 int M = SubLaneMask[i];
16742 if (M < 0)
16743 continue;
16744 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16745 "Unexpected mask element");
16746 RepeatedSubLaneMask[i] = M;
16747 }
16748
16749 // Track the top most source sub-lane - by setting the remaining to
16750 // UNDEF we can greatly simplify shuffle matching.
16751 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16752 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16753 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16754 break;
16755 }
16756
16757 // Bail if we failed to find a matching repeated sub-lane mask.
16758 if (Dst2SrcSubLanes[DstSubLane] < 0)
16759 return SDValue();
16760 }
16761 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16762 "Unexpected source lane");
16763
16764 // Create a repeating shuffle mask for the entire vector.
16765 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16766 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16767 int Lane = SubLane / SubLaneScale;
16768 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16769 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16770 int M = RepeatedSubLaneMask[Elt];
16771 if (M < 0)
16772 continue;
16773 int Idx = (SubLane * NumSubLaneElts) + Elt;
16774 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16775 }
16776 }
16777
16778 // Shuffle each source sub-lane to its destination.
16779 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16780 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16781 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16782 if (SrcSubLane < 0)
16783 continue;
16784 for (int j = 0; j != NumSubLaneElts; ++j)
16785 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16786 }
16787
16788 // Avoid returning the same shuffle operation.
16789 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16790 if (RepeatedMask == Mask || SubLaneMask == Mask)
16791 return SDValue();
16792
16793 SDValue RepeatedShuffle =
16794 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16795
16796 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16797 SubLaneMask);
16798 };
16799
16800 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16801 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16802 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16803 // Otherwise we can only permute whole 128-bit lanes.
16804 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16805 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16806 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16807 MinSubLaneScale = 2;
16808 MaxSubLaneScale =
16809 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16810 }
16811 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16812 MinSubLaneScale = MaxSubLaneScale = 4;
16813
16814 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16815 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16816 return Shuffle;
16817
16818 return SDValue();
16819}
16820
16822 bool &ForceV1Zero, bool &ForceV2Zero,
16823 unsigned &ShuffleImm, ArrayRef<int> Mask,
16824 const APInt &Zeroable) {
16825 int NumElts = VT.getVectorNumElements();
16826 assert(VT.getScalarSizeInBits() == 64 &&
16827 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16828 "Unexpected data type for VSHUFPD");
16829 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16830 "Illegal shuffle mask");
16831
16832 bool ZeroLane[2] = { true, true };
16833 for (int i = 0; i < NumElts; ++i)
16834 ZeroLane[i & 1] &= Zeroable[i];
16835
16836 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16837 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16838 bool IsSHUFPD = true;
16839 bool IsCommutable = true;
16840 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16841 for (int i = 0; i < NumElts; ++i) {
16842 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16843 continue;
16844 if (Mask[i] < 0)
16845 return false;
16846 int Val = (i & 6) + NumElts * (i & 1);
16847 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16848 if (Mask[i] < Val || Mask[i] > Val + 1)
16849 IsSHUFPD = false;
16850 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16851 IsCommutable = false;
16852 SHUFPDMask[i] = Mask[i] % 2;
16853 }
16854
16855 if (!IsSHUFPD && !IsCommutable)
16856 return false;
16857
16858 if (!IsSHUFPD && IsCommutable)
16859 std::swap(V1, V2);
16860
16861 ForceV1Zero = ZeroLane[0];
16862 ForceV2Zero = ZeroLane[1];
16863 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16864 return true;
16865}
16866
16868 SDValue V2, ArrayRef<int> Mask,
16869 const APInt &Zeroable,
16870 const X86Subtarget &Subtarget,
16871 SelectionDAG &DAG) {
16872 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16873 "Unexpected data type for VSHUFPD");
16874
16875 unsigned Immediate = 0;
16876 bool ForceV1Zero = false, ForceV2Zero = false;
16877 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16878 Mask, Zeroable))
16879 return SDValue();
16880
16881 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16882 if (ForceV1Zero)
16883 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16884 if (ForceV2Zero)
16885 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16886
16887 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16888 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16889}
16890
16891// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16892// by zeroable elements in the remaining 24 elements. Turn this into two
16893// vmovqb instructions shuffled together.
16895 SDValue V1, SDValue V2,
16896 ArrayRef<int> Mask,
16897 const APInt &Zeroable,
16898 SelectionDAG &DAG) {
16899 assert(VT == MVT::v32i8 && "Unexpected type!");
16900
16901 // The first 8 indices should be every 8th element.
16902 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16903 return SDValue();
16904
16905 // Remaining elements need to be zeroable.
16906 if (Zeroable.countl_one() < (Mask.size() - 8))
16907 return SDValue();
16908
16909 V1 = DAG.getBitcast(MVT::v4i64, V1);
16910 V2 = DAG.getBitcast(MVT::v4i64, V2);
16911
16912 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16913 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16914
16915 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16916 // the upper bits of the result using an unpckldq.
16917 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16918 { 0, 1, 2, 3, 16, 17, 18, 19,
16919 4, 5, 6, 7, 20, 21, 22, 23 });
16920 // Insert the unpckldq into a zero vector to widen to v32i8.
16921 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16922 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16923 DAG.getVectorIdxConstant(0, DL));
16924}
16925
16926// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16927// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16928// =>
16929// ul = unpckl v1, v2
16930// uh = unpckh v1, v2
16931// a = vperm ul, uh
16932// b = vperm ul, uh
16933//
16934// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16935// and permute. We cannot directly match v3 because it is split into two
16936// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16937// pair of 256-bit shuffles and makes sure the masks are consecutive.
16938//
16939// Once unpck and permute nodes are created, the permute corresponding to this
16940// shuffle is returned, while the other permute replaces the other half of the
16941// shuffle in the selection dag.
16943 SDValue V1, SDValue V2,
16944 ArrayRef<int> Mask,
16945 SelectionDAG &DAG) {
16946 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16947 VT != MVT::v32i8)
16948 return SDValue();
16949 // <B0, B1, B0+1, B1+1, ..., >
16950 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16951 unsigned Begin1) {
16952 size_t Size = Mask.size();
16953 assert(Size % 2 == 0 && "Expected even mask size");
16954 for (unsigned I = 0; I < Size; I += 2) {
16955 if (Mask[I] != (int)(Begin0 + I / 2) ||
16956 Mask[I + 1] != (int)(Begin1 + I / 2))
16957 return false;
16958 }
16959 return true;
16960 };
16961 // Check which half is this shuffle node
16962 int NumElts = VT.getVectorNumElements();
16963 size_t FirstQtr = NumElts / 2;
16964 size_t ThirdQtr = NumElts + NumElts / 2;
16965 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16966 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16967 if (!IsFirstHalf && !IsSecondHalf)
16968 return SDValue();
16969
16970 // Find the intersection between shuffle users of V1 and V2.
16971 SmallVector<SDNode *, 2> Shuffles;
16972 for (SDNode *User : V1->users())
16973 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16974 User->getOperand(1) == V2)
16975 Shuffles.push_back(User);
16976 // Limit user size to two for now.
16977 if (Shuffles.size() != 2)
16978 return SDValue();
16979 // Find out which half of the 512-bit shuffles is each smaller shuffle
16980 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16981 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16982 SDNode *FirstHalf;
16983 SDNode *SecondHalf;
16984 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16985 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16986 FirstHalf = Shuffles[0];
16987 SecondHalf = Shuffles[1];
16988 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16989 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16990 FirstHalf = Shuffles[1];
16991 SecondHalf = Shuffles[0];
16992 } else {
16993 return SDValue();
16994 }
16995 // Lower into unpck and perm. Return the perm of this shuffle and replace
16996 // the other.
16997 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16998 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16999 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
17000 DAG.getTargetConstant(0x20, DL, MVT::i8));
17001 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
17002 DAG.getTargetConstant(0x31, DL, MVT::i8));
17003 if (IsFirstHalf) {
17004 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
17005 return Perm1;
17006 }
17007 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
17008 return Perm2;
17009}
17010
17011/// Handle lowering of 4-lane 64-bit floating point shuffles.
17012///
17013/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
17014/// isn't available.
17016 const APInt &Zeroable, SDValue V1, SDValue V2,
17017 const X86Subtarget &Subtarget,
17018 SelectionDAG &DAG) {
17019 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
17020 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
17021 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
17022
17023 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
17024 Subtarget, DAG))
17025 return V;
17026
17027 if (V2.isUndef()) {
17028 // Check for being able to broadcast a single element.
17029 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
17030 Mask, Subtarget, DAG))
17031 return Broadcast;
17032
17033 // Use low duplicate instructions for masks that match their pattern.
17034 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
17035 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
17036
17037 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
17038 // Non-half-crossing single input shuffles can be lowered with an
17039 // interleaved permutation.
17040 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17041 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
17042 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
17043 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17044 }
17045
17046 // With AVX2 we have direct support for this permutation.
17047 if (Subtarget.hasAVX2())
17048 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
17049 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17050
17051 // Try to create an in-lane repeating shuffle mask and then shuffle the
17052 // results into the target lanes.
17054 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17055 return V;
17056
17057 // Try to permute the lanes and then use a per-lane permute.
17058 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
17059 Mask, DAG, Subtarget))
17060 return V;
17061
17062 // Otherwise, fall back.
17063 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
17064 DAG, Subtarget);
17065 }
17066
17067 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
17068 Zeroable, Subtarget, DAG))
17069 return Blend;
17070
17071 // Use dedicated unpack instructions for masks that match their pattern.
17072 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
17073 return V;
17074
17075 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
17076 Zeroable, Subtarget, DAG))
17077 return Op;
17078
17079 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
17080 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
17081 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
17082 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
17083
17084 // If we have lane crossing shuffles AND they don't all come from the lower
17085 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17086 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
17087 // canonicalize to a blend of splat which isn't necessary for this combine.
17088 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
17089 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
17090 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
17091 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
17092 (!Subtarget.hasAVX2() ||
17093 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
17094 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
17095
17096 // If we have one input in place, then we can permute the other input and
17097 // blend the result.
17098 if (V1IsInPlace || V2IsInPlace)
17099 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17100 Zeroable, Subtarget, DAG);
17101
17102 // Try to create an in-lane repeating shuffle mask and then shuffle the
17103 // results into the target lanes.
17105 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17106 return V;
17107
17108 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17109 // shuffle. However, if we have AVX2 and either inputs are already in place,
17110 // we will be able to shuffle even across lanes the other input in a single
17111 // instruction so skip this pattern.
17112 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
17114 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17115 return V;
17116
17117 // If we have VLX support, we can use VEXPAND.
17118 if (Subtarget.hasVLX())
17119 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
17120 Zeroable, Subtarget, DAG))
17121 return V;
17122
17123 // If we have AVX2 then we always want to lower with a blend because an v4 we
17124 // can fully permute the elements.
17125 if (Subtarget.hasAVX2())
17126 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17127 Zeroable, Subtarget, DAG);
17128
17129 // Otherwise fall back on generic lowering.
17130 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
17131 Subtarget, DAG);
17132}
17133
17134/// Handle lowering of 4-lane 64-bit integer shuffles.
17135///
17136/// This routine is only called when we have AVX2 and thus a reasonable
17137/// instruction set for v4i64 shuffling..
17139 const APInt &Zeroable, SDValue V1, SDValue V2,
17140 const X86Subtarget &Subtarget,
17141 SelectionDAG &DAG) {
17142 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
17143 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
17144 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
17145 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
17146
17147 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17148 Subtarget, DAG))
17149 return V;
17150
17151 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17152 Zeroable, Subtarget, DAG))
17153 return Blend;
17154
17155 // Check for being able to broadcast a single element.
17156 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17157 Subtarget, DAG))
17158 return Broadcast;
17159
17160 // Try to use shift instructions if fast.
17161 if (Subtarget.preferLowerShuffleAsShift())
17162 if (SDValue Shift =
17163 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17164 Subtarget, DAG, /*BitwiseOnly*/ true))
17165 return Shift;
17166
17167 if (V2.isUndef()) {
17168 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17169 // can use lower latency instructions that will operate on both lanes.
17170 SmallVector<int, 2> RepeatedMask;
17171 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17172 SmallVector<int, 4> PSHUFDMask;
17173 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17174 return DAG.getBitcast(
17175 MVT::v4i64,
17176 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17177 DAG.getBitcast(MVT::v8i32, V1),
17178 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17179 }
17180
17181 // AVX2 provides a direct instruction for permuting a single input across
17182 // lanes.
17183 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17184 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17185 }
17186
17187 // Try to use shift instructions.
17188 if (SDValue Shift =
17189 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
17190 DAG, /*BitwiseOnly*/ false))
17191 return Shift;
17192
17193 // If we have VLX support, we can use VALIGN or VEXPAND.
17194 if (Subtarget.hasVLX()) {
17195 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17196 Zeroable, Subtarget, DAG))
17197 return Rotate;
17198
17199 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
17200 Zeroable, Subtarget, DAG))
17201 return V;
17202 }
17203
17204 // Try to use PALIGNR.
17205 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17206 Subtarget, DAG))
17207 return Rotate;
17208
17209 // Use dedicated unpack instructions for masks that match their pattern.
17210 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
17211 return V;
17212
17213 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
17214 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
17215
17216 // If we have one input in place, then we can permute the other input and
17217 // blend the result.
17218 if (V1IsInPlace || V2IsInPlace)
17219 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17220 Zeroable, Subtarget, DAG);
17221
17222 // Try to create an in-lane repeating shuffle mask and then shuffle the
17223 // results into the target lanes.
17225 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17226 return V;
17227
17228 // Try to lower to PERMQ(BLENDD(V1,V2)).
17229 if (SDValue V =
17230 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
17231 return V;
17232
17233 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17234 // shuffle. However, if we have AVX2 and either inputs are already in place,
17235 // we will be able to shuffle even across lanes the other input in a single
17236 // instruction so skip this pattern.
17237 if (!V1IsInPlace && !V2IsInPlace)
17239 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17240 return Result;
17241
17242 // Otherwise fall back on generic blend lowering.
17243 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17244 Zeroable, Subtarget, DAG);
17245}
17246
17247/// Handle lowering of 8-lane 32-bit floating point shuffles.
17248///
17249/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17250/// isn't available.
17252 const APInt &Zeroable, SDValue V1, SDValue V2,
17253 const X86Subtarget &Subtarget,
17254 SelectionDAG &DAG) {
17255 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17256 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17257 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17258
17259 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17260 Zeroable, Subtarget, DAG))
17261 return Blend;
17262
17263 // Check for being able to broadcast a single element.
17264 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17265 Subtarget, DAG))
17266 return Broadcast;
17267
17268 if (!Subtarget.hasAVX2()) {
17269 SmallVector<int> InLaneMask;
17270 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
17271
17272 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
17273 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
17274 /*SimpleOnly*/ true))
17275 return R;
17276 }
17277 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17278 Zeroable, Subtarget, DAG))
17279 return DAG.getBitcast(MVT::v8f32, ZExt);
17280
17281 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17282 // options to efficiently lower the shuffle.
17283 SmallVector<int, 4> RepeatedMask;
17284 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17285 assert(RepeatedMask.size() == 4 &&
17286 "Repeated masks must be half the mask width!");
17287
17288 // Use even/odd duplicate instructions for masks that match their pattern.
17289 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17290 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17291 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17292 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17293
17294 if (V2.isUndef())
17295 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17296 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17297
17298 // Use dedicated unpack instructions for masks that match their pattern.
17299 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
17300 return V;
17301
17302 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17303 // have already handled any direct blends.
17304 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17305 }
17306
17307 // Try to create an in-lane repeating shuffle mask and then shuffle the
17308 // results into the target lanes.
17310 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17311 return V;
17312
17313 // If we have a single input shuffle with different shuffle patterns in the
17314 // two 128-bit lanes use the variable mask to VPERMILPS.
17315 if (V2.isUndef()) {
17316 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17317 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17318 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17319 }
17320 if (Subtarget.hasAVX2()) {
17321 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17322 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17323 }
17324 // Otherwise, fall back.
17325 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17326 DAG, Subtarget);
17327 }
17328
17329 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17330 // shuffle.
17332 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17333 return Result;
17334
17335 // If we have VLX support, we can use VEXPAND.
17336 if (Subtarget.hasVLX())
17337 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
17338 Zeroable, Subtarget, DAG))
17339 return V;
17340
17341 // Try to match an interleave of two v8f32s and lower them as unpck and
17342 // permutes using ymms. This needs to go before we try to split the vectors.
17343 // Don't attempt on AVX1 if we're likely to split vectors anyway.
17344 if ((Subtarget.hasAVX2() ||
17347 !Subtarget.hasAVX512())
17348 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
17349 Mask, DAG))
17350 return V;
17351
17352 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17353 // since after split we get a more efficient code using vpunpcklwd and
17354 // vpunpckhwd instrs than vblend.
17355 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
17356 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
17357 Subtarget, DAG);
17358
17359 // If we have AVX2 then we always want to lower with a blend because at v8 we
17360 // can fully permute the elements.
17361 if (Subtarget.hasAVX2())
17362 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17363 Zeroable, Subtarget, DAG);
17364
17365 // Otherwise fall back on generic lowering.
17366 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
17367 Subtarget, DAG);
17368}
17369
17370/// Handle lowering of 8-lane 32-bit integer shuffles.
17371///
17372/// This routine is only called when we have AVX2 and thus a reasonable
17373/// instruction set for v8i32 shuffling..
17375 const APInt &Zeroable, SDValue V1, SDValue V2,
17376 const X86Subtarget &Subtarget,
17377 SelectionDAG &DAG) {
17378 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17379 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17380 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17381 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
17382
17383 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
17384
17385 // Whenever we can lower this as a zext, that instruction is strictly faster
17386 // than any alternative. It also allows us to fold memory operands into the
17387 // shuffle in many cases.
17388 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17389 Zeroable, Subtarget, DAG))
17390 return ZExt;
17391
17392 // Try to match an interleave of two v8i32s and lower them as unpck and
17393 // permutes using ymms. This needs to go before we try to split the vectors.
17394 if (!Subtarget.hasAVX512())
17395 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
17396 Mask, DAG))
17397 return V;
17398
17399 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17400 // since after split we get a more efficient code than vblend by using
17401 // vpunpcklwd and vpunpckhwd instrs.
17402 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
17403 !Subtarget.hasAVX512())
17404 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
17405 Subtarget, DAG);
17406
17407 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17408 Zeroable, Subtarget, DAG))
17409 return Blend;
17410
17411 // Check for being able to broadcast a single element.
17412 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17413 Subtarget, DAG))
17414 return Broadcast;
17415
17416 // Try to use shift instructions if fast.
17417 if (Subtarget.preferLowerShuffleAsShift()) {
17418 if (SDValue Shift =
17419 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
17420 Subtarget, DAG, /*BitwiseOnly*/ true))
17421 return Shift;
17422 if (NumV2Elements == 0)
17423 if (SDValue Rotate =
17424 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
17425 return Rotate;
17426 }
17427
17428 // If the shuffle mask is repeated in each 128-bit lane we can use more
17429 // efficient instructions that mirror the shuffles across the two 128-bit
17430 // lanes.
17431 SmallVector<int, 4> RepeatedMask;
17432 bool Is128BitLaneRepeatedShuffle =
17433 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17434 if (Is128BitLaneRepeatedShuffle) {
17435 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17436 if (V2.isUndef())
17437 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17438 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17439
17440 // Use dedicated unpack instructions for masks that match their pattern.
17441 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
17442 return V;
17443 }
17444
17445 // Try to use shift instructions.
17446 if (SDValue Shift =
17447 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
17448 DAG, /*BitwiseOnly*/ false))
17449 return Shift;
17450
17451 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
17452 if (SDValue Rotate =
17453 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
17454 return Rotate;
17455
17456 // If we have VLX support, we can use VALIGN or EXPAND.
17457 if (Subtarget.hasVLX()) {
17458 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17459 Zeroable, Subtarget, DAG))
17460 return Rotate;
17461
17462 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
17463 Zeroable, Subtarget, DAG))
17464 return V;
17465 }
17466
17467 // Try to use byte rotation instructions.
17468 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17469 Subtarget, DAG))
17470 return Rotate;
17471
17472 // Try to create an in-lane repeating shuffle mask and then shuffle the
17473 // results into the target lanes.
17475 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17476 return V;
17477
17478 if (V2.isUndef()) {
17479 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17480 // because that should be faster than the variable permute alternatives.
17481 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
17482 return V;
17483
17484 // If the shuffle patterns aren't repeated but it's a single input, directly
17485 // generate a cross-lane VPERMD instruction.
17486 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17487 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17488 }
17489
17490 // Assume that a single SHUFPS is faster than an alternative sequence of
17491 // multiple instructions (even if the CPU has a domain penalty).
17492 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17493 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17494 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17495 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17496 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17497 CastV1, CastV2, DAG);
17498 return DAG.getBitcast(MVT::v8i32, ShufPS);
17499 }
17500
17501 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17502 // shuffle.
17504 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17505 return Result;
17506
17507 // Otherwise fall back on generic blend lowering.
17508 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17509 Zeroable, Subtarget, DAG);
17510}
17511
17512/// Handle lowering of 16-lane 16-bit integer shuffles.
17513///
17514/// This routine is only called when we have AVX2 and thus a reasonable
17515/// instruction set for v16i16 shuffling..
17517 const APInt &Zeroable, SDValue V1, SDValue V2,
17518 const X86Subtarget &Subtarget,
17519 SelectionDAG &DAG) {
17520 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17521 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17522 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17523 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
17524
17525 // Whenever we can lower this as a zext, that instruction is strictly faster
17526 // than any alternative. It also allows us to fold memory operands into the
17527 // shuffle in many cases.
17529 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17530 return ZExt;
17531
17532 // Check for being able to broadcast a single element.
17533 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17534 Subtarget, DAG))
17535 return Broadcast;
17536
17537 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17538 Zeroable, Subtarget, DAG))
17539 return Blend;
17540
17541 // Use dedicated unpack instructions for masks that match their pattern.
17542 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
17543 return V;
17544
17545 // Use dedicated pack instructions for masks that match their pattern.
17546 if (SDValue V =
17547 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17548 return V;
17549
17550 // Try to use lower using a truncation.
17551 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17552 Subtarget, DAG))
17553 return V;
17554
17555 // Try to use shift instructions.
17556 if (SDValue Shift =
17557 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17558 Subtarget, DAG, /*BitwiseOnly*/ false))
17559 return Shift;
17560
17561 // Try to use byte rotation instructions.
17562 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17563 Subtarget, DAG))
17564 return Rotate;
17565
17566 // Try to create an in-lane repeating shuffle mask and then shuffle the
17567 // results into the target lanes.
17569 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17570 return V;
17571
17572 if (V2.isUndef()) {
17573 // Try to use bit rotation instructions.
17574 if (SDValue Rotate =
17575 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17576 return Rotate;
17577
17578 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17579 // because that should be faster than the variable permute alternatives.
17580 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17581 return V;
17582
17583 // There are no generalized cross-lane shuffle operations available on i16
17584 // element types.
17585 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17587 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17588 return V;
17589
17590 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17591 DAG, Subtarget);
17592 }
17593
17594 SmallVector<int, 8> RepeatedMask;
17595 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17596 // As this is a single-input shuffle, the repeated mask should be
17597 // a strictly valid v8i16 mask that we can pass through to the v8i16
17598 // lowering to handle even the v16 case.
17600 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17601 }
17602 }
17603
17604 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17605 Zeroable, Subtarget, DAG))
17606 return PSHUFB;
17607
17608 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17609 if (Subtarget.hasBWI())
17610 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17611
17612 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17613 // shuffle.
17615 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17616 return Result;
17617
17618 // Try to permute the lanes and then use a per-lane permute.
17620 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17621 return V;
17622
17623 // Try to match an interleave of two v16i16s and lower them as unpck and
17624 // permutes using ymms.
17625 if (!Subtarget.hasAVX512())
17626 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17627 Mask, DAG))
17628 return V;
17629
17630 // Otherwise fall back on generic lowering.
17631 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17632 Subtarget, DAG);
17633}
17634
17635/// Handle lowering of 32-lane 8-bit integer shuffles.
17636///
17637/// This routine is only called when we have AVX2 and thus a reasonable
17638/// instruction set for v32i8 shuffling..
17640 const APInt &Zeroable, SDValue V1, SDValue V2,
17641 const X86Subtarget &Subtarget,
17642 SelectionDAG &DAG) {
17643 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17644 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17645 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17646 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17647
17648 // Whenever we can lower this as a zext, that instruction is strictly faster
17649 // than any alternative. It also allows us to fold memory operands into the
17650 // shuffle in many cases.
17651 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17652 Zeroable, Subtarget, DAG))
17653 return ZExt;
17654
17655 // Check for being able to broadcast a single element.
17656 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17657 Subtarget, DAG))
17658 return Broadcast;
17659
17660 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17661 Zeroable, Subtarget, DAG))
17662 return Blend;
17663
17664 // Use dedicated unpack instructions for masks that match their pattern.
17665 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17666 return V;
17667
17668 // Use dedicated pack instructions for masks that match their pattern.
17669 if (SDValue V =
17670 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17671 return V;
17672
17673 // Try to use lower using a truncation.
17674 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17675 Subtarget, DAG))
17676 return V;
17677
17678 // Try to use shift instructions.
17679 if (SDValue Shift =
17680 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17681 DAG, /*BitwiseOnly*/ false))
17682 return Shift;
17683
17684 // Try to use byte rotation instructions.
17685 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17686 Subtarget, DAG))
17687 return Rotate;
17688
17689 // Try to use bit rotation instructions.
17690 if (V2.isUndef())
17691 if (SDValue Rotate =
17692 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17693 return Rotate;
17694
17695 // Try to create an in-lane repeating shuffle mask and then shuffle the
17696 // results into the target lanes.
17698 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17699 return V;
17700
17701 // There are no generalized cross-lane shuffle operations available on i8
17702 // element types.
17703 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17704 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17705 // because that should be faster than the variable permute alternatives.
17706 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17707 return V;
17708
17710 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17711 return V;
17712
17713 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17714 DAG, Subtarget);
17715 }
17716
17717 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17718 Zeroable, Subtarget, DAG))
17719 return PSHUFB;
17720
17721 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17722 if (Subtarget.hasVBMI())
17723 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17724
17725 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17726 // shuffle.
17728 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17729 return Result;
17730
17731 // Try to permute the lanes and then use a per-lane permute.
17733 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17734 return V;
17735
17736 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17737 // by zeroable elements in the remaining 24 elements. Turn this into two
17738 // vmovqb instructions shuffled together.
17739 if (Subtarget.hasVLX())
17740 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17741 Mask, Zeroable, DAG))
17742 return V;
17743
17744 // Try to match an interleave of two v32i8s and lower them as unpck and
17745 // permutes using ymms.
17746 if (!Subtarget.hasAVX512())
17747 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17748 Mask, DAG))
17749 return V;
17750
17751 // Otherwise fall back on generic lowering.
17752 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17753 Subtarget, DAG);
17754}
17755
17756/// High-level routine to lower various 256-bit x86 vector shuffles.
17757///
17758/// This routine either breaks down the specific type of a 256-bit x86 vector
17759/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17760/// together based on the available instructions.
17762 SDValue V1, SDValue V2, const APInt &Zeroable,
17763 const X86Subtarget &Subtarget,
17764 SelectionDAG &DAG) {
17765 // If we have a single input to the zero element, insert that into V1 if we
17766 // can do so cheaply.
17767 int NumElts = VT.getVectorNumElements();
17768 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17769
17770 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17772 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17773 return Insertion;
17774
17775 // Handle special cases where the lower or upper half is UNDEF.
17776 if (SDValue V =
17777 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17778 return V;
17779
17780 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17781 // can check for those subtargets here and avoid much of the subtarget
17782 // querying in the per-vector-type lowering routines. With AVX1 we have
17783 // essentially *zero* ability to manipulate a 256-bit vector with integer
17784 // types. Since we'll use floating point types there eventually, just
17785 // immediately cast everything to a float and operate entirely in that domain.
17786 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17787 int ElementBits = VT.getScalarSizeInBits();
17788 if (ElementBits < 32) {
17789 // No floating point type available, if we can't use the bit operations
17790 // for masking/blending then decompose into 128-bit vectors.
17791 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17792 Subtarget, DAG))
17793 return V;
17794 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17795 return V;
17796 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17797 }
17798
17799 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17801 V1 = DAG.getBitcast(FpVT, V1);
17802 V2 = DAG.getBitcast(FpVT, V2);
17803 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17804 }
17805
17806 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17807 V1 = DAG.getBitcast(MVT::v16i16, V1);
17808 V2 = DAG.getBitcast(MVT::v16i16, V2);
17809 return DAG.getBitcast(VT,
17810 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17811 }
17812
17813 switch (VT.SimpleTy) {
17814 case MVT::v4f64:
17815 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17816 case MVT::v4i64:
17817 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17818 case MVT::v8f32:
17819 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17820 case MVT::v8i32:
17821 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17822 case MVT::v16i16:
17823 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17824 case MVT::v32i8:
17825 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17826
17827 default:
17828 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17829 }
17830}
17831
17832/// Try to lower a vector shuffle as a 128-bit shuffles.
17834 const APInt &Zeroable, SDValue V1, SDValue V2,
17835 const X86Subtarget &Subtarget,
17836 SelectionDAG &DAG) {
17837 assert(VT.getScalarSizeInBits() == 64 &&
17838 "Unexpected element type size for 128bit shuffle.");
17839
17840 // To handle 256 bit vector requires VLX and most probably
17841 // function lowerV2X128VectorShuffle() is better solution.
17842 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17843
17844 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17845 SmallVector<int, 4> Widened128Mask;
17846 if (!canWidenShuffleElements(Mask, Widened128Mask))
17847 return SDValue();
17848 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17849
17850 // Try to use an insert into a zero vector.
17851 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17852 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17853 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17854 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17855 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17856 DAG.getVectorIdxConstant(0, DL));
17857 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17858 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17859 DAG.getVectorIdxConstant(0, DL));
17860 }
17861
17862 // Check for patterns which can be matched with a single insert of a 256-bit
17863 // subvector.
17864 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17865 if (OnlyUsesV1 ||
17866 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17867 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17868 SDValue SubVec =
17869 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17870 DAG.getVectorIdxConstant(0, DL));
17871 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17872 DAG.getVectorIdxConstant(4, DL));
17873 }
17874
17875 // See if this is an insertion of the lower 128-bits of V2 into V1.
17876 bool IsInsert = true;
17877 int V2Index = -1;
17878 for (int i = 0; i < 4; ++i) {
17879 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17880 if (Widened128Mask[i] < 0)
17881 continue;
17882
17883 // Make sure all V1 subvectors are in place.
17884 if (Widened128Mask[i] < 4) {
17885 if (Widened128Mask[i] != i) {
17886 IsInsert = false;
17887 break;
17888 }
17889 } else {
17890 // Make sure we only have a single V2 index and its the lowest 128-bits.
17891 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17892 IsInsert = false;
17893 break;
17894 }
17895 V2Index = i;
17896 }
17897 }
17898 if (IsInsert && V2Index >= 0) {
17899 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17900 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17901 DAG.getVectorIdxConstant(0, DL));
17902 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17903 }
17904
17905 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17906 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17907 // possible we at least ensure the lanes stay sequential to help later
17908 // combines.
17909 SmallVector<int, 2> Widened256Mask;
17910 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17911 Widened128Mask.clear();
17912 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17913 }
17914
17915 // Try to lower to vshuf64x2/vshuf32x4.
17916 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17917 int PermMask[4] = {-1, -1, -1, -1};
17918 // Ensure elements came from the same Op.
17919 for (int i = 0; i < 4; ++i) {
17920 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17921 if (Widened128Mask[i] < 0)
17922 continue;
17923
17924 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17925 unsigned OpIndex = i / 2;
17926 if (Ops[OpIndex].isUndef())
17927 Ops[OpIndex] = Op;
17928 else if (Ops[OpIndex] != Op)
17929 return SDValue();
17930
17931 PermMask[i] = Widened128Mask[i] % 4;
17932 }
17933
17934 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17935 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17936}
17937
17938/// Handle lowering of 8-lane 64-bit floating point shuffles.
17940 const APInt &Zeroable, SDValue V1, SDValue V2,
17941 const X86Subtarget &Subtarget,
17942 SelectionDAG &DAG) {
17943 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17944 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17945 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17946
17947 if (V2.isUndef()) {
17948 // Use low duplicate instructions for masks that match their pattern.
17949 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17950 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17951
17952 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17953 // Non-half-crossing single input shuffles can be lowered with an
17954 // interleaved permutation.
17955 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17956 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17957 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17958 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17959 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17960 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17961 }
17962
17963 SmallVector<int, 4> RepeatedMask;
17964 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17965 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17966 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17967 }
17968
17969 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17970 V2, Subtarget, DAG))
17971 return Shuf128;
17972
17973 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17974 return Unpck;
17975
17976 // Check if the blend happens to exactly fit that of SHUFPD.
17977 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17978 Zeroable, Subtarget, DAG))
17979 return Op;
17980
17981 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17982 Subtarget, DAG))
17983 return V;
17984
17985 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17986 Zeroable, Subtarget, DAG))
17987 return Blend;
17988
17989 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17990}
17991
17992/// Handle lowering of 16-lane 32-bit floating point shuffles.
17994 const APInt &Zeroable, SDValue V1, SDValue V2,
17995 const X86Subtarget &Subtarget,
17996 SelectionDAG &DAG) {
17997 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17998 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17999 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
18000
18001 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18002 // options to efficiently lower the shuffle.
18003 SmallVector<int, 4> RepeatedMask;
18004 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
18005 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
18006
18007 // Use even/odd duplicate instructions for masks that match their pattern.
18008 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18009 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
18010 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18011 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
18012
18013 if (V2.isUndef())
18014 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
18015 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18016
18017 // Use dedicated unpack instructions for masks that match their pattern.
18018 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
18019 return V;
18020
18021 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
18022 Zeroable, Subtarget, DAG))
18023 return Blend;
18024
18025 // Otherwise, fall back to a SHUFPS sequence.
18026 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
18027 }
18028
18029 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
18030 Zeroable, Subtarget, DAG))
18031 return Blend;
18032
18034 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18035 return DAG.getBitcast(MVT::v16f32, ZExt);
18036
18037 // Try to create an in-lane repeating shuffle mask and then shuffle the
18038 // results into the target lanes.
18040 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
18041 return V;
18042
18043 // If we have a single input shuffle with different shuffle patterns in the
18044 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
18045 if (V2.isUndef() &&
18046 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
18047 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
18048 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
18049 }
18050
18051 // If we have AVX512F support, we can use VEXPAND.
18052 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
18053 Zeroable, Subtarget, DAG))
18054 return V;
18055
18056 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
18057}
18058
18059/// Handle lowering of 8-lane 64-bit integer shuffles.
18061 const APInt &Zeroable, SDValue V1, SDValue V2,
18062 const X86Subtarget &Subtarget,
18063 SelectionDAG &DAG) {
18064 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
18065 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
18066 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
18067
18068 // Try to use shift instructions if fast.
18069 if (Subtarget.preferLowerShuffleAsShift())
18070 if (SDValue Shift =
18071 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
18072 Subtarget, DAG, /*BitwiseOnly*/ true))
18073 return Shift;
18074
18075 if (V2.isUndef()) {
18076 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18077 // can use lower latency instructions that will operate on all four
18078 // 128-bit lanes.
18079 SmallVector<int, 2> Repeated128Mask;
18080 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
18081 SmallVector<int, 4> PSHUFDMask;
18082 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
18083 return DAG.getBitcast(
18084 MVT::v8i64,
18085 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
18086 DAG.getBitcast(MVT::v16i32, V1),
18087 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18088 }
18089
18090 SmallVector<int, 4> Repeated256Mask;
18091 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
18092 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
18093 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
18094 }
18095
18096 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
18097 V2, Subtarget, DAG))
18098 return Shuf128;
18099
18100 // Try to use shift instructions.
18101 if (SDValue Shift =
18102 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
18103 DAG, /*BitwiseOnly*/ false))
18104 return Shift;
18105
18106 // Try to use VALIGN.
18107 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
18108 Zeroable, Subtarget, DAG))
18109 return Rotate;
18110
18111 // Try to use PALIGNR.
18112 if (Subtarget.hasBWI())
18113 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
18114 Subtarget, DAG))
18115 return Rotate;
18116
18117 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
18118 return Unpck;
18119
18120 // If we have AVX512F support, we can use VEXPAND.
18121 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
18122 Subtarget, DAG))
18123 return V;
18124
18125 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
18126 Zeroable, Subtarget, DAG))
18127 return Blend;
18128
18129 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
18130}
18131
18132/// Handle lowering of 16-lane 32-bit integer shuffles.
18134 const APInt &Zeroable, SDValue V1, SDValue V2,
18135 const X86Subtarget &Subtarget,
18136 SelectionDAG &DAG) {
18137 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
18138 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
18139 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
18140
18141 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
18142
18143 // Whenever we can lower this as a zext, that instruction is strictly faster
18144 // than any alternative. It also allows us to fold memory operands into the
18145 // shuffle in many cases.
18147 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18148 return ZExt;
18149
18150 // Try to use shift instructions if fast.
18151 if (Subtarget.preferLowerShuffleAsShift()) {
18152 if (SDValue Shift =
18153 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
18154 Subtarget, DAG, /*BitwiseOnly*/ true))
18155 return Shift;
18156 if (NumV2Elements == 0)
18157 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
18158 Subtarget, DAG))
18159 return Rotate;
18160 }
18161
18162 // If the shuffle mask is repeated in each 128-bit lane we can use more
18163 // efficient instructions that mirror the shuffles across the four 128-bit
18164 // lanes.
18165 SmallVector<int, 4> RepeatedMask;
18166 bool Is128BitLaneRepeatedShuffle =
18167 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
18168 if (Is128BitLaneRepeatedShuffle) {
18169 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
18170 if (V2.isUndef())
18171 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
18172 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18173
18174 // Use dedicated unpack instructions for masks that match their pattern.
18175 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
18176 return V;
18177 }
18178
18179 // Try to use shift instructions.
18180 if (SDValue Shift =
18181 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
18182 Subtarget, DAG, /*BitwiseOnly*/ false))
18183 return Shift;
18184
18185 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
18186 if (SDValue Rotate =
18187 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
18188 return Rotate;
18189
18190 // Try to use VALIGN.
18191 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
18192 Zeroable, Subtarget, DAG))
18193 return Rotate;
18194
18195 // Try to use byte rotation instructions.
18196 if (Subtarget.hasBWI())
18197 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
18198 Subtarget, DAG))
18199 return Rotate;
18200
18201 // Assume that a single SHUFPS is faster than using a permv shuffle.
18202 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18203 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18204 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
18205 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
18206 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
18207 CastV1, CastV2, DAG);
18208 return DAG.getBitcast(MVT::v16i32, ShufPS);
18209 }
18210
18211 // Try to create an in-lane repeating shuffle mask and then shuffle the
18212 // results into the target lanes.
18214 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
18215 return V;
18216
18217 // If we have AVX512F support, we can use VEXPAND.
18218 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
18219 Zeroable, Subtarget, DAG))
18220 return V;
18221
18222 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
18223 Zeroable, Subtarget, DAG))
18224 return Blend;
18225
18226 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
18227}
18228
18229/// Handle lowering of 32-lane 16-bit integer shuffles.
18231 const APInt &Zeroable, SDValue V1, SDValue V2,
18232 const X86Subtarget &Subtarget,
18233 SelectionDAG &DAG) {
18234 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
18235 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
18236 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
18237 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
18238
18239 // Whenever we can lower this as a zext, that instruction is strictly faster
18240 // than any alternative. It also allows us to fold memory operands into the
18241 // shuffle in many cases.
18243 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18244 return ZExt;
18245
18246 // Use dedicated unpack instructions for masks that match their pattern.
18247 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
18248 return V;
18249
18250 // Use dedicated pack instructions for masks that match their pattern.
18251 if (SDValue V =
18252 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
18253 return V;
18254
18255 // Try to use shift instructions.
18256 if (SDValue Shift =
18257 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
18258 Subtarget, DAG, /*BitwiseOnly*/ false))
18259 return Shift;
18260
18261 // Try to use byte rotation instructions.
18262 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
18263 Subtarget, DAG))
18264 return Rotate;
18265
18266 if (V2.isUndef()) {
18267 // Try to use bit rotation instructions.
18268 if (SDValue Rotate =
18269 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
18270 return Rotate;
18271
18272 SmallVector<int, 8> RepeatedMask;
18273 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18274 // As this is a single-input shuffle, the repeated mask should be
18275 // a strictly valid v8i16 mask that we can pass through to the v8i16
18276 // lowering to handle even the v32 case.
18277 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18278 RepeatedMask, Subtarget, DAG);
18279 }
18280 }
18281
18282 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18283 Zeroable, Subtarget, DAG))
18284 return Blend;
18285
18286 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18287 Zeroable, Subtarget, DAG))
18288 return PSHUFB;
18289
18290 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18291 // shuffle.
18293 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
18294 return Result;
18295
18296 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18297}
18298
18299/// Handle lowering of 64-lane 8-bit integer shuffles.
18301 const APInt &Zeroable, SDValue V1, SDValue V2,
18302 const X86Subtarget &Subtarget,
18303 SelectionDAG &DAG) {
18304 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18305 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18306 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
18307 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
18308
18309 // Whenever we can lower this as a zext, that instruction is strictly faster
18310 // than any alternative. It also allows us to fold memory operands into the
18311 // shuffle in many cases.
18313 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18314 return ZExt;
18315
18316 // Use dedicated unpack instructions for masks that match their pattern.
18317 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
18318 return V;
18319
18320 // Use dedicated pack instructions for masks that match their pattern.
18321 if (SDValue V =
18322 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18323 return V;
18324
18325 // Try to use shift instructions.
18326 if (SDValue Shift =
18327 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
18328 DAG, /*BitwiseOnly*/ false))
18329 return Shift;
18330
18331 // Try to use byte rotation instructions.
18332 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18333 Subtarget, DAG))
18334 return Rotate;
18335
18336 // Try to use bit rotation instructions.
18337 if (V2.isUndef())
18338 if (SDValue Rotate =
18339 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18340 return Rotate;
18341
18342 // Lower as AND if possible.
18343 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18344 Zeroable, Subtarget, DAG))
18345 return Masked;
18346
18347 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18348 Zeroable, Subtarget, DAG))
18349 return PSHUFB;
18350
18351 // Try to create an in-lane repeating shuffle mask and then shuffle the
18352 // results into the target lanes.
18353 // FIXME: Avoid on VBMI targets as the post lane permute often interferes
18354 // with shuffle combining (should be fixed by topological DAG sorting).
18355 if (!Subtarget.hasVBMI())
18357 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18358 return V;
18359
18361 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
18362 return Result;
18363
18364 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18365 Zeroable, Subtarget, DAG))
18366 return Blend;
18367
18368 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
18369 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
18370 // PALIGNR will be cheaper than the second PSHUFB+OR.
18371 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
18372 Mask, Subtarget, DAG))
18373 return V;
18374
18375 // VBMI can use VPERMV/VPERMV3 byte shuffles more efficiently than
18376 // OR(PSHUFB,PSHUFB).
18377 if (Subtarget.hasVBMI())
18378 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget,
18379 DAG);
18380
18381 // If we can't directly blend but can use PSHUFB, that will be better as it
18382 // can both shuffle and set up the inefficient blend.
18383 bool V1InUse, V2InUse;
18384 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
18385 DAG, V1InUse, V2InUse);
18386 }
18387
18388 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18389 // shuffle.
18391 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18392 return Result;
18393
18394 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18395 if (Subtarget.hasVBMI())
18396 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18397
18398 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG,
18399 /*SimpleOnly*/ false);
18400}
18401
18402/// High-level routine to lower various 512-bit x86 vector shuffles.
18403///
18404/// This routine either breaks down the specific type of a 512-bit x86 vector
18405/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18406/// together based on the available instructions.
18408 MVT VT, SDValue V1, SDValue V2,
18409 const APInt &Zeroable,
18410 const X86Subtarget &Subtarget,
18411 SelectionDAG &DAG) {
18412 assert(Subtarget.hasAVX512() &&
18413 "Cannot lower 512-bit vectors w/ basic ISA!");
18414
18415 // If we have a single input to the zero element, insert that into V1 if we
18416 // can do so cheaply.
18417 int NumElts = Mask.size();
18418 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18419
18420 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18422 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18423 return Insertion;
18424
18425 // Handle special cases where the lower or upper half is UNDEF.
18426 if (SDValue V =
18427 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18428 return V;
18429
18430 // Check for being able to broadcast a single element.
18431 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18432 Subtarget, DAG))
18433 return Broadcast;
18434
18435 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18436 // Try using bit ops for masking and blending before falling back to
18437 // splitting.
18438 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18439 Subtarget, DAG))
18440 return V;
18441 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18442 return V;
18443
18444 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
18445 }
18446
18447 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
18448 if (!Subtarget.hasBWI())
18449 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
18450 /*SimpleOnly*/ false);
18451
18452 V1 = DAG.getBitcast(MVT::v32i16, V1);
18453 V2 = DAG.getBitcast(MVT::v32i16, V2);
18454 return DAG.getBitcast(VT,
18455 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
18456 }
18457
18458 // Dispatch to each element type for lowering. If we don't have support for
18459 // specific element type shuffles at 512 bits, immediately split them and
18460 // lower them. Each lowering routine of a given type is allowed to assume that
18461 // the requisite ISA extensions for that element type are available.
18462 switch (VT.SimpleTy) {
18463 case MVT::v8f64:
18464 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18465 case MVT::v16f32:
18466 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18467 case MVT::v8i64:
18468 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18469 case MVT::v16i32:
18470 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18471 case MVT::v32i16:
18472 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18473 case MVT::v64i8:
18474 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18475
18476 default:
18477 llvm_unreachable("Not a valid 512-bit x86 vector type!");
18478 }
18479}
18480
18482 MVT VT, SDValue V1, SDValue V2,
18483 const X86Subtarget &Subtarget,
18484 SelectionDAG &DAG) {
18485 // Shuffle should be unary.
18486 if (!V2.isUndef())
18487 return SDValue();
18488
18489 int ShiftAmt = -1;
18490 int NumElts = Mask.size();
18491 for (int i = 0; i != NumElts; ++i) {
18492 int M = Mask[i];
18493 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
18494 "Unexpected mask index.");
18495 if (M < 0)
18496 continue;
18497
18498 // The first non-undef element determines our shift amount.
18499 if (ShiftAmt < 0) {
18500 ShiftAmt = M - i;
18501 // Need to be shifting right.
18502 if (ShiftAmt <= 0)
18503 return SDValue();
18504 }
18505 // All non-undef elements must shift by the same amount.
18506 if (ShiftAmt != M - i)
18507 return SDValue();
18508 }
18509 assert(ShiftAmt >= 0 && "All undef?");
18510
18511 // Great we found a shift right.
18512 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
18513 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
18514 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18515 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18516 DAG.getVectorIdxConstant(0, DL));
18517}
18518
18519// Determine if this shuffle can be implemented with a KSHIFT instruction.
18520// Returns the shift amount if possible or -1 if not. This is a simplified
18521// version of matchShuffleAsShift.
18522static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18523 int MaskOffset, const APInt &Zeroable) {
18524 int Size = Mask.size();
18525
18526 auto CheckZeros = [&](int Shift, bool Left) {
18527 for (int j = 0; j < Shift; ++j)
18528 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18529 return false;
18530
18531 return true;
18532 };
18533
18534 auto MatchShift = [&](int Shift, bool Left) {
18535 unsigned Pos = Left ? Shift : 0;
18536 unsigned Low = Left ? 0 : Shift;
18537 unsigned Len = Size - Shift;
18538 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18539 };
18540
18541 for (int Shift = 1; Shift != Size; ++Shift)
18542 for (bool Left : {true, false})
18543 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18544 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18545 return Shift;
18546 }
18547
18548 return -1;
18549}
18550
18551
18552// Lower vXi1 vector shuffles.
18553// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18554// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18555// vector, shuffle and then truncate it back.
18557 MVT VT, SDValue V1, SDValue V2,
18558 const APInt &Zeroable,
18559 const X86Subtarget &Subtarget,
18560 SelectionDAG &DAG) {
18561 assert(Subtarget.hasAVX512() &&
18562 "Cannot lower 512-bit vectors w/o basic ISA!");
18563
18564 int NumElts = Mask.size();
18565 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18566
18567 // Try to recognize shuffles that are just padding a subvector with zeros.
18568 int SubvecElts = 0;
18569 int Src = -1;
18570 for (int i = 0; i != NumElts; ++i) {
18571 if (Mask[i] >= 0) {
18572 // Grab the source from the first valid mask. All subsequent elements need
18573 // to use this same source.
18574 if (Src < 0)
18575 Src = Mask[i] / NumElts;
18576 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18577 break;
18578 }
18579
18580 ++SubvecElts;
18581 }
18582 assert(SubvecElts != NumElts && "Identity shuffle?");
18583
18584 // Clip to a power 2.
18585 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18586
18587 // Make sure the number of zeroable bits in the top at least covers the bits
18588 // not covered by the subvector.
18589 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18590 assert(Src >= 0 && "Expected a source!");
18591 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18592 SDValue Extract =
18593 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18594 DAG.getVectorIdxConstant(0, DL));
18595 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18596 DAG.getConstant(0, DL, VT), Extract,
18597 DAG.getVectorIdxConstant(0, DL));
18598 }
18599
18600 // Try a simple shift right with undef elements. Later we'll try with zeros.
18601 if (SDValue Shift =
18602 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18603 return Shift;
18604
18605 // Try to match KSHIFTs.
18606 unsigned Offset = 0;
18607 for (SDValue V : {V1, V2}) {
18608 unsigned Opcode;
18609 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18610 if (ShiftAmt >= 0) {
18611 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18612 MVT WideVT = Res.getSimpleValueType();
18613 // Widened right shifts need two shifts to ensure we shift in zeroes.
18614 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18615 int WideElts = WideVT.getVectorNumElements();
18616 // Shift left to put the original vector in the MSBs of the new size.
18617 Res =
18618 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18619 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18620 // Increase the shift amount to account for the left shift.
18621 ShiftAmt += WideElts - NumElts;
18622 }
18623
18624 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18625 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18626 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18627 DAG.getVectorIdxConstant(0, DL));
18628 }
18629 Offset += NumElts; // Increment for next iteration.
18630 }
18631
18632 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18633 // ops instead.
18634 // TODO: What other unary shuffles would benefit from this?
18635 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18636 SDValue Op0 = V1.getOperand(0);
18637 SDValue Op1 = V1.getOperand(1);
18639 EVT OpVT = Op0.getValueType();
18640 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18641 return DAG.getSetCC(
18642 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18643 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18644 }
18645
18646 // If this is a sequential shuffle with zero'd elements - then lower to AND.
18647 bool IsBlendWithZero = all_of(enumerate(Mask), [&Zeroable](auto M) {
18648 return Zeroable[M.index()] || (M.value() == (int)M.index());
18649 });
18650 if (IsBlendWithZero) {
18651 const unsigned Width = std::max<unsigned>(NumElts, 8u);
18652 MVT IntVT = MVT::getIntegerVT(Width);
18653
18654 APInt MaskValue = (~Zeroable).zextOrTrunc(Width);
18655 SDValue MaskNode = DAG.getConstant(MaskValue, DL, IntVT);
18656
18657 MVT MaskVecVT = MVT::getVectorVT(MVT::i1, Width);
18658 SDValue MaskVecNode = DAG.getBitcast(MaskVecVT, MaskNode);
18659
18660 SDValue MaskVec = DAG.getExtractSubvector(DL, VT, MaskVecNode, 0);
18661 return DAG.getNode(ISD::AND, DL, VT, V1, MaskVec);
18662 }
18663
18664 MVT ExtVT;
18665 switch (VT.SimpleTy) {
18666 default:
18667 llvm_unreachable("Expected a vector of i1 elements");
18668 case MVT::v2i1:
18669 ExtVT = MVT::v2i64;
18670 break;
18671 case MVT::v4i1:
18672 ExtVT = MVT::v4i32;
18673 break;
18674 case MVT::v8i1:
18675 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18676 // shuffle.
18677 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18678 break;
18679 case MVT::v16i1:
18680 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18681 // 256-bit operation available.
18682 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18683 break;
18684 case MVT::v32i1:
18685 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18686 // 256-bit operation available.
18687 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18688 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18689 break;
18690 case MVT::v64i1:
18691 // Fall back to scalarization. FIXME: We can do better if the shuffle
18692 // can be partitioned cleanly.
18693 if (!Subtarget.useBWIRegs())
18694 return SDValue();
18695 ExtVT = MVT::v64i8;
18696 break;
18697 }
18698
18699 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18700 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18701
18702 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18703 // i1 was sign extended we can use X86ISD::CVT2MASK.
18704 int NumElems = VT.getVectorNumElements();
18705 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18706 (Subtarget.hasDQI() && (NumElems < 32)))
18707 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18708 Shuffle, ISD::SETGT);
18709
18710 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18711}
18712
18713/// Helper function that returns true if the shuffle mask should be
18714/// commuted to improve canonicalization.
18716 int NumElements = Mask.size();
18717
18718 int NumV1Elements = 0, NumV2Elements = 0;
18719 for (int M : Mask)
18720 if (M < 0)
18721 continue;
18722 else if (M < NumElements)
18723 ++NumV1Elements;
18724 else
18725 ++NumV2Elements;
18726
18727 // Commute the shuffle as needed such that more elements come from V1 than
18728 // V2. This allows us to match the shuffle pattern strictly on how many
18729 // elements come from V1 without handling the symmetric cases.
18730 if (NumV2Elements > NumV1Elements)
18731 return true;
18732
18733 assert(NumV1Elements > 0 && "No V1 indices");
18734
18735 if (NumV2Elements == 0)
18736 return false;
18737
18738 // When the number of V1 and V2 elements are the same, try to minimize the
18739 // number of uses of V2 in the low half of the vector. When that is tied,
18740 // ensure that the sum of indices for V1 is equal to or lower than the sum
18741 // indices for V2. When those are equal, try to ensure that the number of odd
18742 // indices for V1 is lower than the number of odd indices for V2.
18743 if (NumV1Elements == NumV2Elements) {
18744 int LowV1Elements = 0, LowV2Elements = 0;
18745 for (int M : Mask.slice(0, NumElements / 2))
18746 if (M >= NumElements)
18747 ++LowV2Elements;
18748 else if (M >= 0)
18749 ++LowV1Elements;
18750 if (LowV2Elements > LowV1Elements)
18751 return true;
18752 if (LowV2Elements == LowV1Elements) {
18753 int SumV1Indices = 0, SumV2Indices = 0;
18754 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18755 if (Mask[i] >= NumElements)
18756 SumV2Indices += i;
18757 else if (Mask[i] >= 0)
18758 SumV1Indices += i;
18759 if (SumV2Indices < SumV1Indices)
18760 return true;
18761 if (SumV2Indices == SumV1Indices) {
18762 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18763 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18764 if (Mask[i] >= NumElements)
18765 NumV2OddIndices += i % 2;
18766 else if (Mask[i] >= 0)
18767 NumV1OddIndices += i % 2;
18768 if (NumV2OddIndices < NumV1OddIndices)
18769 return true;
18770 }
18771 }
18772 }
18773
18774 return false;
18775}
18776
18778 const X86Subtarget &Subtarget) {
18779 if (!Subtarget.hasAVX512())
18780 return false;
18781
18782 if (!V.getValueType().isSimple())
18783 return false;
18784
18785 MVT VT = V.getSimpleValueType().getScalarType();
18786 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18787 return false;
18788
18789 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18790 // are preferable to blendw/blendvb/masked-mov.
18791 if ((VT == MVT::i16 || VT == MVT::i8) &&
18792 V.getSimpleValueType().getSizeInBits() < 512)
18793 return false;
18794
18795 auto HasMaskOperation = [&](SDValue V) {
18796 // TODO: Currently we only check limited opcode. We probably extend
18797 // it to all binary operation by checking TLI.isBinOp().
18798 switch (V->getOpcode()) {
18799 default:
18800 return false;
18801 case ISD::ADD:
18802 case ISD::SUB:
18803 case ISD::AND:
18804 case ISD::XOR:
18805 case ISD::OR:
18806 case ISD::SMAX:
18807 case ISD::SMIN:
18808 case ISD::UMAX:
18809 case ISD::UMIN:
18810 case ISD::ABS:
18811 case ISD::SHL:
18812 case ISD::SRL:
18813 case ISD::SRA:
18814 case ISD::MUL:
18815 break;
18816 }
18817 if (!V->hasOneUse())
18818 return false;
18819
18820 return true;
18821 };
18822
18823 if (HasMaskOperation(V))
18824 return true;
18825
18826 return false;
18827}
18828
18829// Forward declaration.
18832 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18833 const X86Subtarget &Subtarget);
18834
18835 /// Top-level lowering for x86 vector shuffles.
18836///
18837/// This handles decomposition, canonicalization, and lowering of all x86
18838/// vector shuffles. Most of the specific lowering strategies are encapsulated
18839/// above in helper routines. The canonicalization attempts to widen shuffles
18840/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18841/// s.t. only one of the two inputs needs to be tested, etc.
18843 SelectionDAG &DAG) {
18845 ArrayRef<int> OrigMask = SVOp->getMask();
18846 SDValue V1 = Op.getOperand(0);
18847 SDValue V2 = Op.getOperand(1);
18848 MVT VT = Op.getSimpleValueType();
18849 int NumElements = VT.getVectorNumElements();
18850 SDLoc DL(Op);
18851 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18852
18853 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18854 "Can't lower MMX shuffles");
18855
18856 bool V1IsUndef = V1.isUndef();
18857 bool V2IsUndef = V2.isUndef();
18858 if (V1IsUndef && V2IsUndef)
18859 return DAG.getUNDEF(VT);
18860
18861 // When we create a shuffle node we put the UNDEF node to second operand,
18862 // but in some cases the first operand may be transformed to UNDEF.
18863 // In this case we should just commute the node.
18864 if (V1IsUndef)
18865 return DAG.getCommutedVectorShuffle(*SVOp);
18866
18867 // Check for non-undef masks pointing at an undef vector and make the masks
18868 // undef as well. This makes it easier to match the shuffle based solely on
18869 // the mask.
18870 if (V2IsUndef &&
18871 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18872 SmallVector<int, 8> NewMask(OrigMask);
18873 for (int &M : NewMask)
18874 if (M >= NumElements)
18875 M = -1;
18876 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18877 }
18878
18879 // Check for illegal shuffle mask element index values.
18880 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18881 (void)MaskUpperLimit;
18882 assert(llvm::all_of(OrigMask,
18883 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18884 "Out of bounds shuffle index");
18885
18886 // We actually see shuffles that are entirely re-arrangements of a set of
18887 // zero inputs. This mostly happens while decomposing complex shuffles into
18888 // simple ones. Directly lower these as a buildvector of zeros.
18889 APInt KnownUndef, KnownZero;
18890 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18891
18892 APInt Zeroable = KnownUndef | KnownZero;
18893 if (Zeroable.isAllOnes())
18894 return getZeroVector(VT, Subtarget, DAG, DL);
18895
18896 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18897
18898 // Try to collapse shuffles into using a vector type with fewer elements but
18899 // wider element types. We cap this to not form integers or floating point
18900 // elements wider than 64 bits. It does not seem beneficial to form i128
18901 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18902 SmallVector<int, 16> WidenedMask;
18903 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18904 !canCombineAsMaskOperation(V1, Subtarget) &&
18905 !canCombineAsMaskOperation(V2, Subtarget) &&
18906 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18907 // Shuffle mask widening should not interfere with a broadcast opportunity
18908 // by obfuscating the operands with bitcasts.
18909 // TODO: Avoid lowering directly from this top-level function: make this
18910 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18911 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18912 Subtarget, DAG))
18913 return Broadcast;
18914
18915 MVT NewEltVT = VT.isFloatingPoint()
18918 int NewNumElts = NumElements / 2;
18919 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18920 // Make sure that the new vector type is legal. For example, v2f64 isn't
18921 // legal on SSE1.
18922 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18923 if (V2IsZero) {
18924 // Modify the new Mask to take all zeros from the all-zero vector.
18925 // Choose indices that are blend-friendly.
18926 bool UsedZeroVector = false;
18927 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18928 "V2's non-undef elements are used?!");
18929 for (int i = 0; i != NewNumElts; ++i)
18930 if (WidenedMask[i] == SM_SentinelZero) {
18931 WidenedMask[i] = i + NewNumElts;
18932 UsedZeroVector = true;
18933 }
18934 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18935 // some elements to be undef.
18936 if (UsedZeroVector)
18937 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18938 }
18939 V1 = DAG.getBitcast(NewVT, V1);
18940 V2 = DAG.getBitcast(NewVT, V2);
18941 return DAG.getBitcast(
18942 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18943 }
18944 }
18945
18946 SmallVector<SDValue> Ops = {V1, V2};
18947 SmallVector<int> Mask(OrigMask);
18948
18949 // Canonicalize the shuffle with any horizontal ops inputs.
18950 // Don't attempt this if the shuffle can still be widened as we may lose
18951 // whole lane shuffle patterns.
18952 // NOTE: This may update Ops and Mask.
18953 if (!canWidenShuffleElements(Mask)) {
18955 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18956 return DAG.getBitcast(VT, HOp);
18957
18958 V1 = DAG.getBitcast(VT, Ops[0]);
18959 V2 = DAG.getBitcast(VT, Ops[1]);
18960 assert(NumElements == (int)Mask.size() &&
18961 "canonicalizeShuffleMaskWithHorizOp "
18962 "shouldn't alter the shuffle mask size");
18963 }
18964
18965 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18966 // These will be materialized uniformly anyway, so make splat matching easier.
18967 // TODO: Allow all int constants?
18968 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18969 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18970 BitVector Undefs;
18971 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18972 if (Undefs.any() &&
18975 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18976 }
18977 }
18978 }
18979 return V;
18980 };
18981 V1 = CanonicalizeConstant(V1);
18982 V2 = CanonicalizeConstant(V2);
18983
18984 // Commute the shuffle if it will improve canonicalization.
18987 std::swap(V1, V2);
18988 }
18989
18990 // For each vector width, delegate to a specialized lowering routine.
18991 if (VT.is128BitVector())
18992 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18993
18994 if (VT.is256BitVector())
18995 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18996
18997 if (VT.is512BitVector())
18998 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18999
19000 if (Is1BitVector)
19001 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
19002
19003 llvm_unreachable("Unimplemented!");
19004}
19005
19006// As legal vpcompress instructions depend on various AVX512 extensions, try to
19007// convert illegal vector sizes to legal ones to avoid expansion.
19009 SelectionDAG &DAG) {
19010 assert(Subtarget.hasAVX512() &&
19011 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
19012
19013 SDLoc DL(Op);
19014 SDValue Vec = Op.getOperand(0);
19015 SDValue Mask = Op.getOperand(1);
19016 SDValue Passthru = Op.getOperand(2);
19017
19018 EVT VecVT = Vec.getValueType();
19019 EVT ElementVT = VecVT.getVectorElementType();
19020 unsigned NumElements = VecVT.getVectorNumElements();
19021 unsigned NumVecBits = VecVT.getFixedSizeInBits();
19022 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
19023
19024 // 128- and 256-bit vectors with <= 16 elements can be converted to and
19025 // compressed as 512-bit vectors in AVX512F.
19026 if (NumVecBits != 128 && NumVecBits != 256)
19027 return SDValue();
19028
19029 if (NumElementBits == 32 || NumElementBits == 64) {
19030 unsigned NumLargeElements = 512 / NumElementBits;
19031 MVT LargeVecVT =
19032 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
19033 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
19034
19035 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
19036 DAG, DL);
19037 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
19038 Subtarget, DAG, DL);
19039 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
19040 : widenSubVector(LargeVecVT, Passthru,
19041 /*ZeroNewElements=*/false,
19042 Subtarget, DAG, DL);
19043
19044 SDValue Compressed =
19045 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
19046 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
19047 DAG.getConstant(0, DL, MVT::i64));
19048 }
19049
19050 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
19051 VecVT == MVT::v16i16) {
19052 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
19053 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
19054
19055 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
19056 Passthru = Passthru.isUndef()
19057 ? DAG.getUNDEF(LargeVecVT)
19058 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
19059
19060 SDValue Compressed =
19061 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
19062 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
19063 }
19064
19065 return SDValue();
19066}
19067
19068/// Try to lower a VSELECT instruction to a vector shuffle.
19070 const X86Subtarget &Subtarget,
19071 SelectionDAG &DAG) {
19072 SDValue Cond = Op.getOperand(0);
19073 SDValue LHS = Op.getOperand(1);
19074 SDValue RHS = Op.getOperand(2);
19075 MVT VT = Op.getSimpleValueType();
19076
19077 // Only non-legal VSELECTs reach this lowering, convert those into generic
19078 // shuffles and re-use the shuffle lowering path for blends.
19082 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
19083 }
19084
19085 return SDValue();
19086}
19087
19088SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
19089 SDValue Cond = Op.getOperand(0);
19090 SDValue LHS = Op.getOperand(1);
19091 SDValue RHS = Op.getOperand(2);
19092
19093 SDLoc dl(Op);
19094 MVT VT = Op.getSimpleValueType();
19095 if (isSoftF16(VT, Subtarget)) {
19096 MVT NVT = VT.changeVectorElementTypeToInteger();
19097 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
19098 DAG.getBitcast(NVT, LHS),
19099 DAG.getBitcast(NVT, RHS)));
19100 }
19101
19102 // A vselect where all conditions and data are constants can be optimized into
19103 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
19107 return SDValue();
19108
19109 // Try to lower this to a blend-style vector shuffle. This can handle all
19110 // constant condition cases.
19111 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
19112 return BlendOp;
19113
19114 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
19115 // with patterns on the mask registers on AVX-512.
19116 MVT CondVT = Cond.getSimpleValueType();
19117 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
19118 if (CondEltSize == 1)
19119 return Op;
19120
19121 // Variable blends are only legal from SSE4.1 onward.
19122 if (!Subtarget.hasSSE41())
19123 return SDValue();
19124
19125 unsigned EltSize = VT.getScalarSizeInBits();
19126 unsigned NumElts = VT.getVectorNumElements();
19127
19128 // Expand v32i16/v64i8 without BWI.
19129 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
19130 return SDValue();
19131
19132 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
19133 // into an i1 condition so that we can use the mask-based 512-bit blend
19134 // instructions.
19135 if (VT.getSizeInBits() == 512) {
19136 // Build a mask by testing the condition against zero.
19137 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
19138 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
19139 DAG.getConstant(0, dl, CondVT),
19140 ISD::SETNE);
19141 // Now return a new VSELECT using the mask.
19142 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
19143 }
19144
19145 // SEXT/TRUNC cases where the mask doesn't match the destination size.
19146 if (CondEltSize != EltSize) {
19147 // If we don't have a sign splat, rely on the expansion.
19148 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
19149 return SDValue();
19150
19151 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
19152 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
19153 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
19154 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
19155 }
19156
19157 // v16i16/v32i8 selects without AVX2, if the condition and another operand
19158 // are free to split, then better to split before expanding the
19159 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
19160 // TODO: This is very similar to narrowVectorSelect.
19161 // TODO: Add Load splitting to isFreeToSplitVector ?
19162 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
19163 !Subtarget.hasXOP()) {
19164 bool FreeCond = isFreeToSplitVector(Cond, DAG);
19165 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
19166 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
19167 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
19168 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
19169 if (FreeCond && (FreeLHS || FreeRHS))
19170 return splitVectorOp(Op, DAG, dl);
19171 }
19172
19173 // Only some types will be legal on some subtargets. If we can emit a legal
19174 // VSELECT-matching blend, return Op, and but if we need to expand, return
19175 // a null value.
19176 switch (VT.SimpleTy) {
19177 default:
19178 // Most of the vector types have blends past SSE4.1.
19179 return Op;
19180
19181 case MVT::v32i8:
19182 // The byte blends for AVX vectors were introduced only in AVX2.
19183 if (Subtarget.hasAVX2())
19184 return Op;
19185
19186 return SDValue();
19187
19188 case MVT::v8i16:
19189 case MVT::v16i16:
19190 case MVT::v8f16:
19191 case MVT::v16f16: {
19192 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
19193 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
19194 Cond = DAG.getBitcast(CastVT, Cond);
19195 LHS = DAG.getBitcast(CastVT, LHS);
19196 RHS = DAG.getBitcast(CastVT, RHS);
19197 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
19198 return DAG.getBitcast(VT, Select);
19199 }
19200 }
19201}
19202
19204 MVT VT = Op.getSimpleValueType();
19205 SDValue Vec = Op.getOperand(0);
19206 SDValue Idx = Op.getOperand(1);
19207 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
19208 SDLoc dl(Op);
19209
19211 return SDValue();
19212
19213 if (VT.getSizeInBits() == 8) {
19214 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
19215 // we're going to zero extend the register or fold the store.
19218 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
19219 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19220 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19221
19222 unsigned IdxVal = Idx->getAsZExtVal();
19223 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
19224 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19225 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19226 }
19227
19228 if (VT == MVT::f32) {
19229 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
19230 // the result back to FR32 register. It's only worth matching if the
19231 // result has a single use which is a store or a bitcast to i32. And in
19232 // the case of a store, it's not worth it if the index is a constant 0,
19233 // because a MOVSSmr can be used instead, which is smaller and faster.
19234 if (!Op.hasOneUse())
19235 return SDValue();
19236 SDNode *User = *Op.getNode()->user_begin();
19237 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
19238 (User->getOpcode() != ISD::BITCAST ||
19239 User->getValueType(0) != MVT::i32))
19240 return SDValue();
19241 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19242 DAG.getBitcast(MVT::v4i32, Vec), Idx);
19243 return DAG.getBitcast(MVT::f32, Extract);
19244 }
19245
19246 if (VT == MVT::i32 || VT == MVT::i64)
19247 return Op;
19248
19249 return SDValue();
19250}
19251
19252/// Extract one bit from mask vector, like v16i1 or v8i1.
19253/// AVX-512 feature.
19255 const X86Subtarget &Subtarget) {
19256 SDValue Vec = Op.getOperand(0);
19257 SDLoc dl(Vec);
19258 MVT VecVT = Vec.getSimpleValueType();
19259 SDValue Idx = Op.getOperand(1);
19260 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19261 MVT EltVT = Op.getSimpleValueType();
19262
19263 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
19264 "Unexpected vector type in ExtractBitFromMaskVector");
19265
19266 // variable index can't be handled in mask registers,
19267 // extend vector to VR512/128
19268 if (!IdxC) {
19269 unsigned NumElts = VecVT.getVectorNumElements();
19270 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
19271 // than extending to 128/256bit.
19272 if (NumElts == 1) {
19273 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19275 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
19276 }
19277 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19278 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19279 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
19280 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
19281 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
19282 }
19283
19284 unsigned IdxVal = IdxC->getZExtValue();
19285 if (IdxVal == 0) // the operation is legal
19286 return Op;
19287
19288 // Extend to natively supported kshift.
19289 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19290
19291 // Use kshiftr instruction to move to the lower element.
19292 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19293 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19294
19295 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19296 DAG.getVectorIdxConstant(0, dl));
19297}
19298
19299// Helper to find all the extracted elements from a vector.
19301 MVT VT = N->getSimpleValueType(0);
19302 unsigned NumElts = VT.getVectorNumElements();
19303 APInt DemandedElts = APInt::getZero(NumElts);
19304 for (SDNode *User : N->users()) {
19305 switch (User->getOpcode()) {
19306 case X86ISD::PEXTRB:
19307 case X86ISD::PEXTRW:
19310 DemandedElts.setAllBits();
19311 return DemandedElts;
19312 }
19313 DemandedElts.setBit(User->getConstantOperandVal(1));
19314 break;
19315 case ISD::BITCAST: {
19316 if (!User->getValueType(0).isSimple() ||
19317 !User->getValueType(0).isVector()) {
19318 DemandedElts.setAllBits();
19319 return DemandedElts;
19320 }
19321 APInt DemandedSrcElts = getExtractedDemandedElts(User);
19322 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
19323 break;
19324 }
19325 default:
19326 DemandedElts.setAllBits();
19327 return DemandedElts;
19328 }
19329 }
19330 return DemandedElts;
19331}
19332
19333SDValue
19334X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
19335 SelectionDAG &DAG) const {
19336 SDLoc dl(Op);
19337 SDValue Vec = Op.getOperand(0);
19338 MVT VecVT = Vec.getSimpleValueType();
19339 SDValue Idx = Op.getOperand(1);
19340 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19341
19342 if (VecVT.getVectorElementType() == MVT::i1)
19343 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
19344
19345 if (!IdxC) {
19346 // Its more profitable to go through memory (1 cycles throughput)
19347 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
19348 // IACA tool was used to get performance estimation
19349 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
19350 //
19351 // example : extractelement <16 x i8> %a, i32 %i
19352 //
19353 // Block Throughput: 3.00 Cycles
19354 // Throughput Bottleneck: Port5
19355 //
19356 // | Num Of | Ports pressure in cycles | |
19357 // | Uops | 0 - DV | 5 | 6 | 7 | |
19358 // ---------------------------------------------
19359 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
19360 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
19361 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
19362 // Total Num Of Uops: 4
19363 //
19364 //
19365 // Block Throughput: 1.00 Cycles
19366 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
19367 //
19368 // | | Ports pressure in cycles | |
19369 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
19370 // ---------------------------------------------------------
19371 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
19372 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
19373 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
19374 // Total Num Of Uops: 4
19375
19376 return SDValue();
19377 }
19378
19379 unsigned IdxVal = IdxC->getZExtValue();
19380
19381 // If this is a 256-bit vector result, first extract the 128-bit vector and
19382 // then extract the element from the 128-bit vector.
19383 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
19384 // Get the 128-bit vector.
19385 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
19386 MVT EltVT = VecVT.getVectorElementType();
19387
19388 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
19389 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
19390
19391 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
19392 // this can be done with a mask.
19393 IdxVal &= ElemsPerChunk - 1;
19394 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19395 DAG.getVectorIdxConstant(IdxVal, dl));
19396 }
19397
19398 assert(VecVT.is128BitVector() && "Unexpected vector length");
19399
19400 MVT VT = Op.getSimpleValueType();
19401
19402 if (VT == MVT::i16) {
19403 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
19404 // we're going to zero extend the register or fold the store (SSE41 only).
19405 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
19406 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
19407 if (Subtarget.hasFP16())
19408 return Op;
19409
19410 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
19411 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19412 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19413 }
19414
19415 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19416 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19417 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19418 }
19419
19420 if (Subtarget.hasSSE41())
19421 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
19422 return Res;
19423
19424 // Only extract a single element from a v16i8 source - determine the common
19425 // DWORD/WORD that all extractions share, and extract the sub-byte.
19426 // TODO: Add QWORD MOVQ extraction?
19427 if (VT == MVT::i8) {
19428 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
19429 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
19430
19431 // Extract either the lowest i32 or any i16, and extract the sub-byte.
19432 int DWordIdx = IdxVal / 4;
19433 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
19434 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19435 DAG.getBitcast(MVT::v4i32, Vec),
19436 DAG.getVectorIdxConstant(DWordIdx, dl));
19437 int ShiftVal = (IdxVal % 4) * 8;
19438 if (ShiftVal != 0)
19439 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
19440 DAG.getConstant(ShiftVal, dl, MVT::i8));
19441 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19442 }
19443
19444 int WordIdx = IdxVal / 2;
19445 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
19446 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
19447 DAG.getBitcast(MVT::v8i16, Vec),
19448 DAG.getVectorIdxConstant(WordIdx, dl));
19449 int ShiftVal = (IdxVal % 2) * 8;
19450 if (ShiftVal != 0)
19451 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
19452 DAG.getConstant(ShiftVal, dl, MVT::i8));
19453 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19454 }
19455 }
19456
19457 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
19458 if (IdxVal == 0)
19459 return Op;
19460
19461 // Shuffle the element to the lowest element, then movss or movsh.
19462 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
19463 Mask[0] = static_cast<int>(IdxVal);
19464 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19465 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19466 DAG.getVectorIdxConstant(0, dl));
19467 }
19468
19469 if (VT.getSizeInBits() == 64) {
19470 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
19471 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
19472 // to match extract_elt for f64.
19473 if (IdxVal == 0)
19474 return Op;
19475
19476 // UNPCKHPD the element to the lowest double word, then movsd.
19477 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
19478 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
19479 int Mask[2] = { 1, -1 };
19480 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19481 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19482 DAG.getVectorIdxConstant(0, dl));
19483 }
19484
19485 return SDValue();
19486}
19487
19488/// Insert one bit to mask vector, like v16i1 or v8i1.
19489/// AVX-512 feature.
19491 const X86Subtarget &Subtarget) {
19492 SDLoc dl(Op);
19493 SDValue Vec = Op.getOperand(0);
19494 SDValue Elt = Op.getOperand(1);
19495 SDValue Idx = Op.getOperand(2);
19496 MVT VecVT = Vec.getSimpleValueType();
19497
19498 if (!isa<ConstantSDNode>(Idx)) {
19499 // Non constant index. Extend source and destination,
19500 // insert element and then truncate the result.
19501 unsigned NumElts = VecVT.getVectorNumElements();
19502 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19503 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19504 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
19505 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
19506 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
19507 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
19508 }
19509
19510 // Copy into a k-register, extract to v1i1 and insert_subvector.
19511 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
19512 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
19513}
19514
19515SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
19516 SelectionDAG &DAG) const {
19517 MVT VT = Op.getSimpleValueType();
19518 MVT EltVT = VT.getVectorElementType();
19519 unsigned NumElts = VT.getVectorNumElements();
19520 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
19521
19522 if (EltVT == MVT::i1)
19523 return InsertBitToMaskVector(Op, DAG, Subtarget);
19524
19525 SDLoc dl(Op);
19526 SDValue N0 = Op.getOperand(0);
19527 SDValue N1 = Op.getOperand(1);
19528 SDValue N2 = Op.getOperand(2);
19529 auto *N2C = dyn_cast<ConstantSDNode>(N2);
19530
19531 if (EltVT == MVT::bf16) {
19532 MVT IVT = VT.changeVectorElementTypeToInteger();
19533 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
19534 DAG.getBitcast(IVT, N0),
19535 DAG.getBitcast(MVT::i16, N1), N2);
19536 return DAG.getBitcast(VT, Res);
19537 }
19538
19539 if (!N2C) {
19540 // Variable insertion indices, usually we're better off spilling to stack,
19541 // but AVX512 can use a variable compare+select by comparing against all
19542 // possible vector indices, and FP insertion has less gpr->simd traffic.
19543 if (!(Subtarget.hasBWI() ||
19544 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19545 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
19546 return SDValue();
19547
19548 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
19549 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
19550 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
19551 return SDValue();
19552
19553 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
19554 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
19555 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
19556
19557 SmallVector<SDValue, 16> RawIndices;
19558 for (unsigned I = 0; I != NumElts; ++I)
19559 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
19560 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
19561
19562 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
19563 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19565 }
19566
19567 if (N2C->getAPIntValue().uge(NumElts))
19568 return SDValue();
19569 uint64_t IdxVal = N2C->getZExtValue();
19570
19571 bool IsZeroElt = X86::isZeroNode(N1);
19572 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19573
19574 if (IsZeroElt || IsAllOnesElt) {
19575 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
19576 // We don't deal with i8 0 since it appears to be handled elsewhere.
19577 if (IsAllOnesElt &&
19578 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
19579 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
19580 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
19581 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
19582 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
19583 CstVectorElts[IdxVal] = OnesCst;
19584 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
19585 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
19586 }
19587 // See if we can do this more efficiently with a blend shuffle with a
19588 // rematerializable vector.
19589 if (Subtarget.hasSSE41() &&
19590 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19591 SmallVector<int, 8> BlendMask;
19592 for (unsigned i = 0; i != NumElts; ++i)
19593 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19594 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19595 : getOnesVector(VT, DAG, dl);
19596 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19597 }
19598 }
19599
19600 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19601 // into that, and then insert the subvector back into the result.
19602 if (VT.is256BitVector() || VT.is512BitVector()) {
19603 // With a 256-bit vector, we can insert into the zero element efficiently
19604 // using a blend if we have AVX or AVX2 and the right data type.
19605 if (VT.is256BitVector() && IdxVal == 0) {
19606 // TODO: It is worthwhile to cast integer to floating point and back
19607 // and incur a domain crossing penalty if that's what we'll end up
19608 // doing anyway after extracting to a 128-bit vector.
19609 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19610 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19611 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19612 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19613 DAG.getTargetConstant(1, dl, MVT::i8));
19614 }
19615 }
19616
19617 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19618 assert(isPowerOf2_32(NumEltsIn128) &&
19619 "Vectors will always have power-of-two number of elements.");
19620
19621 // If we are not inserting into the low 128-bit vector chunk,
19622 // then prefer the broadcast+blend sequence.
19623 // FIXME: relax the profitability check iff all N1 uses are insertions.
19624 if (IdxVal >= NumEltsIn128 &&
19625 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19626 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19627 X86::mayFoldLoad(N1, Subtarget)))) {
19628 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19629 SmallVector<int, 8> BlendMask;
19630 for (unsigned i = 0; i != NumElts; ++i)
19631 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19632 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19633 }
19634
19635 // Get the desired 128-bit vector chunk.
19636 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19637
19638 // Insert the element into the desired chunk.
19639 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19640 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19641
19642 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19643 DAG.getVectorIdxConstant(IdxIn128, dl));
19644
19645 // Insert the changed part back into the bigger vector
19646 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19647 }
19648 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19649
19650 // This will be just movw/movd/movq/movsh/movss/movsd.
19651 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19652 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19653 EltVT == MVT::f16 || EltVT == MVT::i64) {
19654 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19655 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19656 }
19657
19658 // We can't directly insert an i8 or i16 into a vector, so zero extend
19659 // it to i32 first.
19660 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19661 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19662 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19663 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19664 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19665 return DAG.getBitcast(VT, N1);
19666 }
19667 }
19668
19669 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19670 // argument. SSE41 required for pinsrb.
19671 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19672 unsigned Opc;
19673 if (VT == MVT::v8i16) {
19674 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19675 Opc = X86ISD::PINSRW;
19676 } else {
19677 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19678 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19679 Opc = X86ISD::PINSRB;
19680 }
19681
19682 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19683 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19684 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19685 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19686 }
19687
19688 if (Subtarget.hasSSE41()) {
19689 if (EltVT == MVT::f32) {
19690 // Bits [7:6] of the constant are the source select. This will always be
19691 // zero here. The DAG Combiner may combine an extract_elt index into
19692 // these bits. For example (insert (extract, 3), 2) could be matched by
19693 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19694 // Bits [5:4] of the constant are the destination select. This is the
19695 // value of the incoming immediate.
19696 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19697 // combine either bitwise AND or insert of float 0.0 to set these bits.
19698
19699 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19700 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19701 // If this is an insertion of 32-bits into the low 32-bits of
19702 // a vector, we prefer to generate a blend with immediate rather
19703 // than an insertps. Blends are simpler operations in hardware and so
19704 // will always have equal or better performance than insertps.
19705 // But if optimizing for size and there's a load folding opportunity,
19706 // generate insertps because blendps does not have a 32-bit memory
19707 // operand form.
19708 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19709 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19710 DAG.getTargetConstant(1, dl, MVT::i8));
19711 }
19712 // Create this as a scalar to vector..
19713 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19714 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19715 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19716 }
19717
19718 // PINSR* works with constant index.
19719 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19720 return Op;
19721 }
19722
19723 return SDValue();
19724}
19725
19726static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
19727 SelectionDAG &DAG) {
19728 SDLoc DL(Op);
19729 SDValue X = Op.getOperand(0);
19730 MVT XTy = X.getSimpleValueType();
19731 SDValue Exp = Op.getOperand(1);
19732
19733 switch (XTy.SimpleTy) {
19734 default:
19735 return SDValue();
19736 case MVT::f16:
19737 if (!Subtarget.hasFP16())
19738 X = DAG.getFPExtendOrRound(X, DL, MVT::f32);
19739 [[fallthrough]];
19740 case MVT::f32:
19741 case MVT::f64: {
19742 MVT VT = MVT::getVectorVT(X.getSimpleValueType(),
19743 128 / X.getSimpleValueType().getSizeInBits());
19744 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
19745 SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X);
19746 SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp);
19747 SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp);
19748 SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0);
19749 return DAG.getFPExtendOrRound(Final, DL, XTy);
19750 }
19751 case MVT::v4f32:
19752 case MVT::v2f64:
19753 case MVT::v8f32:
19754 case MVT::v4f64:
19755 case MVT::v16f32:
19756 case MVT::v8f64:
19757 if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) {
19758 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19759 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19760 }
19761 break;
19762 case MVT::v8f16:
19763 case MVT::v16f16:
19764 if (Subtarget.hasFP16()) {
19765 if (Subtarget.hasVLX()) {
19766 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19767 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19768 }
19769 break;
19770 }
19771 X = DAG.getFPExtendOrRound(X, DL, XTy.changeVectorElementType(MVT::f32));
19772 Exp = DAG.getSExtOrTrunc(Exp, DL,
19773 X.getSimpleValueType().changeTypeToInteger());
19774 break;
19775 case MVT::v32f16:
19776 if (Subtarget.hasFP16()) {
19777 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19778 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19779 }
19780 return splitVectorOp(Op, DAG, DL);
19781 }
19782 SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512);
19783 SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512);
19784 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, WideExp.getSimpleValueType(), Exp);
19785 SDValue Scalef =
19786 DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp);
19787 SDValue Final =
19788 DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0);
19789 return DAG.getFPExtendOrRound(Final, DL, XTy);
19790}
19791
19793 SelectionDAG &DAG) {
19794 SDLoc dl(Op);
19795 MVT OpVT = Op.getSimpleValueType();
19796
19797 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19798 // combines.
19799 if (X86::isZeroNode(Op.getOperand(0)))
19800 return getZeroVector(OpVT, Subtarget, DAG, dl);
19801
19802 // If this is a 256-bit vector result, first insert into a 128-bit
19803 // vector and then insert into the 256-bit vector.
19804 if (!OpVT.is128BitVector()) {
19805 // Insert into a 128-bit vector.
19806 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19808 OpVT.getVectorNumElements() / SizeFactor);
19809
19810 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19811
19812 // Insert the 128-bit vector.
19813 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19814 }
19815 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19816 "Expected an SSE type!");
19817
19818 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19819 // tblgen.
19820 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19821 return Op;
19822
19823 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19824 return DAG.getBitcast(
19825 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19826}
19827
19828// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19829// simple superregister reference or explicit instructions to insert
19830// the upper bits of a vector.
19832 SelectionDAG &DAG) {
19833 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19834
19835 return insert1BitVector(Op, DAG, Subtarget);
19836}
19837
19839 SelectionDAG &DAG) {
19840 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19841 "Only vXi1 extract_subvectors need custom lowering");
19842
19843 SDLoc dl(Op);
19844 SDValue Vec = Op.getOperand(0);
19845 uint64_t IdxVal = Op.getConstantOperandVal(1);
19846
19847 if (IdxVal == 0) // the operation is legal
19848 return Op;
19849
19850 // Extend to natively supported kshift.
19851 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19852
19853 // Shift to the LSB.
19854 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19855 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19856
19857 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19858 DAG.getVectorIdxConstant(0, dl));
19859}
19860
19861// Returns the appropriate wrapper opcode for a global reference.
19862unsigned X86TargetLowering::getGlobalWrapperKind(
19863 const GlobalValue *GV, const unsigned char OpFlags) const {
19864 // References to absolute symbols are never PC-relative.
19865 if (GV && GV->isAbsoluteSymbolRef())
19866 return X86ISD::Wrapper;
19867
19868 // The following OpFlags under RIP-rel PIC use RIP.
19869 if (Subtarget.isPICStyleRIPRel() &&
19870 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19871 OpFlags == X86II::MO_DLLIMPORT))
19872 return X86ISD::WrapperRIP;
19873
19874 // GOTPCREL references must always use RIP.
19875 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19876 return X86ISD::WrapperRIP;
19877
19878 return X86ISD::Wrapper;
19879}
19880
19881// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19882// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19883// one of the above mentioned nodes. It has to be wrapped because otherwise
19884// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19885// be used to form addressing mode. These wrapped nodes will be selected
19886// into MOV32ri.
19887SDValue
19888X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19889 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19890
19891 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19892 // global base reg.
19893 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19894
19895 auto PtrVT = getPointerTy(DAG.getDataLayout());
19897 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19898 SDLoc DL(CP);
19899 Result =
19900 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19901 // With PIC, the address is actually $g + Offset.
19902 if (OpFlag) {
19903 Result =
19904 DAG.getNode(ISD::ADD, DL, PtrVT,
19905 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19906 }
19907
19908 return Result;
19909}
19910
19911SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19912 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19913
19914 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19915 // global base reg.
19916 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19917
19918 EVT PtrVT = Op.getValueType();
19919 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19920 SDLoc DL(JT);
19921 Result =
19922 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19923
19924 // With PIC, the address is actually $g + Offset.
19925 if (OpFlag)
19926 Result =
19927 DAG.getNode(ISD::ADD, DL, PtrVT,
19928 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19929
19930 return Result;
19931}
19932
19933SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19934 SelectionDAG &DAG) const {
19935 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19936}
19937
19938SDValue
19939X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19940 // Create the TargetBlockAddressAddress node.
19941 unsigned char OpFlags =
19942 Subtarget.classifyBlockAddressReference();
19943 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19944 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19945 SDLoc dl(Op);
19946 EVT PtrVT = Op.getValueType();
19947 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19948 Result =
19949 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19950
19951 // With PIC, the address is actually $g + Offset.
19952 if (isGlobalRelativeToPICBase(OpFlags)) {
19953 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19954 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19955 }
19956
19957 return Result;
19958}
19959
19960/// Creates target global address or external symbol nodes for calls or
19961/// other uses.
19962SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19963 bool ForCall,
19964 bool *IsImpCall) const {
19965 // Unpack the global address or external symbol.
19966 SDLoc dl(Op);
19967 const GlobalValue *GV = nullptr;
19968 int64_t Offset = 0;
19969 const char *ExternalSym = nullptr;
19970 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19971 GV = G->getGlobal();
19972 Offset = G->getOffset();
19973 } else {
19974 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19975 ExternalSym = ES->getSymbol();
19976 }
19977
19978 // Calculate some flags for address lowering.
19980 unsigned char OpFlags;
19981 if (ForCall)
19982 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19983 else
19984 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19985 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19986 bool NeedsLoad = isGlobalStubReference(OpFlags);
19987
19989 EVT PtrVT = Op.getValueType();
19991
19992 if (GV) {
19993 // Create a target global address if this is a global. If possible, fold the
19994 // offset into the global address reference. Otherwise, ADD it on later.
19995 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19996 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19997 // relocation will compute to a negative value, which is invalid.
19998 int64_t GlobalOffset = 0;
19999 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
20001 std::swap(GlobalOffset, Offset);
20002 }
20003 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
20004 } else {
20005 // If this is not a global address, this must be an external symbol.
20006 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
20007 }
20008
20009 // If this is a direct call, avoid the wrapper if we don't need to do any
20010 // loads or adds. This allows SDAG ISel to match direct calls.
20011 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
20012 return Result;
20013
20014 // If Import Call Optimization is enabled and this is an imported function
20015 // then make a note of it and return the global address without wrapping.
20016 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
20017 Mod.getModuleFlag("import-call-optimization")) {
20018 assert(ForCall && "Should only enable import call optimization if we are "
20019 "lowering a call");
20020 *IsImpCall = true;
20021 return Result;
20022 }
20023
20024 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
20025
20026 // With PIC, the address is actually $g + Offset.
20027 if (HasPICReg) {
20028 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
20029 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
20030 }
20031
20032 // For globals that require a load from a stub to get the address, emit the
20033 // load.
20034 if (NeedsLoad)
20035 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
20037
20038 // If there was a non-zero offset that we didn't fold, create an explicit
20039 // addition for it.
20040 if (Offset != 0)
20041 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
20042 DAG.getSignedConstant(Offset, dl, PtrVT));
20043
20044 return Result;
20045}
20046
20047SDValue
20048X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
20049 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
20050}
20051
20053 const EVT PtrVT, unsigned ReturnReg,
20054 unsigned char OperandFlags,
20055 bool LoadGlobalBaseReg = false,
20056 bool LocalDynamic = false) {
20058 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20059 SDLoc dl(GA);
20060 SDValue TGA;
20061 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
20062 SDValue Chain = DAG.getEntryNode();
20063 SDValue Ret;
20064 if (LocalDynamic && UseTLSDESC) {
20065 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
20066 // Reuse existing GetTLSADDR node if we can find it.
20067 if (TGA->hasOneUse()) {
20068 // TLSDESC uses TGA.
20069 SDNode *TLSDescOp = *TGA->user_begin();
20070 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
20071 "Unexpected TLSDESC DAG");
20072 // CALLSEQ_END uses TGA via a chain and glue.
20073 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
20074 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
20075 "Unexpected TLSDESC DAG");
20076 // CopyFromReg uses CALLSEQ_END via a chain and glue.
20077 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
20078 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
20079 "Unexpected TLSDESC DAG");
20080 Ret = SDValue(CopyFromRegOp, 0);
20081 }
20082 } else {
20083 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
20084 GA->getOffset(), OperandFlags);
20085 }
20086
20087 if (!Ret) {
20088 unsigned CallType = UseTLSDESC ? X86ISD::TLSDESC
20089 : LocalDynamic ? X86ISD::TLSBASEADDR
20090 : X86ISD::TLSADDR;
20091
20092 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
20093 if (LoadGlobalBaseReg) {
20094 SDValue InGlue;
20095 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
20096 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
20097 InGlue);
20098 InGlue = Chain.getValue(1);
20099 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
20100 } else {
20101 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
20102 }
20103 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
20104
20105 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
20106 MFI.setHasCalls(true);
20107
20108 SDValue Glue = Chain.getValue(1);
20109 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
20110 }
20111
20112 if (!UseTLSDESC)
20113 return Ret;
20114
20115 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
20116 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
20117
20119 SDValue Offset =
20120 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
20121 MachinePointerInfo(Ptr));
20122 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
20123}
20124
20125// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
20126static SDValue
20128 const EVT PtrVT) {
20129 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
20130 /*LoadGlobalBaseReg=*/true);
20131}
20132
20133// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
20134static SDValue
20136 const EVT PtrVT) {
20137 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
20138}
20139
20140// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
20141static SDValue
20143 const EVT PtrVT) {
20144 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
20145}
20146
20148 SelectionDAG &DAG, const EVT PtrVT,
20149 bool Is64Bit, bool Is64BitLP64) {
20150 SDLoc dl(GA);
20151
20152 // Get the start address of the TLS block for this module.
20156
20157 SDValue Base;
20158 if (Is64Bit) {
20159 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
20160 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
20161 /*LoadGlobalBaseReg=*/false,
20162 /*LocalDynamic=*/true);
20163 } else {
20164 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
20165 /*LoadGlobalBaseReg=*/true,
20166 /*LocalDynamic=*/true);
20167 }
20168
20169 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
20170 // of Base.
20171
20172 // Build x@dtpoff.
20173 unsigned char OperandFlags = X86II::MO_DTPOFF;
20174 unsigned WrapperKind = X86ISD::Wrapper;
20175 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20176 GA->getValueType(0),
20177 GA->getOffset(), OperandFlags);
20178 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20179
20180 // Add x@dtpoff with the base.
20181 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
20182}
20183
20184// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
20186 const EVT PtrVT, TLSModel::Model model,
20187 bool is64Bit, bool isPIC) {
20188 SDLoc dl(GA);
20189
20190 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
20193
20194 SDValue ThreadPointer =
20195 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
20196 MachinePointerInfo(Ptr));
20197
20198 unsigned char OperandFlags = 0;
20199 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
20200 // initialexec.
20201 unsigned WrapperKind = X86ISD::Wrapper;
20202 if (model == TLSModel::LocalExec) {
20203 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
20204 } else if (model == TLSModel::InitialExec) {
20205 if (is64Bit) {
20206 OperandFlags = X86II::MO_GOTTPOFF;
20207 WrapperKind = X86ISD::WrapperRIP;
20208 } else {
20209 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
20210 }
20211 } else {
20212 llvm_unreachable("Unexpected model");
20213 }
20214
20215 // emit "addl x@ntpoff,%eax" (local exec)
20216 // or "addl x@indntpoff,%eax" (initial exec)
20217 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
20218 SDValue TGA =
20219 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
20220 GA->getOffset(), OperandFlags);
20221 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20222
20223 if (model == TLSModel::InitialExec) {
20224 if (isPIC && !is64Bit) {
20225 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
20226 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20227 Offset);
20228 }
20229
20230 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
20232 }
20233
20234 // The address of the thread local variable is the add of the thread
20235 // pointer with the offset of the variable.
20236 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
20237}
20238
20239SDValue
20240X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
20241
20242 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
20243
20244 if (DAG.getTarget().useEmulatedTLS())
20245 return LowerToTLSEmulatedModel(GA, DAG);
20246
20247 const GlobalValue *GV = GA->getGlobal();
20248 EVT PtrVT = Op.getValueType();
20249 bool PositionIndependent = isPositionIndependent();
20250
20251 if (Subtarget.isTargetELF()) {
20252 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
20253 switch (model) {
20255 if (Subtarget.is64Bit()) {
20256 if (Subtarget.isTarget64BitLP64())
20257 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
20258 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
20259 }
20260 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
20262 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
20263 Subtarget.isTarget64BitLP64());
20266 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
20267 PositionIndependent);
20268 }
20269 llvm_unreachable("Unknown TLS model.");
20270 }
20271
20272 if (Subtarget.isTargetDarwin()) {
20273 // Darwin only has one model of TLS. Lower to that.
20274 unsigned char OpFlag = 0;
20275 unsigned WrapperKind = 0;
20276
20277 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20278 // global base reg.
20279 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
20280 if (PIC32) {
20281 OpFlag = X86II::MO_TLVP_PIC_BASE;
20282 WrapperKind = X86ISD::Wrapper;
20283 } else {
20284 OpFlag = X86II::MO_TLVP;
20285 WrapperKind = X86ISD::WrapperRIP;
20286 }
20287 SDLoc DL(Op);
20289 GA->getValueType(0),
20290 GA->getOffset(), OpFlag);
20291 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
20292
20293 // With PIC32, the address is actually $g + Offset.
20294 if (PIC32)
20295 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
20296 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20297 Offset);
20298
20299 // Lowering the machine isd will make sure everything is in the right
20300 // location.
20301 SDValue Chain = DAG.getEntryNode();
20302 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20303 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
20304 SDValue Args[] = { Chain, Offset };
20305 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
20306 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
20307
20308 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
20309 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20310 MFI.setAdjustsStack(true);
20311
20312 // And our return value (tls address) is in the standard call return value
20313 // location.
20314 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
20315 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
20316 }
20317
20318 if (Subtarget.isOSWindows()) {
20319 // Just use the implicit TLS architecture
20320 // Need to generate something similar to:
20321 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
20322 // ; from TEB
20323 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
20324 // mov rcx, qword [rdx+rcx*8]
20325 // mov eax, .tls$:tlsvar
20326 // [rax+rcx] contains the address
20327 // Windows 64bit: gs:0x58
20328 // Windows 32bit: fs:__tls_array
20329
20330 SDLoc dl(GA);
20331 SDValue Chain = DAG.getEntryNode();
20332
20333 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
20334 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
20335 // use its literal value of 0x2C.
20337 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
20339
20340 SDValue TlsArray = Subtarget.is64Bit()
20341 ? DAG.getIntPtrConstant(0x58, dl)
20342 : (Subtarget.isTargetWindowsGNU()
20343 ? DAG.getIntPtrConstant(0x2C, dl)
20344 : DAG.getExternalSymbol("_tls_array", PtrVT));
20345
20346 SDValue ThreadPointer =
20347 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
20348
20349 SDValue res;
20351 res = ThreadPointer;
20352 } else {
20353 // Load the _tls_index variable
20354 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
20355 if (Subtarget.is64Bit())
20356 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
20357 MachinePointerInfo(), MVT::i32);
20358 else
20359 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
20360
20361 const DataLayout &DL = DAG.getDataLayout();
20362 SDValue Scale =
20363 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
20364 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
20365
20366 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
20367 }
20368
20369 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
20370
20371 // Get the offset of start of .tls section
20372 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20373 GA->getValueType(0),
20375 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
20376
20377 // The address of the thread local variable is the add of the thread
20378 // pointer with the offset of the variable.
20379 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
20380 }
20381
20382 llvm_unreachable("TLS not implemented for this target.");
20383}
20384
20386 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
20387 const TargetMachine &TM = getTargetMachine();
20388 TLSModel::Model Model = TM.getTLSModel(&GV);
20389 switch (Model) {
20392 // We can include the %fs segment register in addressing modes.
20393 return true;
20396 // These models do not result in %fs relative addresses unless
20397 // TLS descriptior are used.
20398 //
20399 // Even in the case of TLS descriptors we currently have no way to model
20400 // the difference between %fs access and the computations needed for the
20401 // offset and returning `true` for TLS-desc currently duplicates both
20402 // which is detrimental :-/
20403 return false;
20404 }
20405 }
20406 return false;
20407}
20408
20409/// Lower SRA_PARTS and friends, which return two i32 values
20410/// and take a 2 x i32 value to shift plus a shift amount.
20411/// TODO: Can this be moved to general expansion code?
20413 SDValue Lo, Hi;
20414 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
20415 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
20416}
20417
20418// Try to use a packed vector operation to handle i64 on 32-bit targets when
20419// AVX512DQ is enabled.
20421 SelectionDAG &DAG,
20422 const X86Subtarget &Subtarget) {
20423 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20424 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20425 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20426 Op.getOpcode() == ISD::UINT_TO_FP) &&
20427 "Unexpected opcode!");
20428 bool IsStrict = Op->isStrictFPOpcode();
20429 unsigned OpNo = IsStrict ? 1 : 0;
20430 SDValue Src = Op.getOperand(OpNo);
20431 MVT SrcVT = Src.getSimpleValueType();
20432 MVT VT = Op.getSimpleValueType();
20433
20434 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20435 (VT != MVT::f32 && VT != MVT::f64))
20436 return SDValue();
20437
20438 // Pack the i64 into a vector, do the operation and extract.
20439
20440 // Using 256-bit to ensure result is 128-bits for f32 case.
20441 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20442 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
20443 MVT VecVT = MVT::getVectorVT(VT, NumElts);
20444
20445 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
20446 if (IsStrict) {
20447 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
20448 {Op.getOperand(0), InVec});
20449 SDValue Chain = CvtVec.getValue(1);
20450 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20451 DAG.getVectorIdxConstant(0, dl));
20452 return DAG.getMergeValues({Value, Chain}, dl);
20453 }
20454
20455 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
20456
20457 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20458 DAG.getVectorIdxConstant(0, dl));
20459}
20460
20461// Try to use a packed vector operation to handle i64 on 32-bit targets.
20463 const X86Subtarget &Subtarget) {
20464 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20465 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20466 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20467 Op.getOpcode() == ISD::UINT_TO_FP) &&
20468 "Unexpected opcode!");
20469 bool IsStrict = Op->isStrictFPOpcode();
20470 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20471 MVT SrcVT = Src.getSimpleValueType();
20472 MVT VT = Op.getSimpleValueType();
20473
20474 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20475 return SDValue();
20476
20477 // Pack the i64 into a vector, do the operation and extract.
20478
20479 assert(Subtarget.hasFP16() && "Expected FP16");
20480
20481 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
20482 if (IsStrict) {
20483 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20484 {Op.getOperand(0), InVec});
20485 SDValue Chain = CvtVec.getValue(1);
20486 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20487 DAG.getVectorIdxConstant(0, dl));
20488 return DAG.getMergeValues({Value, Chain}, dl);
20489 }
20490
20491 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
20492
20493 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20494 DAG.getVectorIdxConstant(0, dl));
20495}
20496
20497static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
20498 const X86Subtarget &Subtarget) {
20499 switch (Opcode) {
20500 case ISD::SINT_TO_FP:
20501 // TODO: Handle wider types with AVX/AVX512.
20502 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
20503 return false;
20504 // CVTDQ2PS or (V)CVTDQ2PD
20505 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
20506
20507 case ISD::UINT_TO_FP:
20508 // TODO: Handle wider types and i64 elements.
20509 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
20510 return false;
20511 // VCVTUDQ2PS or VCVTUDQ2PD
20512 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20513
20514 default:
20515 return false;
20516 }
20517}
20518
20519/// Given a scalar cast operation that is extracted from a vector, try to
20520/// vectorize the cast op followed by extraction. This will avoid an expensive
20521/// round-trip between XMM and GPR.
20523 SelectionDAG &DAG,
20524 const X86Subtarget &Subtarget) {
20525 // TODO: This could be enhanced to handle smaller integer types by peeking
20526 // through an extend.
20527 SDValue Extract = Cast.getOperand(0);
20528 MVT DestVT = Cast.getSimpleValueType();
20529 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20530 !isa<ConstantSDNode>(Extract.getOperand(1)))
20531 return SDValue();
20532
20533 // See if we have a 128-bit vector cast op for this type of cast.
20534 SDValue VecOp = Extract.getOperand(0);
20535 EVT FromVT = VecOp.getValueType();
20536 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
20537 MVT Vec128VT =
20538 MVT::getVectorVT(FromVT.getScalarType().getSimpleVT(), NumEltsInXMM);
20539 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
20540 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
20541 return SDValue();
20542
20543 // If we are extracting from a non-zero element, first shuffle the source
20544 // vector to allow extracting from element zero.
20545 if (!isNullConstant(Extract.getOperand(1))) {
20546 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
20547 Mask[0] = Extract.getConstantOperandVal(1);
20548 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
20549 }
20550 // If the source vector is wider than 128-bits, extract the low part. Do not
20551 // create an unnecessarily wide vector cast op.
20552 if (FromVT != Vec128VT)
20553 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
20554
20555 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
20556 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
20557 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
20558 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
20559 DAG.getVectorIdxConstant(0, DL));
20560}
20561
20562/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
20563/// try to vectorize the cast ops. This will avoid an expensive round-trip
20564/// between XMM and GPR.
20565static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
20566 SelectionDAG &DAG,
20567 const X86Subtarget &Subtarget) {
20568 SDValue CastToInt = CastToFP.getOperand(0);
20569 MVT VT = CastToFP.getSimpleValueType();
20570 if ((CastToInt.getOpcode() != ISD::FP_TO_SINT &&
20571 CastToInt.getOpcode() != ISD::FP_TO_UINT) ||
20572 VT.isVector())
20573 return SDValue();
20574
20575 MVT IntVT = CastToInt.getSimpleValueType();
20576 SDValue X = CastToInt.getOperand(0);
20577 MVT SrcVT = X.getSimpleValueType();
20578 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20579 return SDValue();
20580
20581 // See if we have 128-bit vector cast instructions for this type of cast.
20582 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
20583 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20584 (IntVT != MVT::i32 && IntVT != MVT::i64))
20585 return SDValue();
20586
20587 unsigned SrcSize = SrcVT.getSizeInBits();
20588 unsigned IntSize = IntVT.getSizeInBits();
20589 unsigned VTSize = VT.getSizeInBits();
20590 bool IsUnsigned = CastToInt.getOpcode() == ISD::FP_TO_UINT;
20591 unsigned ToIntOpcode =
20592 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
20593 unsigned ToFPOpcode =
20594 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
20595 unsigned Width = 128;
20596
20597 if (Subtarget.hasVLX() && Subtarget.hasDQI()) {
20598 // AVX512DQ+VLX
20599 if (IsUnsigned) {
20600 ToIntOpcode =
20601 SrcSize != IntSize ? X86ISD::CVTTP2UI : (unsigned)ISD::FP_TO_UINT;
20602 ToFPOpcode =
20603 IntSize != VTSize ? X86ISD::CVTUI2P : (unsigned)ISD::UINT_TO_FP;
20604 }
20605 } else {
20606 if (IsUnsigned || IntVT == MVT::i64) {
20607 // SSE2 can only perform f64/f32 <-> i32 signed.
20608 if (!Subtarget.useAVX512Regs() || !Subtarget.hasDQI())
20609 return SDValue();
20610
20611 // Need to extend width for AVX512DQ without AVX512VL.
20612 Width = 512;
20613 ToIntOpcode = CastToInt.getOpcode();
20614 ToFPOpcode = IsUnsigned ? ISD::UINT_TO_FP : ISD::SINT_TO_FP;
20615 }
20616 }
20617
20618 MVT VecSrcVT, VecIntVT, VecVT;
20619 unsigned NumElts;
20620 unsigned SrcElts, VTElts;
20621 // Some conversions are only legal with uniform vector sizes on AVX512DQ.
20622 if (Width == 512) {
20623 NumElts = std::min(Width / IntSize, Width / SrcSize);
20624 SrcElts = NumElts;
20625 VTElts = NumElts;
20626 } else {
20627 NumElts = Width / IntSize;
20628 SrcElts = Width / SrcSize;
20629 VTElts = Width / VTSize;
20630 }
20631 VecIntVT = MVT::getVectorVT(IntVT, NumElts);
20632 VecSrcVT = MVT::getVectorVT(SrcVT, SrcElts);
20633 VecVT = MVT::getVectorVT(VT, VTElts);
20634 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
20635 //
20636 // We are not defining the high elements (for example, zero them) because
20637 // that could nullify any performance advantage that we hoped to gain from
20638 // this vector op hack. We do not expect any adverse effects (like denorm
20639 // penalties) with cast ops.
20640 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20641 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
20642 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
20643 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
20644 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
20645}
20646
20648 SelectionDAG &DAG,
20649 const X86Subtarget &Subtarget) {
20650 bool IsStrict = Op->isStrictFPOpcode();
20651 MVT VT = Op->getSimpleValueType(0);
20652 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
20653
20654 if (Subtarget.hasDQI()) {
20655 assert(!Subtarget.hasVLX() && "Unexpected features");
20656
20657 assert((Src.getSimpleValueType() == MVT::v2i64 ||
20658 Src.getSimpleValueType() == MVT::v4i64) &&
20659 "Unsupported custom type");
20660
20661 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
20662 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
20663 "Unexpected VT!");
20664 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20665
20666 // Need to concat with zero vector for strict fp to avoid spurious
20667 // exceptions.
20668 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
20669 : DAG.getUNDEF(MVT::v8i64);
20670 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
20671 DAG.getVectorIdxConstant(0, DL));
20672 SDValue Res, Chain;
20673 if (IsStrict) {
20674 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
20675 {Op->getOperand(0), Src});
20676 Chain = Res.getValue(1);
20677 } else {
20678 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
20679 }
20680
20681 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20682 DAG.getVectorIdxConstant(0, DL));
20683
20684 if (IsStrict)
20685 return DAG.getMergeValues({Res, Chain}, DL);
20686 return Res;
20687 }
20688
20689 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20690 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20691 if (VT != MVT::v4f32 || IsSigned)
20692 return SDValue();
20693
20694 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20695 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20696 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20697 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20698 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20699 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20700 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20701 SmallVector<SDValue, 4> SignCvts(4);
20702 SmallVector<SDValue, 4> Chains(4);
20703 for (int i = 0; i != 4; ++i) {
20704 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20705 DAG.getVectorIdxConstant(i, DL));
20706 if (IsStrict) {
20707 SignCvts[i] =
20708 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20709 {Op.getOperand(0), Elt});
20710 Chains[i] = SignCvts[i].getValue(1);
20711 } else {
20712 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20713 }
20714 }
20715 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20716
20717 SDValue Slow, Chain;
20718 if (IsStrict) {
20719 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20720 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20721 {Chain, SignCvt, SignCvt});
20722 Chain = Slow.getValue(1);
20723 } else {
20724 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20725 }
20726
20727 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20728 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20729
20730 if (IsStrict)
20731 return DAG.getMergeValues({Cvt, Chain}, DL);
20732
20733 return Cvt;
20734}
20735
20737 SelectionDAG &DAG) {
20738 bool IsStrict = Op->isStrictFPOpcode();
20739 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20740 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20741 MVT VT = Op.getSimpleValueType();
20742 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20743
20744 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20745 if (IsStrict)
20746 return DAG.getNode(
20747 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20748 {Chain,
20749 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20750 Rnd});
20751 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20752 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20753}
20754
20755static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20756 const X86Subtarget &Subtarget) {
20757 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20758 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20759 return true;
20760 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20761 return true;
20762 }
20763 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20764 return true;
20765 if (Subtarget.useAVX512Regs()) {
20766 if (VT == MVT::v16i32)
20767 return true;
20768 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20769 return true;
20770 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20771 return true;
20772 }
20773 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20774 (VT == MVT::v2i64 || VT == MVT::v4i64))
20775 return true;
20776 return false;
20777}
20778
20779SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20780 SelectionDAG &DAG) const {
20781 bool IsStrict = Op->isStrictFPOpcode();
20782 unsigned OpNo = IsStrict ? 1 : 0;
20783 SDValue Src = Op.getOperand(OpNo);
20784 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20785 MVT SrcVT = Src.getSimpleValueType();
20786 MVT VT = Op.getSimpleValueType();
20787 SDLoc dl(Op);
20788
20789 if (isSoftF16(VT, Subtarget))
20790 return promoteXINT_TO_FP(Op, dl, DAG);
20791 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20792 return Op;
20793
20794 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20795 return LowerWin64_INT128_TO_FP(Op, DAG);
20796
20797 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20798 return Extract;
20799
20800 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20801 return R;
20802
20803 if (SrcVT.isVector()) {
20804 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20805 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20806 // source for strict FP.
20807 if (IsStrict)
20808 return DAG.getNode(
20809 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20810 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20811 DAG.getUNDEF(SrcVT))});
20812 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20813 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20814 DAG.getUNDEF(SrcVT)));
20815 }
20816 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20817 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20818
20819 return SDValue();
20820 }
20821
20822 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20823 "Unknown SINT_TO_FP to lower!");
20824
20825 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20826
20827 // These are really Legal; return the operand so the caller accepts it as
20828 // Legal.
20829 if (SrcVT == MVT::i32 && UseSSEReg)
20830 return Op;
20831 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20832 return Op;
20833
20834 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20835 return V;
20836 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20837 return V;
20838
20839 // SSE doesn't have an i16 conversion so we need to promote.
20840 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20841 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20842 if (IsStrict)
20843 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20844 {Chain, Ext});
20845
20846 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20847 }
20848
20849 if (VT == MVT::f128 || !Subtarget.hasX87())
20850 return SDValue();
20851
20852 SDValue ValueToStore = Src;
20853 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20854 // Bitcasting to f64 here allows us to do a single 64-bit store from
20855 // an SSE register, avoiding the store forwarding penalty that would come
20856 // with two 32-bit stores.
20857 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20858
20859 unsigned Size = SrcVT.getStoreSize();
20860 Align Alignment(Size);
20861 MachineFunction &MF = DAG.getMachineFunction();
20862 auto PtrVT = getPointerTy(MF.getDataLayout());
20863 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20864 MachinePointerInfo MPI =
20866 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20867 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20868 std::pair<SDValue, SDValue> Tmp =
20869 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20870
20871 if (IsStrict)
20872 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20873
20874 return Tmp.first;
20875}
20876
20877std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20878 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20879 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20880 // Build the FILD
20881 SDVTList Tys;
20882 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20883 if (useSSE)
20884 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20885 else
20886 Tys = DAG.getVTList(DstVT, MVT::Other);
20887
20888 SDValue FILDOps[] = {Chain, Pointer};
20889 SDValue Result =
20890 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20891 Alignment, MachineMemOperand::MOLoad);
20892 Chain = Result.getValue(1);
20893
20894 if (useSSE) {
20896 unsigned SSFISize = DstVT.getStoreSize();
20897 int SSFI =
20898 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20899 auto PtrVT = getPointerTy(MF.getDataLayout());
20900 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20901 Tys = DAG.getVTList(MVT::Other);
20902 SDValue FSTOps[] = {Chain, Result, StackSlot};
20905 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20906
20907 Chain =
20908 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20909 Result = DAG.getLoad(
20910 DstVT, DL, Chain, StackSlot,
20912 Chain = Result.getValue(1);
20913 }
20914
20915 return { Result, Chain };
20916}
20917
20918/// Horizontal vector math instructions may be slower than normal math with
20919/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20920/// implementation, and likely shuffle complexity of the alternate sequence.
20921static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20922 const X86Subtarget &Subtarget) {
20923 bool IsOptimizingSize = DAG.shouldOptForSize();
20924 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20925 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20926}
20927
20928/// 64-bit unsigned integer to double expansion.
20930 SelectionDAG &DAG,
20931 const X86Subtarget &Subtarget) {
20932 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20933 // when converting 0 when rounding toward negative infinity. Caller will
20934 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20935 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20936 // This algorithm is not obvious. Here it is what we're trying to output:
20937 /*
20938 movq %rax, %xmm0
20939 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20940 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20941 #ifdef __SSE3__
20942 haddpd %xmm0, %xmm0
20943 #else
20944 pshufd $0x4e, %xmm0, %xmm1
20945 addpd %xmm1, %xmm0
20946 #endif
20947 */
20948
20949 LLVMContext *Context = DAG.getContext();
20950
20951 // Build some magic constants.
20952 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20953 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20954 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20955 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20956
20958 CV1.push_back(
20959 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20960 APInt(64, 0x4330000000000000ULL))));
20961 CV1.push_back(
20962 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20963 APInt(64, 0x4530000000000000ULL))));
20964 Constant *C1 = ConstantVector::get(CV1);
20965 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20966
20967 // Load the 64-bit value into an XMM register.
20968 SDValue XR1 =
20969 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20970 SDValue CLod0 = DAG.getLoad(
20971 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20973 SDValue Unpck1 =
20974 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20975
20976 SDValue CLod1 = DAG.getLoad(
20977 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20979 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20980 // TODO: Are there any fast-math-flags to propagate here?
20981 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20982 SDValue Result;
20983
20984 if (Subtarget.hasSSE3() &&
20985 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20986 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20987 } else {
20988 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20989 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20990 }
20991 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20992 DAG.getVectorIdxConstant(0, dl));
20993 return Result;
20994}
20995
20996/// 32-bit unsigned integer to float expansion.
20998 SelectionDAG &DAG,
20999 const X86Subtarget &Subtarget) {
21000 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21001 // FP constant to bias correct the final result.
21002 SDValue Bias = DAG.getConstantFP(
21003 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
21004
21005 // Load the 32-bit value into an XMM register.
21006 SDValue Load =
21007 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
21008
21009 // Zero out the upper parts of the register.
21010 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
21011
21012 // Or the load with the bias.
21013 SDValue Or = DAG.getNode(
21014 ISD::OR, dl, MVT::v2i64,
21015 DAG.getBitcast(MVT::v2i64, Load),
21016 DAG.getBitcast(MVT::v2i64,
21017 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
21018 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
21019 DAG.getBitcast(MVT::v2f64, Or),
21020 DAG.getVectorIdxConstant(0, dl));
21021
21022 if (Op.getNode()->isStrictFPOpcode()) {
21023 // Subtract the bias.
21024 // TODO: Are there any fast-math-flags to propagate here?
21025 SDValue Chain = Op.getOperand(0);
21026 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
21027 {Chain, Or, Bias});
21028
21029 if (Op.getValueType() == Sub.getValueType())
21030 return Sub;
21031
21032 // Handle final rounding.
21033 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
21034 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
21035
21036 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
21037 }
21038
21039 // Subtract the bias.
21040 // TODO: Are there any fast-math-flags to propagate here?
21041 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
21042
21043 // Handle final rounding.
21044 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
21045}
21046
21048 SelectionDAG &DAG,
21049 const X86Subtarget &Subtarget) {
21050 if (Op.getSimpleValueType() != MVT::v2f64)
21051 return SDValue();
21052
21053 bool IsStrict = Op->isStrictFPOpcode();
21054
21055 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
21056 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
21057
21058 if (Subtarget.hasAVX512()) {
21059 if (!Subtarget.hasVLX()) {
21060 // Let generic type legalization widen this.
21061 if (!IsStrict)
21062 return SDValue();
21063 // Otherwise pad the integer input with 0s and widen the operation.
21064 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21065 DAG.getConstant(0, DL, MVT::v2i32));
21066 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
21067 {Op.getOperand(0), N0});
21068 SDValue Chain = Res.getValue(1);
21069 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
21070 DAG.getVectorIdxConstant(0, DL));
21071 return DAG.getMergeValues({Res, Chain}, DL);
21072 }
21073
21074 // Legalize to v4i32 type.
21075 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21076 DAG.getUNDEF(MVT::v2i32));
21077 if (IsStrict)
21078 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
21079 {Op.getOperand(0), N0});
21080 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
21081 }
21082
21083 // Zero extend to 2i64, OR with the floating point representation of 2^52.
21084 // This gives us the floating point equivalent of 2^52 + the i32 integer
21085 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
21086 // point leaving just our i32 integers in double format.
21087 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
21088 SDValue VBias = DAG.getConstantFP(
21089 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
21090 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
21091 DAG.getBitcast(MVT::v2i64, VBias));
21092 Or = DAG.getBitcast(MVT::v2f64, Or);
21093
21094 if (IsStrict)
21095 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
21096 {Op.getOperand(0), Or, VBias});
21097 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
21098}
21099
21101 SelectionDAG &DAG,
21102 const X86Subtarget &Subtarget) {
21103 bool IsStrict = Op->isStrictFPOpcode();
21104 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
21105 MVT VecIntVT = V.getSimpleValueType();
21106 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
21107 "Unsupported custom type");
21108
21109 if (Subtarget.hasAVX512()) {
21110 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
21111 assert(!Subtarget.hasVLX() && "Unexpected features");
21112 MVT VT = Op->getSimpleValueType(0);
21113
21114 // v8i32->v8f64 is legal with AVX512 so just return it.
21115 if (VT == MVT::v8f64)
21116 return Op;
21117
21118 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
21119 VT == MVT::v8f16) &&
21120 "Unexpected VT!");
21121 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
21122 MVT WideIntVT = MVT::v16i32;
21123 if (VT == MVT::v4f64) {
21124 WideVT = MVT::v8f64;
21125 WideIntVT = MVT::v8i32;
21126 }
21127
21128 // Need to concat with zero vector for strict fp to avoid spurious
21129 // exceptions.
21130 SDValue Tmp =
21131 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
21132 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
21133 DAG.getVectorIdxConstant(0, DL));
21134 SDValue Res, Chain;
21135 if (IsStrict) {
21136 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
21137 {Op->getOperand(0), V});
21138 Chain = Res.getValue(1);
21139 } else {
21140 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
21141 }
21142
21143 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21144 DAG.getVectorIdxConstant(0, DL));
21145
21146 if (IsStrict)
21147 return DAG.getMergeValues({Res, Chain}, DL);
21148 return Res;
21149 }
21150
21151 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
21152 Op->getSimpleValueType(0) == MVT::v4f64) {
21153 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
21154 Constant *Bias = ConstantFP::get(
21155 *DAG.getContext(),
21156 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
21157 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21158 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
21159 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
21160 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
21161 SDValue VBias = DAG.getMemIntrinsicNode(
21162 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
21165
21166 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
21167 DAG.getBitcast(MVT::v4i64, VBias));
21168 Or = DAG.getBitcast(MVT::v4f64, Or);
21169
21170 if (IsStrict)
21171 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
21172 {Op.getOperand(0), Or, VBias});
21173 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
21174 }
21175
21176 // The algorithm is the following:
21177 // #ifdef __SSE4_1__
21178 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21179 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21180 // (uint4) 0x53000000, 0xaa);
21181 // #else
21182 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21183 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21184 // #endif
21185 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21186 // return (float4) lo + fhi;
21187
21188 bool Is128 = VecIntVT == MVT::v4i32;
21189 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
21190 // If we convert to something else than the supported type, e.g., to v4f64,
21191 // abort early.
21192 if (VecFloatVT != Op->getSimpleValueType(0))
21193 return SDValue();
21194
21195 // In the #idef/#else code, we have in common:
21196 // - The vector of constants:
21197 // -- 0x4b000000
21198 // -- 0x53000000
21199 // - A shift:
21200 // -- v >> 16
21201
21202 // Create the splat vector for 0x4b000000.
21203 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
21204 // Create the splat vector for 0x53000000.
21205 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
21206
21207 // Create the right shift.
21208 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
21209 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
21210
21211 SDValue Low, High;
21212 if (Subtarget.hasSSE41()) {
21213 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
21214 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21215 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
21216 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
21217 // Low will be bitcasted right away, so do not bother bitcasting back to its
21218 // original type.
21219 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
21220 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21221 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21222 // (uint4) 0x53000000, 0xaa);
21223 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
21224 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
21225 // High will be bitcasted right away, so do not bother bitcasting back to
21226 // its original type.
21227 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
21228 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21229 } else {
21230 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
21231 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21232 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
21233 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
21234
21235 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21236 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
21237 }
21238
21239 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
21240 SDValue VecCstFSub = DAG.getConstantFP(
21241 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
21242
21243 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21244 // NOTE: By using fsub of a positive constant instead of fadd of a negative
21245 // constant, we avoid reassociation in MachineCombiner when reassoc is
21246 // enabled. See PR24512.
21247 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
21248 // TODO: Are there any fast-math-flags to propagate here?
21249 // (float4) lo;
21250 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
21251 // return (float4) lo + fhi;
21252 if (IsStrict) {
21253 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
21254 {Op.getOperand(0), HighBitcast, VecCstFSub});
21255 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
21256 {FHigh.getValue(1), LowBitcast, FHigh});
21257 }
21258
21259 SDValue FHigh =
21260 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
21261 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
21262}
21263
21265 const X86Subtarget &Subtarget) {
21266 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21267 SDValue N0 = Op.getOperand(OpNo);
21268 MVT SrcVT = N0.getSimpleValueType();
21269
21270 switch (SrcVT.SimpleTy) {
21271 default:
21272 llvm_unreachable("Custom UINT_TO_FP is not supported!");
21273 case MVT::v2i32:
21274 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
21275 case MVT::v4i32:
21276 case MVT::v8i32:
21277 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
21278 case MVT::v2i64:
21279 case MVT::v4i64:
21280 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
21281 }
21282}
21283
21284SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
21285 SelectionDAG &DAG) const {
21286 bool IsStrict = Op->isStrictFPOpcode();
21287 unsigned OpNo = IsStrict ? 1 : 0;
21288 SDValue Src = Op.getOperand(OpNo);
21289 SDLoc dl(Op);
21290 auto PtrVT = getPointerTy(DAG.getDataLayout());
21291 MVT SrcVT = Src.getSimpleValueType();
21292 MVT DstVT = Op->getSimpleValueType(0);
21293 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21294
21295 // Bail out when we don't have native conversion instructions.
21296 if (DstVT == MVT::f128)
21297 return SDValue();
21298
21299 if (isSoftF16(DstVT, Subtarget))
21300 return promoteXINT_TO_FP(Op, dl, DAG);
21301 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
21302 return Op;
21303
21304 if (SDValue V = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
21305 return V;
21306
21307 if (DstVT.isVector())
21308 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
21309
21310 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21311 return LowerWin64_INT128_TO_FP(Op, DAG);
21312
21313 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
21314 return Extract;
21315
21316 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
21317 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
21318 // Conversions from unsigned i32 to f32/f64 are legal,
21319 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
21320 return Op;
21321 }
21322
21323 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
21324 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
21325 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
21326 if (IsStrict)
21327 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
21328 {Chain, Src});
21329 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
21330 }
21331
21332 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
21333 return V;
21334 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
21335 return V;
21336
21337 // The transform for i64->f64 isn't correct for 0 when rounding to negative
21338 // infinity. It produces -0.0, so disable under strictfp.
21339 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
21340 !IsStrict)
21341 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
21342 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
21343 // negative infinity. So disable under strictfp. Using FILD instead.
21344 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
21345 !IsStrict)
21346 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
21347 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
21348 (DstVT == MVT::f32 || DstVT == MVT::f64))
21349 return SDValue();
21350
21351 // Make a 64-bit buffer, and use it to build an FILD.
21352 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
21353 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
21354 Align SlotAlign(8);
21355 MachinePointerInfo MPI =
21357 if (SrcVT == MVT::i32) {
21358 SDValue OffsetSlot =
21359 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
21360 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
21361 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
21362 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
21363 std::pair<SDValue, SDValue> Tmp =
21364 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21365 if (IsStrict)
21366 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21367
21368 return Tmp.first;
21369 }
21370
21371 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
21372 SDValue ValueToStore = Src;
21373 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
21374 // Bitcasting to f64 here allows us to do a single 64-bit store from
21375 // an SSE register, avoiding the store forwarding penalty that would come
21376 // with two 32-bit stores.
21377 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21378 }
21379 SDValue Store =
21380 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21381 // For i64 source, we need to add the appropriate power of 2 if the input
21382 // was negative. We must be careful to do the computation in x87 extended
21383 // precision, not in SSE.
21384 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21385 SDValue Ops[] = {Store, StackSlot};
21386 SDValue Fild =
21387 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
21388 SlotAlign, MachineMemOperand::MOLoad);
21389 Chain = Fild.getValue(1);
21390
21391 // Check whether the sign bit is set.
21392 SDValue SignSet = DAG.getSetCC(
21393 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
21394 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
21395
21396 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
21397 APInt FF(64, 0x5F80000000000000ULL);
21398 SDValue FudgePtr =
21399 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
21400 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
21401
21402 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
21403 SDValue Zero = DAG.getIntPtrConstant(0, dl);
21404 SDValue Four = DAG.getIntPtrConstant(4, dl);
21405 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
21406 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
21407
21408 // Load the value out, extending it from f32 to f80.
21409 SDValue Fudge = DAG.getExtLoad(
21410 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
21412 CPAlignment);
21413 Chain = Fudge.getValue(1);
21414 // Extend everything to 80 bits to force it to be done on x87.
21415 // TODO: Are there any fast-math-flags to propagate here?
21416 if (IsStrict) {
21417 unsigned Opc = ISD::STRICT_FADD;
21418 // Windows needs the precision control changed to 80bits around this add.
21419 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21420 Opc = X86ISD::STRICT_FP80_ADD;
21421
21422 SDValue Add =
21423 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
21424 // STRICT_FP_ROUND can't handle equal types.
21425 if (DstVT == MVT::f80)
21426 return Add;
21427 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
21428 {Add.getValue(1), Add,
21429 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
21430 }
21431 unsigned Opc = ISD::FADD;
21432 // Windows needs the precision control changed to 80bits around this add.
21433 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21434 Opc = X86ISD::FP80_ADD;
21435
21436 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
21437 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
21438 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21439}
21440
21441// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
21442// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
21443// just return an SDValue().
21444// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
21445// to i16, i32 or i64, and we lower it to a legal sequence and return the
21446// result.
21447SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
21448 bool IsSigned,
21449 SDValue &Chain) const {
21450 bool IsStrict = Op->isStrictFPOpcode();
21451 SDLoc DL(Op);
21452
21453 EVT DstTy = Op.getValueType();
21454 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
21455 EVT TheVT = Value.getValueType();
21456 auto PtrVT = getPointerTy(DAG.getDataLayout());
21457
21458 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21459 // f16 must be promoted before using the lowering in this routine.
21460 // fp128 does not use this lowering.
21461 return SDValue();
21462 }
21463
21464 // If using FIST to compute an unsigned i64, we'll need some fixup
21465 // to handle values above the maximum signed i64. A FIST is always
21466 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
21467 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21468
21469 // FIXME: This does not generate an invalid exception if the input does not
21470 // fit in i32. PR44019
21471 if (!IsSigned && DstTy != MVT::i64) {
21472 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
21473 // The low 32 bits of the fist result will have the correct uint32 result.
21474 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
21475 DstTy = MVT::i64;
21476 }
21477
21478 assert(DstTy.getSimpleVT() <= MVT::i64 &&
21479 DstTy.getSimpleVT() >= MVT::i16 &&
21480 "Unknown FP_TO_INT to lower!");
21481
21482 // We lower FP->int64 into FISTP64 followed by a load from a temporary
21483 // stack slot.
21484 MachineFunction &MF = DAG.getMachineFunction();
21485 unsigned MemSize = DstTy.getStoreSize();
21486 int SSFI =
21487 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
21488 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21489
21490 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21491
21492 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
21493
21494 if (UnsignedFixup) {
21495 //
21496 // Conversion to unsigned i64 is implemented with a select,
21497 // depending on whether the source value fits in the range
21498 // of a signed i64. Let Thresh be the FP equivalent of
21499 // 0x8000000000000000ULL.
21500 //
21501 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
21502 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
21503 // FistSrc = (Value - FltOfs);
21504 // Fist-to-mem64 FistSrc
21505 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
21506 // to XOR'ing the high 32 bits with Adjust.
21507 //
21508 // Being a power of 2, Thresh is exactly representable in all FP formats.
21509 // For X87 we'd like to use the smallest FP type for this constant, but
21510 // for DAG type consistency we have to match the FP operand type.
21511
21512 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
21513 [[maybe_unused]] APFloat::opStatus Status = APFloat::opOK;
21514 bool LosesInfo = false;
21515 if (TheVT == MVT::f64)
21516 // The rounding mode is irrelevant as the conversion should be exact.
21517 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
21518 &LosesInfo);
21519 else if (TheVT == MVT::f80)
21520 Status = Thresh.convert(APFloat::x87DoubleExtended(),
21521 APFloat::rmNearestTiesToEven, &LosesInfo);
21522
21523 assert(Status == APFloat::opOK && !LosesInfo &&
21524 "FP conversion should have been exact");
21525
21526 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
21527
21528 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
21529 *DAG.getContext(), TheVT);
21530 SDValue Cmp;
21531 if (IsStrict) {
21532 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
21533 /*IsSignaling*/ true);
21534 Chain = Cmp.getValue(1);
21535 } else {
21536 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
21537 }
21538
21539 // Our preferred lowering of
21540 //
21541 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
21542 //
21543 // is
21544 //
21545 // (Value >= Thresh) << 63
21546 //
21547 // but since we can get here after LegalOperations, DAGCombine might do the
21548 // wrong thing if we create a select. So, directly create the preferred
21549 // version.
21550 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
21551 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
21552 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
21553
21554 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
21555 DAG.getConstantFP(0.0, DL, TheVT));
21556
21557 if (IsStrict) {
21558 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
21559 { Chain, Value, FltOfs });
21560 Chain = Value.getValue(1);
21561 } else
21562 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
21563 }
21564
21565 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
21566
21567 // FIXME This causes a redundant load/store if the SSE-class value is already
21568 // in memory, such as if it is on the callstack.
21569 if (isScalarFPTypeInSSEReg(TheVT)) {
21570 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
21571 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
21572 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21573 SDValue Ops[] = { Chain, StackSlot };
21574
21575 unsigned FLDSize = TheVT.getStoreSize();
21576 assert(FLDSize <= MemSize && "Stack slot not big enough");
21577 MachineMemOperand *MMO = MF.getMachineMemOperand(
21578 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
21579 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
21580 Chain = Value.getValue(1);
21581 }
21582
21583 // Build the FP_TO_INT*_IN_MEM
21584 MachineMemOperand *MMO = MF.getMachineMemOperand(
21585 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
21586 SDValue Ops[] = { Chain, Value, StackSlot };
21587 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
21588 DAG.getVTList(MVT::Other),
21589 Ops, DstTy, MMO);
21590
21591 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
21592 Chain = Res.getValue(1);
21593
21594 // If we need an unsigned fixup, XOR the result with adjust.
21595 if (UnsignedFixup)
21596 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
21597
21598 return Res;
21599}
21600
21602 const X86Subtarget &Subtarget) {
21603 MVT VT = Op.getSimpleValueType();
21604 SDValue In = Op.getOperand(0);
21605 MVT InVT = In.getSimpleValueType();
21606 unsigned Opc = Op.getOpcode();
21607
21608 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
21610 "Unexpected extension opcode");
21612 "Expected same number of elements");
21613 assert((VT.getVectorElementType() == MVT::i16 ||
21614 VT.getVectorElementType() == MVT::i32 ||
21615 VT.getVectorElementType() == MVT::i64) &&
21616 "Unexpected element type");
21617 assert((InVT.getVectorElementType() == MVT::i8 ||
21618 InVT.getVectorElementType() == MVT::i16 ||
21619 InVT.getVectorElementType() == MVT::i32) &&
21620 "Unexpected element type");
21621
21622