LLVM 23.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1// I
2//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3//
4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5// See https://llvm.org/LICENSE.txt for license information.
6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#include "X86ISelLowering.h"
17#include "X86.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
22#include "X86TargetMachine.h"
24#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM, STI), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
305
306 if (!Subtarget.is64Bit()) {
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
352 if (Subtarget.is64Bit()) {
354 // Without SSE, i64->f64 goes through memory.
356 }
357 } else if (!Subtarget.is64Bit())
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
385 }
386 if (Subtarget.is64Bit())
391
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
537
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
549 }
550
551 if (!Subtarget.is64Bit())
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
569 }
570
573
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
581 else
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
593
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
640 };
641
642 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
643 // f16, f32 and f64 use SSE.
644 // Set up the FP register classes.
645 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
646 : &X86::FR16RegClass);
647 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
648 : &X86::FR32RegClass);
649 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
650 : &X86::FR64RegClass);
651
652 // Disable f32->f64 extload as we can only generate this in one instruction
653 // under optsize. So its easier to pattern match (fpext (load)) for that
654 // case instead of needing to emit 2 instructions for extload in the
655 // non-optsize case.
656 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
657
658 for (auto VT : { MVT::f32, MVT::f64 }) {
659 // Use ANDPD to simulate FABS.
661
662 // Use XORP to simulate FNEG.
664
665 // Use ANDPD and ORPD to simulate FCOPYSIGN.
667
668 // These might be better off as horizontal vector ops.
671
672 // We don't support sin/cos/fmod
676 }
677
678 // Half type will be promoted by default.
679 setF16Action(MVT::f16, Promote);
690
721
726
731
732 // Lower this to MOVMSK plus an AND.
735
736 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
737 (UseX87 || Is64Bit)) {
738 // Use SSE for f32, x87 for f64.
739 // Set up the FP register classes.
740 addRegisterClass(MVT::f32, &X86::FR32RegClass);
741 if (UseX87)
742 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
743
744 // Use ANDPS to simulate FABS.
746
747 // Use XORP to simulate FNEG.
749
750 if (UseX87)
752
753 // Use ANDPS and ORPS to simulate FCOPYSIGN.
754 if (UseX87)
757
758 // We don't support sin/cos/fmod
762
763 if (UseX87) {
764 // Always expand sin/cos functions even though x87 has an instruction.
768 }
769 } else if (UseX87) {
770 // f32 and f64 in x87.
771 // Set up the FP register classes.
772 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
773 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
774
775 for (auto VT : { MVT::f32, MVT::f64 }) {
778
779 // Always expand sin/cos functions even though x87 has an instruction.
783 }
784 }
785
786 // Expand FP32 immediates into loads from the stack, save special cases.
787 if (isTypeLegal(MVT::f32)) {
788 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
789 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
790 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
791 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
792 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
793 } else // SSE immediates.
794 addLegalFPImmediate(APFloat(+0.0f)); // xorps
795 }
796 // Expand FP64 immediates into loads from the stack, save special cases.
797 if (isTypeLegal(MVT::f64)) {
798 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
799 addLegalFPImmediate(APFloat(+0.0)); // FLD0
800 addLegalFPImmediate(APFloat(+1.0)); // FLD1
801 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
802 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
803 } else // SSE immediates.
804 addLegalFPImmediate(APFloat(+0.0)); // xorpd
805 }
806 // Support fp16 0 immediate.
807 if (isTypeLegal(MVT::f16))
808 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
809
810 // Handle constrained floating-point operations of scalar.
823
824 // We don't support FMA.
827
828 // f80 always uses X87.
829 if (UseX87) {
830 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
833 {
835 addLegalFPImmediate(TmpFlt); // FLD0
836 TmpFlt.changeSign();
837 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
838
839 bool ignored;
840 APFloat TmpFlt2(+1.0);
842 &ignored);
843 addLegalFPImmediate(TmpFlt2); // FLD1
844 TmpFlt2.changeSign();
845 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
846 }
847
848 // Always expand sin/cos functions even though x87 has an instruction.
849 // clang-format off
861 // clang-format on
862
874
875 // Handle constrained floating-point operations of scalar.
882 if (isTypeLegal(MVT::f16)) {
885 } else {
887 }
888 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
889 // as Custom.
891 }
892
893 // f128 uses xmm registers, but most operations require libcalls.
894 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
895 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
896 : &X86::VR128RegClass);
897
898 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
899
910
914
915 // clang-format off
923 // clang-format on
924 // No STRICT_FSINCOS
927
930 // We need to custom handle any FP_ROUND with an f128 input, but
931 // LegalizeDAG uses the result type to know when to run a custom handler.
932 // So we have to list all legal floating point result types here.
933 if (isTypeLegal(MVT::f32)) {
936 }
937 if (isTypeLegal(MVT::f64)) {
940 }
941 if (isTypeLegal(MVT::f80)) {
945 }
946
948
949 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
950 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
951 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
952 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
953 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
954 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
955 }
956
957 // Always use a library call for pow.
958 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
959 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
960 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
961 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
962
971
972 // Some FP actions are always expanded for vector types.
973 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
974 MVT::v4f32, MVT::v8f32, MVT::v16f32,
975 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
976 // clang-format off
990 // clang-format on
991 }
992
993 // First set operation action for all vector types to either promote
994 // (for widening) or expand (for scalarization). Then we will selectively
995 // turn on ones that can be effectively codegen'd.
1035 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1036 setTruncStoreAction(InnerVT, VT, Expand);
1037
1038 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1039 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1040
1041 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1042 // types, we have to deal with them whether we ask for Expansion or not.
1043 // Setting Expand causes its own optimisation problems though, so leave
1044 // them legal.
1045 if (VT.getVectorElementType() == MVT::i1)
1046 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1047
1048 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1049 // split/scalarized right now.
1050 if (VT.getVectorElementType() == MVT::f16 ||
1051 VT.getVectorElementType() == MVT::bf16)
1052 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1053 }
1054 }
1055
1056 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1057 // with -msoft-float, disable use of MMX as well.
1058 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1059 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1060 // No operations on x86mmx supported, everything uses intrinsics.
1061 }
1062
1063 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1064 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1065 : &X86::VR128RegClass);
1066
1071
1072 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1073 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1081
1082 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1083 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1085
1091 }
1092
1093 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1094 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096
1097 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1098 // registers cannot be used even for integer operations.
1099 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1100 : &X86::VR128RegClass);
1101 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1102 : &X86::VR128RegClass);
1103 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1104 : &X86::VR128RegClass);
1105 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1106 : &X86::VR128RegClass);
1107 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1108 : &X86::VR128RegClass);
1109
1110 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1115 }
1116
1117 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1118 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1123 }
1124
1125 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1126 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1127 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1128
1129 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1130 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1131 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1132 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1133 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1134 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1135 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1136 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1137 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1138 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1141
1142 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1143 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1144 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1145
1146 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1148 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1150
1151 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1152 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1153
1154 setOperationAction(ISD::AND, MVT::i128, Custom);
1155 setOperationAction(ISD::OR, MVT::i128, Custom);
1156 setOperationAction(ISD::XOR, MVT::i128, Custom);
1157
1158 if (Subtarget.hasPCLMUL()) {
1159 for (auto VT : {MVT::i64, MVT::v4i32, MVT::v2i64}) {
1162 }
1166 }
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1169 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1170 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1171 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1172 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1173 }
1174
1185
1190
1191 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1197
1198 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1199 // setcc all the way to isel and prefer SETGT in some isel patterns.
1202 }
1203
1204 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1205 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1210
1211 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1217 }
1218
1219 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1223
1224 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1225 continue;
1226
1229 }
1230 setF16Action(MVT::v8f16, Expand);
1231 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1232 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1233 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1234 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1235 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1236 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1238
1239 // Custom lower v2i64 and v2f64 selects.
1246
1253
1254 // Custom legalize these to avoid over promotion or custom promotion.
1255 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1260 }
1261
1266
1269
1272
1273 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1278
1283
1284 // We want to legalize this to an f64 load rather than an i64 load on
1285 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1286 // store.
1287 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1288 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1289 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1290 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1291 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1293
1294 // Add 32-bit vector stores to help vectorization opportunities.
1295 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1297
1301 if (!Subtarget.hasAVX512())
1303
1307
1309
1326
1327 // In the customized shift lowering, the legal v4i32/v2i64 cases
1328 // in AVX2 will be recognized.
1329 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1333 if (VT == MVT::v2i64) continue;
1338 }
1339
1345 }
1346
1347 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1352
1353 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1355 }
1356 }
1357
1358 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1359 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1360 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1361 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1362
1363 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1366 }
1367
1368 // These might be better off as horizontal vector ops.
1373 }
1374
1375 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1376 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1379 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1383 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1389
1391 }
1392
1393 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1394 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1395 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1396 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1397 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1398 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1399 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1400 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1401
1405
1406 // FIXME: Do we need to handle scalar-to-vector here?
1407 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1408 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1409
1410 // We directly match byte blends in the backend as they match the VSELECT
1411 // condition form.
1413
1414 // SSE41 brings specific instructions for doing vector sign extend even in
1415 // cases where we don't have SRA.
1416 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1419 }
1420
1421 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1422 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1423 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1424 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1425 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1426 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1427 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1428 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1429 }
1430
1431 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1432 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1433 // do the pre and post work in the vector domain.
1436 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1437 // so that DAG combine doesn't try to turn it into uint_to_fp.
1440 }
1441 }
1442
1443 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1445 }
1446
1447 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1448 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1449 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1452 }
1453
1454 // XOP can efficiently perform BITREVERSE with VPPERM.
1455 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1457 }
1458
1459 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1460 bool HasInt256 = Subtarget.hasInt256();
1461
1462 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1463 : &X86::VR256RegClass);
1464 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1465 : &X86::VR256RegClass);
1466 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1467 : &X86::VR256RegClass);
1468 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1469 : &X86::VR256RegClass);
1470 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1471 : &X86::VR256RegClass);
1472 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1473 : &X86::VR256RegClass);
1474 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1475 : &X86::VR256RegClass);
1476
1477 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1490
1492
1496
1502 }
1503
1504 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1505 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1506
1507 setOperationAction(ISD::AND, MVT::i256, Custom);
1508 setOperationAction(ISD::OR, MVT::i256, Custom);
1509 setOperationAction(ISD::XOR, MVT::i256, Custom);
1510
1511 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1512 // even though v8i16 is a legal type.
1513 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1514 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1515 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1516 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1520
1527
1539
1540 if (!Subtarget.hasAVX512())
1542
1543 // In the customized shift lowering, the legal v8i32/v4i64 cases
1544 // in AVX2 will be recognized.
1545 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1551 if (VT == MVT::v4i64) continue;
1556 }
1557
1558 // These types need custom splitting if their input is a 128-bit vector.
1563
1567 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1568 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1571
1572 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1576 }
1577
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1588
1589 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1590 // setcc all the way to isel and prefer SETGT in some isel patterns.
1593 }
1594
1595 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1596 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1601
1602 if (Subtarget.hasAnyFMA()) {
1603 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1604 MVT::v2f64, MVT::v4f64 }) {
1607 }
1608 }
1609
1610 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1611 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1613 }
1614
1615 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1616 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1619
1620 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1621 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1622 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1623 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1624 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1625 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1626 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1628
1629 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1630 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1631
1632 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1633 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1634 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1635 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1636 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1637
1638 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1639 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1640 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1641 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1642 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1643 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1644 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1645 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1650
1651 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1652 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1653 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1654 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1655 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1656 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1657 }
1658
1659 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1662 }
1663
1664 if (HasInt256) {
1665 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1666 // when we have a 256bit-wide blend with immediate.
1669
1670 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1671 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1672 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1673 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1674 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1675 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1676 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1677 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1678 }
1679 }
1680
1681 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1682 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1683 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1685 }
1686
1687 // Extract subvector is special because the value type
1688 // (result) is 128-bit but the source is 256-bit wide.
1689 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1690 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1692 }
1693
1694 // Custom lower several nodes for 256-bit types.
1695 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1696 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1706 }
1707 setF16Action(MVT::v16f16, Expand);
1708 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1709 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1711 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1712 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1713 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1714 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1715
1716 if (HasInt256) {
1718
1719 // Custom legalize 2x32 to get a little better code.
1722
1723 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1724 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1726 }
1727 }
1728
1729 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1730 Subtarget.hasF16C()) {
1731 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1734 }
1735 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1738 }
1739 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1740 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1741 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1742 }
1743 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1744 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1745 }
1746
1747 // This block controls legalization of the mask vector sizes that are
1748 // available with AVX512. 512-bit vectors are in a separate block controlled
1749 // by useAVX512Regs.
1750 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1751 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1752 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1753 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1754 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1755 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1756
1760
1761 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1762 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1763 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1764 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1765 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1766 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1767 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1768 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1776
1777 // There is no byte sized k-register load or store without AVX512DQ.
1778 if (!Subtarget.hasDQI()) {
1779 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1780 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1781 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1782 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1783
1788 }
1789
1790 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1791 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1795 }
1796
1797 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1799
1800 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1804
1811 }
1812
1813 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1815 }
1816 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1817 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1820 }
1821 }
1822
1823 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1824 // elements. 512-bits can be disabled based on prefer-vector-width and
1825 // required-vector-width function attributes.
1826 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1827 bool HasBWI = Subtarget.hasBWI();
1828
1829 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1830 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1831 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1832 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1833 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1834 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1835 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1836
1837 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1838 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1839 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1840 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1841 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1842 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1843 if (HasBWI)
1844 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1845 }
1846
1847 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1858 }
1859 setOperationAction(ISD::LRINT, MVT::v16f32,
1860 Subtarget.hasDQI() ? Legal : Custom);
1861 setOperationAction(ISD::LRINT, MVT::v8f64,
1862 Subtarget.hasDQI() ? Legal : Custom);
1863 if (Subtarget.hasDQI())
1864 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1865
1866 setOperationAction(ISD::AND, MVT::i512, Custom);
1867 setOperationAction(ISD::OR, MVT::i512, Custom);
1868 setOperationAction(ISD::XOR, MVT::i512, Custom);
1869
1870 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1875 }
1876
1877 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1882 }
1883
1890
1902
1903 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1904 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1905 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1906 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1907 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1908 if (HasBWI)
1909 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1910
1911 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1912 // to 512-bit rather than use the AVX2 instructions so that we can use
1913 // k-masks.
1914 if (!Subtarget.hasVLX()) {
1915 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1916 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1919 }
1920 }
1921
1923 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1924 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1934
1935 if (HasBWI) {
1936 // Extends from v64i1 masks to 512-bit vectors.
1940 }
1941
1942 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1955
1957 }
1958
1959 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1962 }
1963
1964 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1965 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1966 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1967 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1968
1969 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1970 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1971 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1972 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1973
1974 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1975 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1976 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1977 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1978 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1979 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1980 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1981 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1982
1983 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1984 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1985
1986 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1996
1997 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1998 // setcc all the way to isel and prefer SETGT in some isel patterns.
2001 }
2002
2003 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
2004 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
2009
2010 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
2017 }
2018
2019 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2020 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
2021 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
2023 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
2024 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
2025 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
2026 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2031 }
2032
2033 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2034 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2035 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2036 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2037 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2038 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2039
2040 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2044 setOperationAction(Opc, MVT::v8i64, Custom);
2045
2046 if (Subtarget.hasDQI())
2047 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2048
2049 if (Subtarget.hasCDI()) {
2050 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2051 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2053 }
2054 } // Subtarget.hasCDI()
2055
2056 if (Subtarget.hasVPOPCNTDQ()) {
2057 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2059 }
2060
2061 // Extract subvector is special because the value type
2062 // (result) is 256-bit but the source is 512-bit wide.
2063 // 128-bit was made Legal under AVX1.
2064 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2065 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2067
2068 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2069 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2079 }
2080 setF16Action(MVT::v32f16, Expand);
2085 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2086 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2087 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2088
2089 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2094 }
2095 if (HasBWI) {
2096 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2099 }
2100 } else {
2101 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2102 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2103 }
2104
2105 if (Subtarget.hasVBMI2()) {
2106 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2109 }
2110
2111 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2112 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2113 }
2114
2115 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2116 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2118 }// useAVX512Regs
2119
2120 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2121 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2122 MVT::v4i64}) {
2123 setOperationAction(ISD::FSHL, VT, Subtarget.hasVLX() ? Legal : Custom);
2124 setOperationAction(ISD::FSHR, VT, Subtarget.hasVLX() ? Legal : Custom);
2125 }
2126 }
2127
2128 // This block controls legalization for operations that don't have
2129 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2130 // narrower widths.
2131 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2132 for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
2133 MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
2134 MVT::v16f32, MVT::v8f64})
2136
2137 // These operations are handled on non-VLX by artificially widening in
2138 // isel patterns.
2142
2143 if (Subtarget.hasDQI()) {
2144 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2145 // v2f32 UINT_TO_FP is already custom under SSE2.
2148 "Unexpected operation action!");
2149 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2154 }
2155
2156 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2162 }
2163
2164 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2167 }
2168
2169 // Custom legalize 2x32 to get a little better code.
2172
2173 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2174 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2176
2177 if (Subtarget.hasDQI()) {
2181 setOperationAction(Opc, MVT::v2i64, Custom);
2182 setOperationAction(Opc, MVT::v4i64, Custom);
2183 }
2184 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2185 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2186 }
2187
2188 if (Subtarget.hasCDI()) {
2189 for (auto VT : {MVT::i256, MVT::i512}) {
2190 if (VT == MVT::i512 && !Subtarget.useAVX512Regs())
2191 continue;
2196 }
2197 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2199 }
2200 } // Subtarget.hasCDI()
2201
2202 if (Subtarget.hasVPOPCNTDQ()) {
2203 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2205 }
2206
2207 // We can try to convert vectors to different sizes to leverage legal
2208 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2209 // then specialize to Legal below.
2210 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2211 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2212 MVT::v16i16, MVT::v8i8})
2214
2215 // Legal vpcompress depends on various AVX512 extensions.
2216 // Legal in AVX512F
2217 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2219
2220 // Legal in AVX512F + AVX512VL
2221 if (Subtarget.hasVLX())
2222 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2223 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2225
2226 // Legal in AVX512F + AVX512VBMI2
2227 if (Subtarget.hasVBMI2())
2228 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2230
2231 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2232 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2233 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2235 }
2236
2237 // This block control legalization of v32i1/v64i1 which are available with
2238 // AVX512BW..
2239 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2240 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2241 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2242
2243 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2254 }
2255
2256 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2258
2259 // Extends from v32i1 masks to 256-bit vectors.
2263
2264 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2265 MVT::v16f16, MVT::v8f16}) {
2266 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2267 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2268 }
2269
2270 // These operations are handled on non-VLX by artificially widening in
2271 // isel patterns.
2272 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2273
2274 if (Subtarget.hasBITALG()) {
2275 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2277 }
2278 }
2279
2280 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2281 auto setGroup = [&] (MVT VT) {
2292
2305
2307
2310
2316
2322
2326 };
2327
2328 // AVX512_FP16 scalar operations
2329 setGroup(MVT::f16);
2347
2350
2351 if (Subtarget.useAVX512Regs()) {
2352 setGroup(MVT::v32f16);
2358 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2365
2370 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2372 MVT::v32i16);
2373 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2375 MVT::v32i16);
2376 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2378 MVT::v32i16);
2379 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2381 MVT::v32i16);
2382
2386
2387 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2388 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2389
2394 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2395 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2396 }
2397
2402
2403 if (Subtarget.hasVLX()) {
2404 setGroup(MVT::v8f16);
2405 setGroup(MVT::v16f16);
2406
2417
2424
2425 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2428
2432
2433 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2434 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2435 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2436 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2437
2438 // Need to custom widen these to prevent scalarization.
2439 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2440 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2441
2446
2451 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2452 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2453 }
2454 }
2455
2456 if (!Subtarget.useSoftFloat() &&
2457 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2458 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2459 : &X86::VR128RegClass);
2460 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2461 : &X86::VR256RegClass);
2462 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2463 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2464 // Set the operation action Custom to do the customization later.
2467 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2468 setF16Action(VT, Expand);
2469 if (!Subtarget.hasBF16())
2475 }
2476 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2477 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2478 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2479 }
2480 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2481 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2483 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2484 }
2485
2486 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2487 Subtarget.useAVX512Regs()) {
2488 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2489 setF16Action(MVT::v32bf16, Expand);
2490 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2491 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2492 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2494 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2498 }
2499
2500 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2501 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2502 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2503 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2504 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2505 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2506 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2507 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2508 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2509 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2512 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2524 }
2525 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2528 }
2529 }
2530
2531 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2532 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2533 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2534 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2535 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2536 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2537
2538 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2539 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2540 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2541 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2542 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2543
2544 if (Subtarget.hasBWI()) {
2545 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2546 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2547 }
2548
2549 if (Subtarget.hasFP16()) {
2550 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2559 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2568 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2573 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2578 }
2579 }
2580
2581 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2582 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2583 }
2584
2585 // We want to custom lower some of our intrinsics.
2589 if (!Subtarget.is64Bit()) {
2591 }
2592
2593 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2594 // handle type legalization for these operations here.
2595 //
2596 // FIXME: We really should do custom legalization for addition and
2597 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2598 // than generic legalization for 64-bit multiplication-with-overflow, though.
2599 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2600 if (VT == MVT::i64 && !Subtarget.is64Bit())
2601 continue;
2602 // Add/Sub/Mul with overflow operations are custom lowered.
2609
2610 // Support carry in as value rather than glue.
2616 }
2617
2618 // Combine sin / cos into _sincos_stret if it is available.
2621
2622 if (Subtarget.isTargetWin64()) {
2623 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2624 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2625 setOperationAction(ISD::SREM, MVT::i128, Custom);
2626 setOperationAction(ISD::UREM, MVT::i128, Custom);
2635 }
2636
2637 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2638 // is. We should promote the value to 64-bits to solve this.
2639 // This is what the CRT headers do - `fmodf` is an inline header
2640 // function casting to f64 and calling `fmod`.
2641 if (Subtarget.is32Bit() &&
2642 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2643 // clang-format off
2644 for (ISD::NodeType Op :
2662 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2663 ISD::FMODF})
2664 if (isOperationExpandOrLibCall(Op, MVT::f32))
2665 setOperationAction(Op, MVT::f32, Promote);
2666 // clang-format on
2667
2668 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2669 // it, but it's just a wrapper around ldexp.
2670 if (Subtarget.isOSWindows()) {
2672 if (isOperationExpand(Op, MVT::f32))
2673 setOperationAction(Op, MVT::f32, Promote);
2674 }
2675
2676 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
2677 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
2678 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
2679
2680 // We have target-specific dag combine patterns for the following nodes:
2691 ISD::SHL,
2692 ISD::SRA,
2693 ISD::SRL,
2694 ISD::OR,
2695 ISD::AND,
2701 ISD::ADD,
2704 ISD::FADD,
2705 ISD::FSUB,
2706 ISD::FNEG,
2707 ISD::FMA,
2711 ISD::SUB,
2712 ISD::LOAD,
2713 ISD::LRINT,
2715 ISD::MLOAD,
2716 ISD::STORE,
2733 ISD::SETCC,
2734 ISD::MUL,
2735 ISD::XOR,
2743 ISD::FSHL,
2744 ISD::FSHR,
2748
2749 computeRegisterProperties(Subtarget.getRegisterInfo());
2750
2751 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2753 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2755 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2757
2758 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2759 // that needs to benchmarked and balanced with the potential use of vector
2760 // load/store types (PR33329, PR33914).
2763
2764 // Default loop alignment, which can be overridden by -align-loops.
2766
2767 // An out-of-order CPU can speculatively execute past a predictable branch,
2768 // but a conditional move could be stalled by an expensive earlier operation.
2769 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2770 EnableExtLdPromotion = true;
2772
2774
2775 // Default to having -disable-strictnode-mutation on
2776 IsStrictFPEnabled = true;
2777}
2778
2779// This has so far only been implemented for 64-bit MachO.
2781 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2782}
2783
2785 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2786 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2787}
2788
2790 const SDLoc &DL) const {
2791 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2792 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2793 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2794 return SDValue(Node, 0);
2795}
2796
2799 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2800 !Subtarget.hasBWI())
2801 return TypeSplitVector;
2802
2803 // Since v8f16 is legal, widen anything over v4f16.
2804 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2805 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2806 VT.getVectorElementType() == MVT::f16)
2807 return TypeSplitVector;
2808
2809 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2810 VT.getVectorElementType() != MVT::i1)
2811 return TypeWidenVector;
2812
2814}
2815
2817 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
2818 const LibcallLoweringInfo *libcallLowering) const {
2819 return X86::createFastISel(funcInfo, libInfo, libcallLowering);
2820}
2821
2822//===----------------------------------------------------------------------===//
2823// Other Lowering Hooks
2824//===----------------------------------------------------------------------===//
2825
2827 bool AssumeSingleUse, bool IgnoreAlignment) {
2828 if (!AssumeSingleUse && !Op.hasOneUse())
2829 return false;
2830 if (!ISD::isNormalLoad(Op.getNode()))
2831 return false;
2832
2833 // If this is an unaligned vector, make sure the target supports folding it.
2834 auto *Ld = cast<LoadSDNode>(Op.getNode());
2835 if (!IgnoreAlignment && !Subtarget.hasAVX() &&
2836 !Subtarget.hasSSEUnalignedMem() && Ld->getValueSizeInBits(0) == 128 &&
2837 Ld->getAlign() < Align(16))
2838 return false;
2839
2840 // TODO: If this is a non-temporal load and the target has an instruction
2841 // for it, it should not be folded. See "useNonTemporalLoad()".
2842
2843 return true;
2844}
2845
2847 const X86Subtarget &Subtarget,
2848 bool AssumeSingleUse) {
2849 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2850 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2851 return false;
2852
2853 // We can not replace a wide volatile load with a broadcast-from-memory,
2854 // because that would narrow the load, which isn't legal for volatiles.
2855 auto *Ld = cast<LoadSDNode>(Op.getNode());
2856 return !Ld->isVolatile() ||
2857 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2858}
2859
2861 if (!Op.hasOneUse())
2862 return false;
2863 // Peek through (oneuse) bitcast users
2864 SDNode *User = *Op->user_begin();
2865 while (User->getOpcode() == ISD::BITCAST) {
2866 if (!User->hasOneUse())
2867 return false;
2868 User = *User->user_begin();
2869 }
2870 return ISD::isNormalStore(User);
2871}
2872
2874 if (Op.hasOneUse()) {
2875 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2876 return (ISD::ZERO_EXTEND == Opcode);
2877 }
2878 return false;
2879}
2880
2881// Return true if its cheap to bitcast this to a vector type.
2882static bool mayFoldIntoVector(SDValue Op, const X86Subtarget &Subtarget,
2883 bool AssumeSingleUse = false) {
2884 if (peekThroughBitcasts(Op).getValueType().isVector())
2885 return true;
2887 return true;
2888 return X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse,
2889 /*IgnoreAlignment=*/true);
2890}
2891
2892static bool isLogicOp(unsigned Opcode) {
2893 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2894 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2895}
2896
2897static bool isTargetShuffle(unsigned Opcode) {
2898 switch(Opcode) {
2899 default: return false;
2900 case X86ISD::BLENDI:
2901 case X86ISD::PSHUFB:
2902 case X86ISD::PSHUFD:
2903 case X86ISD::PSHUFHW:
2904 case X86ISD::PSHUFLW:
2905 case X86ISD::SHUFP:
2906 case X86ISD::INSERTPS:
2907 case X86ISD::EXTRQI:
2908 case X86ISD::INSERTQI:
2909 case X86ISD::VALIGN:
2910 case X86ISD::PALIGNR:
2911 case X86ISD::VSHLDQ:
2912 case X86ISD::VSRLDQ:
2913 case X86ISD::MOVLHPS:
2914 case X86ISD::MOVHLPS:
2915 case X86ISD::MOVSHDUP:
2916 case X86ISD::MOVSLDUP:
2917 case X86ISD::MOVDDUP:
2918 case X86ISD::MOVSS:
2919 case X86ISD::MOVSD:
2920 case X86ISD::MOVSH:
2921 case X86ISD::UNPCKL:
2922 case X86ISD::UNPCKH:
2923 case X86ISD::VBROADCAST:
2924 case X86ISD::VPERMILPI:
2925 case X86ISD::VPERMILPV:
2926 case X86ISD::VPERM2X128:
2927 case X86ISD::SHUF128:
2928 case X86ISD::VPERMIL2:
2929 case X86ISD::VPERMI:
2930 case X86ISD::VPPERM:
2931 case X86ISD::VPERMV:
2932 case X86ISD::VPERMV3:
2933 case X86ISD::VZEXT_MOVL:
2934 case X86ISD::COMPRESS:
2935 case X86ISD::EXPAND:
2936 return true;
2937 }
2938}
2939
2940static bool isTargetShuffleVariableMask(unsigned Opcode) {
2941 switch (Opcode) {
2942 default: return false;
2943 // Target Shuffles.
2944 case X86ISD::PSHUFB:
2945 case X86ISD::VPERMILPV:
2946 case X86ISD::VPERMIL2:
2947 case X86ISD::VPPERM:
2948 case X86ISD::VPERMV:
2949 case X86ISD::VPERMV3:
2950 return true;
2951 // 'Faux' Target Shuffles.
2952 case ISD::OR:
2953 case ISD::AND:
2954 case X86ISD::ANDNP:
2955 return true;
2956 }
2957}
2958
2961 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2963 int ReturnAddrIndex = FuncInfo->getRAIndex();
2964
2965 if (ReturnAddrIndex == 0) {
2966 // Set up a frame object for the return address.
2967 unsigned SlotSize = RegInfo->getSlotSize();
2968 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2969 -(int64_t)SlotSize,
2970 false);
2971 FuncInfo->setRAIndex(ReturnAddrIndex);
2972 }
2973
2974 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2975}
2976
2978 bool HasSymbolicDisplacement) {
2979 // Offset should fit into 32 bit immediate field.
2980 if (!isInt<32>(Offset))
2981 return false;
2982
2983 // If we don't have a symbolic displacement - we don't have any extra
2984 // restrictions.
2985 if (!HasSymbolicDisplacement)
2986 return true;
2987
2988 // We can fold large offsets in the large code model because we always use
2989 // 64-bit offsets.
2990 if (CM == CodeModel::Large)
2991 return true;
2992
2993 // For kernel code model we know that all object resist in the negative half
2994 // of 32bits address space. We may not accept negative offsets, since they may
2995 // be just off and we may accept pretty large positive ones.
2996 if (CM == CodeModel::Kernel)
2997 return Offset >= 0;
2998
2999 // For other non-large code models we assume that latest small object is 16MB
3000 // before end of 31 bits boundary. We may also accept pretty large negative
3001 // constants knowing that all objects are in the positive half of address
3002 // space.
3003 return Offset < 16 * 1024 * 1024;
3004}
3005
3006/// Return true if the condition is an signed comparison operation.
3007static bool isX86CCSigned(X86::CondCode X86CC) {
3008 switch (X86CC) {
3009 default:
3010 llvm_unreachable("Invalid integer condition!");
3011 case X86::COND_E:
3012 case X86::COND_NE:
3013 case X86::COND_B:
3014 case X86::COND_A:
3015 case X86::COND_BE:
3016 case X86::COND_AE:
3017 return false;
3018 case X86::COND_G:
3019 case X86::COND_GE:
3020 case X86::COND_L:
3021 case X86::COND_LE:
3022 return true;
3023 }
3024}
3025
3027 switch (SetCCOpcode) {
3028 // clang-format off
3029 default: llvm_unreachable("Invalid integer condition!");
3030 case ISD::SETEQ: return X86::COND_E;
3031 case ISD::SETGT: return X86::COND_G;
3032 case ISD::SETGE: return X86::COND_GE;
3033 case ISD::SETLT: return X86::COND_L;
3034 case ISD::SETLE: return X86::COND_LE;
3035 case ISD::SETNE: return X86::COND_NE;
3036 case ISD::SETULT: return X86::COND_B;
3037 case ISD::SETUGT: return X86::COND_A;
3038 case ISD::SETULE: return X86::COND_BE;
3039 case ISD::SETUGE: return X86::COND_AE;
3040 // clang-format on
3041 }
3042}
3043
3044/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
3045/// condition code, returning the condition code and the LHS/RHS of the
3046/// comparison to make.
3048 bool isFP, SDValue &LHS, SDValue &RHS,
3049 SelectionDAG &DAG) {
3050 if (!isFP) {
3052 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
3053 // X > -1 -> X == 0, jump !sign.
3054 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3055 return X86::COND_NS;
3056 }
3057 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
3058 // X < 0 -> X == 0, jump on sign.
3059 return X86::COND_S;
3060 }
3061 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3062 // X >= 0 -> X == 0, jump on !sign.
3063 return X86::COND_NS;
3064 }
3065 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3066 // X < 1 -> X <= 0
3067 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3068 return X86::COND_LE;
3069 }
3070 }
3071
3072 return TranslateIntegerX86CC(SetCCOpcode);
3073 }
3074
3075 // First determine if it is required or is profitable to flip the operands.
3076
3077 // If LHS is a foldable load, but RHS is not, flip the condition.
3078 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3079 !ISD::isNON_EXTLoad(RHS.getNode())) {
3080 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3081 std::swap(LHS, RHS);
3082 }
3083
3084 switch (SetCCOpcode) {
3085 default: break;
3086 case ISD::SETOLT:
3087 case ISD::SETOLE:
3088 case ISD::SETUGT:
3089 case ISD::SETUGE:
3090 std::swap(LHS, RHS);
3091 break;
3092 }
3093
3094 // On a floating point condition, the flags are set as follows:
3095 // ZF PF CF op
3096 // 0 | 0 | 0 | X > Y
3097 // 0 | 0 | 1 | X < Y
3098 // 1 | 0 | 0 | X == Y
3099 // 1 | 1 | 1 | unordered
3100 switch (SetCCOpcode) {
3101 // clang-format off
3102 default: llvm_unreachable("Condcode should be pre-legalized away");
3103 case ISD::SETUEQ:
3104 case ISD::SETEQ: return X86::COND_E;
3105 case ISD::SETOLT: // flipped
3106 case ISD::SETOGT:
3107 case ISD::SETGT: return X86::COND_A;
3108 case ISD::SETOLE: // flipped
3109 case ISD::SETOGE:
3110 case ISD::SETGE: return X86::COND_AE;
3111 case ISD::SETUGT: // flipped
3112 case ISD::SETULT:
3113 case ISD::SETLT: return X86::COND_B;
3114 case ISD::SETUGE: // flipped
3115 case ISD::SETULE:
3116 case ISD::SETLE: return X86::COND_BE;
3117 case ISD::SETONE:
3118 case ISD::SETNE: return X86::COND_NE;
3119 case ISD::SETUO: return X86::COND_P;
3120 case ISD::SETO: return X86::COND_NP;
3121 case ISD::SETOEQ:
3122 case ISD::SETUNE: return X86::COND_INVALID;
3123 // clang-format on
3124 }
3125}
3126
3127/// Is there a floating point cmov for the specific X86 condition code?
3128/// Current x86 isa includes the following FP cmov instructions:
3129/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3130static bool hasFPCMov(unsigned X86CC) {
3131 switch (X86CC) {
3132 default:
3133 return false;
3134 case X86::COND_B:
3135 case X86::COND_BE:
3136 case X86::COND_E:
3137 case X86::COND_P:
3138 case X86::COND_A:
3139 case X86::COND_AE:
3140 case X86::COND_NE:
3141 case X86::COND_NP:
3142 return true;
3143 }
3144}
3145
3146static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3147 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3148 VT.is512BitVector();
3149}
3150
3152 const CallBase &I,
3153 MachineFunction &MF,
3154 unsigned Intrinsic) const {
3155 Info.flags = MachineMemOperand::MONone;
3156 Info.offset = 0;
3157
3159 if (!IntrData) {
3160 switch (Intrinsic) {
3161 case Intrinsic::x86_aesenc128kl:
3162 case Intrinsic::x86_aesdec128kl:
3163 Info.opc = ISD::INTRINSIC_W_CHAIN;
3164 Info.ptrVal = I.getArgOperand(1);
3165 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3166 Info.align = Align(1);
3167 Info.flags |= MachineMemOperand::MOLoad;
3168 return true;
3169 case Intrinsic::x86_aesenc256kl:
3170 case Intrinsic::x86_aesdec256kl:
3171 Info.opc = ISD::INTRINSIC_W_CHAIN;
3172 Info.ptrVal = I.getArgOperand(1);
3173 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3174 Info.align = Align(1);
3175 Info.flags |= MachineMemOperand::MOLoad;
3176 return true;
3177 case Intrinsic::x86_aesencwide128kl:
3178 case Intrinsic::x86_aesdecwide128kl:
3179 Info.opc = ISD::INTRINSIC_W_CHAIN;
3180 Info.ptrVal = I.getArgOperand(0);
3181 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3182 Info.align = Align(1);
3183 Info.flags |= MachineMemOperand::MOLoad;
3184 return true;
3185 case Intrinsic::x86_aesencwide256kl:
3186 case Intrinsic::x86_aesdecwide256kl:
3187 Info.opc = ISD::INTRINSIC_W_CHAIN;
3188 Info.ptrVal = I.getArgOperand(0);
3189 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3190 Info.align = Align(1);
3191 Info.flags |= MachineMemOperand::MOLoad;
3192 return true;
3193 case Intrinsic::x86_cmpccxadd32:
3194 case Intrinsic::x86_cmpccxadd64:
3195 case Intrinsic::x86_atomic_bts:
3196 case Intrinsic::x86_atomic_btc:
3197 case Intrinsic::x86_atomic_btr: {
3198 Info.opc = ISD::INTRINSIC_W_CHAIN;
3199 Info.ptrVal = I.getArgOperand(0);
3200 unsigned Size = I.getType()->getScalarSizeInBits();
3201 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3202 Info.align = Align(Size);
3205 return true;
3206 }
3207 case Intrinsic::x86_atomic_bts_rm:
3208 case Intrinsic::x86_atomic_btc_rm:
3209 case Intrinsic::x86_atomic_btr_rm: {
3210 Info.opc = ISD::INTRINSIC_W_CHAIN;
3211 Info.ptrVal = I.getArgOperand(0);
3212 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3213 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3214 Info.align = Align(Size);
3217 return true;
3218 }
3219 case Intrinsic::x86_aadd32:
3220 case Intrinsic::x86_aadd64:
3221 case Intrinsic::x86_aand32:
3222 case Intrinsic::x86_aand64:
3223 case Intrinsic::x86_aor32:
3224 case Intrinsic::x86_aor64:
3225 case Intrinsic::x86_axor32:
3226 case Intrinsic::x86_axor64:
3227 case Intrinsic::x86_atomic_add_cc:
3228 case Intrinsic::x86_atomic_sub_cc:
3229 case Intrinsic::x86_atomic_or_cc:
3230 case Intrinsic::x86_atomic_and_cc:
3231 case Intrinsic::x86_atomic_xor_cc: {
3232 Info.opc = ISD::INTRINSIC_W_CHAIN;
3233 Info.ptrVal = I.getArgOperand(0);
3234 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3235 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3236 Info.align = Align(Size);
3239 return true;
3240 }
3241 }
3242 return false;
3243 }
3244
3245 switch (IntrData->Type) {
3248 case TRUNCATE_TO_MEM_VI32: {
3249 Info.opc = ISD::INTRINSIC_VOID;
3250 Info.ptrVal = I.getArgOperand(0);
3251 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3253 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3254 ScalarVT = MVT::i8;
3255 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3256 ScalarVT = MVT::i16;
3257 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3258 ScalarVT = MVT::i32;
3259
3260 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3261 Info.align = Align(1);
3262 Info.flags |= MachineMemOperand::MOStore;
3263 break;
3264 }
3265 case GATHER:
3266 case GATHER_AVX2: {
3267 Info.opc = ISD::INTRINSIC_W_CHAIN;
3268 Info.ptrVal = nullptr;
3269 MVT DataVT = MVT::getVT(I.getType());
3270 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3271 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3272 IndexVT.getVectorNumElements());
3273 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3274 Info.align = Align(1);
3275 Info.flags |= MachineMemOperand::MOLoad;
3276 break;
3277 }
3278 case SCATTER: {
3279 Info.opc = ISD::INTRINSIC_VOID;
3280 Info.ptrVal = nullptr;
3281 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3282 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3283 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3284 IndexVT.getVectorNumElements());
3285 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3286 Info.align = Align(1);
3287 Info.flags |= MachineMemOperand::MOStore;
3288 break;
3289 }
3290 default:
3291 return false;
3292 }
3293
3294 return true;
3295}
3296
3297/// Returns true if the target can instruction select the
3298/// specified FP immediate natively. If false, the legalizer will
3299/// materialize the FP immediate as a load from a constant pool.
3301 bool ForCodeSize) const {
3302 for (const APFloat &FPImm : LegalFPImmediates)
3303 if (Imm.bitwiseIsEqual(FPImm))
3304 return true;
3305 return false;
3306}
3307
3309 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3310 std::optional<unsigned> ByteOffset) const {
3311 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3312
3313 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3314 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3315 N = *N->user_begin();
3316 return N;
3317 };
3318
3319 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3320 // relocation target a movq or addq instruction: don't let the load shrink.
3321 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3322 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3323 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3324 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3325
3326 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3327 // those uses are extracted directly into a store, then the extract + store
3328 // can be store-folded, or (4) any use will be used by legal full width
3329 // instruction. Then, it's probably not worth splitting the load.
3330 EVT VT = Load->getValueType(0);
3331 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3332 !SDValue(Load, 0).hasOneUse()) {
3333 bool FullWidthUse = false;
3334 bool AllExtractStores = true;
3335 for (SDUse &Use : Load->uses()) {
3336 // Skip uses of the chain value. Result 0 of the node is the load value.
3337 if (Use.getResNo() != 0)
3338 continue;
3339
3340 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3341
3342 // If this use is an extract + store, it's probably not worth splitting.
3343 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3344 all_of(User->uses(), [&](const SDUse &U) {
3345 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3346 return Inner->getOpcode() == ISD::STORE;
3347 }))
3348 continue;
3349
3350 AllExtractStores = false;
3351
3352 // If any use is a full width legal/target bin op, then assume its legal
3353 // and won't split.
3354 if (isBinOp(User->getOpcode()) &&
3355 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3356 User->getOpcode() > ISD::BUILTIN_OP_END))
3357 FullWidthUse = true;
3358 }
3359
3360 if (AllExtractStores)
3361 return false;
3362
3363 // If we have an user that uses the full vector width, then this use is
3364 // only worth splitting if the offset isn't 0 (to avoid an
3365 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3366 if (FullWidthUse)
3367 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3368 }
3369
3370 return true;
3371}
3372
3373/// Returns true if it is beneficial to convert a load of a constant
3374/// to just the constant itself.
3376 Type *Ty) const {
3377 assert(Ty->isIntegerTy());
3378
3379 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3380 if (BitSize == 0 || BitSize > 64)
3381 return false;
3382 return true;
3383}
3384
3386 // If we are using XMM registers in the ABI and the condition of the select is
3387 // a floating-point compare and we have blendv or conditional move, then it is
3388 // cheaper to select instead of doing a cross-register move and creating a
3389 // load that depends on the compare result.
3390 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3391 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3392}
3393
3395 // TODO: It might be a win to ease or lift this restriction, but the generic
3396 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3397 if (VT.isVector() && Subtarget.hasAVX512())
3398 return false;
3399
3400 return true;
3401}
3402
3404 SDValue C) const {
3405 // TODO: We handle scalars using custom code, but generic combining could make
3406 // that unnecessary.
3407 APInt MulC;
3408 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3409 return false;
3410
3411 // Find the type this will be legalized too. Otherwise we might prematurely
3412 // convert this to shl+add/sub and then still have to type legalize those ops.
3413 // Another choice would be to defer the decision for illegal types until
3414 // after type legalization. But constant splat vectors of i64 can't make it
3415 // through type legalization on 32-bit targets so we would need to special
3416 // case vXi64.
3417 while (getTypeAction(Context, VT) != TypeLegal)
3418 VT = getTypeToTransformTo(Context, VT);
3419
3420 // If vector multiply is legal, assume that's faster than shl + add/sub.
3421 // Multiply is a complex op with higher latency and lower throughput in
3422 // most implementations, sub-vXi32 vector multiplies are always fast,
3423 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3424 // is always going to be slow.
3425 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3426 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3427 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3428 return false;
3429
3430 // shl+add, shl+sub, shl+add+neg
3431 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3432 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3433}
3434
3436 unsigned Index) const {
3438 return false;
3439
3440 // Mask vectors support all subregister combinations and operations that
3441 // extract half of vector.
3442 if (ResVT.getVectorElementType() == MVT::i1)
3443 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3444 (Index == ResVT.getVectorNumElements()));
3445
3446 return (Index % ResVT.getVectorNumElements()) == 0;
3447}
3448
3450 unsigned Opc = VecOp.getOpcode();
3451
3452 // Assume target opcodes can't be scalarized.
3453 // TODO - do we have any exceptions?
3454 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3455 return false;
3456
3457 // If the vector op is not supported, try to convert to scalar.
3458 EVT VecVT = VecOp.getValueType();
3460 return true;
3461
3462 // If the vector op is supported, but the scalar op is not, the transform may
3463 // not be worthwhile.
3464 EVT ScalarVT = VecVT.getScalarType();
3465 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3466}
3467
3469 bool) const {
3470 // TODO: Allow vectors?
3471 if (VT.isVector())
3472 return false;
3473 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3474}
3475
3477 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3478 // i32/i64 or can rely on BSF passthrough value.
3479 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3480 Subtarget.hasBitScanPassThrough() ||
3481 (!Ty->isVectorTy() &&
3482 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3483}
3484
3486 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3487 // passthrough value.
3488 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3489 Subtarget.hasBitScanPassThrough();
3490}
3491
3493 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3494 // expensive than a straight movsd. On the other hand, it's important to
3495 // shrink long double fp constant since fldt is very slow.
3496 return !Subtarget.hasSSE2() || VT == MVT::f80;
3497}
3498
3500 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3501 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3502}
3503
3505 const SelectionDAG &DAG,
3506 const MachineMemOperand &MMO) const {
3507 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3508 BitcastVT.getVectorElementType() == MVT::i1)
3509 return false;
3510
3511 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3512 return false;
3513
3514 if (LoadVT.isVector() && BitcastVT.isVector()) {
3515 // If both types are legal vectors, it's always ok to convert them.
3516 // Don't convert to an illegal type.
3517 if (isTypeLegal(LoadVT))
3518 return isTypeLegal(BitcastVT);
3519 }
3520
3521 // If we have a large vector type (even if illegal), don't bitcast to large
3522 // (illegal) scalar types. Better to load fewer vectors and extract.
3523 if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() &&
3524 BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0)
3525 return false;
3526
3527 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3528}
3529
3531 const MachineFunction &MF) const {
3532 // Do not merge to float value size (128 bytes) if no implicit
3533 // float attribute is set.
3534 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3535
3536 if (NoFloat) {
3537 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3538 return (MemVT.getSizeInBits() <= MaxIntSize);
3539 }
3540 // Make sure we don't merge greater than our preferred vector
3541 // width.
3542 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3543 return false;
3544
3545 return true;
3546}
3547
3549 return Subtarget.hasFastLZCNT();
3550}
3551
3553 const Instruction &AndI) const {
3554 return true;
3555}
3556
3558 EVT VT = Y.getValueType();
3559
3560 if (VT.isVector())
3561 return false;
3562
3563 if (!Subtarget.hasBMI())
3564 return false;
3565
3566 // There are only 32-bit and 64-bit forms for 'andn'.
3567 if (VT != MVT::i32 && VT != MVT::i64)
3568 return false;
3569
3570 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3571}
3572
3574 EVT VT = Y.getValueType();
3575
3576 if (!VT.isVector())
3577 return hasAndNotCompare(Y);
3578
3579 // Vector.
3580
3581 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3582 return false;
3583
3584 if (VT == MVT::v4i32)
3585 return true;
3586
3587 return Subtarget.hasSSE2();
3588}
3589
3591 return X.getValueType().isScalarInteger(); // 'bt'
3592}
3593
3597 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3598 SelectionDAG &DAG) const {
3599 // Does baseline recommend not to perform the fold by default?
3601 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3602 return false;
3603 // For scalars this transform is always beneficial.
3604 if (X.getValueType().isScalarInteger())
3605 return true;
3606 // If all the shift amounts are identical, then transform is beneficial even
3607 // with rudimentary SSE2 shifts.
3608 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3609 return true;
3610 // If we have AVX2 with it's powerful shift operations, then it's also good.
3611 if (Subtarget.hasAVX2())
3612 return true;
3613 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3614 return NewShiftOpcode == ISD::SHL;
3615}
3616
3618 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3619 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3620 if (!VT.isInteger())
3621 return ShiftOpc;
3622
3623 bool PreferRotate = false;
3624 if (VT.isVector()) {
3625 // For vectors, if we have rotate instruction support, then its definetly
3626 // best. Otherwise its not clear what the best so just don't make changed.
3627 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3628 VT.getScalarType() == MVT::i64);
3629 } else {
3630 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3631 // rotate unless we have a zext mask+shr.
3632 PreferRotate = Subtarget.hasBMI2();
3633 if (!PreferRotate) {
3634 unsigned MaskBits =
3635 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3636 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3637 }
3638 }
3639
3640 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3641 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3642
3643 if (PreferRotate && MayTransformRotate)
3644 return ISD::ROTL;
3645
3646 // If vector we don't really get much benefit swapping around constants.
3647 // Maybe we could check if the DAG has the flipped node already in the
3648 // future.
3649 if (VT.isVector())
3650 return ShiftOpc;
3651
3652 // See if the beneficial to swap shift type.
3653 if (ShiftOpc == ISD::SHL) {
3654 // If the current setup has imm64 mask, then inverse will have
3655 // at least imm32 mask (or be zext i32 -> i64).
3656 if (VT == MVT::i64)
3657 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3658 : ShiftOpc;
3659
3660 // We can only benefit if req at least 7-bit for the mask. We
3661 // don't want to replace shl of 1,2,3 as they can be implemented
3662 // with lea/add.
3663 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3664 }
3665
3666 if (VT == MVT::i64)
3667 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3668 // extremely efficient.
3669 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3670
3671 // Keep small shifts as shl so we can generate add/lea.
3672 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3673 }
3674
3675 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3676 // (PreferRotate will be set in the latter case).
3677 if (PreferRotate || !MayTransformRotate || VT.isVector())
3678 return ShiftOpc;
3679
3680 // Non-vector type and we have a zext mask with SRL.
3681 return ISD::SRL;
3682}
3683
3686 const Value *Lhs,
3687 const Value *Rhs) const {
3688 using namespace llvm::PatternMatch;
3689 int BaseCost = BrMergingBaseCostThresh.getValue();
3690 // With CCMP, branches can be merged in a more efficient way.
3691 if (BaseCost >= 0 && Subtarget.hasCCMP())
3692 BaseCost += BrMergingCcmpBias;
3693 // a == b && a == c is a fast pattern on x86.
3694 if (BaseCost >= 0 && Opc == Instruction::And &&
3697 BaseCost += 1;
3698
3699 // For OR conditions with EQ comparisons, prefer splitting into branches
3700 // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops,
3701 // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed
3702 // comparisons (SLT, SGT) that can be optimized.
3703 if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or &&
3706 return {-1, -1, -1};
3707
3708 return {BaseCost, BrMergingLikelyBias.getValue(),
3709 BrMergingUnlikelyBias.getValue()};
3710}
3711
3713 return N->getOpcode() != ISD::FP_EXTEND;
3714}
3715
3717 const SDNode *N) const {
3718 assert(((N->getOpcode() == ISD::SHL &&
3719 N->getOperand(0).getOpcode() == ISD::SRL) ||
3720 (N->getOpcode() == ISD::SRL &&
3721 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3722 "Expected shift-shift mask");
3723 // TODO: Should we always create i64 masks? Or only folded immediates?
3724 EVT VT = N->getValueType(0);
3725 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3726 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3727 // Only fold if the shift values are equal - so it folds to AND.
3728 // TODO - we should fold if either is a non-uniform vector but we don't do
3729 // the fold for non-splats yet.
3730 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3731 }
3733}
3734
3736 EVT VT = Y.getValueType();
3737
3738 // For vectors, we don't have a preference, but we probably want a mask.
3739 if (VT.isVector())
3740 return false;
3741
3742 unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
3743 return VT.getScalarSizeInBits() <= MaxWidth;
3744}
3745
3748 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3750 !Subtarget.isOSWindows())
3753 ExpansionFactor);
3754}
3755
3757 // Any legal vector type can be splatted more efficiently than
3758 // loading/spilling from memory.
3759 return isTypeLegal(VT);
3760}
3761
3763 MVT VT = MVT::getIntegerVT(NumBits);
3764 if (isTypeLegal(VT))
3765 return VT;
3766
3767 // PMOVMSKB can handle this.
3768 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3769 return MVT::v16i8;
3770
3771 // VPMOVMSKB can handle this.
3772 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3773 return MVT::v32i8;
3774
3775 // TODO: Allow 64-bit type for 32-bit target.
3776 // TODO: 512-bit types should be allowed, but make sure that those
3777 // cases are handled in combineVectorSizedSetCCEquality().
3778
3780}
3781
3782/// Val is the undef sentinel value or equal to the specified value.
3783static bool isUndefOrEqual(int Val, int CmpVal) {
3784 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3785}
3786
3787/// Return true if every element in Mask is the undef sentinel value or equal to
3788/// the specified value.
3789static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3790 return llvm::all_of(Mask, [CmpVal](int M) {
3791 return (M == SM_SentinelUndef) || (M == CmpVal);
3792 });
3793}
3794
3795/// Return true if every element in Mask, beginning from position Pos and ending
3796/// in Pos+Size is the undef sentinel value or equal to the specified value.
3797static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3798 unsigned Size) {
3799 return llvm::all_of(Mask.slice(Pos, Size),
3800 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3801}
3802
3803/// Val is either the undef or zero sentinel value.
3804static bool isUndefOrZero(int Val) {
3805 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3806}
3807
3808/// Return true if every element in Mask, beginning from position Pos and ending
3809/// in Pos+Size is the undef sentinel value.
3810static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3811 return llvm::all_of(Mask.slice(Pos, Size), equal_to(SM_SentinelUndef));
3812}
3813
3814/// Return true if the mask creates a vector whose lower half is undefined.
3816 unsigned NumElts = Mask.size();
3817 return isUndefInRange(Mask, 0, NumElts / 2);
3818}
3819
3820/// Return true if the mask creates a vector whose upper half is undefined.
3822 unsigned NumElts = Mask.size();
3823 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3824}
3825
3826/// Return true if Val falls within the specified range (L, H].
3827static bool isInRange(int Val, int Low, int Hi) {
3828 return (Val >= Low && Val < Hi);
3829}
3830
3831/// Return true if the value of any element in Mask falls within the specified
3832/// range (L, H].
3833static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3834 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3835}
3836
3837/// Return true if the value of any element in Mask is the zero sentinel value.
3838static bool isAnyZero(ArrayRef<int> Mask) {
3839 return llvm::any_of(Mask, equal_to(SM_SentinelZero));
3840}
3841
3842/// Return true if Val is undef or if its value falls within the
3843/// specified range (L, H].
3844static bool isUndefOrInRange(int Val, int Low, int Hi) {
3845 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3846}
3847
3848/// Return true if every element in Mask is undef or if its value
3849/// falls within the specified range (L, H].
3850static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3851 return llvm::all_of(
3852 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3853}
3854
3855/// Return true if Val is undef, zero or if its value falls within the
3856/// specified range (L, H].
3857static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3858 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3859}
3860
3861/// Return true if every element in Mask is undef, zero or if its value
3862/// falls within the specified range (L, H].
3863static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3864 return llvm::all_of(
3865 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3866}
3867
3868/// Return true if every element in Mask, is an in-place blend/select mask or is
3869/// undef.
3870[[maybe_unused]] static bool isBlendOrUndef(ArrayRef<int> Mask) {
3871 unsigned NumElts = Mask.size();
3872 for (auto [I, M] : enumerate(Mask))
3873 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3874 return false;
3875 return true;
3876}
3877
3878/// Return true if every element in Mask, beginning
3879/// from position Pos and ending in Pos + Size, falls within the specified
3880/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3881static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3882 unsigned Size, int Low, int Step = 1) {
3883 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3884 if (!isUndefOrEqual(Mask[i], Low))
3885 return false;
3886 return true;
3887}
3888
3889/// Return true if every element in Mask, beginning
3890/// from position Pos and ending in Pos+Size, falls within the specified
3891/// sequential range (Low, Low+Size], or is undef or is zero.
3893 unsigned Size, int Low,
3894 int Step = 1) {
3895 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3896 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3897 return false;
3898 return true;
3899}
3900
3901/// Return true if every element in Mask, beginning
3902/// from position Pos and ending in Pos+Size is undef or is zero.
3903static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3904 unsigned Size) {
3905 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3906}
3907
3908/// Return true if every element of a single input is referenced by the shuffle
3909/// mask. i.e. it just permutes them all.
3911 unsigned NumElts = Mask.size();
3912 APInt DemandedElts = APInt::getZero(NumElts);
3913 for (int M : Mask)
3914 if (isInRange(M, 0, NumElts))
3915 DemandedElts.setBit(M);
3916 return DemandedElts.isAllOnes();
3917}
3918
3919/// Helper function to test whether a shuffle mask could be
3920/// simplified by widening the elements being shuffled.
3921///
3922/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3923/// leaves it in an unspecified state.
3924///
3925/// NOTE: This must handle normal vector shuffle masks and *target* vector
3926/// shuffle masks. The latter have the special property of a '-2' representing
3927/// a zero-ed lane of a vector.
3929 SmallVectorImpl<int> &WidenedMask) {
3930 WidenedMask.assign(Mask.size() / 2, 0);
3931 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3932 int M0 = Mask[i];
3933 int M1 = Mask[i + 1];
3934
3935 // If both elements are undef, its trivial.
3936 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3937 WidenedMask[i / 2] = SM_SentinelUndef;
3938 continue;
3939 }
3940
3941 // Check for an undef mask and a mask value properly aligned to fit with
3942 // a pair of values. If we find such a case, use the non-undef mask's value.
3943 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3944 WidenedMask[i / 2] = M1 / 2;
3945 continue;
3946 }
3947 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3948 WidenedMask[i / 2] = M0 / 2;
3949 continue;
3950 }
3951
3952 // When zeroing, we need to spread the zeroing across both lanes to widen.
3953 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3954 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3956 WidenedMask[i / 2] = SM_SentinelZero;
3957 continue;
3958 }
3959 return false;
3960 }
3961
3962 // Finally check if the two mask values are adjacent and aligned with
3963 // a pair.
3964 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3965 WidenedMask[i / 2] = M0 / 2;
3966 continue;
3967 }
3968
3969 // Otherwise we can't safely widen the elements used in this shuffle.
3970 return false;
3971 }
3972 assert(WidenedMask.size() == Mask.size() / 2 &&
3973 "Incorrect size of mask after widening the elements!");
3974
3975 return true;
3976}
3977
3979 const APInt &Zeroable,
3980 bool V2IsZero,
3981 SmallVectorImpl<int> &WidenedMask) {
3982 // Create an alternative mask with info about zeroable elements.
3983 // Here we do not set undef elements as zeroable.
3984 SmallVector<int, 64> ZeroableMask(Mask);
3985 if (V2IsZero) {
3986 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3987 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3988 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3989 ZeroableMask[i] = SM_SentinelZero;
3990 }
3991 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3992}
3993
3995 SmallVector<int, 32> WidenedMask;
3996 return canWidenShuffleElements(Mask, WidenedMask);
3997}
3998
3999// Attempt to narrow/widen shuffle mask until it matches the target number of
4000// elements.
4001static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
4002 SmallVectorImpl<int> &ScaledMask) {
4003 unsigned NumSrcElts = Mask.size();
4004 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
4005 "Illegal shuffle scale factor");
4006
4007 // Narrowing is guaranteed to work.
4008 if (NumDstElts >= NumSrcElts) {
4009 int Scale = NumDstElts / NumSrcElts;
4010 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
4011 return true;
4012 }
4013
4014 // We have to repeat the widening until we reach the target size, but we can
4015 // split out the first widening as it sets up ScaledMask for us.
4016 if (canWidenShuffleElements(Mask, ScaledMask)) {
4017 while (ScaledMask.size() > NumDstElts) {
4018 SmallVector<int, 16> WidenedMask;
4019 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
4020 return false;
4021 ScaledMask = std::move(WidenedMask);
4022 }
4023 return true;
4024 }
4025
4026 return false;
4027}
4028
4029static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
4030 SmallVector<int, 32> ScaledMask;
4031 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
4032}
4033
4034// Helper to grow the shuffle mask for a larger value type.
4035// NOTE: This is different to scaleShuffleElements which is a same size type.
4036static void growShuffleMask(ArrayRef<int> SrcMask,
4037 SmallVectorImpl<int> &DstMask,
4038 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
4039 assert(DstMask.empty() && "Expected an empty shuffle mas");
4040 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
4041 unsigned Scale = DstSizeInBits / SrcSizeInBits;
4042 unsigned NumSrcElts = SrcMask.size();
4043 DstMask.assign(SrcMask.begin(), SrcMask.end());
4044 for (int &M : DstMask) {
4045 if (M < 0)
4046 continue;
4047 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
4048 }
4049 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
4050}
4051
4052/// Returns true if Elt is a constant zero or a floating point constant +0.0.
4054 return isNullConstant(Elt) || isNullFPConstant(Elt);
4055}
4056
4057// Build a vector of constants.
4058// Use an UNDEF node if MaskElt == -1.
4059// Split 64-bit constants in the 32-bit mode.
4061 const SDLoc &dl, bool IsMask = false) {
4062
4064 bool Split = false;
4065
4066 MVT ConstVecVT = VT;
4067 unsigned NumElts = VT.getVectorNumElements();
4068 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4069 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4070 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4071 Split = true;
4072 }
4073
4074 MVT EltVT = ConstVecVT.getVectorElementType();
4075 for (unsigned i = 0; i < NumElts; ++i) {
4076 bool IsUndef = Values[i] < 0 && IsMask;
4077 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4078 DAG.getConstant(Values[i], dl, EltVT);
4079 Ops.push_back(OpNode);
4080 if (Split)
4081 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4082 DAG.getConstant(0, dl, EltVT));
4083 }
4084 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4085 if (Split)
4086 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4087 return ConstsNode;
4088}
4089
4090static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4091 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4092 assert(Bits.size() == Undefs.getBitWidth() &&
4093 "Unequal constant and undef arrays");
4095 bool Split = false;
4096
4097 MVT ConstVecVT = VT;
4098 unsigned NumElts = VT.getVectorNumElements();
4099 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4100 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4101 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4102 Split = true;
4103 }
4104
4105 MVT EltVT = ConstVecVT.getVectorElementType();
4106 MVT EltIntVT = EltVT.changeTypeToInteger();
4107 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4108 if (Undefs[i]) {
4109 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4110 continue;
4111 }
4112 const APInt &V = Bits[i];
4113 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4114 if (Split) {
4115 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4116 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4117 } else {
4118 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4119 }
4120 }
4121
4122 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4123 return DAG.getBitcast(VT, ConstsNode);
4124}
4125
4127 SelectionDAG &DAG, const SDLoc &dl) {
4128 APInt Undefs = APInt::getZero(Bits.size());
4129 return getConstVector(Bits, Undefs, VT, DAG, dl);
4130}
4131
4132/// Returns a vector of specified type with all zero elements.
4133static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4134 SelectionDAG &DAG, const SDLoc &dl) {
4135 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4136 VT.getVectorElementType() == MVT::i1) &&
4137 "Unexpected vector type");
4138
4139 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4140 // type. This ensures they get CSE'd. But if the integer type is not
4141 // available, use a floating-point +0.0 instead.
4142 SDValue Vec;
4143 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4144 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4145 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4146 } else if (VT.isFloatingPoint() &&
4148 Vec = DAG.getConstantFP(+0.0, dl, VT);
4149 } else if (VT.getVectorElementType() == MVT::i1) {
4150 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4151 "Unexpected vector type");
4152 Vec = DAG.getConstant(0, dl, VT);
4153 } else {
4154 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4155 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4156 }
4157 return DAG.getBitcast(VT, Vec);
4158}
4159
4160// Helper to determine if the ops are all the extracted subvectors come from a
4161// single source. If we allow commute they don't have to be in order (Lo/Hi).
4162static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4163 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4164 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4165 LHS.getValueType() != RHS.getValueType() ||
4166 LHS.getOperand(0) != RHS.getOperand(0))
4167 return SDValue();
4168
4169 SDValue Src = LHS.getOperand(0);
4170 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4171 return SDValue();
4172
4173 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4174 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4175 RHS.getConstantOperandAPInt(1) == NumElts) ||
4176 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4177 LHS.getConstantOperandAPInt(1) == NumElts))
4178 return Src;
4179
4180 return SDValue();
4181}
4182
4183static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4184 const SDLoc &dl, unsigned vectorWidth) {
4185 EVT VT = Vec.getValueType();
4186 EVT ElVT = VT.getVectorElementType();
4187 unsigned ResultNumElts =
4188 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4189 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4190
4191 assert(ResultVT.getSizeInBits() == vectorWidth &&
4192 "Illegal subvector extraction");
4193
4194 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4195 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4196 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4197
4198 // This is the index of the first element of the vectorWidth-bit chunk
4199 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4200 IdxVal &= ~(ElemsPerChunk - 1);
4201
4202 // If the input is a buildvector just emit a smaller one.
4203 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4204 return DAG.getBuildVector(ResultVT, dl,
4205 Vec->ops().slice(IdxVal, ElemsPerChunk));
4206
4207 // Check if we're extracting the upper undef of a widening pattern.
4208 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4209 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4210 isNullConstant(Vec.getOperand(2)))
4211 return DAG.getUNDEF(ResultVT);
4212
4213 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4214}
4215
4216/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4217/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4218/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4219/// instructions or a simple subregister reference. Idx is an index in the
4220/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4221/// lowering EXTRACT_VECTOR_ELT operations easier.
4222static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4223 SelectionDAG &DAG, const SDLoc &dl) {
4225 Vec.getValueType().is512BitVector()) &&
4226 "Unexpected vector size!");
4227 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4228}
4229
4230/// Generate a DAG to grab 256-bits from a 512-bit vector.
4231static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4232 SelectionDAG &DAG, const SDLoc &dl) {
4233 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4234 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4235}
4236
4237static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4238 SelectionDAG &DAG, const SDLoc &dl,
4239 unsigned vectorWidth) {
4240 assert((vectorWidth == 128 || vectorWidth == 256) &&
4241 "Unsupported vector width");
4242 // Inserting UNDEF is Result
4243 if (Vec.isUndef())
4244 return Result;
4245
4246 // Insert the relevant vectorWidth bits.
4247 EVT VT = Vec.getValueType();
4248 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4249 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4250
4251 // This is the index of the first element of the vectorWidth-bit chunk
4252 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4253 IdxVal &= ~(ElemsPerChunk - 1);
4254 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4255}
4256
4257/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4258/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4259/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4260/// simple superregister reference. Idx is an index in the 128 bits
4261/// we want. It need not be aligned to a 128-bit boundary. That makes
4262/// lowering INSERT_VECTOR_ELT operations easier.
4263static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4264 SelectionDAG &DAG, const SDLoc &dl) {
4265 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4266 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4267}
4268
4269/// Widen a vector to a larger size with the same scalar type, with the new
4270/// elements either zero or undef.
4271static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4272 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4273 const SDLoc &dl) {
4274 EVT VecVT = Vec.getValueType();
4276 VecVT.getScalarType() == VT.getScalarType() &&
4277 "Unsupported vector widening type");
4278 // If the upper 128-bits of a build vector are already undef/zero, then try to
4279 // widen from the lower 128-bits.
4280 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4281 unsigned NumSrcElts = VecVT.getVectorNumElements();
4282 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4283 if (all_of(Hi, [&](SDValue V) {
4284 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4285 }))
4286 Vec = extract128BitVector(Vec, 0, DAG, dl);
4287 }
4288 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4289 : DAG.getUNDEF(VT);
4290 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4291}
4292
4293/// Widen a vector to a larger size with the same scalar type, with the new
4294/// elements either zero or undef.
4295static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4296 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4297 const SDLoc &dl, unsigned WideSizeInBits) {
4298 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4299 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4300 "Unsupported vector widening type");
4301 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4302 MVT SVT = Vec.getSimpleValueType().getScalarType();
4303 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4304 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4305}
4306
4307/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4308/// and bitcast with integer types.
4309static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4310 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4311 unsigned NumElts = VT.getVectorNumElements();
4312 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4313 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4314 return VT;
4315}
4316
4317/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4318/// bitcast with integer types.
4319static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4320 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4321 const SDLoc &dl) {
4322 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4323 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4324}
4325
4326// Helper function to collect subvector ops that are concatenated together,
4327// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4328// The subvectors in Ops are guaranteed to be the same type.
4330 SelectionDAG &DAG) {
4331 assert(Ops.empty() && "Expected an empty ops vector");
4332
4333 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4334 Ops.append(N->op_begin(), N->op_end());
4335 return true;
4336 }
4337
4338 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4339 SDValue Src = N->getOperand(0);
4340 SDValue Sub = N->getOperand(1);
4341 const APInt &Idx = N->getConstantOperandAPInt(2);
4342 EVT VT = Src.getValueType();
4343 EVT SubVT = Sub.getValueType();
4344
4345 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4346 // insert_subvector(undef, x, lo)
4347 if (Idx == 0 && Src.isUndef()) {
4348 Ops.push_back(Sub);
4349 Ops.push_back(DAG.getUNDEF(SubVT));
4350 return true;
4351 }
4352 if (Idx == (VT.getVectorNumElements() / 2)) {
4353 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4354 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4355 Src.getOperand(1).getValueType() == SubVT &&
4356 isNullConstant(Src.getOperand(2))) {
4357 // Attempt to recurse into inner (matching) concats.
4358 SDValue Lo = Src.getOperand(1);
4359 SDValue Hi = Sub;
4360 SmallVector<SDValue, 2> LoOps, HiOps;
4361 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4362 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4363 LoOps.size() == HiOps.size()) {
4364 Ops.append(LoOps);
4365 Ops.append(HiOps);
4366 return true;
4367 }
4368 Ops.push_back(Lo);
4369 Ops.push_back(Hi);
4370 return true;
4371 }
4372 // insert_subvector(x, extract_subvector(x, lo), hi)
4373 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4374 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4375 Ops.append(2, Sub);
4376 return true;
4377 }
4378 // insert_subvector(undef, x, hi)
4379 if (Src.isUndef()) {
4380 Ops.push_back(DAG.getUNDEF(SubVT));
4381 Ops.push_back(Sub);
4382 return true;
4383 }
4384 }
4385 }
4386 }
4387
4388 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4389 EVT VT = N->getValueType(0);
4390 SDValue Src = N->getOperand(0);
4391 uint64_t Idx = N->getConstantOperandVal(1);
4392
4393 // Collect all the subvectors from the source vector and slice off the
4394 // extraction.
4396 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4397 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4398 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4399 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4400 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4401 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4402 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4403 return true;
4404 }
4405 }
4406
4407 assert(Ops.empty() && "Expected an empty ops vector");
4408 return false;
4409}
4410
4411// Helper to check if \p V can be split into subvectors and the upper subvectors
4412// are all undef. In which case return the lower subvector.
4414 SelectionDAG &DAG) {
4415 SmallVector<SDValue> SubOps;
4416 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4417 return SDValue();
4418
4419 unsigned NumSubOps = SubOps.size();
4420 unsigned HalfNumSubOps = NumSubOps / 2;
4421 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4422
4423 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4424 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4425 return SDValue();
4426
4427 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4428 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4429 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4430}
4431
4432// Helper to check if we can access all the constituent subvectors without any
4433// extract ops.
4436 return collectConcatOps(V.getNode(), Ops, DAG);
4437}
4438
4439static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4440 const SDLoc &dl) {
4441 EVT VT = Op.getValueType();
4442 unsigned NumElems = VT.getVectorNumElements();
4443 unsigned SizeInBits = VT.getSizeInBits();
4444 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4445 "Can't split odd sized vector");
4446
4448 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4449 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4450 unsigned HalfOps = SubOps.size() / 2;
4451 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4452 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4453 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4454 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4455 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4456 return std::make_pair(Lo, Hi);
4457 }
4458
4459 // If this is a splat value (with no-undefs) then use the lower subvector,
4460 // which should be a free extraction.
4461 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4462 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4463 return std::make_pair(Lo, Lo);
4464
4465 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4466 return std::make_pair(Lo, Hi);
4467}
4468
4469/// Break an operation into 2 half sized ops and then concatenate the results.
4471 unsigned NumOps = Op.getNumOperands();
4472 EVT VT = Op.getValueType();
4473
4474 // Extract the LHS Lo/Hi vectors
4477 for (unsigned I = 0; I != NumOps; ++I) {
4478 SDValue SrcOp = Op.getOperand(I);
4479 if (!SrcOp.getValueType().isVector()) {
4480 LoOps[I] = HiOps[I] = SrcOp;
4481 continue;
4482 }
4483 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4484 }
4485
4486 EVT LoVT, HiVT;
4487 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4488 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4489 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4490 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4491}
4492
4493/// Break an unary integer operation into 2 half sized ops and then
4494/// concatenate the result back.
4496 const SDLoc &dl) {
4497 // Make sure we only try to split 256/512-bit types to avoid creating
4498 // narrow vectors.
4499 [[maybe_unused]] EVT VT = Op.getValueType();
4500 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4501 Op.getOperand(0).getValueType().is512BitVector()) &&
4502 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4503 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4504 VT.getVectorNumElements() &&
4505 "Unexpected VTs!");
4506 return splitVectorOp(Op, DAG, dl);
4507}
4508
4509/// Break a binary integer operation into 2 half sized ops and then
4510/// concatenate the result back.
4512 const SDLoc &dl) {
4513 // Assert that all the types match.
4514 [[maybe_unused]] EVT VT = Op.getValueType();
4515 assert(Op.getOperand(0).getValueType() == VT &&
4516 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4517 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4518 return splitVectorOp(Op, DAG, dl);
4519}
4520
4521// Helper for splitting operands of an operation to legal target size and
4522// apply a function on each part.
4523// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4524// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4525// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4526// The argument Builder is a function that will be applied on each split part:
4527// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4528template <typename F>
4530 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4531 F Builder, bool CheckBWI = true,
4532 bool AllowAVX512 = true) {
4533 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4534 unsigned NumSubs = 1;
4535 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4536 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4537 if (VT.getSizeInBits() > 512) {
4538 NumSubs = VT.getSizeInBits() / 512;
4539 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4540 }
4541 } else if (Subtarget.hasAVX2()) {
4542 if (VT.getSizeInBits() > 256) {
4543 NumSubs = VT.getSizeInBits() / 256;
4544 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4545 }
4546 } else {
4547 if (VT.getSizeInBits() > 128) {
4548 NumSubs = VT.getSizeInBits() / 128;
4549 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4550 }
4551 }
4552
4553 if (NumSubs == 1)
4554 return Builder(DAG, DL, Ops);
4555
4557 for (unsigned i = 0; i != NumSubs; ++i) {
4559 for (SDValue Op : Ops) {
4560 EVT OpVT = Op.getValueType();
4561 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4562 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4563 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4564 }
4565 Subs.push_back(Builder(DAG, DL, SubOps));
4566 }
4567 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4568}
4569
4570// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4571// targets.
4572static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4574 const X86Subtarget &Subtarget) {
4575 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4576 MVT SVT = VT.getScalarType();
4577
4578 // If we have a 32/64 splatted constant, splat it to DstTy to
4579 // encourage a foldable broadcast'd operand.
4580 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4581 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4582 // AVX512 broadcasts 32/64-bit operands.
4583 // TODO: Support float once getAVX512Node is used by fp-ops.
4584 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4586 return SDValue();
4587 // If we're not widening, don't bother if we're not bitcasting.
4588 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4589 return SDValue();
4591 APInt SplatValue, SplatUndef;
4592 unsigned SplatBitSize;
4593 bool HasAnyUndefs;
4594 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4595 HasAnyUndefs, OpEltSizeInBits) &&
4596 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4597 return DAG.getConstant(SplatValue, DL, DstVT);
4598 }
4599 return SDValue();
4600 };
4601
4602 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4603
4604 MVT DstVT = VT;
4605 if (Widen)
4606 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4607
4608 // Canonicalize src operands.
4609 SmallVector<SDValue> SrcOps(Ops);
4610 for (SDValue &Op : SrcOps) {
4611 MVT OpVT = Op.getSimpleValueType();
4612 // Just pass through scalar operands.
4613 if (!OpVT.isVector())
4614 continue;
4615 assert(OpVT == VT && "Vector type mismatch");
4616
4617 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4618 Op = BroadcastOp;
4619 continue;
4620 }
4621
4622 // Just widen the subvector by inserting into an undef wide vector.
4623 if (Widen)
4624 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4625 }
4626
4627 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4628
4629 // Perform the 512-bit op then extract the bottom subvector.
4630 if (Widen)
4631 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4632 return Res;
4633}
4634
4635/// Insert i1-subvector to i1-vector.
4637 const X86Subtarget &Subtarget) {
4638
4639 SDLoc dl(Op);
4640 SDValue Vec = Op.getOperand(0);
4641 SDValue SubVec = Op.getOperand(1);
4642 SDValue Idx = Op.getOperand(2);
4643 unsigned IdxVal = Op.getConstantOperandVal(2);
4644
4645 // Inserting undef is a nop. We can just return the original vector.
4646 if (SubVec.isUndef())
4647 return Vec;
4648
4649 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4650 return Op;
4651
4652 MVT OpVT = Op.getSimpleValueType();
4653 unsigned NumElems = OpVT.getVectorNumElements();
4654 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4655
4656 // Extend to natively supported kshift.
4657 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4658
4659 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4660 // if necessary.
4661 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4662 // May need to promote to a legal type.
4663 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4664 DAG.getConstant(0, dl, WideOpVT),
4665 SubVec, Idx);
4666 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4667 }
4668
4669 MVT SubVecVT = SubVec.getSimpleValueType();
4670 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4671 assert(IdxVal + SubVecNumElems <= NumElems &&
4672 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4673 "Unexpected index value in INSERT_SUBVECTOR");
4674
4675 SDValue Undef = DAG.getUNDEF(WideOpVT);
4676
4677 if (IdxVal == 0) {
4678 // Zero lower bits of the Vec
4679 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4680 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4681 ZeroIdx);
4682 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4683 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4684 // Merge them together, SubVec should be zero extended.
4685 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4686 DAG.getConstant(0, dl, WideOpVT),
4687 SubVec, ZeroIdx);
4688 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4689 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4690 }
4691
4692 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4693 Undef, SubVec, ZeroIdx);
4694
4695 if (Vec.isUndef()) {
4696 assert(IdxVal != 0 && "Unexpected index");
4697 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4698 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4699 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4700 }
4701
4703 assert(IdxVal != 0 && "Unexpected index");
4704 // If upper elements of Vec are known undef, then just shift into place.
4705 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4706 [](SDValue V) { return V.isUndef(); })) {
4707 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4708 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4709 } else {
4710 NumElems = WideOpVT.getVectorNumElements();
4711 unsigned ShiftLeft = NumElems - SubVecNumElems;
4712 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4713 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4714 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4715 if (ShiftRight != 0)
4716 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4717 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4718 }
4719 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4720 }
4721
4722 // Simple case when we put subvector in the upper part
4723 if (IdxVal + SubVecNumElems == NumElems) {
4724 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4725 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4726 if (SubVecNumElems * 2 == NumElems) {
4727 // Special case, use legal zero extending insert_subvector. This allows
4728 // isel to optimize when bits are known zero.
4729 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4730 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4731 DAG.getConstant(0, dl, WideOpVT),
4732 Vec, ZeroIdx);
4733 } else {
4734 // Otherwise use explicit shifts to zero the bits.
4735 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4736 Undef, Vec, ZeroIdx);
4737 NumElems = WideOpVT.getVectorNumElements();
4738 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4739 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4740 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4741 }
4742 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4743 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4744 }
4745
4746 // Inserting into the middle is more complicated.
4747
4748 NumElems = WideOpVT.getVectorNumElements();
4749
4750 // Widen the vector if needed.
4751 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4752
4753 unsigned ShiftLeft = NumElems - SubVecNumElems;
4754 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4755
4756 // Do an optimization for the most frequently used types.
4757 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4758 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4759 Mask0.flipAllBits();
4760 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4761 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4762 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4763 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4764 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4765 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4766 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4767 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4768
4769 // Reduce to original width if needed.
4770 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4771 }
4772
4773 // Clear the upper bits of the subvector and move it to its insert position.
4774 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4775 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4776 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4777 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4778
4779 // Isolate the bits below the insertion point.
4780 unsigned LowShift = NumElems - IdxVal;
4781 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4782 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4783 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4784 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4785
4786 // Isolate the bits after the last inserted bit.
4787 unsigned HighShift = IdxVal + SubVecNumElems;
4788 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4789 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4790 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4791 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4792
4793 // Now OR all 3 pieces together.
4794 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4795 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4796
4797 // Reduce to original width if needed.
4798 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4799}
4800
4802 const SDLoc &dl) {
4803 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4804 EVT SubVT = V1.getValueType();
4805 EVT SubSVT = SubVT.getScalarType();
4806 unsigned SubNumElts = SubVT.getVectorNumElements();
4807 unsigned SubVectorWidth = SubVT.getSizeInBits();
4808 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4809 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4810 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4811}
4812
4813/// Returns a vector of specified type with all bits set.
4814/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4815/// Then bitcast to their original type, ensuring they get CSE'd.
4816static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4817 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4818 "Expected a 128/256/512-bit vector type");
4819 unsigned NumElts = VT.getSizeInBits() / 32;
4820 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4821 return DAG.getBitcast(VT, Vec);
4822}
4823
4824static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4825 SDValue In, SelectionDAG &DAG) {
4826 EVT InVT = In.getValueType();
4827 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4828
4829 // Canonicalize Opcode to general extension version.
4830 switch (Opcode) {
4831 case ISD::ANY_EXTEND:
4833 Opcode = ISD::ANY_EXTEND;
4834 break;
4835 case ISD::SIGN_EXTEND:
4837 Opcode = ISD::SIGN_EXTEND;
4838 break;
4839 case ISD::ZERO_EXTEND:
4841 Opcode = ISD::ZERO_EXTEND;
4842 break;
4843 default:
4844 llvm_unreachable("Unknown extension opcode");
4845 }
4846
4847 // For 256-bit vectors, we only need the lower (128-bit) input half.
4848 // For 512-bit vectors, we only need the lower input half or quarter.
4849 if (InVT.getSizeInBits() > 128) {
4850 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4851 "Expected VTs to be the same size!");
4852 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4853 In = extractSubVector(In, 0, DAG, DL,
4854 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4855 InVT = In.getValueType();
4856 }
4857
4858 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4859 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4860
4861 return DAG.getNode(Opcode, DL, VT, In);
4862}
4863
4864// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4866 SDValue Mask, SelectionDAG &DAG) {
4867 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4868 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4869 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4870}
4871
4873 bool Lo, bool Unary) {
4874 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4875 "Illegal vector type to unpack");
4876 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4877 int NumElts = VT.getVectorNumElements();
4878 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4879 for (int i = 0; i < NumElts; ++i) {
4880 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4881 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4882 Pos += (Unary ? 0 : NumElts * (i % 2));
4883 Pos += (Lo ? 0 : NumEltsInLane / 2);
4884 Mask.push_back(Pos);
4885 }
4886}
4887
4888/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4889/// imposed by AVX and specific to the unary pattern. Example:
4890/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4891/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4893 bool Lo) {
4894 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4895 int NumElts = VT.getVectorNumElements();
4896 for (int i = 0; i < NumElts; ++i) {
4897 int Pos = i / 2;
4898 Pos += (Lo ? 0 : NumElts / 2);
4899 Mask.push_back(Pos);
4900 }
4901}
4902
4903// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4904static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4905 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4908 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4909 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4910 int M = Mask[I];
4911 if (M < 0)
4912 continue;
4913 SDValue V = (M < NumElts) ? V1 : V2;
4914 if (V.isUndef())
4915 continue;
4916 Ops[I] = V.getOperand(M % NumElts);
4917 }
4918 return DAG.getBuildVector(VT, dl, Ops);
4919 }
4920
4921 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4922}
4923
4924/// Returns a vector_shuffle node for an unpackl operation.
4925static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4926 SDValue V1, SDValue V2) {
4928 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4929 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4930}
4931
4932/// Returns a vector_shuffle node for an unpackh operation.
4933static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4934 SDValue V1, SDValue V2) {
4936 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4937 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4938}
4939
4940/// Returns a node that packs the LHS + RHS nodes together at half width.
4941/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4942/// TODO: Add subvector splitting if/when we have a need for it.
4943static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4944 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4945 bool PackHiHalf = false) {
4946 MVT OpVT = LHS.getSimpleValueType();
4947 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4948 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4949 assert(OpVT == RHS.getSimpleValueType() &&
4950 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4951 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4952 "Unexpected PACK operand types");
4953 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4954 "Unexpected PACK result type");
4955
4956 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4957 if (EltSizeInBits == 32) {
4958 SmallVector<int> PackMask;
4959 int Offset = PackHiHalf ? 1 : 0;
4960 int NumElts = VT.getVectorNumElements();
4961 for (int I = 0; I != NumElts; I += 4) {
4962 PackMask.push_back(I + Offset);
4963 PackMask.push_back(I + Offset + 2);
4964 PackMask.push_back(I + Offset + NumElts);
4965 PackMask.push_back(I + Offset + NumElts + 2);
4966 }
4967 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4968 DAG.getBitcast(VT, RHS), PackMask);
4969 }
4970
4971 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4972 if (!PackHiHalf) {
4973 if (UsePackUS &&
4974 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4975 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4976 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4977
4978 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4979 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4980 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4981 }
4982
4983 // Fallback to sign/zero extending the requested half and pack.
4984 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4985 if (UsePackUS) {
4986 if (PackHiHalf) {
4987 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4988 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4989 } else {
4990 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4991 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4992 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4993 };
4994 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4995 };
4996
4997 if (!PackHiHalf) {
4998 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4999 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
5000 }
5001 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
5002 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
5003 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
5004}
5005
5006/// Return a vector_shuffle of the specified vector of zero or undef vector.
5007/// This produces a shuffle where the low element of V2 is swizzled into the
5008/// zero/undef vector, landing at element Idx.
5009/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5011 bool IsZero,
5012 const X86Subtarget &Subtarget,
5013 SelectionDAG &DAG) {
5014 MVT VT = V2.getSimpleValueType();
5015 SDValue V1 = IsZero
5016 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5017 int NumElems = VT.getVectorNumElements();
5018 SmallVector<int, 16> MaskVec(NumElems);
5019 for (int i = 0; i != NumElems; ++i)
5020 // If this is the insertion idx, put the low elt of V2 here.
5021 MaskVec[i] = (i == Idx) ? NumElems : i;
5022 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5023}
5024
5026 if (Ptr.getOpcode() == X86ISD::Wrapper ||
5028 Ptr = Ptr.getOperand(0);
5029 return dyn_cast<ConstantPoolSDNode>(Ptr);
5030}
5031
5032// TODO: Add support for non-zero offsets.
5035 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
5036 return nullptr;
5037 return CNode->getConstVal();
5038}
5039
5041 if (!Load || !ISD::isNormalLoad(Load))
5042 return nullptr;
5043 return getTargetConstantFromBasePtr(Load->getBasePtr());
5044}
5045
5050
5051const Constant *
5053 assert(LD && "Unexpected null LoadSDNode");
5054 return getTargetConstantFromNode(LD);
5055}
5056
5058 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
5059 SDValue Cond = N->getOperand(0);
5060 SDValue RHS = N->getOperand(2);
5061 EVT CondVT = Cond.getValueType();
5062 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
5063 CondVT.getVectorElementType() == MVT::i1 &&
5064 ISD::isBuildVectorAllZeros(RHS.getNode());
5065}
5066
5067// Extract raw constant bits from constant pools.
5068static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5069 APInt &UndefElts,
5070 SmallVectorImpl<APInt> &EltBits,
5071 bool AllowWholeUndefs = true,
5072 bool AllowPartialUndefs = false) {
5073 assert(EltBits.empty() && "Expected an empty EltBits vector");
5074
5076
5077 EVT VT = Op.getValueType();
5078 unsigned SizeInBits = VT.getSizeInBits();
5079 unsigned NumElts = SizeInBits / EltSizeInBits;
5080
5081 // Can't split constant.
5082 if ((SizeInBits % EltSizeInBits) != 0)
5083 return false;
5084
5085 // Bitcast a source array of element bits to the target size.
5086 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5087 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5088 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5089 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5090 "Constant bit sizes don't match");
5091
5092 // Don't split if we don't allow undef bits.
5093 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5094 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5095 return false;
5096
5097 // If we're already the right size, don't bother bitcasting.
5098 if (NumSrcElts == NumElts) {
5099 UndefElts = UndefSrcElts;
5100 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5101 return true;
5102 }
5103
5104 // Extract all the undef/constant element data and pack into single bitsets.
5105 APInt UndefBits(SizeInBits, 0);
5106 APInt MaskBits(SizeInBits, 0);
5107
5108 for (unsigned i = 0; i != NumSrcElts; ++i) {
5109 unsigned BitOffset = i * SrcEltSizeInBits;
5110 if (UndefSrcElts[i])
5111 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5112 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5113 }
5114
5115 // Split the undef/constant single bitset data into the target elements.
5116 UndefElts = APInt(NumElts, 0);
5117 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5118
5119 for (unsigned i = 0; i != NumElts; ++i) {
5120 unsigned BitOffset = i * EltSizeInBits;
5121 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5122
5123 // Only treat an element as UNDEF if all bits are UNDEF.
5124 if (UndefEltBits.isAllOnes()) {
5125 if (!AllowWholeUndefs)
5126 return false;
5127 UndefElts.setBit(i);
5128 continue;
5129 }
5130
5131 // If only some bits are UNDEF then treat them as zero (or bail if not
5132 // supported).
5133 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5134 return false;
5135
5136 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5137 }
5138 return true;
5139 };
5140
5141 // Collect constant bits and insert into mask/undef bit masks.
5142 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5143 unsigned UndefBitIndex) {
5144 if (!Cst)
5145 return false;
5146 if (isa<UndefValue>(Cst)) {
5147 Undefs.setBit(UndefBitIndex);
5148 return true;
5149 }
5150 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5151 Mask = CInt->getValue();
5152 return true;
5153 }
5154 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5155 Mask = CFP->getValueAPF().bitcastToAPInt();
5156 return true;
5157 }
5158 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5159 Type *Ty = CDS->getType();
5160 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5161 Type *EltTy = CDS->getElementType();
5162 bool IsInteger = EltTy->isIntegerTy();
5163 bool IsFP =
5164 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5165 if (!IsInteger && !IsFP)
5166 return false;
5167 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5168 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5169 if (IsInteger)
5170 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5171 else
5172 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5173 I * EltBits);
5174 return true;
5175 }
5176 return false;
5177 };
5178
5179 // Handle UNDEFs.
5180 if (Op.isUndef()) {
5181 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5182 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5183 return CastBitData(UndefSrcElts, SrcEltBits);
5184 }
5185
5186 // Extract scalar constant bits.
5187 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5188 APInt UndefSrcElts = APInt::getZero(1);
5189 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5190 return CastBitData(UndefSrcElts, SrcEltBits);
5191 }
5192 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5193 APInt UndefSrcElts = APInt::getZero(1);
5194 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5195 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5196 return CastBitData(UndefSrcElts, SrcEltBits);
5197 }
5198
5199 // Extract constant bits from build vector.
5200 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5201 BitVector Undefs;
5202 SmallVector<APInt> SrcEltBits;
5203 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5204 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5205 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5206 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5207 if (Undefs[I])
5208 UndefSrcElts.setBit(I);
5209 return CastBitData(UndefSrcElts, SrcEltBits);
5210 }
5211 }
5212
5213 // Extract constant bits from constant pool vector.
5214 if (auto *Cst = getTargetConstantFromNode(Op)) {
5215 Type *CstTy = Cst->getType();
5216 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5217 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5218 return false;
5219
5220 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5221 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5222 if ((SizeInBits % SrcEltSizeInBits) != 0)
5223 return false;
5224
5225 APInt UndefSrcElts(NumSrcElts, 0);
5226 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5227 for (unsigned i = 0; i != NumSrcElts; ++i)
5228 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5229 UndefSrcElts, i))
5230 return false;
5231
5232 return CastBitData(UndefSrcElts, SrcEltBits);
5233 }
5234
5235 // Extract constant bits from a broadcasted constant pool scalar.
5236 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5237 EltSizeInBits <= VT.getScalarSizeInBits()) {
5238 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5239 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5240 return false;
5241
5242 SDValue Ptr = MemIntr->getBasePtr();
5243 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
5244 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5245 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5246
5247 APInt UndefSrcElts(NumSrcElts, 0);
5248 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5249 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5250 if (UndefSrcElts[0])
5251 UndefSrcElts.setBits(0, NumSrcElts);
5252 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5253 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5254 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5255 return CastBitData(UndefSrcElts, SrcEltBits);
5256 }
5257 }
5258 }
5259
5260 // Extract constant bits from a subvector broadcast.
5261 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5262 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5263 SDValue Ptr = MemIntr->getBasePtr();
5264 // The source constant may be larger than the subvector broadcast,
5265 // ensure we extract the correct subvector constants.
5266 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5267 Type *CstTy = Cst->getType();
5268 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5269 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5270 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5271 (SizeInBits % SubVecSizeInBits) != 0)
5272 return false;
5273 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5274 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5275 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5276 APInt UndefSubElts(NumSubElts, 0);
5277 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5278 APInt(CstEltSizeInBits, 0));
5279 for (unsigned i = 0; i != NumSubElts; ++i) {
5280 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5281 UndefSubElts, i))
5282 return false;
5283 for (unsigned j = 1; j != NumSubVecs; ++j)
5284 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5285 }
5286 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5287 UndefSubElts);
5288 return CastBitData(UndefSubElts, SubEltBits);
5289 }
5290 }
5291
5292 // Extract a rematerialized scalar constant insertion.
5293 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5294 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5295 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5296 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5297 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5298
5299 APInt UndefSrcElts(NumSrcElts, 0);
5300 SmallVector<APInt, 64> SrcEltBits;
5301 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5302 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5303 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5304 return CastBitData(UndefSrcElts, SrcEltBits);
5305 }
5306
5307 // Insert constant bits from a base and sub vector sources.
5308 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5309 // If bitcasts to larger elements we might lose track of undefs - don't
5310 // allow any to be safe.
5311 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5312 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5313
5314 APInt UndefSrcElts, UndefSubElts;
5315 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5316 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5317 UndefSubElts, EltSubBits,
5318 AllowWholeUndefs && AllowUndefs,
5319 AllowPartialUndefs && AllowUndefs) &&
5320 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5321 UndefSrcElts, EltSrcBits,
5322 AllowWholeUndefs && AllowUndefs,
5323 AllowPartialUndefs && AllowUndefs)) {
5324 unsigned BaseIdx = Op.getConstantOperandVal(2);
5325 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5326 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5327 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5328 return CastBitData(UndefSrcElts, EltSrcBits);
5329 }
5330 }
5331
5332 // Extract constant bits from a subvector's source.
5333 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5334 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5335 EltBits, AllowWholeUndefs,
5336 AllowPartialUndefs)) {
5337 EVT SrcVT = Op.getOperand(0).getValueType();
5338 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5339 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5340 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5341 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5342 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5343 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5344 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5345
5346 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5347 if ((BaseIdx + NumSubElts) != NumSrcElts)
5348 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5349 if (BaseIdx != 0)
5350 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5351 return true;
5352 }
5353
5354 // Extract constant bits from shuffle node sources.
5355 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5356 // TODO - support shuffle through bitcasts.
5357 if (EltSizeInBits != VT.getScalarSizeInBits())
5358 return false;
5359
5360 ArrayRef<int> Mask = SVN->getMask();
5361 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5362 llvm::any_of(Mask, [](int M) { return M < 0; }))
5363 return false;
5364
5365 APInt UndefElts0, UndefElts1;
5366 SmallVector<APInt, 32> EltBits0, EltBits1;
5367 if (isAnyInRange(Mask, 0, NumElts) &&
5368 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5369 UndefElts0, EltBits0, AllowWholeUndefs,
5370 AllowPartialUndefs))
5371 return false;
5372 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5373 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5374 UndefElts1, EltBits1, AllowWholeUndefs,
5375 AllowPartialUndefs))
5376 return false;
5377
5378 UndefElts = APInt::getZero(NumElts);
5379 for (int i = 0; i != (int)NumElts; ++i) {
5380 int M = Mask[i];
5381 if (M < 0) {
5382 UndefElts.setBit(i);
5383 EltBits.push_back(APInt::getZero(EltSizeInBits));
5384 } else if (M < (int)NumElts) {
5385 if (UndefElts0[M])
5386 UndefElts.setBit(i);
5387 EltBits.push_back(EltBits0[M]);
5388 } else {
5389 if (UndefElts1[M - NumElts])
5390 UndefElts.setBit(i);
5391 EltBits.push_back(EltBits1[M - NumElts]);
5392 }
5393 }
5394 return true;
5395 }
5396
5397 return false;
5398}
5399
5400namespace llvm {
5401namespace X86 {
5402bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5403 APInt UndefElts;
5404 SmallVector<APInt, 16> EltBits;
5406 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5407 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5408 int SplatIndex = -1;
5409 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5410 if (UndefElts[i])
5411 continue;
5412 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5413 SplatIndex = -1;
5414 break;
5415 }
5416 SplatIndex = i;
5417 }
5418 if (0 <= SplatIndex) {
5419 SplatVal = EltBits[SplatIndex];
5420 return true;
5421 }
5422 }
5423
5424 return false;
5425}
5426
5427int getRoundingModeX86(unsigned RM) {
5428 switch (static_cast<::llvm::RoundingMode>(RM)) {
5429 // clang-format off
5430 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest;
5431 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward;
5432 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward;
5433 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero;
5434 default: return X86::rmInvalid;
5435 // clang-format on
5436 }
5437}
5438
5439} // namespace X86
5440} // namespace llvm
5441
5443 unsigned MaskEltSizeInBits,
5445 APInt &UndefElts) {
5446 // Extract the raw target constant bits.
5447 SmallVector<APInt, 64> EltBits;
5448 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5449 EltBits, /* AllowWholeUndefs */ true,
5450 /* AllowPartialUndefs */ false))
5451 return false;
5452
5453 // Insert the extracted elements into the mask.
5454 for (const APInt &Elt : EltBits)
5455 RawMask.push_back(Elt.getZExtValue());
5456
5457 return true;
5458}
5459
5460static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5461 bool AllowUndefs) {
5462 APInt UndefElts;
5463 SmallVector<APInt, 64> EltBits;
5464 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5465 /*AllowWholeUndefs*/ AllowUndefs,
5466 /*AllowPartialUndefs*/ false))
5467 return false;
5468
5469 bool IsPow2OrUndef = true;
5470 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5471 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5472 return IsPow2OrUndef;
5473}
5474
5475// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5477 // TODO: don't always ignore oneuse constraints.
5478 V = peekThroughBitcasts(V);
5479 EVT VT = V.getValueType();
5480
5481 // Match not(xor X, -1) -> X.
5482 if (V.getOpcode() == ISD::XOR &&
5483 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5484 isAllOnesConstant(V.getOperand(1))))
5485 return V.getOperand(0);
5486
5487 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5488 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5489 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5490 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5491 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5492 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5493 V.getOperand(1));
5494 }
5495 }
5496
5497 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5498 if (V.getOpcode() == X86ISD::PCMPGT &&
5499 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5500 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5501 V.getOperand(0).hasOneUse()) {
5502 APInt UndefElts;
5503 SmallVector<APInt> EltBits;
5504 if (getTargetConstantBitsFromNode(V.getOperand(0),
5505 V.getScalarValueSizeInBits(), UndefElts,
5506 EltBits) &&
5507 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5508 // Don't fold min_signed_value -> (min_signed_value - 1)
5509 bool MinSigned = false;
5510 for (APInt &Elt : EltBits) {
5511 MinSigned |= Elt.isMinSignedValue();
5512 Elt -= 1;
5513 }
5514 if (!MinSigned) {
5515 SDLoc DL(V);
5516 MVT VT = V.getSimpleValueType();
5517 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5518 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5519 }
5520 }
5521 }
5522
5523 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5525 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5526 for (SDValue &CatOp : CatOps) {
5527 SDValue NotCat = IsNOT(CatOp, DAG);
5528 if (!NotCat)
5529 return SDValue();
5530 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5531 }
5532 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5533 }
5534
5535 // Match not(or(not(X),not(Y))) -> and(X, Y).
5536 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5537 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5538 // TODO: Handle cases with single NOT operand -> ANDNP
5539 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5540 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5541 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5542 DAG.getBitcast(VT, Op1));
5543 }
5544
5545 return SDValue();
5546}
5547
5548/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5549/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5550/// Note: This ignores saturation, so inputs must be checked first.
5552 bool Unary, unsigned NumStages = 1) {
5553 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5554 unsigned NumElts = VT.getVectorNumElements();
5555 unsigned NumLanes = VT.getSizeInBits() / 128;
5556 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5557 unsigned Offset = Unary ? 0 : NumElts;
5558 unsigned Repetitions = 1u << (NumStages - 1);
5559 unsigned Increment = 1u << NumStages;
5560 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5561
5562 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5563 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5564 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5565 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5566 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5567 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5568 }
5569 }
5570}
5571
5572// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5573static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5574 APInt &DemandedLHS, APInt &DemandedRHS) {
5575 int NumLanes = VT.getSizeInBits() / 128;
5576 int NumElts = DemandedElts.getBitWidth();
5577 int NumInnerElts = NumElts / 2;
5578 int NumEltsPerLane = NumElts / NumLanes;
5579 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5580
5581 DemandedLHS = APInt::getZero(NumInnerElts);
5582 DemandedRHS = APInt::getZero(NumInnerElts);
5583
5584 // Map DemandedElts to the packed operands.
5585 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5586 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5587 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5588 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5589 if (DemandedElts[OuterIdx])
5590 DemandedLHS.setBit(InnerIdx);
5591 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5592 DemandedRHS.setBit(InnerIdx);
5593 }
5594 }
5595}
5596
5597// Split the demanded elts of a HADD/HSUB node between its operands.
5598static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5599 APInt &DemandedLHS, APInt &DemandedRHS) {
5601 DemandedLHS, DemandedRHS);
5602 DemandedLHS |= DemandedLHS << 1;
5603 DemandedRHS |= DemandedRHS << 1;
5604}
5605
5606/// Calculates the shuffle mask corresponding to the target-specific opcode.
5607/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5608/// operands in \p Ops, and returns true.
5609/// Sets \p IsUnary to true if only one source is used. Note that this will set
5610/// IsUnary for shuffles which use a single input multiple times, and in those
5611/// cases it will adjust the mask to only have indices within that single input.
5612/// It is an error to call this with non-empty Mask/Ops vectors.
5613static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5615 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5616 if (!isTargetShuffle(N.getOpcode()))
5617 return false;
5618
5619 MVT VT = N.getSimpleValueType();
5620 unsigned NumElems = VT.getVectorNumElements();
5621 unsigned MaskEltSize = VT.getScalarSizeInBits();
5623 APInt RawUndefs;
5624 uint64_t ImmN;
5625
5626 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5627 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5628
5629 IsUnary = false;
5630 bool IsFakeUnary = false;
5631 switch (N.getOpcode()) {
5632 case X86ISD::BLENDI:
5633 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5634 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5635 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5636 DecodeBLENDMask(NumElems, ImmN, Mask);
5637 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5638 break;
5639 case X86ISD::SHUFP:
5640 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5641 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5642 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5643 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5644 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5645 break;
5646 case X86ISD::INSERTPS:
5647 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5648 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5649 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5650 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5651 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5652 break;
5653 case X86ISD::EXTRQI:
5654 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5655 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5656 isa<ConstantSDNode>(N.getOperand(2))) {
5657 int BitLen = N.getConstantOperandVal(1);
5658 int BitIdx = N.getConstantOperandVal(2);
5659 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5660 IsUnary = true;
5661 }
5662 break;
5663 case X86ISD::INSERTQI:
5664 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5665 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5666 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5667 isa<ConstantSDNode>(N.getOperand(3))) {
5668 int BitLen = N.getConstantOperandVal(2);
5669 int BitIdx = N.getConstantOperandVal(3);
5670 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5671 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5672 }
5673 break;
5674 case X86ISD::UNPCKH:
5675 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5676 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5677 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5678 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5679 break;
5680 case X86ISD::UNPCKL:
5681 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5682 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5683 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5684 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5685 break;
5686 case X86ISD::MOVHLPS:
5687 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5688 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5689 DecodeMOVHLPSMask(NumElems, Mask);
5690 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5691 break;
5692 case X86ISD::MOVLHPS:
5693 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5694 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5695 DecodeMOVLHPSMask(NumElems, Mask);
5696 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5697 break;
5698 case X86ISD::VALIGN:
5699 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5700 "Only 32-bit and 64-bit elements are supported!");
5701 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5702 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5703 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5704 DecodeVALIGNMask(NumElems, ImmN, Mask);
5705 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5706 Ops.push_back(N.getOperand(1));
5707 Ops.push_back(N.getOperand(0));
5708 break;
5709 case X86ISD::PALIGNR:
5710 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5711 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5712 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5713 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5714 DecodePALIGNRMask(NumElems, ImmN, Mask);
5715 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5716 Ops.push_back(N.getOperand(1));
5717 Ops.push_back(N.getOperand(0));
5718 break;
5719 case X86ISD::VSHLDQ:
5720 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5721 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5722 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5723 DecodePSLLDQMask(NumElems, ImmN, Mask);
5724 IsUnary = true;
5725 break;
5726 case X86ISD::VSRLDQ:
5727 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5728 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5729 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5730 DecodePSRLDQMask(NumElems, ImmN, Mask);
5731 IsUnary = true;
5732 break;
5733 case X86ISD::PSHUFD:
5734 case X86ISD::VPERMILPI:
5735 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5736 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5737 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5738 IsUnary = true;
5739 break;
5740 case X86ISD::PSHUFHW:
5741 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5742 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5743 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5744 IsUnary = true;
5745 break;
5746 case X86ISD::PSHUFLW:
5747 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5748 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5749 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5750 IsUnary = true;
5751 break;
5752 case X86ISD::VZEXT_MOVL:
5753 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5754 DecodeZeroMoveLowMask(NumElems, Mask);
5755 IsUnary = true;
5756 break;
5757 case X86ISD::VBROADCAST:
5758 // We only decode broadcasts of same-sized vectors, peeking through to
5759 // extracted subvectors is likely to cause hasOneUse issues with
5760 // SimplifyDemandedBits etc.
5761 if (N.getOperand(0).getValueType() == VT) {
5762 DecodeVectorBroadcast(NumElems, Mask);
5763 IsUnary = true;
5764 break;
5765 }
5766 return false;
5767 case X86ISD::VPERMILPV: {
5768 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5769 IsUnary = true;
5770 SDValue MaskNode = N.getOperand(1);
5771 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5772 RawUndefs)) {
5773 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5774 break;
5775 }
5776 return false;
5777 }
5778 case X86ISD::PSHUFB: {
5779 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5780 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5781 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5782 IsUnary = true;
5783 SDValue MaskNode = N.getOperand(1);
5784 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5785 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5786 break;
5787 }
5788 return false;
5789 }
5790 case X86ISD::VPERMI:
5791 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5792 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5793 DecodeVPERMMask(NumElems, ImmN, Mask);
5794 IsUnary = true;
5795 break;
5796 case X86ISD::MOVSS:
5797 case X86ISD::MOVSD:
5798 case X86ISD::MOVSH:
5799 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5800 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5801 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5802 break;
5803 case X86ISD::VPERM2X128:
5804 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5805 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5806 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5807 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5808 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5809 break;
5810 case X86ISD::SHUF128:
5811 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5812 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5813 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5814 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5815 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5816 break;
5817 case X86ISD::MOVSLDUP:
5818 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5819 DecodeMOVSLDUPMask(NumElems, Mask);
5820 IsUnary = true;
5821 break;
5822 case X86ISD::MOVSHDUP:
5823 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5824 DecodeMOVSHDUPMask(NumElems, Mask);
5825 IsUnary = true;
5826 break;
5827 case X86ISD::MOVDDUP:
5828 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5829 DecodeMOVDDUPMask(NumElems, Mask);
5830 IsUnary = true;
5831 break;
5832 case X86ISD::VPERMIL2: {
5833 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5834 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5835 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5836 SDValue MaskNode = N.getOperand(2);
5837 SDValue CtrlNode = N.getOperand(3);
5838 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5839 unsigned CtrlImm = CtrlOp->getZExtValue();
5840 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5841 RawUndefs)) {
5842 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5843 Mask);
5844 break;
5845 }
5846 }
5847 return false;
5848 }
5849 case X86ISD::VPPERM: {
5850 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5851 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5852 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5853 SDValue MaskNode = N.getOperand(2);
5854 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5855 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5856 break;
5857 }
5858 return false;
5859 }
5860 case X86ISD::VPERMV: {
5861 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5862 IsUnary = true;
5863 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5864 Ops.push_back(N.getOperand(1));
5865 SDValue MaskNode = N.getOperand(0);
5866 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5867 RawUndefs)) {
5868 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5869 break;
5870 }
5871 return false;
5872 }
5873 case X86ISD::VPERMV3: {
5874 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5875 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5876 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5877 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5878 Ops.push_back(N.getOperand(0));
5879 Ops.push_back(N.getOperand(2));
5880 SDValue MaskNode = N.getOperand(1);
5881 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5882 RawUndefs)) {
5883 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5884 break;
5885 }
5886 return false;
5887 }
5888 case X86ISD::COMPRESS: {
5889 SDValue CmpVec = N.getOperand(0);
5890 SDValue PassThru = N.getOperand(1);
5891 SDValue CmpMask = N.getOperand(2);
5892 APInt UndefElts;
5893 SmallVector<APInt> EltBits;
5894 if (!getTargetConstantBitsFromNode(CmpMask, 1, UndefElts, EltBits))
5895 return false;
5896 assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
5897 "Illegal compression mask");
5898 for (unsigned I = 0; I != NumElems; ++I) {
5899 if (!EltBits[I].isZero())
5900 Mask.push_back(I);
5901 }
5902 while (Mask.size() != NumElems) {
5903 Mask.push_back(NumElems + Mask.size());
5904 }
5905 Ops.push_back(CmpVec);
5906 Ops.push_back(PassThru);
5907 return true;
5908 }
5909 case X86ISD::EXPAND: {
5910 SDValue ExpVec = N.getOperand(0);
5911 SDValue PassThru = N.getOperand(1);
5912 SDValue ExpMask = N.getOperand(2);
5913 APInt UndefElts;
5914 SmallVector<APInt> EltBits;
5915 if (!getTargetConstantBitsFromNode(ExpMask, 1, UndefElts, EltBits))
5916 return false;
5917 assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
5918 "Illegal expansion mask");
5919 unsigned ExpIndex = 0;
5920 for (unsigned I = 0; I != NumElems; ++I) {
5921 if (EltBits[I].isZero())
5922 Mask.push_back(I + NumElems);
5923 else
5924 Mask.push_back(ExpIndex++);
5925 }
5926 Ops.push_back(ExpVec);
5927 Ops.push_back(PassThru);
5928 return true;
5929 }
5930 default:
5931 llvm_unreachable("unknown target shuffle node");
5932 }
5933
5934 // Empty mask indicates the decode failed.
5935 if (Mask.empty())
5936 return false;
5937
5938 // Check if we're getting a shuffle mask with zero'd elements.
5939 if (!AllowSentinelZero && isAnyZero(Mask))
5940 return false;
5941
5942 // If we have a fake unary shuffle, the shuffle mask is spread across two
5943 // inputs that are actually the same node. Re-map the mask to always point
5944 // into the first input.
5945 if (IsFakeUnary)
5946 for (int &M : Mask)
5947 if (M >= (int)Mask.size())
5948 M -= Mask.size();
5949
5950 // If we didn't already add operands in the opcode-specific code, default to
5951 // adding 1 or 2 operands starting at 0.
5952 if (Ops.empty()) {
5953 Ops.push_back(N.getOperand(0));
5954 if (!IsUnary || IsFakeUnary)
5955 Ops.push_back(N.getOperand(1));
5956 }
5957
5958 return true;
5959}
5960
5961// Wrapper for getTargetShuffleMask with InUnary;
5962static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5964 SmallVectorImpl<int> &Mask) {
5965 bool IsUnary;
5966 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5967}
5968
5969/// Compute whether each element of a shuffle is zeroable.
5970///
5971/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5972/// Either it is an undef element in the shuffle mask, the element of the input
5973/// referenced is undef, or the element of the input referenced is known to be
5974/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5975/// as many lanes with this technique as possible to simplify the remaining
5976/// shuffle.
5978 SDValue V1, SDValue V2,
5979 APInt &KnownUndef, APInt &KnownZero) {
5980 int Size = Mask.size();
5981 KnownUndef = KnownZero = APInt::getZero(Size);
5982
5983 V1 = peekThroughBitcasts(V1);
5984 V2 = peekThroughBitcasts(V2);
5985
5986 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5987 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5988
5989 int VectorSizeInBits = V1.getValueSizeInBits();
5990 int ScalarSizeInBits = VectorSizeInBits / Size;
5991 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5992
5993 for (int i = 0; i < Size; ++i) {
5994 int M = Mask[i];
5995 // Handle the easy cases.
5996 if (M < 0) {
5997 KnownUndef.setBit(i);
5998 continue;
5999 }
6000 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
6001 KnownZero.setBit(i);
6002 continue;
6003 }
6004
6005 // Determine shuffle input and normalize the mask.
6006 SDValue V = M < Size ? V1 : V2;
6007 M %= Size;
6008
6009 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
6010 if (V.getOpcode() != ISD::BUILD_VECTOR)
6011 continue;
6012
6013 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
6014 // the (larger) source element must be UNDEF/ZERO.
6015 if ((Size % V.getNumOperands()) == 0) {
6016 int Scale = Size / V->getNumOperands();
6017 SDValue Op = V.getOperand(M / Scale);
6018 if (Op.isUndef())
6019 KnownUndef.setBit(i);
6020 if (X86::isZeroNode(Op))
6021 KnownZero.setBit(i);
6022 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
6023 APInt Val = Cst->getAPIntValue();
6024 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6025 if (Val == 0)
6026 KnownZero.setBit(i);
6027 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6028 APInt Val = Cst->getValueAPF().bitcastToAPInt();
6029 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6030 if (Val == 0)
6031 KnownZero.setBit(i);
6032 }
6033 continue;
6034 }
6035
6036 // If the BUILD_VECTOR has more elements then all the (smaller) source
6037 // elements must be UNDEF or ZERO.
6038 if ((V.getNumOperands() % Size) == 0) {
6039 int Scale = V->getNumOperands() / Size;
6040 bool AllUndef = true;
6041 bool AllZero = true;
6042 for (int j = 0; j < Scale; ++j) {
6043 SDValue Op = V.getOperand((M * Scale) + j);
6044 AllUndef &= Op.isUndef();
6045 AllZero &= X86::isZeroNode(Op);
6046 }
6047 if (AllUndef)
6048 KnownUndef.setBit(i);
6049 if (AllZero)
6050 KnownZero.setBit(i);
6051 continue;
6052 }
6053 }
6054}
6055
6056/// Decode a target shuffle mask and inputs and see if any values are
6057/// known to be undef or zero from their inputs.
6058/// Returns true if the target shuffle mask was decoded.
6059/// FIXME: Merge this with computeZeroableShuffleElements?
6062 APInt &KnownUndef, APInt &KnownZero) {
6063 bool IsUnary;
6064 if (!isTargetShuffle(N.getOpcode()))
6065 return false;
6066
6067 MVT VT = N.getSimpleValueType();
6068 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
6069 return false;
6070
6071 int Size = Mask.size();
6072 SDValue V1 = Ops[0];
6073 SDValue V2 = IsUnary ? V1 : Ops[1];
6074 KnownUndef = KnownZero = APInt::getZero(Size);
6075
6076 V1 = peekThroughBitcasts(V1);
6077 V2 = peekThroughBitcasts(V2);
6078
6079 assert((VT.getSizeInBits() % Size) == 0 &&
6080 "Illegal split of shuffle value type");
6081 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
6082
6083 // Extract known constant input data.
6084 APInt UndefSrcElts[2];
6085 SmallVector<APInt, 32> SrcEltBits[2];
6086 bool IsSrcConstant[2] = {
6087 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6088 SrcEltBits[0], /*AllowWholeUndefs*/ true,
6089 /*AllowPartialUndefs*/ false),
6090 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6091 SrcEltBits[1], /*AllowWholeUndefs*/ true,
6092 /*AllowPartialUndefs*/ false)};
6093
6094 for (int i = 0; i < Size; ++i) {
6095 int M = Mask[i];
6096
6097 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6098 if (M < 0) {
6099 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
6100 if (SM_SentinelUndef == M)
6101 KnownUndef.setBit(i);
6102 if (SM_SentinelZero == M)
6103 KnownZero.setBit(i);
6104 continue;
6105 }
6106
6107 // Determine shuffle input and normalize the mask.
6108 unsigned SrcIdx = M / Size;
6109 SDValue V = M < Size ? V1 : V2;
6110 M %= Size;
6111
6112 // We are referencing an UNDEF input.
6113 if (V.isUndef()) {
6114 KnownUndef.setBit(i);
6115 continue;
6116 }
6117
6118 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6119 // TODO: We currently only set UNDEF for integer types - floats use the same
6120 // registers as vectors and many of the scalar folded loads rely on the
6121 // SCALAR_TO_VECTOR pattern.
6122 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6123 (Size % V.getValueType().getVectorNumElements()) == 0) {
6124 int Scale = Size / V.getValueType().getVectorNumElements();
6125 int Idx = M / Scale;
6126 if (Idx != 0 && !VT.isFloatingPoint())
6127 KnownUndef.setBit(i);
6128 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6129 KnownZero.setBit(i);
6130 continue;
6131 }
6132
6133 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6134 // base vectors.
6135 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6136 SDValue Vec = V.getOperand(0);
6137 int NumVecElts = Vec.getValueType().getVectorNumElements();
6138 if (Vec.isUndef() && Size == NumVecElts) {
6139 int Idx = V.getConstantOperandVal(2);
6140 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6141 if (M < Idx || (Idx + NumSubElts) <= M)
6142 KnownUndef.setBit(i);
6143 }
6144 continue;
6145 }
6146
6147 // Attempt to extract from the source's constant bits.
6148 if (IsSrcConstant[SrcIdx]) {
6149 if (UndefSrcElts[SrcIdx][M])
6150 KnownUndef.setBit(i);
6151 else if (SrcEltBits[SrcIdx][M] == 0)
6152 KnownZero.setBit(i);
6153 }
6154 }
6155
6156 assert(VT.getVectorNumElements() == (unsigned)Size &&
6157 "Different mask size from vector size!");
6158 return true;
6159}
6160
6161// Replace target shuffle mask elements with known undef/zero sentinels.
6163 const APInt &KnownUndef,
6164 const APInt &KnownZero,
6165 bool ResolveKnownZeros= true) {
6166 unsigned NumElts = Mask.size();
6167 assert(KnownUndef.getBitWidth() == NumElts &&
6168 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6169
6170 for (unsigned i = 0; i != NumElts; ++i) {
6171 if (KnownUndef[i])
6172 Mask[i] = SM_SentinelUndef;
6173 else if (ResolveKnownZeros && KnownZero[i])
6174 Mask[i] = SM_SentinelZero;
6175 }
6176}
6177
6178// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6180 APInt &KnownUndef,
6181 APInt &KnownZero) {
6182 unsigned NumElts = Mask.size();
6183 KnownUndef = KnownZero = APInt::getZero(NumElts);
6184
6185 for (unsigned i = 0; i != NumElts; ++i) {
6186 int M = Mask[i];
6187 if (SM_SentinelUndef == M)
6188 KnownUndef.setBit(i);
6189 if (SM_SentinelZero == M)
6190 KnownZero.setBit(i);
6191 }
6192}
6193
6194// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6196 SDValue Cond, bool IsBLENDV = false) {
6197 EVT CondVT = Cond.getValueType();
6198 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6199 unsigned NumElts = CondVT.getVectorNumElements();
6200
6201 APInt UndefElts;
6202 SmallVector<APInt, 32> EltBits;
6203 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6204 /*AllowWholeUndefs*/ true,
6205 /*AllowPartialUndefs*/ false))
6206 return false;
6207
6208 Mask.resize(NumElts, SM_SentinelUndef);
6209
6210 for (int i = 0; i != (int)NumElts; ++i) {
6211 Mask[i] = i;
6212 // Arbitrarily choose from the 2nd operand if the select condition element
6213 // is undef.
6214 // TODO: Can we do better by matching patterns such as even/odd?
6215 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6216 (IsBLENDV && EltBits[i].isNonNegative()))
6217 Mask[i] += NumElts;
6218 }
6219
6220 return true;
6221}
6222
6223// Forward declaration (for getFauxShuffleMask recursive check).
6224static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6227 const SelectionDAG &DAG, unsigned Depth,
6228 bool ResolveKnownElts);
6229
6230// Attempt to decode ops that could be represented as a shuffle mask.
6231// The decoded shuffle mask may contain a different number of elements to the
6232// destination value type.
6233// TODO: Merge into getTargetShuffleInputs()
6234static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6237 const SelectionDAG &DAG, unsigned Depth,
6238 bool ResolveKnownElts) {
6239 Mask.clear();
6240 Ops.clear();
6241
6242 MVT VT = N.getSimpleValueType();
6243 unsigned NumElts = VT.getVectorNumElements();
6244 unsigned NumSizeInBits = VT.getSizeInBits();
6245 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6246 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6247 return false;
6248 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6249 unsigned NumSizeInBytes = NumSizeInBits / 8;
6250 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6251
6252 unsigned Opcode = N.getOpcode();
6253 switch (Opcode) {
6254 case ISD::VECTOR_SHUFFLE: {
6255 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6256 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6257 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6258 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6259 Ops.push_back(N.getOperand(0));
6260 Ops.push_back(N.getOperand(1));
6261 return true;
6262 }
6263 return false;
6264 }
6265 case ISD::AND:
6266 case X86ISD::ANDNP: {
6267 // Attempt to decode as a per-byte mask.
6268 APInt UndefElts;
6269 SmallVector<APInt, 32> EltBits;
6270 SDValue N0 = N.getOperand(0);
6271 SDValue N1 = N.getOperand(1);
6272 bool IsAndN = (X86ISD::ANDNP == Opcode);
6273 uint64_t ZeroMask = IsAndN ? 255 : 0;
6274 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6275 /*AllowWholeUndefs*/ false,
6276 /*AllowPartialUndefs*/ false))
6277 return false;
6278 // We can't assume an undef src element gives an undef dst - the other src
6279 // might be zero.
6280 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6281 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6282 const APInt &ByteBits = EltBits[i];
6283 if (ByteBits != 0 && ByteBits != 255)
6284 return false;
6285 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6286 }
6287 Ops.push_back(IsAndN ? N1 : N0);
6288 return true;
6289 }
6290 case ISD::OR: {
6291 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6292 // is a valid shuffle index.
6293 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6294 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6295 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6296 return false;
6297
6298 SmallVector<int, 64> SrcMask0, SrcMask1;
6299 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6302 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6303 Depth + 1, true) ||
6304 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6305 Depth + 1, true))
6306 return false;
6307
6308 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6309 SmallVector<int, 64> Mask0, Mask1;
6310 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6311 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6312 for (int i = 0; i != (int)MaskSize; ++i) {
6313 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6314 // loops converting between OR and BLEND shuffles due to
6315 // canWidenShuffleElements merging away undef elements, meaning we
6316 // fail to recognise the OR as the undef element isn't known zero.
6317 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6318 Mask.push_back(SM_SentinelZero);
6319 else if (Mask1[i] == SM_SentinelZero)
6320 Mask.push_back(i);
6321 else if (Mask0[i] == SM_SentinelZero)
6322 Mask.push_back(i + MaskSize);
6323 else
6324 return false;
6325 }
6326 Ops.push_back(N.getOperand(0));
6327 Ops.push_back(N.getOperand(1));
6328 return true;
6329 }
6330 case ISD::CONCAT_VECTORS: {
6331 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6332 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6333 if (NumBitsPerElt == 64) {
6334 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6335 for (unsigned M = 0; M != NumSubElts; ++M)
6336 Mask.push_back((I * NumElts) + M);
6337 Ops.push_back(N.getOperand(I));
6338 }
6339 return true;
6340 }
6341 return false;
6342 }
6343 case ISD::INSERT_SUBVECTOR: {
6344 SDValue Src = N.getOperand(0);
6345 SDValue Sub = N.getOperand(1);
6346 EVT SubVT = Sub.getValueType();
6347 unsigned NumSubElts = SubVT.getVectorNumElements();
6348 uint64_t InsertIdx = N.getConstantOperandVal(2);
6349 // Subvector isn't demanded - just return the base vector.
6350 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6351 Mask.resize(NumElts);
6352 std::iota(Mask.begin(), Mask.end(), 0);
6353 Ops.push_back(Src);
6354 return true;
6355 }
6356 // Handle CONCAT(SUB0, SUB1).
6357 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6358 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6359 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6360 Src.getOperand(0).isUndef() &&
6361 Src.getOperand(1).getValueType() == SubVT &&
6362 Src.getConstantOperandVal(2) == 0 &&
6363 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6364 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6365 Mask.resize(NumElts);
6366 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6367 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6368 Ops.push_back(Src.getOperand(1));
6369 Ops.push_back(Sub);
6370 return true;
6371 }
6372 if (!N->isOnlyUserOf(Sub.getNode()))
6373 return false;
6374
6375 SmallVector<int, 64> SubMask;
6376 SmallVector<SDValue, 2> SubInputs;
6378 EVT SubSrcVT = SubSrc.getValueType();
6379 if (!SubSrcVT.isVector())
6380 return false;
6381
6382 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6383 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6384 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6385 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6386 SDValue SubSrcSrc = SubSrc.getOperand(0);
6387 unsigned NumSubSrcSrcElts =
6388 SubSrcSrc.getValueType().getVectorNumElements();
6389 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6390 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6391 "Subvector valuetype mismatch");
6392 InsertIdx *= (MaxElts / NumElts);
6393 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6394 NumSubElts *= (MaxElts / NumElts);
6395 bool SrcIsUndef = Src.isUndef();
6396 for (int i = 0; i != (int)MaxElts; ++i)
6397 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6398 for (int i = 0; i != (int)NumSubElts; ++i)
6399 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6400 if (!SrcIsUndef)
6401 Ops.push_back(Src);
6402 Ops.push_back(SubSrcSrc);
6403 return true;
6404 }
6405
6406 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6407 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6408 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6409 Depth + 1, ResolveKnownElts))
6410 return false;
6411
6412 // Subvector shuffle inputs must not be larger than the subvector.
6413 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6414 return SubVT.getFixedSizeInBits() <
6415 SubInput.getValueSizeInBits().getFixedValue();
6416 }))
6417 return false;
6418
6419 if (SubMask.size() != NumSubElts) {
6420 assert(((SubMask.size() % NumSubElts) == 0 ||
6421 (NumSubElts % SubMask.size()) == 0) &&
6422 "Illegal submask scale");
6423 if ((NumSubElts % SubMask.size()) == 0) {
6424 int Scale = NumSubElts / SubMask.size();
6425 SmallVector<int, 64> ScaledSubMask;
6426 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6427 SubMask = ScaledSubMask;
6428 } else {
6429 int Scale = SubMask.size() / NumSubElts;
6430 NumSubElts = SubMask.size();
6431 NumElts *= Scale;
6432 InsertIdx *= Scale;
6433 }
6434 }
6435 Ops.push_back(Src);
6436 Ops.append(SubInputs.begin(), SubInputs.end());
6437 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6438 Mask.append(NumElts, SM_SentinelZero);
6439 else
6440 for (int i = 0; i != (int)NumElts; ++i)
6441 Mask.push_back(i);
6442 for (int i = 0; i != (int)NumSubElts; ++i) {
6443 int M = SubMask[i];
6444 if (0 <= M) {
6445 int InputIdx = M / NumSubElts;
6446 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6447 }
6448 Mask[i + InsertIdx] = M;
6449 }
6450 return true;
6451 }
6452 case X86ISD::PINSRB:
6453 case X86ISD::PINSRW:
6456 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6457 // vector, for matching src/dst vector types.
6458 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6459
6460 unsigned DstIdx = 0;
6461 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6462 // Check we have an in-range constant insertion index.
6463 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6464 N.getConstantOperandAPInt(2).uge(NumElts))
6465 return false;
6466 DstIdx = N.getConstantOperandVal(2);
6467
6468 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6469 if (X86::isZeroNode(Scl)) {
6470 Ops.push_back(N.getOperand(0));
6471 for (unsigned i = 0; i != NumElts; ++i)
6472 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6473 return true;
6474 }
6475 }
6476
6477 // Peek through trunc/aext/zext/bitcast.
6478 // TODO: aext shouldn't require SM_SentinelZero padding.
6479 // TODO: handle shift of scalars.
6480 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6481 while (Scl.getOpcode() == ISD::TRUNCATE ||
6482 Scl.getOpcode() == ISD::ANY_EXTEND ||
6483 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6484 (Scl.getOpcode() == ISD::BITCAST &&
6487 Scl = Scl.getOperand(0);
6488 MinBitsPerElt =
6489 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6490 }
6491 if ((MinBitsPerElt % 8) != 0)
6492 return false;
6493
6494 // Attempt to find the source vector the scalar was extracted from.
6495 SDValue SrcExtract;
6496 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6497 Scl.getOpcode() == X86ISD::PEXTRW ||
6498 Scl.getOpcode() == X86ISD::PEXTRB) &&
6499 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6500 SrcExtract = Scl;
6501 }
6502 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6503 return false;
6504
6505 SDValue SrcVec = SrcExtract.getOperand(0);
6506 EVT SrcVT = SrcVec.getValueType();
6507 if (!SrcVT.getScalarType().isByteSized())
6508 return false;
6509 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6510 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6511 unsigned DstByte = DstIdx * NumBytesPerElt;
6512 MinBitsPerElt =
6513 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6514
6515 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6516 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6517 Ops.push_back(SrcVec);
6518 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6519 } else {
6520 Ops.push_back(SrcVec);
6521 Ops.push_back(N.getOperand(0));
6522 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6523 Mask.push_back(NumSizeInBytes + i);
6524 }
6525
6526 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6527 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6528 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6529 Mask[DstByte + i] = SrcByte + i;
6530 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6531 Mask[DstByte + i] = SM_SentinelZero;
6532 return true;
6533 }
6534 case X86ISD::PACKSS:
6535 case X86ISD::PACKUS: {
6536 SDValue N0 = N.getOperand(0);
6537 SDValue N1 = N.getOperand(1);
6538 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6539 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6540 "Unexpected input value type");
6541
6542 APInt EltsLHS, EltsRHS;
6543 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6544
6545 // If we know input saturation won't happen (or we don't care for particular
6546 // lanes), we can treat this as a truncation shuffle.
6547 bool Offset0 = false, Offset1 = false;
6548 if (Opcode == X86ISD::PACKSS) {
6549 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6550 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6551 (!(N1.isUndef() || EltsRHS.isZero()) &&
6552 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6553 return false;
6554 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6555 // PACKSS then it was likely being used for sign-extension for a
6556 // truncation, so just peek through and adjust the mask accordingly.
6557 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6558 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6559 Offset0 = true;
6560 N0 = N0.getOperand(0);
6561 }
6562 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6563 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6564 Offset1 = true;
6565 N1 = N1.getOperand(0);
6566 }
6567 } else {
6568 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6569 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6570 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6571 (!(N1.isUndef() || EltsRHS.isZero()) &&
6572 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6573 return false;
6574 }
6575
6576 bool IsUnary = (N0 == N1);
6577
6578 Ops.push_back(N0);
6579 if (!IsUnary)
6580 Ops.push_back(N1);
6581
6582 createPackShuffleMask(VT, Mask, IsUnary);
6583
6584 if (Offset0 || Offset1) {
6585 for (int &M : Mask)
6586 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6587 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6588 ++M;
6589 }
6590 return true;
6591 }
6592 case ISD::VSELECT:
6593 case X86ISD::BLENDV: {
6594 SDValue Cond = N.getOperand(0);
6595 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6596 Ops.push_back(N.getOperand(1));
6597 Ops.push_back(N.getOperand(2));
6598 return true;
6599 }
6600 return false;
6601 }
6602 case X86ISD::VTRUNC: {
6603 SDValue Src = N.getOperand(0);
6604 EVT SrcVT = Src.getValueType();
6605 if (SrcVT.getSizeInBits() != NumSizeInBits)
6606 return false;
6607 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6608 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6609 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6610 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6611 for (unsigned i = 0; i != NumSrcElts; ++i)
6612 Mask.push_back(i * Scale);
6613 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6614 Ops.push_back(Src);
6615 return true;
6616 }
6617 case ISD::SHL:
6618 case ISD::SRL: {
6619 APInt UndefElts;
6620 SmallVector<APInt, 32> EltBits;
6621 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6622 UndefElts, EltBits,
6623 /*AllowWholeUndefs*/ true,
6624 /*AllowPartialUndefs*/ false))
6625 return false;
6626
6627 // We can only decode 'whole byte' bit shifts as shuffles.
6628 for (unsigned I = 0; I != NumElts; ++I)
6629 if (DemandedElts[I] && !UndefElts[I] &&
6630 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6631 return false;
6632
6633 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6634 Ops.push_back(N.getOperand(0));
6635
6636 for (unsigned I = 0; I != NumElts; ++I) {
6637 if (!DemandedElts[I] || UndefElts[I])
6638 continue;
6639 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6640 unsigned Lo = I * NumBytesPerElt;
6641 unsigned Hi = Lo + NumBytesPerElt;
6642 // Clear mask to all zeros and insert the shifted byte indices.
6643 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6644 if (ISD::SHL == Opcode)
6645 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6646 else
6647 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6648 Lo + ByteShift);
6649 }
6650 return true;
6651 }
6652 case X86ISD::VSHLI:
6653 case X86ISD::VSRLI: {
6654 uint64_t ShiftVal = N.getConstantOperandVal(1);
6655 // Out of range bit shifts are guaranteed to be zero.
6656 if (NumBitsPerElt <= ShiftVal) {
6657 Mask.append(NumElts, SM_SentinelZero);
6658 return true;
6659 }
6660
6661 // We can only decode 'whole byte' bit shifts as shuffles.
6662 if ((ShiftVal % 8) != 0)
6663 break;
6664
6665 uint64_t ByteShift = ShiftVal / 8;
6666 Ops.push_back(N.getOperand(0));
6667
6668 // Clear mask to all zeros and insert the shifted byte indices.
6669 Mask.append(NumSizeInBytes, SM_SentinelZero);
6670
6671 if (X86ISD::VSHLI == Opcode) {
6672 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6673 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6674 Mask[i + j] = i + j - ByteShift;
6675 } else {
6676 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6677 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6678 Mask[i + j - ByteShift] = i + j;
6679 }
6680 return true;
6681 }
6682 case X86ISD::VROTLI:
6683 case X86ISD::VROTRI: {
6684 // We can only decode 'whole byte' bit rotates as shuffles.
6685 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6686 if ((RotateVal % 8) != 0)
6687 return false;
6688 Ops.push_back(N.getOperand(0));
6689 int Offset = RotateVal / 8;
6690 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6691 for (int i = 0; i != (int)NumElts; ++i) {
6692 int BaseIdx = i * NumBytesPerElt;
6693 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6694 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6695 }
6696 }
6697 return true;
6698 }
6699 case X86ISD::VBROADCAST: {
6700 SDValue Src = N.getOperand(0);
6701 if (!Src.getSimpleValueType().isVector()) {
6702 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6703 !isNullConstant(Src.getOperand(1)) ||
6704 Src.getOperand(0).getValueType().getScalarType() !=
6705 VT.getScalarType())
6706 return false;
6707 Src = Src.getOperand(0);
6708 }
6709 Ops.push_back(Src);
6710 Mask.append(NumElts, 0);
6711 return true;
6712 }
6714 SDValue Src = N.getOperand(0);
6715 EVT SrcVT = Src.getValueType();
6716 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6717
6718 // Extended source must be a simple vector.
6719 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6720 (NumBitsPerSrcElt % 8) != 0)
6721 return false;
6722
6723 // We can only handle all-signbits extensions.
6724 APInt DemandedSrcElts =
6725 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6726 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6727 return false;
6728
6729 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6730 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6731 for (unsigned I = 0; I != NumElts; ++I)
6732 Mask.append(Scale, I);
6733 Ops.push_back(Src);
6734 return true;
6735 }
6736 case ISD::ZERO_EXTEND:
6737 case ISD::ANY_EXTEND:
6740 SDValue Src = N.getOperand(0);
6741 EVT SrcVT = Src.getValueType();
6742
6743 // Extended source must be a simple vector.
6744 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6745 (SrcVT.getScalarSizeInBits() % 8) != 0)
6746 return false;
6747
6748 bool IsAnyExtend =
6749 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6750 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6751 IsAnyExtend, Mask);
6752 Ops.push_back(Src);
6753 return true;
6754 }
6755 }
6756
6757 return false;
6758}
6759
6760/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6762 SmallVectorImpl<int> &Mask) {
6763 int MaskWidth = Mask.size();
6764 SmallVector<SDValue, 16> UsedInputs;
6765 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6766 int lo = UsedInputs.size() * MaskWidth;
6767 int hi = lo + MaskWidth;
6768
6769 // Strip UNDEF input usage.
6770 if (Inputs[i].isUndef())
6771 for (int &M : Mask)
6772 if ((lo <= M) && (M < hi))
6773 M = SM_SentinelUndef;
6774
6775 // Check for unused inputs.
6776 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6777 for (int &M : Mask)
6778 if (lo <= M)
6779 M -= MaskWidth;
6780 continue;
6781 }
6782
6783 // Check for repeated inputs.
6784 bool IsRepeat = false;
6785 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6786 if (UsedInputs[j] != Inputs[i])
6787 continue;
6788 for (int &M : Mask)
6789 if (lo <= M)
6790 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6791 IsRepeat = true;
6792 break;
6793 }
6794 if (IsRepeat)
6795 continue;
6796
6797 UsedInputs.push_back(Inputs[i]);
6798 }
6799 Inputs = UsedInputs;
6800}
6801
6802/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6803/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6804/// Returns true if the target shuffle mask was decoded.
6805static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6808 APInt &KnownUndef, APInt &KnownZero,
6809 const SelectionDAG &DAG, unsigned Depth,
6810 bool ResolveKnownElts) {
6812 return false; // Limit search depth.
6813
6814 EVT VT = Op.getValueType();
6815 if (!VT.isSimple() || !VT.isVector())
6816 return false;
6817
6818 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6819 if (ResolveKnownElts)
6820 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6821 return true;
6822 }
6823 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6824 ResolveKnownElts)) {
6825 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6826 return true;
6827 }
6828 return false;
6829}
6830
6831static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6834 const SelectionDAG &DAG, unsigned Depth,
6835 bool ResolveKnownElts) {
6836 APInt KnownUndef, KnownZero;
6837 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6838 KnownZero, DAG, Depth, ResolveKnownElts);
6839}
6840
6843 const SelectionDAG &DAG, unsigned Depth = 0,
6844 bool ResolveKnownElts = true) {
6845 EVT VT = Op.getValueType();
6846 if (!VT.isSimple() || !VT.isVector())
6847 return false;
6848
6849 unsigned NumElts = Op.getValueType().getVectorNumElements();
6850 APInt DemandedElts = APInt::getAllOnes(NumElts);
6851 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6852 ResolveKnownElts);
6853}
6854
6855// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6856static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6857 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6858 SelectionDAG &DAG) {
6859 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6860 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6861 "Unknown broadcast load type");
6862
6863 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6864 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6865 return SDValue();
6866
6867 SDValue Ptr = DAG.getMemBasePlusOffset(Mem->getBasePtr(),
6869 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6870 SDValue Ops[] = {Mem->getChain(), Ptr};
6871 SDValue BcstLd = DAG.getMemIntrinsicNode(
6872 Opcode, DL, Tys, Ops, MemVT,
6874 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6875 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6876 return BcstLd;
6877}
6878
6879/// Returns the scalar element that will make up the i'th
6880/// element of the result of the vector shuffle.
6881static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6882 SelectionDAG &DAG, unsigned Depth) {
6884 return SDValue(); // Limit search depth.
6885
6886 EVT VT = Op.getValueType();
6887 unsigned Opcode = Op.getOpcode();
6888 unsigned NumElems = VT.getVectorNumElements();
6889
6890 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6891 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6892 int Elt = SV->getMaskElt(Index);
6893
6894 if (Elt < 0)
6895 return DAG.getUNDEF(VT.getVectorElementType());
6896
6897 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6898 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6899 }
6900
6901 // Recurse into target specific vector shuffles to find scalars.
6902 if (isTargetShuffle(Opcode)) {
6903 MVT ShufVT = VT.getSimpleVT();
6904 MVT ShufSVT = ShufVT.getVectorElementType();
6905 int NumElems = (int)ShufVT.getVectorNumElements();
6906 SmallVector<int, 16> ShuffleMask;
6908 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6909 return SDValue();
6910
6911 int Elt = ShuffleMask[Index];
6912 if (Elt == SM_SentinelZero)
6913 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6914 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6915 if (Elt == SM_SentinelUndef)
6916 return DAG.getUNDEF(ShufSVT);
6917
6918 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6919 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6920 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6921 }
6922
6923 // Recurse into insert_subvector base/sub vector to find scalars.
6924 if (Opcode == ISD::INSERT_SUBVECTOR) {
6925 SDValue Vec = Op.getOperand(0);
6926 SDValue Sub = Op.getOperand(1);
6927 uint64_t SubIdx = Op.getConstantOperandVal(2);
6928 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6929
6930 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6931 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6932 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6933 }
6934
6935 // Recurse into concat_vectors sub vector to find scalars.
6936 if (Opcode == ISD::CONCAT_VECTORS) {
6937 EVT SubVT = Op.getOperand(0).getValueType();
6938 unsigned NumSubElts = SubVT.getVectorNumElements();
6939 uint64_t SubIdx = Index / NumSubElts;
6940 uint64_t SubElt = Index % NumSubElts;
6941 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6942 }
6943
6944 // Recurse into extract_subvector src vector to find scalars.
6945 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6946 SDValue Src = Op.getOperand(0);
6947 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6948 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6949 }
6950
6951 // We only peek through bitcasts of the same vector width.
6952 if (Opcode == ISD::BITCAST) {
6953 SDValue Src = Op.getOperand(0);
6954 EVT SrcVT = Src.getValueType();
6955 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6956 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6957 return SDValue();
6958 }
6959
6960 // Actual nodes that may contain scalar elements
6961
6962 // For insert_vector_elt - either return the index matching scalar or recurse
6963 // into the base vector.
6964 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6965 isa<ConstantSDNode>(Op.getOperand(2))) {
6966 if (Op.getConstantOperandAPInt(2) == Index)
6967 return Op.getOperand(1);
6968 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6969 }
6970
6971 if (Opcode == ISD::SCALAR_TO_VECTOR)
6972 return (Index == 0) ? Op.getOperand(0)
6973 : DAG.getUNDEF(VT.getVectorElementType());
6974
6975 if (Opcode == ISD::BUILD_VECTOR)
6976 return Op.getOperand(Index);
6977
6978 return SDValue();
6979}
6980
6981// Use PINSRB/PINSRW/PINSRD to create a build vector.
6983 const APInt &NonZeroMask,
6984 unsigned NumNonZero, unsigned NumZero,
6985 SelectionDAG &DAG,
6986 const X86Subtarget &Subtarget) {
6987 MVT VT = Op.getSimpleValueType();
6988 unsigned NumElts = VT.getVectorNumElements();
6989 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6990 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6991 "Illegal vector insertion");
6992
6993 SDValue V;
6994 bool First = true;
6995
6996 for (unsigned i = 0; i < NumElts; ++i) {
6997 bool IsNonZero = NonZeroMask[i];
6998 if (!IsNonZero)
6999 continue;
7000
7001 // If the build vector contains zeros or our first insertion is not the
7002 // first index then insert into zero vector to break any register
7003 // dependency else use SCALAR_TO_VECTOR.
7004 if (First) {
7005 First = false;
7006 if (NumZero || 0 != i)
7007 V = getZeroVector(VT, Subtarget, DAG, DL);
7008 else {
7009 assert(0 == i && "Expected insertion into zero-index");
7010 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7011 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
7012 V = DAG.getBitcast(VT, V);
7013 continue;
7014 }
7015 }
7016 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
7017 DAG.getVectorIdxConstant(i, DL));
7018 }
7019
7020 return V;
7021}
7022
7023/// Custom lower build_vector of v16i8.
7025 const APInt &NonZeroMask,
7026 unsigned NumNonZero, unsigned NumZero,
7027 SelectionDAG &DAG,
7028 const X86Subtarget &Subtarget) {
7029 if (NumNonZero > 8 && !Subtarget.hasSSE41())
7030 return SDValue();
7031
7032 // SSE4.1 - use PINSRB to insert each byte directly.
7033 if (Subtarget.hasSSE41())
7034 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
7035 DAG, Subtarget);
7036
7037 SDValue V;
7038
7039 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
7040 // If both the lowest 16-bits are non-zero, then convert to MOVD.
7041 if (!NonZeroMask.extractBits(2, 0).isZero() &&
7042 !NonZeroMask.extractBits(2, 2).isZero()) {
7043 for (unsigned I = 0; I != 4; ++I) {
7044 if (!NonZeroMask[I])
7045 continue;
7046 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
7047 if (I != 0)
7048 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
7049 DAG.getConstant(I * 8, DL, MVT::i8));
7050 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
7051 }
7052 assert(V && "Failed to fold v16i8 vector to zero");
7053 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
7054 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
7055 V = DAG.getBitcast(MVT::v8i16, V);
7056 }
7057 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
7058 bool ThisIsNonZero = NonZeroMask[i];
7059 bool NextIsNonZero = NonZeroMask[i + 1];
7060 if (!ThisIsNonZero && !NextIsNonZero)
7061 continue;
7062
7063 SDValue Elt;
7064 if (ThisIsNonZero) {
7065 if (NumZero || NextIsNonZero)
7066 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7067 else
7068 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7069 }
7070
7071 if (NextIsNonZero) {
7072 SDValue NextElt = Op.getOperand(i + 1);
7073 if (i == 0 && NumZero)
7074 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
7075 else
7076 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
7077 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
7078 DAG.getConstant(8, DL, MVT::i8));
7079 if (ThisIsNonZero)
7080 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
7081 else
7082 Elt = NextElt;
7083 }
7084
7085 // If our first insertion is not the first index or zeros are needed, then
7086 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
7087 // elements undefined).
7088 if (!V) {
7089 if (i != 0 || NumZero)
7090 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
7091 else {
7092 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
7093 V = DAG.getBitcast(MVT::v8i16, V);
7094 continue;
7095 }
7096 }
7097 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7098 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
7099 DAG.getVectorIdxConstant(i / 2, DL));
7100 }
7101
7102 return DAG.getBitcast(MVT::v16i8, V);
7103}
7104
7105/// Custom lower build_vector of v8i16.
7107 const APInt &NonZeroMask,
7108 unsigned NumNonZero, unsigned NumZero,
7109 SelectionDAG &DAG,
7110 const X86Subtarget &Subtarget) {
7111 if (NumNonZero > 4 && !Subtarget.hasSSE41())
7112 return SDValue();
7113
7114 // Use PINSRW to insert each byte directly.
7115 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
7116 Subtarget);
7117}
7118
7119/// Custom lower build_vector of v4i32 or v4f32.
7121 SelectionDAG &DAG,
7122 const X86Subtarget &Subtarget) {
7123 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7124 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7125 // Because we're creating a less complicated build vector here, we may enable
7126 // further folding of the MOVDDUP via shuffle transforms.
7127 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7128 Op.getOperand(0) == Op.getOperand(2) &&
7129 Op.getOperand(1) == Op.getOperand(3) &&
7130 Op.getOperand(0) != Op.getOperand(1)) {
7131 MVT VT = Op.getSimpleValueType();
7132 MVT EltVT = VT.getVectorElementType();
7133 // Create a new build vector with the first 2 elements followed by undef
7134 // padding, bitcast to v2f64, duplicate, and bitcast back.
7135 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7136 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7137 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7138 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7139 return DAG.getBitcast(VT, Dup);
7140 }
7141
7142 // Find all zeroable elements.
7143 std::bitset<4> Zeroable, Undefs;
7144 for (int i = 0; i < 4; ++i) {
7145 SDValue Elt = Op.getOperand(i);
7146 Undefs[i] = Elt.isUndef();
7147 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7148 }
7149 assert(Zeroable.size() - Zeroable.count() > 1 &&
7150 "We expect at least two non-zero elements!");
7151
7152 // We only know how to deal with build_vector nodes where elements are either
7153 // zeroable or extract_vector_elt with constant index.
7154 SDValue FirstNonZero;
7155 unsigned FirstNonZeroIdx;
7156 for (unsigned i = 0; i < 4; ++i) {
7157 if (Zeroable[i])
7158 continue;
7159 SDValue Elt = Op.getOperand(i);
7160 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7162 return SDValue();
7163 // Make sure that this node is extracting from a 128-bit vector.
7164 MVT VT = Elt.getOperand(0).getSimpleValueType();
7165 if (!VT.is128BitVector())
7166 return SDValue();
7167 if (!FirstNonZero.getNode()) {
7168 FirstNonZero = Elt;
7169 FirstNonZeroIdx = i;
7170 }
7171 }
7172
7173 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7174 SDValue V1 = FirstNonZero.getOperand(0);
7175 MVT VT = V1.getSimpleValueType();
7176
7177 // See if this build_vector can be lowered as a blend with zero.
7178 SDValue Elt;
7179 unsigned EltMaskIdx, EltIdx;
7180 int Mask[4];
7181 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7182 if (Zeroable[EltIdx]) {
7183 // The zero vector will be on the right hand side.
7184 Mask[EltIdx] = EltIdx+4;
7185 continue;
7186 }
7187
7188 Elt = Op->getOperand(EltIdx);
7189 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7190 EltMaskIdx = Elt.getConstantOperandVal(1);
7191 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7192 break;
7193 Mask[EltIdx] = EltIdx;
7194 }
7195
7196 if (EltIdx == 4) {
7197 // Let the shuffle legalizer deal with blend operations.
7198 SDValue VZeroOrUndef = (Zeroable == Undefs)
7199 ? DAG.getUNDEF(VT)
7200 : getZeroVector(VT, Subtarget, DAG, DL);
7201 if (V1.getSimpleValueType() != VT)
7202 V1 = DAG.getBitcast(VT, V1);
7203 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7204 }
7205
7206 // See if we can lower this build_vector to a INSERTPS.
7207 if (!Subtarget.hasSSE41())
7208 return SDValue();
7209
7210 SDValue V2 = Elt.getOperand(0);
7211 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7212 V1 = SDValue();
7213
7214 bool CanFold = true;
7215 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7216 if (Zeroable[i])
7217 continue;
7218
7219 SDValue Current = Op->getOperand(i);
7220 SDValue SrcVector = Current->getOperand(0);
7221 if (!V1.getNode())
7222 V1 = SrcVector;
7223 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7224 }
7225
7226 if (!CanFold)
7227 return SDValue();
7228
7229 assert(V1.getNode() && "Expected at least two non-zero elements!");
7230 if (V1.getSimpleValueType() != MVT::v4f32)
7231 V1 = DAG.getBitcast(MVT::v4f32, V1);
7232 if (V2.getSimpleValueType() != MVT::v4f32)
7233 V2 = DAG.getBitcast(MVT::v4f32, V2);
7234
7235 // Ok, we can emit an INSERTPS instruction.
7236 unsigned ZMask = Zeroable.to_ulong();
7237
7238 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7239 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7240 SDValue Result =
7241 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7242 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7243 return DAG.getBitcast(VT, Result);
7244}
7245
7246/// Return a vector logical shift node.
7247static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7248 SelectionDAG &DAG, const TargetLowering &TLI,
7249 const SDLoc &dl) {
7250 assert(VT.is128BitVector() && "Unknown type for VShift");
7251 MVT ShVT = MVT::v16i8;
7252 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7253 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7254 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7255 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7256 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7257}
7258
7260 SelectionDAG &DAG) {
7261
7262 // Check if the scalar load can be widened into a vector load. And if
7263 // the address is "base + cst" see if the cst can be "absorbed" into
7264 // the shuffle mask.
7266 SDValue Ptr = LD->getBasePtr();
7267 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7268 return SDValue();
7269 EVT PVT = LD->getValueType(0);
7270 if (PVT != MVT::i32 && PVT != MVT::f32)
7271 return SDValue();
7272
7273 int FI = -1;
7274 int64_t Offset = 0;
7275 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7276 FI = FINode->getIndex();
7277 Offset = 0;
7278 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7280 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7282 Ptr = Ptr.getOperand(0);
7283 } else {
7284 return SDValue();
7285 }
7286
7287 // FIXME: 256-bit vector instructions don't require a strict alignment,
7288 // improve this code to support it better.
7289 Align RequiredAlign(VT.getSizeInBits() / 8);
7290 SDValue Chain = LD->getChain();
7291 // Make sure the stack object alignment is at least 16 or 32.
7293 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7294 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7295 if (MFI.isFixedObjectIndex(FI)) {
7296 // Can't change the alignment. FIXME: It's possible to compute
7297 // the exact stack offset and reference FI + adjust offset instead.
7298 // If someone *really* cares about this. That's the way to implement it.
7299 return SDValue();
7300 } else {
7301 MFI.setObjectAlignment(FI, RequiredAlign);
7302 }
7303 }
7304
7305 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7306 // Ptr + (Offset & ~15).
7307 if (Offset < 0)
7308 return SDValue();
7309 if ((Offset % RequiredAlign.value()) & 3)
7310 return SDValue();
7311 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7312 if (StartOffset) {
7313 SDLoc DL(Ptr);
7314 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7315 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7316 }
7317
7318 int EltNo = (Offset - StartOffset) >> 2;
7319 unsigned NumElems = VT.getVectorNumElements();
7320
7321 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7322 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7323 LD->getPointerInfo().getWithOffset(StartOffset));
7324
7325 SmallVector<int, 8> Mask(NumElems, EltNo);
7326
7327 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7328 }
7329
7330 return SDValue();
7331}
7332
7333// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7334static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7335 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7336 auto *BaseLd = cast<LoadSDNode>(Elt);
7337 if (!BaseLd->isSimple())
7338 return false;
7339 Ld = BaseLd;
7340 ByteOffset = 0;
7341 return true;
7342 }
7343
7344 switch (Elt.getOpcode()) {
7345 case ISD::BITCAST:
7346 case ISD::TRUNCATE:
7348 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7349 case ISD::SRL:
7350 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7351 uint64_t Amt = AmtC->getZExtValue();
7352 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7353 ByteOffset += Amt / 8;
7354 return true;
7355 }
7356 }
7357 break;
7359 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7360 SDValue Src = Elt.getOperand(0);
7361 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7362 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7363 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7364 findEltLoadSrc(Src, Ld, ByteOffset)) {
7365 uint64_t Idx = IdxC->getZExtValue();
7366 ByteOffset += Idx * (SrcSizeInBits / 8);
7367 return true;
7368 }
7369 }
7370 break;
7371 }
7372
7373 return false;
7374}
7375
7376/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7377/// elements can be replaced by a single large load which has the same value as
7378/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7379///
7380/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7382 const SDLoc &DL, SelectionDAG &DAG,
7383 const X86Subtarget &Subtarget,
7384 bool IsAfterLegalize,
7385 unsigned Depth = 0) {
7387 return SDValue(); // Limit search depth.
7388 if ((VT.getScalarSizeInBits() % 8) != 0)
7389 return SDValue();
7390
7391 unsigned NumElems = Elts.size();
7392
7393 int LastLoadedElt = -1;
7394 APInt LoadMask = APInt::getZero(NumElems);
7395 APInt ZeroMask = APInt::getZero(NumElems);
7396 APInt UndefMask = APInt::getZero(NumElems);
7397
7398 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7399 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7400
7401 // For each element in the initializer, see if we've found a load, zero or an
7402 // undef.
7403 for (unsigned i = 0; i < NumElems; ++i) {
7404 SDValue Elt = peekThroughBitcasts(Elts[i]);
7405 if (!Elt.getNode())
7406 return SDValue();
7407 if (Elt.isUndef()) {
7408 UndefMask.setBit(i);
7409 continue;
7410 }
7412 ZeroMask.setBit(i);
7413 continue;
7414 }
7415
7416 // Each loaded element must be the correct fractional portion of the
7417 // requested vector load.
7418 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7419 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7420 return SDValue();
7421
7422 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7423 return SDValue();
7424 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7425 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7426 return SDValue();
7427
7428 LoadMask.setBit(i);
7429 LastLoadedElt = i;
7430 }
7431 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7432 NumElems &&
7433 "Incomplete element masks");
7434
7435 // Handle Special Cases - all undef or undef/zero.
7436 if (UndefMask.popcount() == NumElems)
7437 return DAG.getUNDEF(VT);
7438 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7439 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7440 : DAG.getConstantFP(0.0, DL, VT);
7441
7442 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7443 int FirstLoadedElt = LoadMask.countr_zero();
7444 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7445 EVT EltBaseVT = EltBase.getValueType();
7446 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7447 "Register/Memory size mismatch");
7448 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7449 assert(LDBase && "Did not find base load for merging consecutive loads");
7450 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7451 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7452 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7453 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7454 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7455
7456 // TODO: Support offsetting the base load.
7457 if (ByteOffsets[FirstLoadedElt] != 0)
7458 return SDValue();
7459
7460 // Check to see if the element's load is consecutive to the base load
7461 // or offset from a previous (already checked) load.
7462 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7463 LoadSDNode *Ld = Loads[EltIdx];
7464 int64_t ByteOffset = ByteOffsets[EltIdx];
7465 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7466 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7467 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7468 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7469 }
7470 int Stride = EltIdx - FirstLoadedElt;
7471 if (DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, Stride))
7472 return true;
7473 // Try again using the memory load size (we might have broken a large load
7474 // into smaller elements), ensure the stride is the full memory load size
7475 // apart and a whole number of elements fit in each memory load.
7476 unsigned BaseMemSizeInBits = Base->getMemoryVT().getSizeInBits();
7477 if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
7478 (BaseMemSizeInBits % BaseSizeInBits) == 0) {
7479 unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
7480 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseMemSizeInBits / 8,
7481 Stride / Scale);
7482 }
7483 return false;
7484 };
7485
7486 // Consecutive loads can contain UNDEFS but not ZERO elements.
7487 // Consecutive loads with UNDEFs and ZEROs elements require a
7488 // an additional shuffle stage to clear the ZERO elements.
7489 bool IsConsecutiveLoad = true;
7490 bool IsConsecutiveLoadWithZeros = true;
7491 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7492 if (LoadMask[i]) {
7493 if (!CheckConsecutiveLoad(LDBase, i)) {
7494 IsConsecutiveLoad = false;
7495 IsConsecutiveLoadWithZeros = false;
7496 break;
7497 }
7498 } else if (ZeroMask[i]) {
7499 IsConsecutiveLoad = false;
7500 }
7501 }
7502
7503 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7504 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7505 assert(LDBase->isSimple() &&
7506 "Cannot merge volatile or atomic loads.");
7507 SDValue NewLd =
7508 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7509 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7510 for (auto *LD : Loads)
7511 if (LD)
7512 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7513 return NewLd;
7514 };
7515
7516 // Check if the base load is entirely dereferenceable.
7517 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7518 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7519
7520 // LOAD - all consecutive load/undefs (must start/end with a load or be
7521 // entirely dereferenceable). If we have found an entire vector of loads and
7522 // undefs, then return a large load of the entire vector width starting at the
7523 // base pointer. If the vector contains zeros, then attempt to shuffle those
7524 // elements.
7525 if (FirstLoadedElt == 0 &&
7526 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7527 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7528 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7529 return SDValue();
7530
7531 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7532 // will lower to regular temporal loads and use the cache.
7533 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7534 VT.is256BitVector() && !Subtarget.hasInt256())
7535 return SDValue();
7536
7537 if (NumElems == 1)
7538 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7539
7540 if (!ZeroMask)
7541 return CreateLoad(VT, LDBase);
7542
7543 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7544 // vector and a zero vector to clear out the zero elements.
7545 if (!IsAfterLegalize && VT.isVector()) {
7546 unsigned NumMaskElts = VT.getVectorNumElements();
7547 if ((NumMaskElts % NumElems) == 0) {
7548 unsigned Scale = NumMaskElts / NumElems;
7549 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7550 for (unsigned i = 0; i < NumElems; ++i) {
7551 if (UndefMask[i])
7552 continue;
7553 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7554 for (unsigned j = 0; j != Scale; ++j)
7555 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7556 }
7557 SDValue V = CreateLoad(VT, LDBase);
7558 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7559 : DAG.getConstantFP(0.0, DL, VT);
7560 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7561 }
7562 }
7563 }
7564
7565 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7566 if (VT.is256BitVector() || VT.is512BitVector()) {
7567 unsigned HalfNumElems = NumElems / 2;
7568 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7569 EVT HalfVT =
7570 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7571 SDValue HalfLD =
7572 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7573 DAG, Subtarget, IsAfterLegalize, Depth + 1);
7574 if (HalfLD)
7575 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7576 HalfLD, DAG.getVectorIdxConstant(0, DL));
7577 }
7578 }
7579
7580 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7581 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7582 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7583 LoadSizeInBits == 64) &&
7584 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7585 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7586 : MVT::getIntegerVT(LoadSizeInBits);
7587 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7588 // Allow v4f32 on SSE1 only targets.
7589 // FIXME: Add more isel patterns so we can just use VT directly.
7590 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7591 VecVT = MVT::v4f32;
7592 if (TLI.isTypeLegal(VecVT)) {
7593 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7594 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7595 SDValue ResNode = DAG.getMemIntrinsicNode(
7596 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7598 for (auto *LD : Loads)
7599 if (LD)
7600 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7601 return DAG.getBitcast(VT, ResNode);
7602 }
7603 }
7604
7605 // BROADCAST - match the smallest possible repetition pattern, load that
7606 // scalar/subvector element and then broadcast to the entire vector.
7607 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7608 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7609 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7610 unsigned RepeatSize = SubElems * BaseSizeInBits;
7611 unsigned ScalarSize = std::min(RepeatSize, 64u);
7612 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7613 continue;
7614
7615 // Don't attempt a 1:N subvector broadcast - it should be caught by
7616 // combineConcatVectorOps, else will cause infinite loops.
7617 if (RepeatSize > ScalarSize && SubElems == 1)
7618 continue;
7619
7620 bool Match = true;
7621 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7622 for (unsigned i = 0; i != NumElems && Match; ++i) {
7623 if (!LoadMask[i])
7624 continue;
7625 SDValue Elt = peekThroughBitcasts(Elts[i]);
7626 if (RepeatedLoads[i % SubElems].isUndef())
7627 RepeatedLoads[i % SubElems] = Elt;
7628 else
7629 Match &= (RepeatedLoads[i % SubElems] == Elt);
7630 }
7631
7632 // We must have loads at both ends of the repetition.
7633 Match &= !RepeatedLoads.front().isUndef();
7634 Match &= !RepeatedLoads.back().isUndef();
7635 if (!Match)
7636 continue;
7637
7638 EVT RepeatVT =
7639 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7640 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7641 : EVT::getFloatingPointVT(ScalarSize);
7642 if (RepeatSize > ScalarSize)
7643 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7644 RepeatSize / ScalarSize);
7645 EVT BroadcastVT =
7646 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7647 VT.getSizeInBits() / ScalarSize);
7648 if (TLI.isTypeLegal(BroadcastVT)) {
7649 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7650 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize,
7651 Depth + 1)) {
7652 SDValue Broadcast = RepeatLoad;
7653 if (RepeatSize > ScalarSize) {
7654 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7655 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7656 } else {
7657 if (!Subtarget.hasAVX2() &&
7659 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7660 Subtarget,
7661 /*AssumeSingleUse=*/true))
7662 return SDValue();
7663 Broadcast =
7664 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7665 }
7666 return DAG.getBitcast(VT, Broadcast);
7667 }
7668 }
7669 }
7670 }
7671
7672 // REVERSE - attempt to match the loads in reverse and then shuffle back.
7673 // TODO: Do this for any permute or mismatching element counts.
7674 if (Depth == 0 && ZeroMask.isZero() && UndefMask.isZero() &&
7675 TLI.isTypeLegal(VT) && VT.isVector() &&
7676 NumElems == VT.getVectorNumElements()) {
7677 SmallVector<SDValue, 16> ReverseElts(Elts.rbegin(), Elts.rend());
7679 VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) {
7680 SmallVector<int, 16> ReverseMask(NumElems);
7681 std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0);
7682 return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask);
7683 }
7684 }
7685
7686 return SDValue();
7687}
7688
7689// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7690// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7691// are consecutive, non-overlapping, and in the right order.
7693 SelectionDAG &DAG,
7694 const X86Subtarget &Subtarget,
7695 bool IsAfterLegalize) {
7697 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7698 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7699 Elts.push_back(Elt);
7700 continue;
7701 }
7702 return SDValue();
7703 }
7704 assert(Elts.size() == VT.getVectorNumElements());
7705 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7706 IsAfterLegalize);
7707}
7708
7710 const APInt &Undefs, LLVMContext &C) {
7711 unsigned ScalarSize = VT.getScalarSizeInBits();
7712 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7713
7714 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7715 if (VT.isFloatingPoint()) {
7716 if (ScalarSize == 16)
7717 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7718 if (ScalarSize == 32)
7719 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7720 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7721 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7722 }
7723 return Constant::getIntegerValue(Ty, Val);
7724 };
7725
7726 SmallVector<Constant *, 32> ConstantVec;
7727 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7728 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7729 : getConstantScalar(Bits[I]));
7730
7731 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7732}
7733
7734static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7735 unsigned SplatBitSize, LLVMContext &C) {
7736 unsigned ScalarSize = VT.getScalarSizeInBits();
7737
7738 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7739 if (VT.isFloatingPoint()) {
7740 if (ScalarSize == 16)
7741 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7742 if (ScalarSize == 32)
7743 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7744 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7745 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7746 }
7747 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7748 };
7749
7750 if (ScalarSize == SplatBitSize)
7751 return getConstantScalar(SplatValue);
7752
7753 unsigned NumElm = SplatBitSize / ScalarSize;
7754 SmallVector<Constant *, 32> ConstantVec;
7755 for (unsigned I = 0; I != NumElm; ++I) {
7756 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7757 ConstantVec.push_back(getConstantScalar(Val));
7758 }
7759 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7760}
7761
7763 for (auto *U : N->users()) {
7764 unsigned Opc = U->getOpcode();
7765 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7766 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7767 return false;
7768 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7769 return false;
7770 if (isTargetShuffle(Opc))
7771 return true;
7772 if (Opc == ISD::BITCAST) // Ignore bitcasts
7773 return isFoldableUseOfShuffle(U);
7774 if (N->hasOneUse()) {
7775 // TODO, there may be some general way to know if a SDNode can
7776 // be folded. We now only know whether an MI is foldable.
7777 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7778 return false;
7779 return true;
7780 }
7781 }
7782 return false;
7783}
7784
7785// If the node has a single use by a VSELECT then AVX512 targets may be able to
7786// fold as a predicated instruction.
7787static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7788 unsigned SizeInBits = V.getValueSizeInBits();
7789 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7790 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7791 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7792 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7793 return true;
7794 }
7795 }
7796 return false;
7797}
7798
7799/// Attempt to use the vbroadcast instruction to generate a splat value
7800/// from a splat BUILD_VECTOR which uses:
7801/// a. A single scalar load, or a constant.
7802/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7803///
7804/// The VBROADCAST node is returned when a pattern is found,
7805/// or SDValue() otherwise.
7807 const SDLoc &dl,
7808 const X86Subtarget &Subtarget,
7809 SelectionDAG &DAG) {
7810 // VBROADCAST requires AVX.
7811 // TODO: Splats could be generated for non-AVX CPUs using SSE
7812 // instructions, but there's less potential gain for only 128-bit vectors.
7813 if (!Subtarget.hasAVX())
7814 return SDValue();
7815
7816 MVT VT = BVOp->getSimpleValueType(0);
7817 unsigned NumElts = VT.getVectorNumElements();
7818 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7819 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7820 "Unsupported vector type for broadcast.");
7821
7822 // See if the build vector is a repeating sequence of scalars (inc. splat).
7823 SDValue Ld;
7824 BitVector UndefElements;
7825 SmallVector<SDValue, 16> Sequence;
7826 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7827 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7828 if (Sequence.size() == 1)
7829 Ld = Sequence[0];
7830 }
7831
7832 // Attempt to use VBROADCASTM
7833 // From this pattern:
7834 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7835 // b. t1 = (build_vector t0 t0)
7836 //
7837 // Create (VBROADCASTM v2i1 X)
7838 if (!Sequence.empty() && Subtarget.hasCDI()) {
7839 // If not a splat, are the upper sequence values zeroable?
7840 unsigned SeqLen = Sequence.size();
7841 bool UpperZeroOrUndef =
7842 SeqLen == 1 ||
7843 llvm::all_of(ArrayRef(Sequence).drop_front(),
7844 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7845 SDValue Op0 = Sequence[0];
7846 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7847 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7848 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7849 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7850 ? Op0.getOperand(0)
7851 : Op0.getOperand(0).getOperand(0);
7852 MVT MaskVT = BOperand.getSimpleValueType();
7853 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7854 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7855 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7856 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7857 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7858 unsigned Scale = 512 / VT.getSizeInBits();
7859 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7860 }
7861 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7862 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7863 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7864 return DAG.getBitcast(VT, Bcst);
7865 }
7866 }
7867 }
7868
7869 unsigned NumUndefElts = UndefElements.count();
7870 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7871 APInt SplatValue, Undef;
7872 unsigned SplatBitSize;
7873 bool HasUndef;
7874 // Check if this is a repeated constant pattern suitable for broadcasting.
7875 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7876 SplatBitSize > VT.getScalarSizeInBits() &&
7877 SplatBitSize < VT.getSizeInBits()) {
7878 // Avoid replacing with broadcast when it's a use of a shuffle
7879 // instruction to preserve the present custom lowering of shuffles.
7880 if (isFoldableUseOfShuffle(BVOp))
7881 return SDValue();
7882 // replace BUILD_VECTOR with broadcast of the repeated constants.
7883 LLVMContext *Ctx = DAG.getContext();
7884 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7885 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7886 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7887 // Load the constant scalar/subvector and broadcast it.
7888 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7889 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7890 SDValue CP = DAG.getConstantPool(C, PVT);
7891 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7892
7893 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7894 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7895 SDValue Ops[] = {DAG.getEntryNode(), CP};
7896 MachinePointerInfo MPI =
7898 SDValue Brdcst =
7900 MPI, Alignment, MachineMemOperand::MOLoad);
7901 return DAG.getBitcast(VT, Brdcst);
7902 }
7903 if (SplatBitSize > 64) {
7904 // Load the vector of constants and broadcast it.
7905 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7906 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7907 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7908 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7909 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7910 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7911 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7912 MachinePointerInfo MPI =
7915 Ops, VVT, MPI, Alignment,
7917 }
7918 }
7919
7920 // If we are moving a scalar into a vector (Ld must be set and all elements
7921 // but 1 are undef) and that operation is not obviously supported by
7922 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7923 // That's better than general shuffling and may eliminate a load to GPR and
7924 // move from scalar to vector register.
7925 if (!Ld || NumElts - NumUndefElts != 1)
7926 return SDValue();
7927 unsigned ScalarSize = Ld.getValueSizeInBits();
7928 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7929 return SDValue();
7930 }
7931
7932 bool ConstSplatVal =
7933 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7934 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7935
7936 // TODO: Handle broadcasts of non-constant sequences.
7937
7938 // Make sure that all of the users of a non-constant load are from the
7939 // BUILD_VECTOR node.
7940 // FIXME: Is the use count needed for non-constant, non-load case?
7941 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7942 return SDValue();
7943
7944 unsigned ScalarSize = Ld.getValueSizeInBits();
7945 bool IsGE256 = (VT.getSizeInBits() >= 256);
7946
7947 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7948 // instruction to save 8 or more bytes of constant pool data.
7949 // TODO: If multiple splats are generated to load the same constant,
7950 // it may be detrimental to overall size. There needs to be a way to detect
7951 // that condition to know if this is truly a size win.
7952 bool OptForSize = DAG.shouldOptForSize();
7953
7954 // Handle broadcasting a single constant scalar from the constant pool
7955 // into a vector.
7956 // On Sandybridge (no AVX2), it is still better to load a constant vector
7957 // from the constant pool and not to broadcast it from a scalar.
7958 // But override that restriction when optimizing for size.
7959 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7960 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7961 EVT CVT = Ld.getValueType();
7962 assert(!CVT.isVector() && "Must not broadcast a vector type");
7963
7964 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7965 // For size optimization, also splat v2f64 and v2i64, and for size opt
7966 // with AVX2, also splat i8 and i16.
7967 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7968 if (ScalarSize == 32 ||
7969 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7970 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7971 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7972 const Constant *C = nullptr;
7974 C = CI->getConstantIntValue();
7976 C = CF->getConstantFPValue();
7977
7978 assert(C && "Invalid constant type");
7979
7980 SDValue CP =
7982 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7983
7984 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7985 SDValue Ops[] = {DAG.getEntryNode(), CP};
7986 MachinePointerInfo MPI =
7988 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7989 MPI, Alignment, MachineMemOperand::MOLoad);
7990 }
7991 }
7992
7993 // Handle AVX2 in-register broadcasts.
7994 if (!IsLoad && Subtarget.hasInt256() &&
7995 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7996 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7997
7998 // The scalar source must be a normal load.
7999 if (!IsLoad)
8000 return SDValue();
8001
8002 // Make sure the non-chain result is only used by this build vector.
8003 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
8004 return SDValue();
8005
8006 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8007 (Subtarget.hasVLX() && ScalarSize == 64)) {
8008 auto *LN = cast<LoadSDNode>(Ld);
8009 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8010 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8011 SDValue BCast =
8013 LN->getMemoryVT(), LN->getMemOperand());
8014 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8015 return BCast;
8016 }
8017
8018 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
8019 // double since there is no vbroadcastsd xmm
8020 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
8021 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
8022 auto *LN = cast<LoadSDNode>(Ld);
8023 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8024 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8025 SDValue BCast =
8027 LN->getMemoryVT(), LN->getMemOperand());
8028 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8029 return BCast;
8030 }
8031
8032 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
8033 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8034
8035 // Unsupported broadcast.
8036 return SDValue();
8037}
8038
8039/// For an EXTRACT_VECTOR_ELT with a constant index return the real
8040/// underlying vector and index.
8041///
8042/// Modifies \p ExtractedFromVec to the real vector and returns the real
8043/// index.
8044static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
8045 SDValue ExtIdx) {
8046 int Idx = ExtIdx->getAsZExtVal();
8047 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
8048 return Idx;
8049
8050 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
8051 // lowered this:
8052 // (extract_vector_elt (v8f32 %1), Constant<6>)
8053 // to:
8054 // (extract_vector_elt (vector_shuffle<2,u,u,u>
8055 // (extract_subvector (v8f32 %0), Constant<4>),
8056 // undef)
8057 // Constant<0>)
8058 // In this case the vector is the extract_subvector expression and the index
8059 // is 2, as specified by the shuffle.
8060 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
8061 SDValue ShuffleVec = SVOp->getOperand(0);
8062 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
8063 assert(ShuffleVecVT.getVectorElementType() ==
8064 ExtractedFromVec.getSimpleValueType().getVectorElementType());
8065
8066 int ShuffleIdx = SVOp->getMaskElt(Idx);
8067 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
8068 ExtractedFromVec = ShuffleVec;
8069 return ShuffleIdx;
8070 }
8071 return Idx;
8072}
8073
8075 SelectionDAG &DAG) {
8076 MVT VT = Op.getSimpleValueType();
8077
8078 // Skip if insert_vec_elt is not supported.
8079 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8081 return SDValue();
8082
8083 unsigned NumElems = Op.getNumOperands();
8084 SDValue VecIn1;
8085 SDValue VecIn2;
8086 SmallVector<unsigned, 4> InsertIndices;
8087 SmallVector<int, 8> Mask(NumElems, -1);
8088
8089 for (unsigned i = 0; i != NumElems; ++i) {
8090 unsigned Opc = Op.getOperand(i).getOpcode();
8091
8092 if (Opc == ISD::POISON || Opc == ISD::UNDEF)
8093 continue;
8094
8096 // Quit if more than 1 elements need inserting.
8097 if (InsertIndices.size() > 1)
8098 return SDValue();
8099
8100 InsertIndices.push_back(i);
8101 continue;
8102 }
8103
8104 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
8105 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
8106
8107 // Quit if non-constant index.
8108 if (!isa<ConstantSDNode>(ExtIdx))
8109 return SDValue();
8110 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
8111
8112 // Quit if extracted from vector of different type.
8113 if (ExtractedFromVec.getValueType() != VT)
8114 return SDValue();
8115
8116 if (!VecIn1.getNode())
8117 VecIn1 = ExtractedFromVec;
8118 else if (VecIn1 != ExtractedFromVec) {
8119 if (!VecIn2.getNode())
8120 VecIn2 = ExtractedFromVec;
8121 else if (VecIn2 != ExtractedFromVec)
8122 // Quit if more than 2 vectors to shuffle
8123 return SDValue();
8124 }
8125
8126 if (ExtractedFromVec == VecIn1)
8127 Mask[i] = Idx;
8128 else if (ExtractedFromVec == VecIn2)
8129 Mask[i] = Idx + NumElems;
8130 }
8131
8132 if (!VecIn1.getNode())
8133 return SDValue();
8134
8135 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getPOISON(VT);
8136 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
8137
8138 for (unsigned Idx : InsertIndices)
8139 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
8140 DAG.getVectorIdxConstant(Idx, DL));
8141
8142 return NV;
8143}
8144
8145// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
8147 const X86Subtarget &Subtarget) {
8148 MVT VT = Op.getSimpleValueType();
8149 MVT IVT =
8150 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
8152 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8153 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
8154 Op.getOperand(I)));
8155 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8156 return DAG.getBitcast(VT, Res);
8157}
8158
8159// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8161 SelectionDAG &DAG,
8162 const X86Subtarget &Subtarget) {
8163
8164 MVT VT = Op.getSimpleValueType();
8165 assert((VT.getVectorElementType() == MVT::i1) &&
8166 "Unexpected type in LowerBUILD_VECTORvXi1!");
8167 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8168 ISD::isBuildVectorAllOnes(Op.getNode()))
8169 return Op;
8170
8171 uint64_t Immediate = 0;
8172 SmallVector<unsigned, 16> NonConstIdx;
8173 bool IsSplat = true;
8174 bool HasConstElts = false;
8175 int SplatIdx = -1;
8176 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8177 SDValue In = Op.getOperand(idx);
8178 if (In.isUndef())
8179 continue;
8180 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8181 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8182 HasConstElts = true;
8183 } else {
8184 NonConstIdx.push_back(idx);
8185 }
8186 if (SplatIdx < 0)
8187 SplatIdx = idx;
8188 else if (In != Op.getOperand(SplatIdx))
8189 IsSplat = false;
8190 }
8191
8192 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8193 if (IsSplat) {
8194 // The build_vector allows the scalar element to be larger than the vector
8195 // element type. We need to mask it to use as a condition unless we know
8196 // the upper bits are zero.
8197 // FIXME: Use computeKnownBits instead of checking specific opcode?
8198 SDValue Cond = Op.getOperand(SplatIdx);
8199 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8200 if (Cond.getOpcode() != ISD::SETCC)
8201 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8202 DAG.getConstant(1, dl, MVT::i8));
8203
8204 // Perform the select in the scalar domain so we can use cmov.
8205 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8206 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8207 DAG.getAllOnesConstant(dl, MVT::i32),
8208 DAG.getConstant(0, dl, MVT::i32));
8209 Select = DAG.getBitcast(MVT::v32i1, Select);
8210 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8211 } else {
8212 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8213 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8214 DAG.getAllOnesConstant(dl, ImmVT),
8215 DAG.getConstant(0, dl, ImmVT));
8216 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8217 Select = DAG.getBitcast(VecVT, Select);
8218 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8219 DAG.getVectorIdxConstant(0, dl));
8220 }
8221 }
8222
8223 // insert elements one by one
8224 SDValue DstVec;
8225 if (HasConstElts) {
8226 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8227 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8228 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8229 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8230 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8231 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8232 } else {
8233 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8234 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8235 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8236 DstVec = DAG.getBitcast(VecVT, Imm);
8237 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8238 DAG.getVectorIdxConstant(0, dl));
8239 }
8240 } else
8241 DstVec = DAG.getUNDEF(VT);
8242
8243 for (unsigned InsertIdx : NonConstIdx) {
8244 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8245 Op.getOperand(InsertIdx),
8246 DAG.getVectorIdxConstant(InsertIdx, dl));
8247 }
8248 return DstVec;
8249}
8250
8251[[maybe_unused]] static bool isHorizOp(unsigned Opcode) {
8252 switch (Opcode) {
8253 case X86ISD::PACKSS:
8254 case X86ISD::PACKUS:
8255 case X86ISD::FHADD:
8256 case X86ISD::FHSUB:
8257 case X86ISD::HADD:
8258 case X86ISD::HSUB:
8259 case X86ISD::HADDS:
8260 case X86ISD::HSUBS:
8261 return true;
8262 }
8263 return false;
8264}
8265
8266/// This is a helper function of LowerToHorizontalOp().
8267/// This function checks that the build_vector \p N in input implements a
8268/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8269/// may not match the layout of an x86 256-bit horizontal instruction.
8270/// In other words, if this returns true, then some extraction/insertion will
8271/// be required to produce a valid horizontal instruction.
8272///
8273/// Parameter \p Opcode defines the kind of horizontal operation to match.
8274/// For example, if \p Opcode is equal to ISD::ADD, then this function
8275/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8276/// is equal to ISD::SUB, then this function checks if this is a horizontal
8277/// arithmetic sub.
8278///
8279/// This function only analyzes elements of \p N whose indices are
8280/// in range [BaseIdx, LastIdx).
8281///
8282/// TODO: This function was originally used to match both real and fake partial
8283/// horizontal operations, but the index-matching logic is incorrect for that.
8284/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8285/// code because it is only used for partial h-op matching now?
8286static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8287 const SDLoc &DL, SelectionDAG &DAG,
8288 unsigned BaseIdx, unsigned LastIdx,
8289 SDValue &V0, SDValue &V1) {
8290 EVT VT = N->getValueType(0);
8291 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8292 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8293 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8294 "Invalid Vector in input!");
8295
8296 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8297 bool CanFold = true;
8298 unsigned ExpectedVExtractIdx = BaseIdx;
8299 unsigned NumElts = LastIdx - BaseIdx;
8300 V0 = DAG.getUNDEF(VT);
8301 V1 = DAG.getUNDEF(VT);
8302
8303 // Check if N implements a horizontal binop.
8304 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8305 SDValue Op = N->getOperand(i + BaseIdx);
8306
8307 // Skip UNDEFs.
8308 if (Op->isUndef()) {
8309 // Update the expected vector extract index.
8310 if (i * 2 == NumElts)
8311 ExpectedVExtractIdx = BaseIdx;
8312 ExpectedVExtractIdx += 2;
8313 continue;
8314 }
8315
8316 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8317
8318 if (!CanFold)
8319 break;
8320
8321 SDValue Op0 = Op.getOperand(0);
8322 SDValue Op1 = Op.getOperand(1);
8323
8324 // Try to match the following pattern:
8325 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8326 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8328 Op0.getOperand(0) == Op1.getOperand(0) &&
8331 if (!CanFold)
8332 break;
8333
8334 unsigned I0 = Op0.getConstantOperandVal(1);
8335 unsigned I1 = Op1.getConstantOperandVal(1);
8336
8337 if (i * 2 < NumElts) {
8338 if (V0.isUndef()) {
8339 V0 = Op0.getOperand(0);
8340 if (V0.getValueType() != VT)
8341 return false;
8342 }
8343 } else {
8344 if (V1.isUndef()) {
8345 V1 = Op0.getOperand(0);
8346 if (V1.getValueType() != VT)
8347 return false;
8348 }
8349 if (i * 2 == NumElts)
8350 ExpectedVExtractIdx = BaseIdx;
8351 }
8352
8353 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8354 if (I0 == ExpectedVExtractIdx)
8355 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8356 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8357 // Try to match the following dag sequence:
8358 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8359 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8360 } else
8361 CanFold = false;
8362
8363 ExpectedVExtractIdx += 2;
8364 }
8365
8366 return CanFold;
8367}
8368
8369/// Emit a sequence of two 128-bit horizontal add/sub followed by
8370/// a concat_vector.
8371///
8372/// This is a helper function of LowerToHorizontalOp().
8373/// This function expects two 256-bit vectors called V0 and V1.
8374/// At first, each vector is split into two separate 128-bit vectors.
8375/// Then, the resulting 128-bit vectors are used to implement two
8376/// horizontal binary operations.
8377///
8378/// The kind of horizontal binary operation is defined by \p X86Opcode.
8379///
8380/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8381/// the two new horizontal binop.
8382/// When Mode is set, the first horizontal binop dag node would take as input
8383/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8384/// horizontal binop dag node would take as input the lower 128-bit of V1
8385/// and the upper 128-bit of V1.
8386/// Example:
8387/// HADD V0_LO, V0_HI
8388/// HADD V1_LO, V1_HI
8389///
8390/// Otherwise, the first horizontal binop dag node takes as input the lower
8391/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8392/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8393/// Example:
8394/// HADD V0_LO, V1_LO
8395/// HADD V0_HI, V1_HI
8396///
8397/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8398/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8399/// the upper 128-bits of the result.
8400static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8401 const SDLoc &DL, SelectionDAG &DAG,
8402 unsigned X86Opcode, bool Mode,
8403 bool isUndefLO, bool isUndefHI) {
8404 MVT VT = V0.getSimpleValueType();
8405 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8406 "Invalid nodes in input!");
8407
8408 unsigned NumElts = VT.getVectorNumElements();
8409 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8410 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8411 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8412 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8413 MVT NewVT = V0_LO.getSimpleValueType();
8414
8415 SDValue LO = DAG.getUNDEF(NewVT);
8416 SDValue HI = DAG.getUNDEF(NewVT);
8417
8418 if (Mode) {
8419 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8420 if (!isUndefLO && !V0->isUndef())
8421 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8422 if (!isUndefHI && !V1->isUndef())
8423 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8424 } else {
8425 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8426 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8427 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8428
8429 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8430 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8431 }
8432
8433 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8434}
8435
8436/// Returns true iff \p BV builds a vector with the result equivalent to
8437/// the result of ADDSUB/SUBADD operation.
8438/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8439/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8440/// \p Opnd0 and \p Opnd1.
8442 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8443 SDValue &Opnd0, SDValue &Opnd1,
8444 unsigned &NumExtracts, bool &IsSubAdd,
8445 bool &HasAllowContract) {
8446 using namespace SDPatternMatch;
8447
8448 MVT VT = BV->getSimpleValueType(0);
8449 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8450 return false;
8451
8452 unsigned NumElts = VT.getVectorNumElements();
8453 SDValue InVec0 = DAG.getUNDEF(VT);
8454 SDValue InVec1 = DAG.getUNDEF(VT);
8455
8456 NumExtracts = 0;
8457 HasAllowContract = NumElts != 0;
8458
8459 // Odd-numbered elements in the input build vector are obtained from
8460 // adding/subtracting two integer/float elements.
8461 // Even-numbered elements in the input build vector are obtained from
8462 // subtracting/adding two integer/float elements.
8463 unsigned Opc[2] = {0, 0};
8464 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8465 SDValue Op = BV->getOperand(i);
8466
8467 // Skip 'undef' values.
8468 unsigned Opcode = Op.getOpcode();
8469 if (Opcode == ISD::UNDEF)
8470 continue;
8471
8472 // Early exit if we found an unexpected opcode.
8473 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8474 return false;
8475
8476 SDValue Op0 = Op.getOperand(0);
8477 SDValue Op1 = Op.getOperand(1);
8478
8479 // Try to match the following pattern:
8480 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8481 // Early exit if we cannot match that sequence.
8482 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8483 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8484 return false;
8485
8486 // We found a valid add/sub node, make sure its the same opcode as previous
8487 // elements for this parity.
8488 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8489 return false;
8490 Opc[i % 2] = Opcode;
8491
8492 // Update InVec0 and InVec1.
8493 if (InVec0.isUndef())
8494 InVec0 = Op0.getOperand(0);
8495 if (InVec1.isUndef())
8496 InVec1 = Op1.getOperand(0);
8497
8498 // Make sure that operands in input to each add/sub node always
8499 // come from a same pair of vectors.
8500 if (InVec0 != Op0.getOperand(0)) {
8501 if (Opcode == ISD::FSUB)
8502 return false;
8503
8504 // FADD is commutable. Try to commute the operands
8505 // and then test again.
8506 std::swap(Op0, Op1);
8507 if (InVec0 != Op0.getOperand(0))
8508 return false;
8509 }
8510
8511 if (InVec1 != Op1.getOperand(0))
8512 return false;
8513
8514 // Increment the number of extractions done.
8515 ++NumExtracts;
8516 HasAllowContract &= Op->getFlags().hasAllowContract();
8517 }
8518
8519 // Ensure we have found an opcode for both parities and that they are
8520 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8521 // inputs are undef.
8522 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8523 InVec0.isUndef() || InVec1.isUndef())
8524 return false;
8525
8526 IsSubAdd = Opc[0] == ISD::FADD;
8527
8528 Opnd0 = InVec0;
8529 Opnd1 = InVec1;
8530 return true;
8531}
8532
8533/// Returns true if is possible to fold MUL and an idiom that has already been
8534/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8535/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8536/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8537///
8538/// Prior to calling this function it should be known that there is some
8539/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8540/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8541/// before replacement of such SDNode with ADDSUB operation. Thus the number
8542/// of \p Opnd0 uses is expected to be equal to 2.
8543/// For example, this function may be called for the following IR:
8544/// %AB = fmul fast <2 x double> %A, %B
8545/// %Sub = fsub fast <2 x double> %AB, %C
8546/// %Add = fadd fast <2 x double> %AB, %C
8547/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8548/// <2 x i32> <i32 0, i32 3>
8549/// There is a def for %Addsub here, which potentially can be replaced by
8550/// X86ISD::ADDSUB operation:
8551/// %Addsub = X86ISD::ADDSUB %AB, %C
8552/// and such ADDSUB can further be replaced with FMADDSUB:
8553/// %Addsub = FMADDSUB %A, %B, %C.
8554///
8555/// The main reason why this method is called before the replacement of the
8556/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8557/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8558/// FMADDSUB is.
8559static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8560 SelectionDAG &DAG, SDValue &Opnd0,
8561 SDValue &Opnd1, SDValue &Opnd2,
8562 unsigned ExpectedUses,
8563 bool AllowSubAddOrAddSubContract) {
8564 if (Opnd0.getOpcode() != ISD::FMUL ||
8565 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8566 return false;
8567
8568 // FIXME: These checks must match the similar ones in
8569 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8570 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8571 // or MUL + ADDSUB to FMADDSUB.
8572 bool AllowFusion =
8573 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8574 if (!AllowFusion)
8575 return false;
8576
8577 Opnd2 = Opnd1;
8578 Opnd1 = Opnd0.getOperand(1);
8579 Opnd0 = Opnd0.getOperand(0);
8580
8581 return true;
8582}
8583
8584/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8585/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8586/// X86ISD::FMSUBADD node.
8588 const SDLoc &DL,
8589 const X86Subtarget &Subtarget,
8590 SelectionDAG &DAG) {
8591 SDValue Opnd0, Opnd1;
8592 unsigned NumExtracts;
8593 bool IsSubAdd;
8594 bool HasAllowContract;
8595 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8596 HasAllowContract))
8597 return SDValue();
8598
8599 MVT VT = BV->getSimpleValueType(0);
8600
8601 // Try to generate X86ISD::FMADDSUB node here.
8602 SDValue Opnd2;
8603 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8604 HasAllowContract)) {
8605 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8606 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8607 }
8608
8609 // We only support ADDSUB.
8610 if (IsSubAdd)
8611 return SDValue();
8612
8613 // There are no known X86 targets with 512-bit ADDSUB instructions!
8614 // Convert to blend(fsub,fadd).
8615 if (VT.is512BitVector()) {
8616 SmallVector<int> Mask;
8617 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8618 Mask.push_back(I);
8619 Mask.push_back(I + E + 1);
8620 }
8621 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8622 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8623 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8624 }
8625
8626 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8627}
8628
8630 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8631 // Initialize outputs to known values.
8632 MVT VT = BV->getSimpleValueType(0);
8633 HOpcode = ISD::DELETED_NODE;
8634 V0 = DAG.getUNDEF(VT);
8635 V1 = DAG.getUNDEF(VT);
8636
8637 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8638 // half of the result is calculated independently from the 128-bit halves of
8639 // the inputs, so that makes the index-checking logic below more complicated.
8640 unsigned NumElts = VT.getVectorNumElements();
8641 unsigned GenericOpcode = ISD::DELETED_NODE;
8642 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8643 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8644 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8645 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8646 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8647 // Ignore undef elements.
8648 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8649 if (Op.isUndef())
8650 continue;
8651
8652 // If there's an opcode mismatch, we're done.
8653 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8654 return false;
8655
8656 // Initialize horizontal opcode.
8657 if (HOpcode == ISD::DELETED_NODE) {
8658 GenericOpcode = Op.getOpcode();
8659 switch (GenericOpcode) {
8660 // clang-format off
8661 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8662 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8663 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8664 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8665 default: return false;
8666 // clang-format on
8667 }
8668 }
8669
8670 SDValue Op0 = Op.getOperand(0);
8671 SDValue Op1 = Op.getOperand(1);
8672 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8674 Op0.getOperand(0) != Op1.getOperand(0) ||
8676 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8677 return false;
8678
8679 // The source vector is chosen based on which 64-bit half of the
8680 // destination vector is being calculated.
8681 if (j < NumEltsIn64Bits) {
8682 if (V0.isUndef())
8683 V0 = Op0.getOperand(0);
8684 } else {
8685 if (V1.isUndef())
8686 V1 = Op0.getOperand(0);
8687 }
8688
8689 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8690 if (SourceVec != Op0.getOperand(0))
8691 return false;
8692
8693 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8694 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8695 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8696 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8697 (j % NumEltsIn64Bits) * 2;
8698 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8699 continue;
8700
8701 // If this is not a commutative op, this does not match.
8702 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8703 return false;
8704
8705 // Addition is commutative, so try swapping the extract indexes.
8706 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8707 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8708 continue;
8709
8710 // Extract indexes do not match horizontal requirement.
8711 return false;
8712 }
8713 }
8714 // We matched. Opcode and operands are returned by reference as arguments.
8715 return true;
8716}
8717
8719 const SDLoc &DL, SelectionDAG &DAG,
8720 unsigned HOpcode, SDValue V0, SDValue V1) {
8721 // If either input vector is not the same size as the build vector,
8722 // extract/insert the low bits to the correct size.
8723 // This is free (examples: zmm --> xmm, xmm --> ymm).
8724 MVT VT = BV->getSimpleValueType(0);
8725 unsigned Width = VT.getSizeInBits();
8726 if (V0.getValueSizeInBits() > Width)
8727 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8728 else if (V0.getValueSizeInBits() < Width)
8729 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8730
8731 if (V1.getValueSizeInBits() > Width)
8732 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8733 else if (V1.getValueSizeInBits() < Width)
8734 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8735
8736 unsigned NumElts = VT.getVectorNumElements();
8737 APInt DemandedElts = APInt::getAllOnes(NumElts);
8738 for (unsigned i = 0; i != NumElts; ++i)
8739 if (BV->getOperand(i).isUndef())
8740 DemandedElts.clearBit(i);
8741
8742 // If we don't need the upper xmm, then perform as a xmm hop.
8743 unsigned HalfNumElts = NumElts / 2;
8744 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8745 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8746 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8747 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8748 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8749 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8750 }
8751
8752 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8753}
8754
8755/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8757 const X86Subtarget &Subtarget,
8758 SelectionDAG &DAG) {
8759 // We need at least 2 non-undef elements to make this worthwhile by default.
8760 unsigned NumNonUndefs =
8761 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8762 if (NumNonUndefs < 2)
8763 return SDValue();
8764
8765 // There are 4 sets of horizontal math operations distinguished by type:
8766 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8767 // subtarget feature. Try to match those "native" patterns first.
8768 MVT VT = BV->getSimpleValueType(0);
8769 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8770 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8771 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8772 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8773 unsigned HOpcode;
8774 SDValue V0, V1;
8775 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8776 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8777 }
8778
8779 // Try harder to match 256-bit ops by using extract/concat.
8780 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8781 return SDValue();
8782
8783 // Count the number of UNDEF operands in the build_vector in input.
8784 unsigned NumElts = VT.getVectorNumElements();
8785 unsigned Half = NumElts / 2;
8786 unsigned NumUndefsLO = 0;
8787 unsigned NumUndefsHI = 0;
8788 for (unsigned i = 0, e = Half; i != e; ++i)
8789 if (BV->getOperand(i)->isUndef())
8790 NumUndefsLO++;
8791
8792 for (unsigned i = Half, e = NumElts; i != e; ++i)
8793 if (BV->getOperand(i)->isUndef())
8794 NumUndefsHI++;
8795
8796 SDValue InVec0, InVec1;
8797 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8798 SDValue InVec2, InVec3;
8799 unsigned X86Opcode;
8800 bool CanFold = true;
8801
8802 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8803 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8804 InVec3) &&
8805 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8806 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8807 X86Opcode = X86ISD::HADD;
8808 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8809 InVec1) &&
8810 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8811 InVec3) &&
8812 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8813 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8814 X86Opcode = X86ISD::HSUB;
8815 else
8816 CanFold = false;
8817
8818 if (CanFold) {
8819 // Do not try to expand this build_vector into a pair of horizontal
8820 // add/sub if we can emit a pair of scalar add/sub.
8821 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8822 return SDValue();
8823
8824 // Convert this build_vector into a pair of horizontal binops followed by
8825 // a concat vector. We must adjust the outputs from the partial horizontal
8826 // matching calls above to account for undefined vector halves.
8827 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8828 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8829 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8830 bool isUndefLO = NumUndefsLO == Half;
8831 bool isUndefHI = NumUndefsHI == Half;
8832 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8833 isUndefHI);
8834 }
8835 }
8836
8837 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8838 VT == MVT::v16i16) {
8839 unsigned X86Opcode;
8840 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8841 InVec1))
8842 X86Opcode = X86ISD::HADD;
8843 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8844 InVec1))
8845 X86Opcode = X86ISD::HSUB;
8846 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8847 InVec1))
8848 X86Opcode = X86ISD::FHADD;
8849 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8850 InVec1))
8851 X86Opcode = X86ISD::FHSUB;
8852 else
8853 return SDValue();
8854
8855 // Don't try to expand this build_vector into a pair of horizontal add/sub
8856 // if we can simply emit a pair of scalar add/sub.
8857 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8858 return SDValue();
8859
8860 // Convert this build_vector into two horizontal add/sub followed by
8861 // a concat vector.
8862 bool isUndefLO = NumUndefsLO == Half;
8863 bool isUndefHI = NumUndefsHI == Half;
8864 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8865 isUndefLO, isUndefHI);
8866 }
8867
8868 return SDValue();
8869}
8870
8871static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8872 SelectionDAG &DAG);
8873
8874/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8875/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8876/// just apply the bit to the vectors.
8877/// NOTE: Its not in our interest to start make a general purpose vectorizer
8878/// from this, but enough scalar bit operations are created from the later
8879/// legalization + scalarization stages to need basic support.
8881 const X86Subtarget &Subtarget,
8882 SelectionDAG &DAG) {
8883 MVT VT = Op->getSimpleValueType(0);
8884 unsigned NumElems = VT.getVectorNumElements();
8885 unsigned ElemSize = VT.getScalarSizeInBits();
8886 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8887
8888 // Check that all elements have the same opcode.
8889 // TODO: Should we allow UNDEFS and if so how many?
8890 unsigned Opcode = Op->getOperand(0).getOpcode();
8891 for (unsigned i = 1; i < NumElems; ++i)
8892 if (Opcode != Op->getOperand(i).getOpcode())
8893 return SDValue();
8894
8895 // TODO: We may be able to add support for other Ops (e.g. ADD/SUB).
8896 bool IsShift = false;
8897 switch (Opcode) {
8898 default:
8899 return SDValue();
8900 case ISD::SHL:
8901 case ISD::SRL:
8902 case ISD::SRA:
8903 IsShift = true;
8904 break;
8905 case ISD::AND:
8906 case ISD::XOR:
8907 case ISD::OR:
8908 // Don't do this if the buildvector is a splat - we'd replace one
8909 // constant with an entire vector.
8910 if (Op->getSplatValue())
8911 return SDValue();
8912 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8913 return SDValue();
8914 break;
8915 }
8916
8917 // Collect elements.
8918 bool RHSAllConst = true;
8919 SmallVector<SDValue, 4> LHSElts, RHSElts;
8920 for (SDValue Elt : Op->ops()) {
8921 SDValue LHS = Elt.getOperand(0);
8922 SDValue RHS = Elt.getOperand(1);
8923 RHSAllConst &= isa<ConstantSDNode>(RHS);
8924 LHSElts.push_back(LHS);
8925 RHSElts.push_back(RHS);
8926 }
8927
8928 // Canonicalize shift amounts.
8929 if (IsShift) {
8930 // We expect the canonicalized RHS operand to be the constant.
8931 // TODO: Permit non-constant XOP/AVX2 cases?
8932 if (!RHSAllConst)
8933 return SDValue();
8934
8935 // Extend shift amounts.
8936 for (SDValue &Op1 : RHSElts)
8937 if (Op1.getValueSizeInBits() != ElemSize)
8938 Op1 = DAG.getZExtOrTrunc(Op1, DL, VT.getScalarType());
8939
8940 // Limit to shifts by uniform immediates.
8941 // TODO: Only accept vXi8/vXi64 special cases?
8942 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8943 if (any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8944 return SDValue();
8945 }
8946 assert(all_of(llvm::concat<SDValue>(LHSElts, RHSElts),
8947 [ElemSize](SDValue V) {
8948 return V.getValueSizeInBits() == ElemSize;
8949 }) &&
8950 "Element size mismatch");
8951
8952 // To avoid an increase in GPR->FPU instructions, LHS/RHS must be foldable as
8953 // a load or RHS must be constant.
8954 SDValue LHS = EltsFromConsecutiveLoads(VT, LHSElts, DL, DAG, Subtarget,
8955 /*IsAfterLegalize=*/true);
8956 SDValue RHS = EltsFromConsecutiveLoads(VT, RHSElts, DL, DAG, Subtarget,
8957 /*IsAfterLegalize=*/true);
8958 if (!LHS && !RHS && !RHSAllConst)
8959 return SDValue();
8960
8961 if (!LHS)
8962 LHS = DAG.getBuildVector(VT, DL, LHSElts);
8963 if (!RHS)
8964 RHS = DAG.getBuildVector(VT, DL, RHSElts);
8965 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8966
8967 if (!IsShift)
8968 return Res;
8969
8970 // Immediately lower the shift to ensure the constant build vector doesn't
8971 // get converted to a constant pool before the shift is lowered.
8972 return LowerShift(Res, Subtarget, DAG);
8973}
8974
8975static bool isShuffleFoldableLoad(SDValue);
8976
8977/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8978/// representing a blend.
8980 X86Subtarget const &Subtarget,
8981 SelectionDAG &DAG) {
8982 MVT VT = BVOp->getSimpleValueType(0u);
8983
8984 if (VT != MVT::v4f64)
8985 return SDValue();
8986
8987 // Collect unique operands.
8988 auto UniqueOps = SmallSet<SDValue, 16u>();
8989 for (SDValue Op : BVOp->ops()) {
8990 if (isIntOrFPConstant(Op) || Op.isUndef())
8991 return SDValue();
8992 UniqueOps.insert(Op);
8993 }
8994
8995 // Candidate BUILD_VECTOR must have 2 unique operands.
8996 if (UniqueOps.size() != 2u)
8997 return SDValue();
8998
8999 SDValue Op0 = BVOp->getOperand(0u);
9000 UniqueOps.erase(Op0);
9001 SDValue Op1 = *UniqueOps.begin();
9002
9003 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
9004 isShuffleFoldableLoad(Op1)) {
9005 // Create shuffle mask.
9006 auto const NumElems = VT.getVectorNumElements();
9007 SmallVector<int, 16u> Mask(NumElems);
9008 for (auto I = 0u; I < NumElems; ++I) {
9009 SDValue Op = BVOp->getOperand(I);
9010 Mask[I] = Op == Op0 ? I : I + NumElems;
9011 }
9012 // Create shuffle of splats.
9013 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
9014 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
9015 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
9016 }
9017
9018 return SDValue();
9019}
9020
9021/// Widen a BUILD_VECTOR if the scalar operands are freely mergeable.
9023 X86Subtarget const &Subtarget,
9024 SelectionDAG &DAG) {
9025 using namespace SDPatternMatch;
9026 MVT VT = BVOp->getSimpleValueType(0);
9027 MVT SVT = VT.getScalarType();
9028 unsigned NumElts = VT.getVectorNumElements();
9029 unsigned EltBits = SVT.getSizeInBits();
9030
9031 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
9032 return SDValue();
9033
9034 unsigned WideBits = 2 * EltBits;
9035 MVT WideSVT = MVT::getIntegerVT(WideBits);
9036 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts / 2);
9037 if (!DAG.getTargetLoweringInfo().isTypeLegal(WideSVT))
9038 return SDValue();
9039
9041 for (unsigned I = 0; I != NumElts; I += 2) {
9042 SDValue Op0 = BVOp->getOperand(I + 0);
9043 SDValue Op1 = BVOp->getOperand(I + 1);
9044
9045 if (Op0.isUndef() && Op1.isUndef()) {
9046 WideOps.push_back(DAG.getUNDEF(WideSVT));
9047 continue;
9048 }
9049
9050 // TODO: Constant repacking?
9051
9052 // Merge scalars that have been split from the same source.
9053 SDValue X, Y;
9054 if (sd_match(Op0, m_Trunc(m_Value(X))) &&
9055 sd_match(Op1, m_Trunc(m_Srl(m_Value(Y), m_SpecificInt(EltBits)))) &&
9057 X.getValueType().bitsGE(WideSVT)) {
9058 if (X.getValueType().bitsGT(WideSVT))
9059 X = DAG.getNode(ISD::TRUNCATE, DL, WideSVT, X);
9060 WideOps.push_back(X);
9061 continue;
9062 }
9063
9064 return SDValue();
9065 }
9066
9067 assert(WideOps.size() == (NumElts / 2) && "Failed to widen build vector");
9068 return DAG.getBitcast(VT, DAG.getBuildVector(WideVT, DL, WideOps));
9069}
9070
9071/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9072/// functionality to do this, so it's all zeros, all ones, or some derivation
9073/// that is cheap to calculate.
9075 SelectionDAG &DAG,
9076 const X86Subtarget &Subtarget) {
9077 MVT VT = Op.getSimpleValueType();
9078
9079 // Vectors containing all zeros can be matched by pxor and xorps.
9080 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9081 return Op;
9082
9083 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9084 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9085 // vpcmpeqd on 256-bit vectors.
9086 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
9087 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9088 return Op;
9089
9090 return getOnesVector(VT, DAG, DL);
9091 }
9092
9093 return SDValue();
9094}
9095
9096/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
9097/// from a vector of source values and a vector of extraction indices.
9098/// The vectors might be manipulated to match the type of the permute op.
9099static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
9100 const SDLoc &DL, SelectionDAG &DAG,
9101 const X86Subtarget &Subtarget) {
9102 MVT ShuffleVT = VT;
9103 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9104 unsigned NumElts = VT.getVectorNumElements();
9105 unsigned SizeInBits = VT.getSizeInBits();
9106
9107 // Adjust IndicesVec to match VT size.
9108 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
9109 "Illegal variable permute mask size");
9110 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
9111 // Narrow/widen the indices vector to the correct size.
9112 if (IndicesVec.getValueSizeInBits() > SizeInBits)
9113 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
9114 NumElts * VT.getScalarSizeInBits());
9115 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
9116 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
9117 SDLoc(IndicesVec), SizeInBits);
9118 // Zero-extend the index elements within the vector.
9119 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
9120 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
9121 IndicesVT, IndicesVec);
9122 }
9123 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
9124
9125 // Handle SrcVec that don't match VT type.
9126 if (SrcVec.getValueSizeInBits() != SizeInBits) {
9127 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
9128 // Handle larger SrcVec by treating it as a larger permute.
9129 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
9130 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
9131 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9132 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
9133 Subtarget, DAG, SDLoc(IndicesVec));
9134 SDValue NewSrcVec =
9135 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9136 if (NewSrcVec)
9137 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
9138 return SDValue();
9139 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
9140 // Widen smaller SrcVec to match VT.
9141 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
9142 } else
9143 return SDValue();
9144 }
9145
9146 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
9147 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
9148 EVT SrcVT = Idx.getValueType();
9149 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
9150 uint64_t IndexScale = 0;
9151 uint64_t IndexOffset = 0;
9152
9153 // If we're scaling a smaller permute op, then we need to repeat the
9154 // indices, scaling and offsetting them as well.
9155 // e.g. v4i32 -> v16i8 (Scale = 4)
9156 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
9157 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
9158 for (uint64_t i = 0; i != Scale; ++i) {
9159 IndexScale |= Scale << (i * NumDstBits);
9160 IndexOffset |= i << (i * NumDstBits);
9161 }
9162
9163 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
9164 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
9165 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
9166 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
9167 return Idx;
9168 };
9169
9170 unsigned Opcode = 0;
9171 switch (VT.SimpleTy) {
9172 default:
9173 break;
9174 case MVT::v16i8:
9175 if (Subtarget.hasSSSE3())
9176 Opcode = X86ISD::PSHUFB;
9177 break;
9178 case MVT::v8i16:
9179 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9180 Opcode = X86ISD::VPERMV;
9181 else if (Subtarget.hasSSSE3()) {
9182 Opcode = X86ISD::PSHUFB;
9183 ShuffleVT = MVT::v16i8;
9184 }
9185 break;
9186 case MVT::v4f32:
9187 case MVT::v4i32:
9188 if (Subtarget.hasAVX()) {
9189 Opcode = X86ISD::VPERMILPV;
9190 ShuffleVT = MVT::v4f32;
9191 } else if (Subtarget.hasSSSE3()) {
9192 Opcode = X86ISD::PSHUFB;
9193 ShuffleVT = MVT::v16i8;
9194 }
9195 break;
9196 case MVT::v2f64:
9197 case MVT::v2i64:
9198 if (Subtarget.hasAVX()) {
9199 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
9200 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9201 Opcode = X86ISD::VPERMILPV;
9202 ShuffleVT = MVT::v2f64;
9203 } else if (Subtarget.hasSSE41()) {
9204 // SSE41 can compare v2i64 - select between indices 0 and 1.
9205 return DAG.getSelectCC(
9206 DL, IndicesVec,
9207 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
9208 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
9209 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
9211 }
9212 break;
9213 case MVT::v32i8:
9214 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9215 Opcode = X86ISD::VPERMV;
9216 else if (Subtarget.hasXOP()) {
9217 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
9218 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9219 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9220 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9221 return DAG.getNode(
9223 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9224 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9225 } else if (Subtarget.hasAVX()) {
9226 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9227 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9228 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9229 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9230 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9232 // Permute Lo and Hi and then select based on index range.
9233 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9234 // care about the bit[7] as its just an index vector.
9235 SDValue Idx = Ops[2];
9236 EVT VT = Idx.getValueType();
9237 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9238 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9239 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9241 };
9242 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9243 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9244 PSHUFBBuilder);
9245 }
9246 break;
9247 case MVT::v16i16:
9248 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9249 Opcode = X86ISD::VPERMV;
9250 else if (Subtarget.hasAVX()) {
9251 // Scale to v32i8 and perform as v32i8.
9252 IndicesVec = ScaleIndices(IndicesVec, 2);
9253 return DAG.getBitcast(
9255 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9256 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9257 }
9258 break;
9259 case MVT::v8f32:
9260 case MVT::v8i32:
9261 if (Subtarget.hasAVX2())
9262 Opcode = X86ISD::VPERMV;
9263 else if (Subtarget.hasAVX()) {
9264 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9265 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9266 {0, 1, 2, 3, 0, 1, 2, 3});
9267 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9268 {4, 5, 6, 7, 4, 5, 6, 7});
9269 if (Subtarget.hasXOP())
9270 return DAG.getBitcast(
9271 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9272 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9273 // Permute Lo and Hi and then select based on index range.
9274 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9275 SDValue Res = DAG.getSelectCC(
9276 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9277 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9278 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9280 return DAG.getBitcast(VT, Res);
9281 }
9282 break;
9283 case MVT::v4i64:
9284 case MVT::v4f64:
9285 if (Subtarget.hasAVX512()) {
9286 if (!Subtarget.hasVLX()) {
9287 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9288 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9289 SDLoc(SrcVec));
9290 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9291 DAG, SDLoc(IndicesVec));
9292 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9293 DAG, Subtarget);
9294 return extract256BitVector(Res, 0, DAG, DL);
9295 }
9296 Opcode = X86ISD::VPERMV;
9297 } else if (Subtarget.hasAVX()) {
9298 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9299 SDValue LoLo =
9300 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9301 SDValue HiHi =
9302 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9303 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9304 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9305 if (Subtarget.hasXOP())
9306 return DAG.getBitcast(
9307 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9308 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9309 // Permute Lo and Hi and then select based on index range.
9310 // This works as VPERMILPD only uses index bit[1] to permute elements.
9311 SDValue Res = DAG.getSelectCC(
9312 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9313 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9314 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9316 return DAG.getBitcast(VT, Res);
9317 }
9318 break;
9319 case MVT::v64i8:
9320 if (Subtarget.hasVBMI())
9321 Opcode = X86ISD::VPERMV;
9322 break;
9323 case MVT::v32i16:
9324 if (Subtarget.hasBWI())
9325 Opcode = X86ISD::VPERMV;
9326 break;
9327 case MVT::v16f32:
9328 case MVT::v16i32:
9329 case MVT::v8f64:
9330 case MVT::v8i64:
9331 if (Subtarget.hasAVX512())
9332 Opcode = X86ISD::VPERMV;
9333 break;
9334 }
9335 if (!Opcode)
9336 return SDValue();
9337
9338 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9339 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9340 "Illegal variable permute shuffle type");
9341
9342 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9343 if (Scale > 1)
9344 IndicesVec = ScaleIndices(IndicesVec, Scale);
9345
9346 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9347 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9348
9349 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9350 SDValue Res = Opcode == X86ISD::VPERMV
9351 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9352 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9353 return DAG.getBitcast(VT, Res);
9354}
9355
9356// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9357// reasoned to be a permutation of a vector by indices in a non-constant vector.
9358// (build_vector (extract_elt V, (extract_elt I, 0)),
9359// (extract_elt V, (extract_elt I, 1)),
9360// ...
9361// ->
9362// (vpermv I, V)
9363//
9364// TODO: Handle undefs
9365// TODO: Utilize pshufb and zero mask blending to support more efficient
9366// construction of vectors with constant-0 elements.
9367static SDValue
9369 SelectionDAG &DAG,
9370 const X86Subtarget &Subtarget) {
9371 SDValue SrcVec, IndicesVec;
9372
9373 auto PeekThroughFreeze = [](SDValue N) {
9374 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9375 return N->getOperand(0);
9376 return N;
9377 };
9378 // Check for a match of the permute source vector and permute index elements.
9379 // This is done by checking that the i-th build_vector operand is of the form:
9380 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9381 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9382 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9383 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9384 return SDValue();
9385
9386 // If this is the first extract encountered in V, set the source vector,
9387 // otherwise verify the extract is from the previously defined source
9388 // vector.
9389 if (!SrcVec)
9390 SrcVec = Op.getOperand(0);
9391 else if (SrcVec != Op.getOperand(0))
9392 return SDValue();
9393 SDValue ExtractedIndex = Op->getOperand(1);
9394 // Peek through extends.
9395 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9396 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9397 ExtractedIndex = ExtractedIndex.getOperand(0);
9398 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9399 return SDValue();
9400
9401 // If this is the first extract from the index vector candidate, set the
9402 // indices vector, otherwise verify the extract is from the previously
9403 // defined indices vector.
9404 if (!IndicesVec)
9405 IndicesVec = ExtractedIndex.getOperand(0);
9406 else if (IndicesVec != ExtractedIndex.getOperand(0))
9407 return SDValue();
9408
9409 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9410 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9411 return SDValue();
9412 }
9413
9414 MVT VT = V.getSimpleValueType();
9415 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9416}
9417
9418SDValue
9419X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9420 SDLoc dl(Op);
9421
9422 MVT VT = Op.getSimpleValueType();
9423 MVT EltVT = VT.getVectorElementType();
9424 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9425 unsigned NumElems = Op.getNumOperands();
9426
9427 // Generate vectors for predicate vectors.
9428 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9429 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9430
9431 if (VT.getVectorElementType() == MVT::bf16 &&
9432 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9433 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9434
9435 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9436 return VectorCst;
9437
9438 unsigned EVTBits = EltVT.getSizeInBits();
9439 APInt UndefMask = APInt::getZero(NumElems);
9440 APInt FrozenUndefMask = APInt::getZero(NumElems);
9441 APInt ZeroMask = APInt::getZero(NumElems);
9442 APInt NonZeroMask = APInt::getZero(NumElems);
9443 bool IsAllConstants = true;
9444 bool OneUseFrozenUndefs = true;
9445 SmallSet<SDValue, 8> Values;
9446 unsigned NumConstants = NumElems;
9447 for (unsigned i = 0; i < NumElems; ++i) {
9448 SDValue Elt = Op.getOperand(i);
9449 if (Elt.isUndef()) {
9450 UndefMask.setBit(i);
9451 continue;
9452 }
9453 if (ISD::isFreezeUndef(Elt.getNode())) {
9454 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9455 FrozenUndefMask.setBit(i);
9456 continue;
9457 }
9458 Values.insert(Elt);
9459 if (!isIntOrFPConstant(Elt)) {
9460 IsAllConstants = false;
9461 NumConstants--;
9462 }
9463 if (X86::isZeroNode(Elt)) {
9464 ZeroMask.setBit(i);
9465 } else {
9466 NonZeroMask.setBit(i);
9467 }
9468 }
9469
9470 // All undef vector. Return an UNDEF.
9471 if (UndefMask.isAllOnes())
9472 return DAG.getUNDEF(VT);
9473
9474 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9475 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9476 return DAG.getFreeze(DAG.getUNDEF(VT));
9477
9478 // All undef/freeze(undef)/zero vector. Return a zero vector.
9479 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9480 return getZeroVector(VT, Subtarget, DAG, dl);
9481
9482 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9483 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9484 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9485 // and blend the FREEZE-UNDEF operands back in.
9486 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9487 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9488 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9489 SmallVector<int, 16> BlendMask(NumElems, -1);
9490 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9491 for (unsigned i = 0; i < NumElems; ++i) {
9492 if (UndefMask[i]) {
9493 BlendMask[i] = -1;
9494 continue;
9495 }
9496 BlendMask[i] = i;
9497 if (!FrozenUndefMask[i])
9498 Elts[i] = Op.getOperand(i);
9499 else
9500 BlendMask[i] += NumElems;
9501 }
9502 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9503 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9504 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9505 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9506 }
9507
9508 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9509
9510 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9511 // be better off lowering to a smaller build vector and padding with
9512 // undef/zero.
9513 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9515 unsigned UpperElems = NumElems / 2;
9516 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9517 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9518 if (NumUpperUndefsOrZeros >= UpperElems) {
9519 if (VT.is512BitVector() &&
9520 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9521 UpperElems = NumElems - (NumElems / 4);
9522 // If freeze(undef) is in any upper elements, force to zero.
9523 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9524 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9525 SDValue NewBV =
9526 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9527 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9528 }
9529 }
9530
9531 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9532 return AddSub;
9533 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9534 return HorizontalOp;
9535 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9536 return Broadcast;
9537 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9538 return BitOp;
9539 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9540 return Blend;
9541 if (SDValue WideBV = widenBuildVector(BV, dl, Subtarget, DAG))
9542 return WideBV;
9543
9544 unsigned NumZero = ZeroMask.popcount();
9545 unsigned NumNonZero = NonZeroMask.popcount();
9546
9547 // If we are inserting one variable into a vector of non-zero constants, try
9548 // to avoid loading each constant element as a scalar. Load the constants as a
9549 // vector and then insert the variable scalar element. If insertion is not
9550 // supported, fall back to a shuffle to get the scalar blended with the
9551 // constants. Insertion into a zero vector is handled as a special-case
9552 // somewhere below here.
9553 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9554 FrozenUndefMask.isZero() &&
9557 // Create an all-constant vector. The variable element in the old
9558 // build vector is replaced by undef in the constant vector. Save the
9559 // variable scalar element and its index for use in the insertelement.
9560 LLVMContext &Context = *DAG.getContext();
9561 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9562 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9563 SDValue VarElt;
9564 SDValue InsIndex;
9565 for (unsigned i = 0; i != NumElems; ++i) {
9566 SDValue Elt = Op.getOperand(i);
9567 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9568 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9569 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9570 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9571 else if (!Elt.isUndef()) {
9572 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9573 "Expected one variable element in this vector");
9574 VarElt = Elt;
9575 InsIndex = DAG.getVectorIdxConstant(i, dl);
9576 }
9577 }
9578 Constant *CV = ConstantVector::get(ConstVecOps);
9579 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9580
9581 // The constants we just created may not be legal (eg, floating point). We
9582 // must lower the vector right here because we can not guarantee that we'll
9583 // legalize it before loading it. This is also why we could not just create
9584 // a new build vector here. If the build vector contains illegal constants,
9585 // it could get split back up into a series of insert elements.
9586 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9587 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9588 MachineFunction &MF = DAG.getMachineFunction();
9589 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9590 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9591 unsigned InsertC = InsIndex->getAsZExtVal();
9592 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9593 if (InsertC < NumEltsInLow128Bits)
9594 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9595
9596 // There's no good way to insert into the high elements of a >128-bit
9597 // vector, so use shuffles to avoid an extract/insert sequence.
9598 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9599 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9600 SmallVector<int, 8> ShuffleMask;
9601 unsigned NumElts = VT.getVectorNumElements();
9602 for (unsigned i = 0; i != NumElts; ++i)
9603 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9604 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9605 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9606 }
9607
9608 // Special case for single non-zero, non-undef, element.
9609 if (NumNonZero == 1) {
9610 unsigned Idx = NonZeroMask.countr_zero();
9611 SDValue Item = Op.getOperand(Idx);
9612
9613 // If we have a constant or non-constant insertion into the low element of
9614 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9615 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9616 // depending on what the source datatype is.
9617 if (Idx == 0) {
9618 if (NumZero == 0)
9619 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9620
9621 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9622 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9623 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9624 assert((VT.is128BitVector() || VT.is256BitVector() ||
9625 VT.is512BitVector()) &&
9626 "Expected an SSE value type!");
9627 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9628 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9629 // zero vector.
9630 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9631 }
9632
9633 // We can't directly insert an i8 or i16 into a vector, so zero extend
9634 // it to i32 first.
9635 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9636 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9637 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9638 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9639 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9640 return DAG.getBitcast(VT, Item);
9641 }
9642 }
9643
9644 // Is it a vector logical left shift?
9645 if (NumElems == 2 && Idx == 1 &&
9646 X86::isZeroNode(Op.getOperand(0)) &&
9647 !X86::isZeroNode(Op.getOperand(1))) {
9648 unsigned NumBits = VT.getSizeInBits();
9649 return getVShift(true, VT,
9651 VT, Op.getOperand(1)),
9652 NumBits/2, DAG, *this, dl);
9653 }
9654
9655 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9656 return SDValue();
9657
9658 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9659 // is a non-constant being inserted into an element other than the low one,
9660 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9661 // movd/movss) to move this into the low element, then shuffle it into
9662 // place.
9663 if (EVTBits == 32) {
9664 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9665 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9666 }
9667 }
9668
9669 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9670 if (Values.size() == 1) {
9671 if (EVTBits == 32) {
9672 // Instead of a shuffle like this:
9673 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9674 // Check if it's possible to issue this instead.
9675 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9676 unsigned Idx = NonZeroMask.countr_zero();
9677 SDValue Item = Op.getOperand(Idx);
9678 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9679 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9680 }
9681 return SDValue();
9682 }
9683
9684 // A vector full of immediates; various special cases are already
9685 // handled, so this is best done with a single constant-pool load.
9686 if (IsAllConstants)
9687 return SDValue();
9688
9689 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9690 return V;
9691
9692 // See if we can use a vector load to get all of the elements.
9693 {
9694 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9695 if (SDValue LD =
9696 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9697 return LD;
9698 }
9699
9700 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9701 // build_vector and broadcast it.
9702 // TODO: We could probably generalize this more.
9703 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9704 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9705 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9706 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9707 // Make sure all the even/odd operands match.
9708 for (unsigned i = 2; i != NumElems; ++i)
9709 if (Ops[i % 2] != Op.getOperand(i))
9710 return false;
9711 return true;
9712 };
9713 if (CanSplat(Op, NumElems, Ops)) {
9714 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9715 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9716 // Create a new build vector and cast to v2i64/v2f64.
9717 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9718 DAG.getBuildVector(NarrowVT, dl, Ops));
9719 // Broadcast from v2i64/v2f64 and cast to final VT.
9720 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9721 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9722 NewBV));
9723 }
9724 }
9725
9726 // For AVX-length vectors, build the individual 128-bit pieces and use
9727 // shuffles to put them in place.
9728 if (VT.getSizeInBits() > 128) {
9729 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9730
9731 // Build both the lower and upper subvector.
9732 SDValue Lower =
9733 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9735 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9736
9737 // Recreate the wider vector with the lower and upper part.
9738 return concatSubVectors(Lower, Upper, DAG, dl);
9739 }
9740
9741 // Let legalizer expand 2-wide build_vectors.
9742 if (EVTBits == 64) {
9743 if (NumNonZero == 1) {
9744 // One half is zero or undef.
9745 unsigned Idx = NonZeroMask.countr_zero();
9746 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9747 Op.getOperand(Idx));
9748 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9749 }
9750 return SDValue();
9751 }
9752
9753 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9754 if (EVTBits == 8 && NumElems == 16)
9755 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9756 NumZero, DAG, Subtarget))
9757 return V;
9758
9759 if (EltVT == MVT::i16 && NumElems == 8)
9760 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9761 NumZero, DAG, Subtarget))
9762 return V;
9763
9764 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9765 if (EVTBits == 32 && NumElems == 4)
9766 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9767 return V;
9768
9769 // If element VT is == 32 bits, turn it into a number of shuffles.
9770 if (NumElems == 4 && NumZero > 0) {
9771 SmallVector<SDValue, 8> Ops(NumElems);
9772 for (unsigned i = 0; i < 4; ++i) {
9773 bool isZero = !NonZeroMask[i];
9774 if (isZero)
9775 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9776 else
9777 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9778 }
9779
9780 for (unsigned i = 0; i < 2; ++i) {
9781 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9782 default: llvm_unreachable("Unexpected NonZero count");
9783 case 0:
9784 Ops[i] = Ops[i*2]; // Must be a zero vector.
9785 break;
9786 case 1:
9787 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9788 break;
9789 case 2:
9790 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9791 break;
9792 case 3:
9793 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9794 break;
9795 }
9796 }
9797
9798 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9799 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9800 int MaskVec[] = {
9801 Reverse1 ? 1 : 0,
9802 Reverse1 ? 0 : 1,
9803 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9804 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9805 };
9806 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9807 }
9808
9809 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9810
9811 // Check for a build vector from mostly shuffle plus few inserting.
9812 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9813 return Sh;
9814
9815 // For SSE 4.1, use insertps to put the high elements into the low element.
9816 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9818 if (!Op.getOperand(0).isUndef())
9819 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9820 else
9821 Result = DAG.getUNDEF(VT);
9822
9823 for (unsigned i = 1; i < NumElems; ++i) {
9824 if (Op.getOperand(i).isUndef()) continue;
9825 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9826 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9827 }
9828 return Result;
9829 }
9830
9831 // Otherwise, expand into a number of unpckl*, start by extending each of
9832 // our (non-undef) elements to the full vector width with the element in the
9833 // bottom slot of the vector (which generates no code for SSE).
9834 SmallVector<SDValue, 8> Ops(NumElems);
9835 for (unsigned i = 0; i < NumElems; ++i) {
9836 if (!Op.getOperand(i).isUndef())
9837 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9838 else
9839 Ops[i] = DAG.getUNDEF(VT);
9840 }
9841
9842 // Next, we iteratively mix elements, e.g. for v4f32:
9843 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9844 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9845 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9846 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9847 // Generate scaled UNPCKL shuffle mask.
9848 SmallVector<int, 16> Mask;
9849 for(unsigned i = 0; i != Scale; ++i)
9850 Mask.push_back(i);
9851 for (unsigned i = 0; i != Scale; ++i)
9852 Mask.push_back(NumElems+i);
9853 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9854
9855 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9856 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9857 }
9858 return Ops[0];
9859}
9860
9861// 256-bit AVX can use the vinsertf128 instruction
9862// to create 256-bit vectors from two other 128-bit ones.
9863// TODO: Detect subvector broadcast here instead of DAG combine?
9865 SelectionDAG &DAG,
9866 const X86Subtarget &Subtarget) {
9867 MVT ResVT = Op.getSimpleValueType();
9868 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9869 "Value type must be 256-/512-bit wide");
9870
9871 unsigned NumOperands = Op.getNumOperands();
9872 unsigned NumFreezeUndef = 0;
9873 unsigned NumZero = 0;
9874 unsigned NumNonZero = 0;
9875 unsigned NonZeros = 0;
9876 SmallSet<SDValue, 4> Undefs;
9877 for (unsigned i = 0; i != NumOperands; ++i) {
9878 SDValue SubVec = Op.getOperand(i);
9879 if (SubVec.isUndef())
9880 continue;
9881 if (ISD::isFreezeUndef(SubVec.getNode())) {
9882 // If the freeze(undef) has multiple uses then we must fold to zero.
9883 if (SubVec.hasOneUse()) {
9884 ++NumFreezeUndef;
9885 } else {
9886 ++NumZero;
9887 Undefs.insert(SubVec);
9888 }
9889 }
9890 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9891 ++NumZero;
9892 else {
9893 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9894 NonZeros |= 1 << i;
9895 ++NumNonZero;
9896 }
9897 }
9898
9899 // If we have more than 2 non-zeros, build each half separately.
9900 if (NumNonZero > 2) {
9901 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9902 ArrayRef<SDUse> Ops = Op->ops();
9903 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9904 Ops.slice(0, NumOperands/2));
9905 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9906 Ops.slice(NumOperands/2));
9907 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9908 }
9909
9910 // Otherwise, build it up through insert_subvectors.
9911 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9912 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9913 : DAG.getUNDEF(ResVT));
9914
9915 // Replace Undef operands with ZeroVector.
9916 for (SDValue U : Undefs)
9918 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9919
9920 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9921 unsigned NumSubElems = SubVT.getVectorNumElements();
9922 for (unsigned i = 0; i != NumOperands; ++i) {
9923 if ((NonZeros & (1 << i)) == 0)
9924 continue;
9925
9926 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9927 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9928 }
9929
9930 return Vec;
9931}
9932
9933// Returns true if the given node is a type promotion (by concatenating i1
9934// zeros) of the result of a node that already zeros all upper bits of
9935// k-register.
9936// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9938 const X86Subtarget &Subtarget,
9939 SelectionDAG & DAG) {
9940 MVT ResVT = Op.getSimpleValueType();
9941 unsigned NumOperands = Op.getNumOperands();
9942 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9943 "Unexpected number of operands in CONCAT_VECTORS");
9944
9945 uint64_t Zeros = 0;
9946 uint64_t NonZeros = 0;
9947 for (unsigned i = 0; i != NumOperands; ++i) {
9948 SDValue SubVec = Op.getOperand(i);
9949 if (SubVec.isUndef())
9950 continue;
9951 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9952 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9953 Zeros |= (uint64_t)1 << i;
9954 else
9955 NonZeros |= (uint64_t)1 << i;
9956 }
9957
9958 unsigned NumElems = ResVT.getVectorNumElements();
9959
9960 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9961 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9962 // insert_subvector will give us two kshifts.
9963 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9964 Log2_64(NonZeros) != NumOperands - 1) {
9965 unsigned Idx = Log2_64(NonZeros);
9966 SDValue SubVec = Op.getOperand(Idx);
9967 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9968 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9969 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9970 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9971 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9972 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9973 DAG.getVectorIdxConstant(0, dl));
9974 }
9975
9976 // If there are zero or one non-zeros we can handle this very simply.
9977 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9978 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9979 if (!NonZeros)
9980 return Vec;
9981 unsigned Idx = Log2_64(NonZeros);
9982 SDValue SubVec = Op.getOperand(Idx);
9983 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9984 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9985 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9986 }
9987
9988 if (NumOperands > 2) {
9989 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9990 ArrayRef<SDUse> Ops = Op->ops();
9991 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9992 Ops.slice(0, NumOperands / 2));
9993 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9994 Ops.slice(NumOperands / 2));
9995 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9996 }
9997
9998 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9999
10000 if (ResVT.getVectorNumElements() >= 16)
10001 return Op; // The operation is legal with KUNPCK
10002
10003 SDValue Vec =
10004 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
10005 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
10006 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10007 DAG.getVectorIdxConstant(NumElems / 2, dl));
10008}
10009
10011 const X86Subtarget &Subtarget,
10012 SelectionDAG &DAG) {
10013 SDLoc DL(Op);
10014 MVT VT = Op.getSimpleValueType();
10015 if (VT.getVectorElementType() == MVT::i1)
10016 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
10017
10018 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10019 // from two other 128-bit ones.
10020 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10021 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
10022 (VT.is512BitVector() &&
10023 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
10024 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
10025}
10026
10027//===----------------------------------------------------------------------===//
10028// Vector shuffle lowering
10029//
10030// This is an experimental code path for lowering vector shuffles on x86. It is
10031// designed to handle arbitrary vector shuffles and blends, gracefully
10032// degrading performance as necessary. It works hard to recognize idiomatic
10033// shuffles and lower them to optimal instruction patterns without leaving
10034// a framework that allows reasonably efficient handling of all vector shuffle
10035// patterns.
10036//===----------------------------------------------------------------------===//
10037
10038/// Checks whether the vector elements referenced by two shuffle masks are
10039/// equivalent.
10040static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
10041 int Idx, int ExpectedIdx) {
10042 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
10043 ExpectedIdx < MaskSize && "Out of range element index");
10044 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
10045 return false;
10046
10047 EVT VT = Op.getValueType();
10048 EVT ExpectedVT = ExpectedOp.getValueType();
10049
10050 // Sources must be vectors and match the mask's element count.
10051 if (!VT.isVector() || !ExpectedVT.isVector() ||
10052 (int)VT.getVectorNumElements() != MaskSize ||
10053 (int)ExpectedVT.getVectorNumElements() != MaskSize)
10054 return false;
10055
10056 // Exact match.
10057 if (Idx == ExpectedIdx && Op == ExpectedOp)
10058 return true;
10059
10060 switch (Op.getOpcode()) {
10061 case ISD::BUILD_VECTOR:
10062 // If the values are build vectors, we can look through them to find
10063 // equivalent inputs that make the shuffles equivalent.
10064 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
10065 case ISD::BITCAST: {
10067 EVT SrcVT = Src.getValueType();
10068 if (Op == ExpectedOp && SrcVT.isVector()) {
10069 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
10070 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
10071 return (Idx % Scale) == (ExpectedIdx % Scale) &&
10072 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
10073 Idx / Scale, ExpectedIdx / Scale);
10074 }
10075 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
10076 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
10077 for (unsigned I = 0; I != Scale; ++I)
10078 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
10079 (Idx * Scale) + I,
10080 (ExpectedIdx * Scale) + I))
10081 return false;
10082 return true;
10083 }
10084 }
10085 break;
10086 }
10087 case ISD::VECTOR_SHUFFLE: {
10088 auto *SVN = cast<ShuffleVectorSDNode>(Op);
10089 return Op == ExpectedOp &&
10090 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
10091 }
10092 case X86ISD::VBROADCAST:
10094 return Op == ExpectedOp;
10096 if (Op == ExpectedOp) {
10097 auto *MemOp = cast<MemSDNode>(Op);
10098 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
10099 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
10100 }
10101 break;
10102 case X86ISD::VPERMI: {
10103 if (Op == ExpectedOp) {
10105 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
10106 SDValue Src = Op.getOperand(0);
10107 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
10108 Mask[ExpectedIdx]);
10109 }
10110 break;
10111 }
10112 case X86ISD::HADD:
10113 case X86ISD::HSUB:
10114 case X86ISD::FHADD:
10115 case X86ISD::FHSUB:
10116 case X86ISD::PACKSS:
10117 case X86ISD::PACKUS:
10118 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
10119 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
10120 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
10121 int NumElts = VT.getVectorNumElements();
10122 int NumLanes = VT.getSizeInBits() / 128;
10123 int NumEltsPerLane = NumElts / NumLanes;
10124 int NumHalfEltsPerLane = NumEltsPerLane / 2;
10125 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
10126 bool SameElt =
10127 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
10128 return SameLane && SameElt;
10129 }
10130 break;
10131 }
10132
10133 return false;
10134}
10135
10136/// Tiny helper function to identify a no-op mask.
10137///
10138/// This is a somewhat boring predicate function. It checks whether the mask
10139/// array input, which is assumed to be a single-input shuffle mask of the kind
10140/// used by the X86 shuffle instructions (not a fully general
10141/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10142/// in-place shuffle are 'no-op's.
10144 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10145 assert(Mask[i] >= -1 && "Out of bound mask element!");
10146 if (Mask[i] >= 0 && Mask[i] != i)
10147 return false;
10148 }
10149 return true;
10150}
10151
10152/// Test whether there are elements crossing LaneSizeInBits lanes in this
10153/// shuffle mask.
10154///
10155/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10156/// and we routinely test for these.
10157static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10158 unsigned ScalarSizeInBits,
10159 ArrayRef<int> Mask) {
10160 assert(LaneSizeInBits && ScalarSizeInBits &&
10161 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10162 "Illegal shuffle lane size");
10163 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10164 int Size = Mask.size();
10165 for (int i = 0; i < Size; ++i)
10166 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10167 return true;
10168 return false;
10169}
10170
10171/// Test whether there are elements crossing 128-bit lanes in this
10172/// shuffle mask.
10174 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10175}
10176
10177/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10178/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10179/// better support 'repeated mask + lane permute' style shuffles.
10180static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10181 unsigned ScalarSizeInBits,
10182 ArrayRef<int> Mask) {
10183 assert(LaneSizeInBits && ScalarSizeInBits &&
10184 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10185 "Illegal shuffle lane size");
10186 int NumElts = Mask.size();
10187 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10188 int NumLanes = NumElts / NumEltsPerLane;
10189 if (NumLanes > 1) {
10190 for (int i = 0; i != NumLanes; ++i) {
10191 int SrcLane = -1;
10192 for (int j = 0; j != NumEltsPerLane; ++j) {
10193 int M = Mask[(i * NumEltsPerLane) + j];
10194 if (M < 0)
10195 continue;
10196 int Lane = (M % NumElts) / NumEltsPerLane;
10197 if (SrcLane >= 0 && SrcLane != Lane)
10198 return true;
10199 SrcLane = Lane;
10200 }
10201 }
10202 }
10203 return false;
10204}
10205
10206/// Test whether a shuffle mask is equivalent within each sub-lane.
10207///
10208/// This checks a shuffle mask to see if it is performing the same
10209/// lane-relative shuffle in each sub-lane. This trivially implies
10210/// that it is also not lane-crossing. It may however involve a blend from the
10211/// same lane of a second vector.
10212///
10213/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10214/// non-trivial to compute in the face of undef lanes. The representation is
10215/// suitable for use with existing 128-bit shuffles as entries from the second
10216/// vector have been remapped to [LaneSize, 2*LaneSize).
10217static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10218 ArrayRef<int> Mask,
10219 SmallVectorImpl<int> &RepeatedMask) {
10220 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10221 RepeatedMask.assign(LaneSize, -1);
10222 int Size = Mask.size();
10223 for (int i = 0; i < Size; ++i) {
10224 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10225 if (Mask[i] < 0)
10226 continue;
10227 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10228 // This entry crosses lanes, so there is no way to model this shuffle.
10229 return false;
10230
10231 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10232 // Adjust second vector indices to start at LaneSize instead of Size.
10233 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10234 : Mask[i] % LaneSize + LaneSize;
10235 if (RepeatedMask[i % LaneSize] < 0)
10236 // This is the first non-undef entry in this slot of a 128-bit lane.
10237 RepeatedMask[i % LaneSize] = LocalM;
10238 else if (RepeatedMask[i % LaneSize] != LocalM)
10239 // Found a mismatch with the repeated mask.
10240 return false;
10241 }
10242 return true;
10243}
10244
10245/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10246static bool
10248 SmallVectorImpl<int> &RepeatedMask) {
10249 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10250}
10251
10252static bool
10254 SmallVector<int, 32> RepeatedMask;
10255 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10256}
10257
10258/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10259static bool
10261 SmallVectorImpl<int> &RepeatedMask) {
10262 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10263}
10264
10265/// Test whether a target shuffle mask is equivalent within each sub-lane.
10266/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10267static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10268 unsigned EltSizeInBits,
10269 ArrayRef<int> Mask,
10270 SmallVectorImpl<int> &RepeatedMask) {
10271 int LaneSize = LaneSizeInBits / EltSizeInBits;
10272 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10273 int Size = Mask.size();
10274 for (int i = 0; i < Size; ++i) {
10275 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10276 if (Mask[i] == SM_SentinelUndef)
10277 continue;
10278 if (Mask[i] == SM_SentinelZero) {
10279 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10280 return false;
10281 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10282 continue;
10283 }
10284 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10285 // This entry crosses lanes, so there is no way to model this shuffle.
10286 return false;
10287
10288 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10289 // later vector indices to start at multiples of LaneSize instead of Size.
10290 int LaneM = Mask[i] / Size;
10291 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10292 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10293 // This is the first non-undef entry in this slot of a 128-bit lane.
10294 RepeatedMask[i % LaneSize] = LocalM;
10295 else if (RepeatedMask[i % LaneSize] != LocalM)
10296 // Found a mismatch with the repeated mask.
10297 return false;
10298 }
10299 return true;
10300}
10301
10302/// Test whether a target shuffle mask is equivalent within each sub-lane.
10303/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10304static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10305 ArrayRef<int> Mask,
10306 SmallVectorImpl<int> &RepeatedMask) {
10307 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10308 Mask, RepeatedMask);
10309}
10310
10311/// Checks whether a shuffle mask is equivalent to an explicit list of
10312/// arguments.
10313///
10314/// This is a fast way to test a shuffle mask against a fixed pattern:
10315///
10316/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10317///
10318/// It returns true if the mask is exactly as wide as the argument list, and
10319/// each element of the mask is either -1 (signifying undef) or the value given
10320/// in the argument.
10321static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10322 SDValue V1 = SDValue(),
10323 SDValue V2 = SDValue()) {
10324 int Size = Mask.size();
10325 if (Size != (int)ExpectedMask.size())
10326 return false;
10327
10328 for (int i = 0; i < Size; ++i) {
10329 assert(Mask[i] >= -1 && "Out of bound mask element!");
10330 int MaskIdx = Mask[i];
10331 int ExpectedIdx = ExpectedMask[i];
10332 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10333 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10334 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10335 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10336 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10337 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10338 return false;
10339 }
10340 }
10341 return true;
10342}
10343
10344/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10345///
10346/// The masks must be exactly the same width.
10347///
10348/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10349/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10350///
10351/// SM_SentinelZero is accepted as a valid negative index but must match in
10352/// both, or via a known bits test.
10354 ArrayRef<int> ExpectedMask,
10355 const SelectionDAG &DAG,
10356 SDValue V1 = SDValue(),
10357 SDValue V2 = SDValue()) {
10358 int Size = Mask.size();
10359 if (Size != (int)ExpectedMask.size())
10360 return false;
10361 assert(llvm::all_of(ExpectedMask,
10362 [Size](int M) {
10363 return M == SM_SentinelZero ||
10364 isInRange(M, 0, 2 * Size);
10365 }) &&
10366 "Illegal target shuffle mask");
10367
10368 // Check for out-of-range target shuffle mask indices.
10369 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10370 return false;
10371
10372 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10373 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10374 !V1.getValueType().isVector()))
10375 V1 = SDValue();
10376 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10377 !V2.getValueType().isVector()))
10378 V2 = SDValue();
10379
10380 APInt ZeroV1 = APInt::getZero(Size);
10381 APInt ZeroV2 = APInt::getZero(Size);
10382
10383 for (int i = 0; i < Size; ++i) {
10384 int MaskIdx = Mask[i];
10385 int ExpectedIdx = ExpectedMask[i];
10386 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10387 continue;
10388 // If we failed to match an expected SM_SentinelZero then early out.
10389 if (ExpectedIdx < 0)
10390 return false;
10391 if (MaskIdx == SM_SentinelZero) {
10392 // If we need this expected index to be a zero element, then update the
10393 // relevant zero mask and perform the known bits at the end to minimize
10394 // repeated computes.
10395 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10396 if (ExpectedV &&
10397 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10398 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10399 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10400 ZeroMask.setBit(BitIdx);
10401 continue;
10402 }
10403 }
10404 if (MaskIdx >= 0) {
10405 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10406 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10407 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10408 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10409 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10410 continue;
10411 }
10412 return false;
10413 }
10414 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10415 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10416}
10417
10418// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10419// instructions.
10421 const SelectionDAG &DAG) {
10422 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10423 return false;
10424
10425 SmallVector<int, 8> Unpcklwd;
10426 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10427 /* Unary = */ false);
10428 SmallVector<int, 8> Unpckhwd;
10429 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10430 /* Unary = */ false);
10431 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10432 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10433 return IsUnpackwdMask;
10434}
10435
10437 const SelectionDAG &DAG) {
10438 // Create 128-bit vector type based on mask size.
10439 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10440 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10441
10442 // We can't assume a canonical shuffle mask, so try the commuted version too.
10443 SmallVector<int, 4> CommutedMask(Mask);
10445
10446 // Match any of unary/binary or low/high.
10447 for (unsigned i = 0; i != 4; ++i) {
10448 SmallVector<int, 16> UnpackMask;
10449 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10450 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10451 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10452 return true;
10453 }
10454 return false;
10455}
10456
10457/// Return true if a shuffle mask chooses elements identically in its top and
10458/// bottom halves. For example, any splat mask has the same top and bottom
10459/// halves. If an element is undefined in only one half of the mask, the halves
10460/// are not considered identical.
10462 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10463 unsigned HalfSize = Mask.size() / 2;
10464 for (unsigned i = 0; i != HalfSize; ++i) {
10465 if (Mask[i] != Mask[i + HalfSize])
10466 return false;
10467 }
10468 return true;
10469}
10470
10471/// Get a 4-lane 8-bit shuffle immediate for a mask.
10472///
10473/// This helper function produces an 8-bit shuffle immediate corresponding to
10474/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10475/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10476/// example.
10477///
10478/// NB: We rely heavily on "undef" masks preserving the input lane.
10479static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10480 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10481 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10482 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10483 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10484 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10485
10486 // If the mask only uses one non-undef element, then fully 'splat' it to
10487 // improve later broadcast matching.
10488 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10489 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10490
10491 int FirstElt = Mask[FirstIndex];
10492 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10493 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10494
10495 unsigned Imm = 0;
10496 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10497 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10498 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10499 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10500 return Imm;
10501}
10502
10504 SelectionDAG &DAG) {
10505 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10506}
10507
10508// Canonicalize SHUFPD mask to improve chances of further folding.
10509// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10510static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10511 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10512 "Unexpected SHUFPD mask size");
10513 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10514 "Unexpected SHUFPD mask elements");
10515
10516 // If the mask only uses one non-undef element, then fully 'splat' it to
10517 // improve later broadcast matching.
10518 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10519 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10520 "All undef shuffle mask");
10521
10522 int FirstElt = Mask[FirstIndex];
10523 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10524 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10525 unsigned Imm = 0;
10526 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10527 Imm |= FirstElt << I;
10528 return Imm;
10529 }
10530
10531 // Attempt to keep any undef elements in place to improve chances of the
10532 // shuffle becoming a (commutative) blend.
10533 unsigned Imm = 0;
10534 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10535 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10536
10537 return Imm;
10538}
10539
10541 SelectionDAG &DAG) {
10542 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10543}
10544
10545// The Shuffle result is as follow:
10546// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10547// Each Zeroable's element correspond to a particular Mask's element.
10548// As described in computeZeroableShuffleElements function.
10549//
10550// The function looks for a sub-mask that the nonzero elements are in
10551// increasing order. If such sub-mask exist. The function returns true.
10552static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10553 ArrayRef<int> Mask, const EVT &VectorType,
10554 bool &IsZeroSideLeft) {
10555 int NextElement = -1;
10556 // Check if the Mask's nonzero elements are in increasing order.
10557 for (int i = 0, e = Mask.size(); i < e; i++) {
10558 // Checks if the mask's zeros elements are built from only zeros.
10559 assert(Mask[i] >= -1 && "Out of bound mask element!");
10560 if (Mask[i] < 0)
10561 return false;
10562 if (Zeroable[i])
10563 continue;
10564 // Find the lowest non zero element
10565 if (NextElement < 0) {
10566 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10567 IsZeroSideLeft = NextElement != 0;
10568 }
10569 // Exit if the mask's non zero elements are not in increasing order.
10570 if (NextElement != Mask[i])
10571 return false;
10572 NextElement++;
10573 }
10574 return true;
10575}
10576
10577static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10579 const X86Subtarget &Subtarget,
10580 unsigned Depth = 0);
10581
10582/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10584 ArrayRef<int> Mask, SDValue V1,
10585 SDValue V2, const APInt &Zeroable,
10586 const X86Subtarget &Subtarget,
10587 SelectionDAG &DAG) {
10588 int Size = Mask.size();
10589 int LaneSize = 128 / VT.getScalarSizeInBits();
10590 const int NumBytes = VT.getSizeInBits() / 8;
10591 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10592
10593 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10594 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10595 (Subtarget.hasBWI() && VT.is512BitVector()));
10596
10597 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10598 // Sign bit set in i8 mask means zero element.
10599 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10600
10601 SDValue V;
10602 for (int i = 0; i < NumBytes; ++i) {
10603 int M = Mask[i / NumEltBytes];
10604 if (M < 0) {
10605 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10606 continue;
10607 }
10608 if (Zeroable[i / NumEltBytes]) {
10609 PSHUFBMask[i] = ZeroMask;
10610 continue;
10611 }
10612
10613 // We can only use a single input of V1 or V2.
10614 SDValue SrcV = (M >= Size ? V2 : V1);
10615 if (V && V != SrcV)
10616 return SDValue();
10617 V = SrcV;
10618 M %= Size;
10619
10620 // PSHUFB can't cross lanes, ensure this doesn't happen.
10621 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10622 return SDValue();
10623
10624 M = M % LaneSize;
10625 M = M * NumEltBytes + (i % NumEltBytes);
10626 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10627 }
10628 assert(V && "Failed to find a source input");
10629
10630 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10631 return DAG.getBitcast(
10632 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10633 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10634}
10635
10636static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10637 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10638 const SDLoc &dl);
10639
10640// X86 has dedicated shuffle that can be lowered to VEXPAND
10642 SDValue V2, ArrayRef<int> Mask,
10643 const APInt &Zeroable,
10644 const X86Subtarget &Subtarget,
10645 SelectionDAG &DAG) {
10646 bool IsLeftZeroSide = true;
10647 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10648 IsLeftZeroSide))
10649 return SDValue();
10650 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10652 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10653 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10654 unsigned NumElts = VT.getVectorNumElements();
10655 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10656 "Unexpected number of vector elements");
10657 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10658 Subtarget, DAG, DL);
10659 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10660 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10661 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10662}
10663
10664static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10665 unsigned &UnpackOpcode, bool IsUnary,
10666 ArrayRef<int> TargetMask, const SDLoc &DL,
10667 SelectionDAG &DAG,
10668 const X86Subtarget &Subtarget) {
10669 int NumElts = VT.getVectorNumElements();
10670
10671 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10672 for (int i = 0; i != NumElts; i += 2) {
10673 int M1 = TargetMask[i + 0];
10674 int M2 = TargetMask[i + 1];
10675 Undef1 &= (SM_SentinelUndef == M1);
10676 Undef2 &= (SM_SentinelUndef == M2);
10677 Zero1 &= isUndefOrZero(M1);
10678 Zero2 &= isUndefOrZero(M2);
10679 }
10680 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10681 "Zeroable shuffle detected");
10682
10683 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10684 SmallVector<int, 64> Unpckl, Unpckh;
10685 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10686 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10687 (IsUnary ? V1 : V2))) {
10688 UnpackOpcode = X86ISD::UNPCKL;
10689 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10690 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10691 return true;
10692 }
10693
10694 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10695 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10696 (IsUnary ? V1 : V2))) {
10697 UnpackOpcode = X86ISD::UNPCKH;
10698 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10699 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10700 return true;
10701 }
10702
10703 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10704 if (IsUnary && (Zero1 || Zero2)) {
10705 // Don't bother if we can blend instead.
10706 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10707 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10708 return false;
10709
10710 bool MatchLo = true, MatchHi = true;
10711 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10712 int M = TargetMask[i];
10713
10714 // Ignore if the input is known to be zero or the index is undef.
10715 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10716 (M == SM_SentinelUndef))
10717 continue;
10718
10719 MatchLo &= (M == Unpckl[i]);
10720 MatchHi &= (M == Unpckh[i]);
10721 }
10722
10723 if (MatchLo || MatchHi) {
10724 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10725 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10726 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10727 return true;
10728 }
10729 }
10730
10731 // If a binary shuffle, commute and try again.
10732 if (!IsUnary) {
10734 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10735 UnpackOpcode = X86ISD::UNPCKL;
10736 std::swap(V1, V2);
10737 return true;
10738 }
10739
10741 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10742 UnpackOpcode = X86ISD::UNPCKH;
10743 std::swap(V1, V2);
10744 return true;
10745 }
10746 }
10747
10748 return false;
10749}
10750
10751// X86 has dedicated unpack instructions that can handle specific blend
10752// operations: UNPCKH and UNPCKL.
10754 SDValue V2, ArrayRef<int> Mask,
10755 SelectionDAG &DAG) {
10756 SmallVector<int, 8> Unpckl;
10757 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10758 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10759 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10760
10761 SmallVector<int, 8> Unpckh;
10762 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10763 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10764 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10765
10766 // Commute and try again.
10768 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10769 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10770
10772 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10773 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10774
10775 return SDValue();
10776}
10777
10778/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10779/// followed by unpack 256-bit.
10781 SDValue V2, ArrayRef<int> Mask,
10782 SelectionDAG &DAG) {
10783 SmallVector<int, 32> Unpckl, Unpckh;
10784 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10785 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10786
10787 unsigned UnpackOpcode;
10788 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10789 UnpackOpcode = X86ISD::UNPCKL;
10790 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10791 UnpackOpcode = X86ISD::UNPCKH;
10792 else
10793 return SDValue();
10794
10795 // This is a "natural" unpack operation (rather than the 128-bit sectored
10796 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10797 // input in order to use the x86 instruction.
10798 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10799 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10800 V1 = DAG.getBitcast(VT, V1);
10801 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10802}
10803
10804// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10805// source into the lower elements and zeroing the upper elements.
10806static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10807 ArrayRef<int> Mask, const APInt &Zeroable,
10808 const X86Subtarget &Subtarget) {
10809 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10810 return false;
10811
10812 unsigned NumElts = Mask.size();
10813 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10814 unsigned MaxScale = 64 / EltSizeInBits;
10815
10816 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10817 unsigned SrcEltBits = EltSizeInBits * Scale;
10818 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10819 continue;
10820 unsigned NumSrcElts = NumElts / Scale;
10821 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10822 continue;
10823 unsigned UpperElts = NumElts - NumSrcElts;
10824 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10825 continue;
10826 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10827 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10828 DstVT = MVT::getIntegerVT(EltSizeInBits);
10829 if ((NumSrcElts * EltSizeInBits) >= 128) {
10830 // ISD::TRUNCATE
10831 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10832 } else {
10833 // X86ISD::VTRUNC
10834 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10835 }
10836 return true;
10837 }
10838
10839 return false;
10840}
10841
10842// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10843// element padding to the final DstVT.
10844static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10845 const X86Subtarget &Subtarget,
10846 SelectionDAG &DAG, bool ZeroUppers) {
10847 MVT SrcVT = Src.getSimpleValueType();
10848 MVT DstSVT = DstVT.getScalarType();
10849 unsigned NumDstElts = DstVT.getVectorNumElements();
10850 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10851 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10852
10853 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10854 return SDValue();
10855
10856 // Perform a direct ISD::TRUNCATE if possible.
10857 if (NumSrcElts == NumDstElts)
10858 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10859
10860 if (NumSrcElts > NumDstElts) {
10861 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10862 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10863 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10864 }
10865
10866 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10867 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10868 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10869 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10870 DstVT.getSizeInBits());
10871 }
10872
10873 // Non-VLX targets must truncate from a 512-bit type, so we need to
10874 // widen, truncate and then possibly extract the original subvector.
10875 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10876 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10877 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10878 }
10879
10880 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10881 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10882 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10883 if (DstVT != TruncVT)
10884 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10885 DstVT.getSizeInBits());
10886 return Trunc;
10887}
10888
10889// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10890//
10891// An example is the following:
10892//
10893// t0: ch = EntryToken
10894// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10895// t25: v4i32 = truncate t2
10896// t41: v8i16 = bitcast t25
10897// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10898// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10899// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10900// t18: v2i64 = bitcast t51
10901//
10902// One can just use a single vpmovdw instruction, without avx512vl we need to
10903// use the zmm variant and extract the lower subvector, padding with zeroes.
10904// TODO: Merge with lowerShuffleAsVTRUNC.
10906 SDValue V2, ArrayRef<int> Mask,
10907 const APInt &Zeroable,
10908 const X86Subtarget &Subtarget,
10909 SelectionDAG &DAG) {
10910 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10911 if (!Subtarget.hasAVX512())
10912 return SDValue();
10913
10914 unsigned NumElts = VT.getVectorNumElements();
10915 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10916 unsigned MaxScale = 64 / EltSizeInBits;
10917 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10918 unsigned SrcEltBits = EltSizeInBits * Scale;
10919 unsigned NumSrcElts = NumElts / Scale;
10920 unsigned UpperElts = NumElts - NumSrcElts;
10921 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10922 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10923 continue;
10924
10925 // Attempt to find a matching source truncation, but as a fall back VLX
10926 // cases can use the VPMOV directly.
10927 SDValue Src = peekThroughBitcasts(V1);
10928 if (Src.getOpcode() == ISD::TRUNCATE &&
10929 Src.getScalarValueSizeInBits() == SrcEltBits) {
10930 Src = Src.getOperand(0);
10931 } else if (Subtarget.hasVLX()) {
10932 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10933 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10934 Src = DAG.getBitcast(SrcVT, Src);
10935 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10936 if (Scale == 2 &&
10937 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10938 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10939 return SDValue();
10940 } else
10941 return SDValue();
10942
10943 // VPMOVWB is only available with avx512bw.
10944 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10945 return SDValue();
10946
10947 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10948 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10949 }
10950
10951 return SDValue();
10952}
10953
10954// Attempt to match binary shuffle patterns as a truncate.
10956 SDValue V2, ArrayRef<int> Mask,
10957 const APInt &Zeroable,
10958 const X86Subtarget &Subtarget,
10959 SelectionDAG &DAG) {
10960 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10961 "Unexpected VTRUNC type");
10962 if (!Subtarget.hasAVX512() ||
10963 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10964 return SDValue();
10965
10966 unsigned NumElts = VT.getVectorNumElements();
10967 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10968 unsigned MaxScale = 64 / EltSizeInBits;
10969 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10970 // TODO: Support non-BWI VPMOVWB truncations?
10971 unsigned SrcEltBits = EltSizeInBits * Scale;
10972 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10973 continue;
10974
10975 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10976 // Bail if the V2 elements are undef.
10977 unsigned NumHalfSrcElts = NumElts / Scale;
10978 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10979 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10980 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10981 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10982 continue;
10983
10984 // The elements beyond the truncation must be undef/zero.
10985 unsigned UpperElts = NumElts - NumSrcElts;
10986 if (UpperElts > 0 &&
10987 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10988 continue;
10989 bool UndefUppers =
10990 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10991
10992 // As we're using both sources then we need to concat them together
10993 // and truncate from the double-sized src.
10994 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10995
10996 // For offset truncations, ensure that the concat is cheap.
10997 SDValue Src =
10998 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10999 if (!Src) {
11000 if (Offset)
11001 continue;
11002 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11003 }
11004
11005 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11006 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11007 Src = DAG.getBitcast(SrcVT, Src);
11008
11009 // Shift the offset'd elements into place for the truncation.
11010 // TODO: Use getTargetVShiftByConstNode.
11011 if (Offset)
11012 Src = DAG.getNode(
11013 X86ISD::VSRLI, DL, SrcVT, Src,
11014 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
11015
11016 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11017 }
11018 }
11019
11020 return SDValue();
11021}
11022
11023/// Check whether a compaction lowering can be done by dropping even/odd
11024/// elements and compute how many times even/odd elements must be dropped.
11025///
11026/// This handles shuffles which take every Nth element where N is a power of
11027/// two. Example shuffle masks:
11028///
11029/// (even)
11030/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11031/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11032/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11033/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11034/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11035/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11036///
11037/// (odd)
11038/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
11039/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
11040///
11041/// Any of these lanes can of course be undef.
11042///
11043/// This routine only supports N <= 3.
11044/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11045/// for larger N.
11046///
11047/// \returns N above, or the number of times even/odd elements must be dropped
11048/// if there is such a number. Otherwise returns zero.
11049static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
11050 bool IsSingleInput) {
11051 // The modulus for the shuffle vector entries is based on whether this is
11052 // a single input or not.
11053 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11054 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11055 "We should only be called with masks with a power-of-2 size!");
11056
11057 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11058 int Offset = MatchEven ? 0 : 1;
11059
11060 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11061 // and 2^3 simultaneously. This is because we may have ambiguity with
11062 // partially undef inputs.
11063 bool ViableForN[3] = {true, true, true};
11064
11065 for (int i = 0, e = Mask.size(); i < e; ++i) {
11066 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11067 // want.
11068 if (Mask[i] < 0)
11069 continue;
11070
11071 bool IsAnyViable = false;
11072 for (unsigned j = 0; j != std::size(ViableForN); ++j)
11073 if (ViableForN[j]) {
11074 uint64_t N = j + 1;
11075
11076 // The shuffle mask must be equal to (i * 2^N) % M.
11077 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
11078 IsAnyViable = true;
11079 else
11080 ViableForN[j] = false;
11081 }
11082 // Early exit if we exhaust the possible powers of two.
11083 if (!IsAnyViable)
11084 break;
11085 }
11086
11087 for (unsigned j = 0; j != std::size(ViableForN); ++j)
11088 if (ViableForN[j])
11089 return j + 1;
11090
11091 // Return 0 as there is no viable power of two.
11092 return 0;
11093}
11094
11095// X86 has dedicated pack instructions that can handle specific truncation
11096// operations: PACKSS and PACKUS.
11097// Checks for compaction shuffle masks if MaxStages > 1.
11098// TODO: Add support for matching multiple PACKSS/PACKUS stages.
11099static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11100 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11101 const SelectionDAG &DAG,
11102 const X86Subtarget &Subtarget,
11103 unsigned MaxStages = 1) {
11104 unsigned NumElts = VT.getVectorNumElements();
11105 unsigned BitSize = VT.getScalarSizeInBits();
11106 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11107 "Illegal maximum compaction");
11108
11109 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11110 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11111 unsigned NumPackedBits = NumSrcBits - BitSize;
11112 N1 = peekThroughBitcasts(N1);
11113 N2 = peekThroughBitcasts(N2);
11114 unsigned NumBits1 = N1.getScalarValueSizeInBits();
11115 unsigned NumBits2 = N2.getScalarValueSizeInBits();
11116 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
11117 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
11118 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11119 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11120 return false;
11121 if (Subtarget.hasSSE41() || BitSize == 8) {
11122 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11123 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
11124 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
11125 V1 = N1;
11126 V2 = N2;
11127 SrcVT = PackVT;
11128 PackOpcode = X86ISD::PACKUS;
11129 return true;
11130 }
11131 }
11132 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
11133 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
11134 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
11135 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
11136 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11137 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
11138 V1 = N1;
11139 V2 = N2;
11140 SrcVT = PackVT;
11141 PackOpcode = X86ISD::PACKSS;
11142 return true;
11143 }
11144 return false;
11145 };
11146
11147 // Attempt to match against wider and wider compaction patterns.
11148 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11149 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11150 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11151
11152 // Try binary shuffle.
11153 SmallVector<int, 32> BinaryMask;
11154 createPackShuffleMask(VT, BinaryMask, false, NumStages);
11155 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
11156 if (MatchPACK(V1, V2, PackVT))
11157 return true;
11158
11159 // Try unary shuffle.
11160 SmallVector<int, 32> UnaryMask;
11161 createPackShuffleMask(VT, UnaryMask, true, NumStages);
11162 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
11163 if (MatchPACK(V1, V1, PackVT))
11164 return true;
11165 }
11166
11167 return false;
11168}
11169
11171 SDValue V2, ArrayRef<int> Mask,
11172 const X86Subtarget &Subtarget,
11173 SelectionDAG &DAG) {
11174 MVT PackVT;
11175 unsigned PackOpcode;
11176 unsigned SizeBits = VT.getSizeInBits();
11177 unsigned EltBits = VT.getScalarSizeInBits();
11178 unsigned MaxStages = Log2_32(64 / EltBits);
11179 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11180 Subtarget, MaxStages))
11181 return SDValue();
11182
11183 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11184 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11185
11186 // Don't lower multi-stage packs on AVX512, truncation is better.
11187 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11188 return SDValue();
11189
11190 // Pack to the largest type possible:
11191 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11192 unsigned MaxPackBits = 16;
11193 if (CurrentEltBits > 16 &&
11194 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11195 MaxPackBits = 32;
11196
11197 // Repeatedly pack down to the target size.
11198 SDValue Res;
11199 for (unsigned i = 0; i != NumStages; ++i) {
11200 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11201 unsigned NumSrcElts = SizeBits / SrcEltBits;
11202 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11203 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11204 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11205 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11206 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11207 DAG.getBitcast(SrcVT, V2));
11208 V1 = V2 = Res;
11209 CurrentEltBits /= 2;
11210 }
11211 assert(Res && Res.getValueType() == VT &&
11212 "Failed to lower compaction shuffle");
11213 return Res;
11214}
11215
11216/// Try to emit a bitmask instruction for a shuffle.
11217///
11218/// This handles cases where we can model a blend exactly as a bitmask due to
11219/// one of the inputs being zeroable.
11221 SDValue V2, ArrayRef<int> Mask,
11222 const APInt &Zeroable,
11223 const X86Subtarget &Subtarget,
11224 SelectionDAG &DAG) {
11225 MVT MaskVT = VT;
11226 MVT EltVT = VT.getVectorElementType();
11227 SDValue Zero, AllOnes;
11228 // Use f64 if i64 isn't legal.
11229 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11230 EltVT = MVT::f64;
11231 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11232 }
11233
11234 MVT LogicVT = VT;
11235 if (EltVT.isFloatingPoint()) {
11236 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11237 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11238 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11239 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11240 } else {
11241 Zero = DAG.getConstant(0, DL, EltVT);
11242 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11243 }
11244
11245 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11246 SDValue V;
11247 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11248 if (Zeroable[i])
11249 continue;
11250 if (Mask[i] % Size != i)
11251 return SDValue(); // Not a blend.
11252 if (!V)
11253 V = Mask[i] < Size ? V1 : V2;
11254 else if (V != (Mask[i] < Size ? V1 : V2))
11255 return SDValue(); // Can only let one input through the mask.
11256
11257 VMaskOps[i] = AllOnes;
11258 }
11259 if (!V)
11260 return SDValue(); // No non-zeroable elements!
11261
11262 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11263 VMask = DAG.getBitcast(LogicVT, VMask);
11264 V = DAG.getBitcast(LogicVT, V);
11265 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11266 return DAG.getBitcast(VT, And);
11267}
11268
11269/// Try to emit a blend instruction for a shuffle using bit math.
11270///
11271/// This is used as a fallback approach when first class blend instructions are
11272/// unavailable. Currently it is only suitable for integer vectors, but could
11273/// be generalized for floating point vectors if desirable.
11275 SDValue V2, ArrayRef<int> Mask,
11276 SelectionDAG &DAG) {
11277 assert(VT.isInteger() && "Only supports integer vector types!");
11278 MVT EltVT = VT.getVectorElementType();
11279 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11280 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11282 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11283 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11284 return SDValue(); // Shuffled input!
11285 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11286 }
11287
11288 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11289 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11290}
11291
11293 SDValue PreservedSrc,
11294 const X86Subtarget &Subtarget,
11295 SelectionDAG &DAG);
11296
11299 const APInt &Zeroable, bool &ForceV1Zero,
11300 bool &ForceV2Zero, uint64_t &BlendMask) {
11301 bool V1IsZeroOrUndef =
11303 bool V2IsZeroOrUndef =
11305
11306 BlendMask = 0;
11307 ForceV1Zero = false, ForceV2Zero = false;
11308 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11309
11310 int NumElts = Mask.size();
11311 int NumLanes = VT.getSizeInBits() / 128;
11312 int NumEltsPerLane = NumElts / NumLanes;
11313 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11314
11315 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11316 // then ensure the blend mask part for that lane just references that input.
11317 bool ForceWholeLaneMasks =
11318 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11319
11320 // Attempt to generate the binary blend mask. If an input is zero then
11321 // we can use any lane.
11322 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11323 // Keep track of the inputs used per lane.
11324 bool LaneV1InUse = false;
11325 bool LaneV2InUse = false;
11326 uint64_t LaneBlendMask = 0;
11327 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11328 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11329 int M = Mask[Elt];
11330 if (M == SM_SentinelUndef)
11331 continue;
11332 if (M == Elt || (0 <= M && M < NumElts &&
11333 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11334 Mask[Elt] = Elt;
11335 LaneV1InUse = true;
11336 continue;
11337 }
11338 if (M == (Elt + NumElts) ||
11339 (NumElts <= M &&
11340 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11341 LaneBlendMask |= 1ull << LaneElt;
11342 Mask[Elt] = Elt + NumElts;
11343 LaneV2InUse = true;
11344 continue;
11345 }
11346 if (Zeroable[Elt]) {
11347 if (V1IsZeroOrUndef) {
11348 ForceV1Zero = true;
11349 Mask[Elt] = Elt;
11350 LaneV1InUse = true;
11351 continue;
11352 }
11353 if (V2IsZeroOrUndef) {
11354 ForceV2Zero = true;
11355 LaneBlendMask |= 1ull << LaneElt;
11356 Mask[Elt] = Elt + NumElts;
11357 LaneV2InUse = true;
11358 continue;
11359 }
11360 }
11361 return false;
11362 }
11363
11364 // If we only used V2 then splat the lane blend mask to avoid any demanded
11365 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11366 // blend mask bit).
11367 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11368 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11369
11370 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11371 }
11372 return true;
11373}
11374
11375/// Try to emit a blend instruction for a shuffle.
11376///
11377/// This doesn't do any checks for the availability of instructions for blending
11378/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11379/// be matched in the backend with the type given. What it does check for is
11380/// that the shuffle mask is a blend, or convertible into a blend with zero.
11382 SDValue V2, ArrayRef<int> Original,
11383 const APInt &Zeroable,
11384 const X86Subtarget &Subtarget,
11385 SelectionDAG &DAG) {
11386 uint64_t BlendMask = 0;
11387 bool ForceV1Zero = false, ForceV2Zero = false;
11388 SmallVector<int, 64> Mask(Original);
11389 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11390 BlendMask))
11391 return SDValue();
11392
11393 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11394 if (ForceV1Zero)
11395 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11396 if (ForceV2Zero)
11397 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11398
11399 unsigned NumElts = VT.getVectorNumElements();
11400
11401 switch (VT.SimpleTy) {
11402 case MVT::v4i64:
11403 case MVT::v8i32:
11404 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11405 [[fallthrough]];
11406 case MVT::v4f64:
11407 case MVT::v8f32:
11408 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11409 [[fallthrough]];
11410 case MVT::v2f64:
11411 case MVT::v2i64:
11412 case MVT::v4f32:
11413 case MVT::v4i32:
11414 case MVT::v8i16:
11415 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11416 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11417 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11418 case MVT::v16i16: {
11419 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11420 SmallVector<int, 8> RepeatedMask;
11421 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11422 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11423 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11424 BlendMask = 0;
11425 for (int i = 0; i < 8; ++i)
11426 if (RepeatedMask[i] >= 8)
11427 BlendMask |= 1ull << i;
11428 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11429 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11430 }
11431 // Use PBLENDW for lower/upper lanes and then blend lanes.
11432 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11433 // merge to VSELECT where useful.
11434 uint64_t LoMask = BlendMask & 0xFF;
11435 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11436 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11437 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11438 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11439 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11440 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11441 return DAG.getVectorShuffle(
11442 MVT::v16i16, DL, Lo, Hi,
11443 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11444 }
11445 [[fallthrough]];
11446 }
11447 case MVT::v32i8:
11448 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11449 [[fallthrough]];
11450 case MVT::v16i8: {
11451 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11452
11453 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11454 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11455 Subtarget, DAG))
11456 return Masked;
11457
11458 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11459 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11460 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11461 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11462 }
11463
11464 // If we have VPTERNLOG, we can use that as a bit blend.
11465 if (Subtarget.hasVLX())
11466 if (SDValue BitBlend =
11467 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11468 return BitBlend;
11469
11470 // Scale the blend by the number of bytes per element.
11471 int Scale = VT.getScalarSizeInBits() / 8;
11472
11473 // This form of blend is always done on bytes. Compute the byte vector
11474 // type.
11475 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11476
11477 // x86 allows load folding with blendvb from the 2nd source operand. But
11478 // we are still using LLVM select here (see comment below), so that's V1.
11479 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11480 // allow that load-folding possibility.
11481 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11483 std::swap(V1, V2);
11484 }
11485
11486 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11487 // mix of LLVM's code generator and the x86 backend. We tell the code
11488 // generator that boolean values in the elements of an x86 vector register
11489 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11490 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11491 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11492 // of the element (the remaining are ignored) and 0 in that high bit would
11493 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11494 // the LLVM model for boolean values in vector elements gets the relevant
11495 // bit set, it is set backwards and over constrained relative to x86's
11496 // actual model.
11497 SmallVector<SDValue, 32> VSELECTMask;
11498 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11499 for (int j = 0; j < Scale; ++j)
11500 VSELECTMask.push_back(
11501 Mask[i] < 0
11502 ? DAG.getUNDEF(MVT::i8)
11503 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11504
11505 V1 = DAG.getBitcast(BlendVT, V1);
11506 V2 = DAG.getBitcast(BlendVT, V2);
11507 return DAG.getBitcast(
11508 VT,
11509 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11510 V1, V2));
11511 }
11512 case MVT::v16f32:
11513 case MVT::v8f64:
11514 case MVT::v8i64:
11515 case MVT::v16i32:
11516 case MVT::v32i16:
11517 case MVT::v64i8: {
11518 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11519 bool OptForSize = DAG.shouldOptForSize();
11520 if (!OptForSize) {
11521 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11522 Subtarget, DAG))
11523 return Masked;
11524 }
11525
11526 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11527 // masked move.
11528 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11529 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11530 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11531 }
11532 default:
11533 llvm_unreachable("Not a supported integer vector type!");
11534 }
11535}
11536
11537/// Try to lower as a blend of elements from two inputs followed by
11538/// a single-input permutation.
11539///
11540/// This matches the pattern where we can blend elements from two inputs and
11541/// then reduce the shuffle to a single-input permutation.
11543 SDValue V1, SDValue V2,
11544 ArrayRef<int> Mask,
11545 SelectionDAG &DAG,
11546 bool ImmBlends = false) {
11547 // We build up the blend mask while checking whether a blend is a viable way
11548 // to reduce the shuffle.
11549 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11550 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11551
11552 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11553 if (Mask[i] < 0)
11554 continue;
11555
11556 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11557
11558 if (BlendMask[Mask[i] % Size] < 0)
11559 BlendMask[Mask[i] % Size] = Mask[i];
11560 else if (BlendMask[Mask[i] % Size] != Mask[i])
11561 return SDValue(); // Can't blend in the needed input!
11562
11563 PermuteMask[i] = Mask[i] % Size;
11564 }
11565
11566 // If only immediate blends, then bail if the blend mask can't be widened to
11567 // i16.
11568 unsigned EltSize = VT.getScalarSizeInBits();
11569 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11570 return SDValue();
11571
11572 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11573 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11574}
11575
11576/// Try to lower as an unpack of elements from two inputs followed by
11577/// a single-input permutation.
11578///
11579/// This matches the pattern where we can unpack elements from two inputs and
11580/// then reduce the shuffle to a single-input (wider) permutation.
11582 SDValue V1, SDValue V2,
11583 ArrayRef<int> Mask,
11584 SelectionDAG &DAG) {
11585 int NumElts = Mask.size();
11586 int NumLanes = VT.getSizeInBits() / 128;
11587 int NumLaneElts = NumElts / NumLanes;
11588 int NumHalfLaneElts = NumLaneElts / 2;
11589
11590 bool MatchLo = true, MatchHi = true;
11591 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11592
11593 // Determine UNPCKL/UNPCKH type and operand order.
11594 for (int Elt = 0; Elt != NumElts; ++Elt) {
11595 int M = Mask[Elt];
11596 if (M < 0)
11597 continue;
11598
11599 // Normalize the mask value depending on whether it's V1 or V2.
11600 int NormM = M;
11601 SDValue &Op = Ops[Elt & 1];
11602 if (M < NumElts && (Op.isUndef() || Op == V1))
11603 Op = V1;
11604 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11605 Op = V2;
11606 NormM -= NumElts;
11607 } else
11608 return SDValue();
11609
11610 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11611 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11612 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11613 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11614 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11615 if (MatchLoAnyLane || MatchHiAnyLane) {
11616 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11617 "Failed to match UNPCKLO/UNPCKHI");
11618 break;
11619 }
11620 }
11621 MatchLo &= MatchLoAnyLane;
11622 MatchHi &= MatchHiAnyLane;
11623 if (!MatchLo && !MatchHi)
11624 return SDValue();
11625 }
11626 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11627
11628 // Element indices have changed after unpacking. Calculate permute mask
11629 // so that they will be put back to the position as dictated by the
11630 // original shuffle mask indices.
11631 SmallVector<int, 32> PermuteMask(NumElts, -1);
11632 for (int Elt = 0; Elt != NumElts; ++Elt) {
11633 int M = Mask[Elt];
11634 if (M < 0)
11635 continue;
11636 int NormM = M;
11637 if (NumElts <= M)
11638 NormM -= NumElts;
11639 bool IsFirstOp = M < NumElts;
11640 int BaseMaskElt =
11641 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11642 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11643 PermuteMask[Elt] = BaseMaskElt;
11644 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11645 PermuteMask[Elt] = BaseMaskElt + 1;
11646 assert(PermuteMask[Elt] != -1 &&
11647 "Input mask element is defined but failed to assign permute mask");
11648 }
11649
11650 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11651 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11652 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11653}
11654
11655/// Try to lower a shuffle as a permute of the inputs followed by an
11656/// UNPCK instruction.
11657///
11658/// This specifically targets cases where we end up with alternating between
11659/// the two inputs, and so can permute them into something that feeds a single
11660/// UNPCK instruction. Note that this routine only targets integer vectors
11661/// because for floating point vectors we have a generalized SHUFPS lowering
11662/// strategy that handles everything that doesn't *exactly* match an unpack,
11663/// making this clever lowering unnecessary.
11665 SDValue V1, SDValue V2,
11666 ArrayRef<int> Mask,
11667 const X86Subtarget &Subtarget,
11668 SelectionDAG &DAG) {
11669 int Size = Mask.size();
11670 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11671
11672 // This routine only supports 128-bit integer dual input vectors.
11673 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11674 return SDValue();
11675
11676 int NumLoInputs =
11677 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11678 int NumHiInputs =
11679 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11680
11681 bool UnpackLo = NumLoInputs >= NumHiInputs;
11682
11683 auto TryUnpack = [&](int ScalarSize, int Scale) {
11684 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11685 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11686
11687 for (int i = 0; i < Size; ++i) {
11688 if (Mask[i] < 0)
11689 continue;
11690
11691 // Each element of the unpack contains Scale elements from this mask.
11692 int UnpackIdx = i / Scale;
11693
11694 // We only handle the case where V1 feeds the first slots of the unpack.
11695 // We rely on canonicalization to ensure this is the case.
11696 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11697 return SDValue();
11698
11699 // Setup the mask for this input. The indexing is tricky as we have to
11700 // handle the unpack stride.
11701 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11702 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11703 Mask[i] % Size;
11704 }
11705
11706 // If we will have to shuffle both inputs to use the unpack, check whether
11707 // we can just unpack first and shuffle the result. If so, skip this unpack.
11708 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11709 !isNoopShuffleMask(V2Mask))
11710 return SDValue();
11711
11712 // Shuffle the inputs into place.
11713 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11714 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11715
11716 // Cast the inputs to the type we will use to unpack them.
11717 MVT UnpackVT =
11718 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11719 V1 = DAG.getBitcast(UnpackVT, V1);
11720 V2 = DAG.getBitcast(UnpackVT, V2);
11721
11722 // Unpack the inputs and cast the result back to the desired type.
11723 return DAG.getBitcast(
11724 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11725 UnpackVT, V1, V2));
11726 };
11727
11728 // We try each unpack from the largest to the smallest to try and find one
11729 // that fits this mask.
11730 int OrigScalarSize = VT.getScalarSizeInBits();
11731 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11732 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11733 return Unpack;
11734
11735 // If we're shuffling with a zero vector then we're better off not doing
11736 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11739 return SDValue();
11740
11741 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11742 // initial unpack.
11743 if (NumLoInputs == 0 || NumHiInputs == 0) {
11744 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11745 "We have to have *some* inputs!");
11746 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11747
11748 // FIXME: We could consider the total complexity of the permute of each
11749 // possible unpacking. Or at the least we should consider how many
11750 // half-crossings are created.
11751 // FIXME: We could consider commuting the unpacks.
11752
11753 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11754 for (int i = 0; i < Size; ++i) {
11755 if (Mask[i] < 0)
11756 continue;
11757
11758 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11759
11760 PermMask[i] =
11761 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11762 }
11763 return DAG.getVectorShuffle(
11764 VT, DL,
11765 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11766 V1, V2),
11767 DAG.getUNDEF(VT), PermMask);
11768 }
11769
11770 return SDValue();
11771}
11772
11773/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11774/// permuting the elements of the result in place.
11776 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11777 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11778 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11779 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11780 (VT.is512BitVector() && !Subtarget.hasBWI()))
11781 return SDValue();
11782
11783 // We don't currently support lane crossing permutes.
11784 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11785 return SDValue();
11786
11787 int Scale = VT.getScalarSizeInBits() / 8;
11788 int NumLanes = VT.getSizeInBits() / 128;
11789 int NumElts = VT.getVectorNumElements();
11790 int NumEltsPerLane = NumElts / NumLanes;
11791
11792 // Determine range of mask elts.
11793 bool Blend1 = true;
11794 bool Blend2 = true;
11795 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11796 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11797 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11798 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11799 int M = Mask[Lane + Elt];
11800 if (M < 0)
11801 continue;
11802 if (M < NumElts) {
11803 Blend1 &= (M == (Lane + Elt));
11804 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11805 M = M % NumEltsPerLane;
11806 Range1.first = std::min(Range1.first, M);
11807 Range1.second = std::max(Range1.second, M);
11808 } else {
11809 M -= NumElts;
11810 Blend2 &= (M == (Lane + Elt));
11811 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11812 M = M % NumEltsPerLane;
11813 Range2.first = std::min(Range2.first, M);
11814 Range2.second = std::max(Range2.second, M);
11815 }
11816 }
11817 }
11818
11819 // Bail if we don't need both elements.
11820 // TODO - it might be worth doing this for unary shuffles if the permute
11821 // can be widened.
11822 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11823 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11824 return SDValue();
11825
11826 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11827 return SDValue();
11828
11829 // Rotate the 2 ops so we can access both ranges, then permute the result.
11830 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11831 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11832 SDValue Rotate = DAG.getBitcast(
11833 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11834 DAG.getBitcast(ByteVT, Lo),
11835 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11836 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11837 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11838 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11839 int M = Mask[Lane + Elt];
11840 if (M < 0)
11841 continue;
11842 if (M < NumElts)
11843 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11844 else
11845 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11846 }
11847 }
11848 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11849 };
11850
11851 // Check if the ranges are small enough to rotate from either direction.
11852 if (Range2.second < Range1.first)
11853 return RotateAndPermute(V1, V2, Range1.first, 0);
11854 if (Range1.second < Range2.first)
11855 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11856 return SDValue();
11857}
11858
11860 return isUndefOrEqual(Mask, 0);
11861}
11862
11864 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11865}
11866
11867/// Check if the Mask consists of the same element repeated multiple times.
11869 size_t NumUndefs = 0;
11870 std::optional<int> UniqueElt;
11871 for (int Elt : Mask) {
11872 if (Elt == SM_SentinelUndef) {
11873 NumUndefs++;
11874 continue;
11875 }
11876 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11877 return false;
11878 UniqueElt = Elt;
11879 }
11880 // Make sure the element is repeated enough times by checking the number of
11881 // undefs is small.
11882 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11883}
11884
11885/// Generic routine to decompose a shuffle and blend into independent
11886/// blends and permutes.
11887///
11888/// This matches the extremely common pattern for handling combined
11889/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11890/// operations. It will try to pick the best arrangement of shuffles and
11891/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11893 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11894 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11895 int NumElts = Mask.size();
11896 int NumLanes = VT.getSizeInBits() / 128;
11897 int NumEltsPerLane = NumElts / NumLanes;
11898
11899 // Shuffle the input elements into the desired positions in V1 and V2 and
11900 // unpack/blend them together.
11901 bool IsAlternating = true;
11902 bool V1Zero = true, V2Zero = true;
11903 SmallVector<int, 32> V1Mask(NumElts, -1);
11904 SmallVector<int, 32> V2Mask(NumElts, -1);
11905 SmallVector<int, 32> FinalMask(NumElts, -1);
11906 for (int i = 0; i < NumElts; ++i) {
11907 int M = Mask[i];
11908 if (M >= 0 && M < NumElts) {
11909 V1Mask[i] = M;
11910 FinalMask[i] = i;
11911 V1Zero &= Zeroable[i];
11912 IsAlternating &= (i & 1) == 0;
11913 } else if (M >= NumElts) {
11914 V2Mask[i] = M - NumElts;
11915 FinalMask[i] = i + NumElts;
11916 V2Zero &= Zeroable[i];
11917 IsAlternating &= (i & 1) == 1;
11918 }
11919 }
11920
11921 // If we effectively only demand the 0'th element of \p Input, and not only
11922 // as 0'th element, then broadcast said input,
11923 // and change \p InputMask to be a no-op (identity) mask.
11924 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11925 &DAG](SDValue &Input,
11926 MutableArrayRef<int> InputMask) {
11927 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11928 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11929 !X86::mayFoldLoad(Input, Subtarget)))
11930 return;
11931 if (isNoopShuffleMask(InputMask))
11932 return;
11933 assert(isBroadcastShuffleMask(InputMask) &&
11934 "Expected to demand only the 0'th element.");
11936 for (auto I : enumerate(InputMask)) {
11937 int &InputMaskElt = I.value();
11938 if (InputMaskElt >= 0)
11939 InputMaskElt = I.index();
11940 }
11941 };
11942
11943 // Currently, we may need to produce one shuffle per input, and blend results.
11944 // It is possible that the shuffle for one of the inputs is already a no-op.
11945 // See if we can simplify non-no-op shuffles into broadcasts,
11946 // which we consider to be strictly better than an arbitrary shuffle.
11947 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11949 canonicalizeBroadcastableInput(V1, V1Mask);
11950 canonicalizeBroadcastableInput(V2, V2Mask);
11951 }
11952
11953 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11954 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11955 // the shuffle may be able to fold with a load or other benefit. However, when
11956 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11957 // pre-shuffle first is a better strategy.
11958 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11959 // If we don't have blends, see if we can create a cheap unpack.
11960 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11961 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11962 is128BitUnpackShuffleMask(V2Mask, DAG)))
11963 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11964 DL, VT, V1, V2, Mask, Subtarget, DAG))
11965 return PermUnpack;
11966
11967 // Only prefer immediate blends to unpack/rotate.
11968 if (SDValue BlendPerm =
11969 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11970 return BlendPerm;
11971
11972 // If either input vector provides only a single element which is repeated
11973 // multiple times, unpacking from both input vectors would generate worse
11974 // code. e.g. for
11975 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11976 // it is better to process t4 first to create a vector of t4[0], then unpack
11977 // that vector with t2.
11978 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11980 if (SDValue UnpackPerm =
11981 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11982 return UnpackPerm;
11983
11985 DL, VT, V1, V2, Mask, Subtarget, DAG))
11986 return RotatePerm;
11987
11988 // Unpack/rotate failed - try again with variable blends.
11989 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11990 DAG))
11991 return BlendPerm;
11992
11993 if (VT.getScalarSizeInBits() >= 32)
11994 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11995 DL, VT, V1, V2, Mask, Subtarget, DAG))
11996 return PermUnpack;
11997 }
11998
11999 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12000 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12001 // TODO: It doesn't have to be alternating - but each lane mustn't have more
12002 // than half the elements coming from each source.
12003 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12004 V1Mask.assign(NumElts, -1);
12005 V2Mask.assign(NumElts, -1);
12006 FinalMask.assign(NumElts, -1);
12007 for (int i = 0; i != NumElts; i += NumEltsPerLane)
12008 for (int j = 0; j != NumEltsPerLane; ++j) {
12009 int M = Mask[i + j];
12010 if (M >= 0 && M < NumElts) {
12011 V1Mask[i + (j / 2)] = M;
12012 FinalMask[i + j] = i + (j / 2);
12013 } else if (M >= NumElts) {
12014 V2Mask[i + (j / 2)] = M - NumElts;
12015 FinalMask[i + j] = i + (j / 2) + NumElts;
12016 }
12017 }
12018 }
12019
12020 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12021 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12022 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12023}
12024
12025static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12026 const X86Subtarget &Subtarget,
12027 ArrayRef<int> Mask) {
12028 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12029 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
12030
12031 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12032 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12033 int MaxSubElts = 64 / EltSizeInBits;
12034 unsigned RotateAmt, NumSubElts;
12035 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
12036 MaxSubElts, NumSubElts, RotateAmt))
12037 return -1;
12038 unsigned NumElts = Mask.size();
12039 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12040 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12041 return RotateAmt;
12042}
12043
12044/// Lower shuffle using X86ISD::VROTLI rotations.
12046 ArrayRef<int> Mask,
12047 const X86Subtarget &Subtarget,
12048 SelectionDAG &DAG) {
12049 // Only XOP + AVX512 targets have bit rotation instructions.
12050 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12051 bool IsLegal =
12052 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12053 if (!IsLegal && Subtarget.hasSSE3())
12054 return SDValue();
12055
12056 MVT RotateVT;
12057 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12058 Subtarget, Mask);
12059 if (RotateAmt < 0)
12060 return SDValue();
12061
12062 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12063 // expanded to OR(SRL,SHL), will be more efficient, but if they can
12064 // widen to vXi16 or more then existing lowering should will be better.
12065 if (!IsLegal) {
12066 if ((RotateAmt % 16) == 0)
12067 return SDValue();
12068 // TODO: Use getTargetVShiftByConstNode.
12069 unsigned ShlAmt = RotateAmt;
12070 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12071 V1 = DAG.getBitcast(RotateVT, V1);
12072 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12073 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12074 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12075 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12076 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12077 return DAG.getBitcast(VT, Rot);
12078 }
12079
12080 SDValue Rot =
12081 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12082 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12083 return DAG.getBitcast(VT, Rot);
12084}
12085
12086/// Try to match a vector shuffle as an element rotation.
12087///
12088/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12090 ArrayRef<int> Mask) {
12091 int NumElts = Mask.size();
12092
12093 // We need to detect various ways of spelling a rotation:
12094 // [11, 12, 13, 14, 15, 0, 1, 2]
12095 // [-1, 12, 13, 14, -1, -1, 1, -1]
12096 // [-1, -1, -1, -1, -1, -1, 1, 2]
12097 // [ 3, 4, 5, 6, 7, 8, 9, 10]
12098 // [-1, 4, 5, 6, -1, -1, 9, -1]
12099 // [-1, 4, 5, 6, -1, -1, -1, -1]
12100 int Rotation = 0;
12101 SDValue Lo, Hi;
12102 for (int i = 0; i < NumElts; ++i) {
12103 int M = Mask[i];
12104 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
12105 "Unexpected mask index.");
12106 if (M < 0)
12107 continue;
12108
12109 // Determine where a rotated vector would have started.
12110 int StartIdx = i - (M % NumElts);
12111 if (StartIdx == 0)
12112 // The identity rotation isn't interesting, stop.
12113 return -1;
12114
12115 // If we found the tail of a vector the rotation must be the missing
12116 // front. If we found the head of a vector, it must be how much of the
12117 // head.
12118 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12119
12120 if (Rotation == 0)
12121 Rotation = CandidateRotation;
12122 else if (Rotation != CandidateRotation)
12123 // The rotations don't match, so we can't match this mask.
12124 return -1;
12125
12126 // Compute which value this mask is pointing at.
12127 SDValue MaskV = M < NumElts ? V1 : V2;
12128
12129 // Compute which of the two target values this index should be assigned
12130 // to. This reflects whether the high elements are remaining or the low
12131 // elements are remaining.
12132 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12133
12134 // Either set up this value if we've not encountered it before, or check
12135 // that it remains consistent.
12136 if (!TargetV)
12137 TargetV = MaskV;
12138 else if (TargetV != MaskV)
12139 // This may be a rotation, but it pulls from the inputs in some
12140 // unsupported interleaving.
12141 return -1;
12142 }
12143
12144 // Check that we successfully analyzed the mask, and normalize the results.
12145 assert(Rotation != 0 && "Failed to locate a viable rotation!");
12146 assert((Lo || Hi) && "Failed to find a rotated input vector!");
12147 if (!Lo)
12148 Lo = Hi;
12149 else if (!Hi)
12150 Hi = Lo;
12151
12152 V1 = Lo;
12153 V2 = Hi;
12154
12155 return Rotation;
12156}
12157
12158/// Try to lower a vector shuffle as a byte rotation.
12159///
12160/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12161/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12162/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12163/// try to generically lower a vector shuffle through such an pattern. It
12164/// does not check for the profitability of lowering either as PALIGNR or
12165/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12166/// This matches shuffle vectors that look like:
12167///
12168/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12169///
12170/// Essentially it concatenates V1 and V2, shifts right by some number of
12171/// elements, and takes the low elements as the result. Note that while this is
12172/// specified as a *right shift* because x86 is little-endian, it is a *left
12173/// rotate* of the vector lanes.
12175 ArrayRef<int> Mask) {
12176 // Don't accept any shuffles with zero elements.
12177 if (isAnyZero(Mask))
12178 return -1;
12179
12180 // PALIGNR works on 128-bit lanes.
12181 SmallVector<int, 16> RepeatedMask;
12182 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12183 return -1;
12184
12185 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12186 if (Rotation <= 0)
12187 return -1;
12188
12189 // PALIGNR rotates bytes, so we need to scale the
12190 // rotation based on how many bytes are in the vector lane.
12191 int NumElts = RepeatedMask.size();
12192 int Scale = 16 / NumElts;
12193 return Rotation * Scale;
12194}
12195
12197 SDValue V2, ArrayRef<int> Mask,
12198 const X86Subtarget &Subtarget,
12199 SelectionDAG &DAG) {
12200 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12201
12202 SDValue Lo = V1, Hi = V2;
12203 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12204 if (ByteRotation <= 0)
12205 return SDValue();
12206
12207 // Cast the inputs to i8 vector of correct length to match PALIGNR or
12208 // PSLLDQ/PSRLDQ.
12209 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12210 Lo = DAG.getBitcast(ByteVT, Lo);
12211 Hi = DAG.getBitcast(ByteVT, Hi);
12212
12213 // SSSE3 targets can use the palignr instruction.
12214 if (Subtarget.hasSSSE3()) {
12215 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
12216 "512-bit PALIGNR requires BWI instructions");
12217 return DAG.getBitcast(
12218 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12219 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12220 }
12221
12222 assert(VT.is128BitVector() &&
12223 "Rotate-based lowering only supports 128-bit lowering!");
12224 assert(Mask.size() <= 16 &&
12225 "Can shuffle at most 16 bytes in a 128-bit vector!");
12226 assert(ByteVT == MVT::v16i8 &&
12227 "SSE2 rotate lowering only needed for v16i8!");
12228
12229 // Default SSE2 implementation
12230 int LoByteShift = 16 - ByteRotation;
12231 int HiByteShift = ByteRotation;
12232
12233 SDValue LoShift =
12234 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12235 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12236 SDValue HiShift =
12237 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12238 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12239 return DAG.getBitcast(VT,
12240 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12241}
12242
12243/// Try to lower a vector shuffle as a dword/qword rotation.
12244///
12245/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12246/// rotation of the concatenation of two vectors; This routine will
12247/// try to generically lower a vector shuffle through such an pattern.
12248///
12249/// Essentially it concatenates V1 and V2, shifts right by some number of
12250/// elements, and takes the low elements as the result. Note that while this is
12251/// specified as a *right shift* because x86 is little-endian, it is a *left
12252/// rotate* of the vector lanes.
12254 SDValue V2, ArrayRef<int> Mask,
12255 const APInt &Zeroable,
12256 const X86Subtarget &Subtarget,
12257 SelectionDAG &DAG) {
12258 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12259 "Only 32-bit and 64-bit elements are supported!");
12260
12261 // 128/256-bit vectors are only supported with VLX.
12262 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12263 && "VLX required for 128/256-bit vectors");
12264
12265 SDValue Lo = V1, Hi = V2;
12266 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12267 if (0 < Rotation)
12268 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12269 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12270
12271 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12272 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12273 // TODO: We can probably make this more aggressive and use shift-pairs like
12274 // lowerShuffleAsByteShiftMask.
12275 unsigned NumElts = Mask.size();
12276 unsigned ZeroLo = Zeroable.countr_one();
12277 unsigned ZeroHi = Zeroable.countl_one();
12278 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12279 if (!ZeroLo && !ZeroHi)
12280 return SDValue();
12281
12282 if (ZeroLo) {
12283 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12284 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12285 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12286 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12287 getZeroVector(VT, Subtarget, DAG, DL),
12288 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12289 }
12290
12291 if (ZeroHi) {
12292 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12293 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12294 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12295 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12296 getZeroVector(VT, Subtarget, DAG, DL), Src,
12297 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12298 }
12299
12300 return SDValue();
12301}
12302
12303/// Try to lower a vector shuffle as a byte shift sequence.
12305 SDValue V2, ArrayRef<int> Mask,
12306 const APInt &Zeroable,
12307 const X86Subtarget &Subtarget,
12308 SelectionDAG &DAG) {
12309 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12310 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12311
12312 // We need a shuffle that has zeros at one/both ends and a sequential
12313 // shuffle from one source within.
12314 unsigned ZeroLo = Zeroable.countr_one();
12315 unsigned ZeroHi = Zeroable.countl_one();
12316 if (!ZeroLo && !ZeroHi)
12317 return SDValue();
12318
12319 unsigned NumElts = Mask.size();
12320 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12321 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12322 return SDValue();
12323
12324 unsigned Scale = VT.getScalarSizeInBits() / 8;
12325 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12326 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12327 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12328 return SDValue();
12329
12330 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12331 Res = DAG.getBitcast(MVT::v16i8, Res);
12332
12333 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12334 // inner sequential set of elements, possibly offset:
12335 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12336 // 01234567 --> 4567zzzz --> zzzzz456
12337 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12338 if (ZeroLo == 0) {
12339 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12340 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12341 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12342 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12343 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12344 } else if (ZeroHi == 0) {
12345 unsigned Shift = Mask[ZeroLo] % NumElts;
12346 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12347 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12348 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12349 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12350 } else if (!Subtarget.hasSSSE3()) {
12351 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12352 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12353 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12354 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12355 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12356 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12357 Shift += Mask[ZeroLo] % NumElts;
12358 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12359 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12360 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12361 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12362 } else
12363 return SDValue();
12364
12365 return DAG.getBitcast(VT, Res);
12366}
12367
12368/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12369///
12370/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12371/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12372/// matches elements from one of the input vectors shuffled to the left or
12373/// right with zeroable elements 'shifted in'. It handles both the strictly
12374/// bit-wise element shifts and the byte shift across an entire 128-bit double
12375/// quad word lane.
12376///
12377/// PSHL : (little-endian) left bit shift.
12378/// [ zz, 0, zz, 2 ]
12379/// [ -1, 4, zz, -1 ]
12380/// PSRL : (little-endian) right bit shift.
12381/// [ 1, zz, 3, zz]
12382/// [ -1, -1, 7, zz]
12383/// PSLLDQ : (little-endian) left byte shift
12384/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12385/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12386/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12387/// PSRLDQ : (little-endian) right byte shift
12388/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12389/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12390/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12391static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12392 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12393 int MaskOffset, const APInt &Zeroable,
12394 const X86Subtarget &Subtarget) {
12395 int Size = Mask.size();
12396 unsigned SizeInBits = Size * ScalarSizeInBits;
12397
12398 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12399 for (int i = 0; i < Size; i += Scale)
12400 for (int j = 0; j < Shift; ++j)
12401 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12402 return false;
12403
12404 return true;
12405 };
12406
12407 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12408 for (int i = 0; i != Size; i += Scale) {
12409 unsigned Pos = Left ? i + Shift : i;
12410 unsigned Low = Left ? i : i + Shift;
12411 unsigned Len = Scale - Shift;
12412 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12413 return -1;
12414 }
12415
12416 int ShiftEltBits = ScalarSizeInBits * Scale;
12417 bool ByteShift = ShiftEltBits > 64;
12418 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12419 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12420 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12421
12422 // Normalize the scale for byte shifts to still produce an i64 element
12423 // type.
12424 Scale = ByteShift ? Scale / 2 : Scale;
12425
12426 // We need to round trip through the appropriate type for the shift.
12427 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12428 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12429 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12430 return ShiftAmt;
12431 };
12432
12433 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12434 // keep doubling the size of the integer elements up to that. We can
12435 // then shift the elements of the integer vector by whole multiples of
12436 // their width within the elements of the larger integer vector. Test each
12437 // multiple to see if we can find a match with the moved element indices
12438 // and that the shifted in elements are all zeroable.
12439 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12440 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12441 for (int Shift = 1; Shift != Scale; ++Shift)
12442 for (bool Left : {true, false})
12443 if (CheckZeros(Shift, Scale, Left)) {
12444 int ShiftAmt = MatchShift(Shift, Scale, Left);
12445 if (0 < ShiftAmt)
12446 return ShiftAmt;
12447 }
12448
12449 // no match
12450 return -1;
12451}
12452
12454 SDValue V2, ArrayRef<int> Mask,
12455 const APInt &Zeroable,
12456 const X86Subtarget &Subtarget,
12457 SelectionDAG &DAG, bool BitwiseOnly) {
12458 int Size = Mask.size();
12459 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12460
12461 MVT ShiftVT;
12462 SDValue V = V1;
12463 unsigned Opcode;
12464
12465 // Try to match shuffle against V1 shift.
12466 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12467 Mask, 0, Zeroable, Subtarget);
12468
12469 // If V1 failed, try to match shuffle against V2 shift.
12470 if (ShiftAmt < 0) {
12471 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12472 Mask, Size, Zeroable, Subtarget);
12473 V = V2;
12474 }
12475
12476 if (ShiftAmt < 0)
12477 return SDValue();
12478
12479 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12480 return SDValue();
12481
12482 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12483 "Illegal integer vector type");
12484 V = DAG.getBitcast(ShiftVT, V);
12485 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12486 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12487 return DAG.getBitcast(VT, V);
12488}
12489
12490// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12491// Remainder of lower half result is zero and upper half is all undef.
12492static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12493 ArrayRef<int> Mask, uint64_t &BitLen,
12494 uint64_t &BitIdx, const APInt &Zeroable) {
12495 int Size = Mask.size();
12496 int HalfSize = Size / 2;
12497 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12498 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12499
12500 // Upper half must be undefined.
12501 if (!isUndefUpperHalf(Mask))
12502 return false;
12503
12504 // Determine the extraction length from the part of the
12505 // lower half that isn't zeroable.
12506 int Len = HalfSize;
12507 for (; Len > 0; --Len)
12508 if (!Zeroable[Len - 1])
12509 break;
12510 assert(Len > 0 && "Zeroable shuffle mask");
12511
12512 // Attempt to match first Len sequential elements from the lower half.
12513 SDValue Src;
12514 int Idx = -1;
12515 for (int i = 0; i != Len; ++i) {
12516 int M = Mask[i];
12517 if (M == SM_SentinelUndef)
12518 continue;
12519 SDValue &V = (M < Size ? V1 : V2);
12520 M = M % Size;
12521
12522 // The extracted elements must start at a valid index and all mask
12523 // elements must be in the lower half.
12524 if (i > M || M >= HalfSize)
12525 return false;
12526
12527 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12528 Src = V;
12529 Idx = M - i;
12530 continue;
12531 }
12532 return false;
12533 }
12534
12535 if (!Src || Idx < 0)
12536 return false;
12537
12538 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12539 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12540 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12541 V1 = Src;
12542 return true;
12543}
12544
12545// INSERTQ: Extract lowest Len elements from lower half of second source and
12546// insert over first source, starting at Idx.
12547// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12548static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12549 ArrayRef<int> Mask, uint64_t &BitLen,
12550 uint64_t &BitIdx) {
12551 int Size = Mask.size();
12552 int HalfSize = Size / 2;
12553 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12554
12555 // Upper half must be undefined.
12556 if (!isUndefUpperHalf(Mask))
12557 return false;
12558
12559 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12560 SDValue Base;
12561
12562 // Attempt to match first source from mask before insertion point.
12563 if (isUndefInRange(Mask, 0, Idx)) {
12564 /* EMPTY */
12565 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12566 Base = V1;
12567 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12568 Base = V2;
12569 } else {
12570 continue;
12571 }
12572
12573 // Extend the extraction length looking to match both the insertion of
12574 // the second source and the remaining elements of the first.
12575 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12576 SDValue Insert;
12577 int Len = Hi - Idx;
12578
12579 // Match insertion.
12580 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12581 Insert = V1;
12582 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12583 Insert = V2;
12584 } else {
12585 continue;
12586 }
12587
12588 // Match the remaining elements of the lower half.
12589 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12590 /* EMPTY */
12591 } else if ((!Base || (Base == V1)) &&
12592 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12593 Base = V1;
12594 } else if ((!Base || (Base == V2)) &&
12595 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12596 Size + Hi)) {
12597 Base = V2;
12598 } else {
12599 continue;
12600 }
12601
12602 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12603 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12604 V1 = Base;
12605 V2 = Insert;
12606 return true;
12607 }
12608 }
12609
12610 return false;
12611}
12612
12613/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12615 SDValue V2, ArrayRef<int> Mask,
12616 const APInt &Zeroable, SelectionDAG &DAG) {
12617 uint64_t BitLen, BitIdx;
12618 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12619 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12620 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12621 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12622
12623 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12624 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12625 V2 ? V2 : DAG.getUNDEF(VT),
12626 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12627 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12628
12629 return SDValue();
12630}
12631
12632/// Lower a vector shuffle as an any/signed/zero extension.
12633///
12634/// Given a specific number of elements, element bit width, and extension
12635/// stride, produce either an extension based on the available
12636/// features of the subtarget. The extended elements are consecutive and
12637/// begin and can start from an offsetted element index in the input; to
12638/// avoid excess shuffling the offset must either being in the bottom lane
12639/// or at the start of a higher lane. All extended elements must be from
12640/// the same lane.
12642 int Scale, int Offset,
12643 unsigned ExtOpc, SDValue InputV,
12644 ArrayRef<int> Mask,
12645 const X86Subtarget &Subtarget,
12646 SelectionDAG &DAG) {
12647 assert(Scale > 1 && "Need a scale to extend.");
12648 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12649 int EltBits = VT.getScalarSizeInBits();
12650 int NumElements = VT.getVectorNumElements();
12651 int NumEltsPerLane = 128 / EltBits;
12652 int OffsetLane = Offset / NumEltsPerLane;
12653 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12654 "Only 8, 16, and 32 bit elements can be extended.");
12655 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12656 assert(0 <= Offset && "Extension offset must be positive.");
12657 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12658 "Extension offset must be in the first lane or start an upper lane.");
12659
12660 // Check that an index is in same lane as the base offset.
12661 auto SafeOffset = [&](int Idx) {
12662 return OffsetLane == (Idx / NumEltsPerLane);
12663 };
12664
12665 // Shift along an input so that the offset base moves to the first element.
12666 auto ShuffleOffset = [&](SDValue V) {
12667 if (!Offset)
12668 return V;
12669
12670 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12671 for (int i = 0; i * Scale < NumElements; ++i) {
12672 int SrcIdx = i + Offset;
12673 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12674 }
12675 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12676 };
12677
12678 // Found a valid a/zext mask! Try various lowering strategies based on the
12679 // input type and available ISA extensions.
12680 if (Subtarget.hasSSE41()) {
12681 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12682 // PUNPCK will catch this in a later shuffle match.
12683 if (Offset && Scale == 2 && VT.is128BitVector())
12684 return SDValue();
12685 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12686 NumElements / Scale);
12687 InputV = DAG.getBitcast(VT, InputV);
12688 InputV = ShuffleOffset(InputV);
12689 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12690 return DAG.getBitcast(VT, InputV);
12691 }
12692
12693 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12694 InputV = DAG.getBitcast(VT, InputV);
12695 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12696
12697 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12698 if (ExtOpc == ISD::SIGN_EXTEND)
12699 return SDValue();
12700
12701 // For any extends we can cheat for larger element sizes and use shuffle
12702 // instructions that can fold with a load and/or copy.
12703 if (AnyExt && EltBits == 32) {
12704 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12705 -1};
12706 return DAG.getBitcast(
12707 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12708 DAG.getBitcast(MVT::v4i32, InputV),
12709 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12710 }
12711 if (AnyExt && EltBits == 16 && Scale > 2) {
12712 int PSHUFDMask[4] = {Offset / 2, -1,
12713 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12714 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12715 DAG.getBitcast(MVT::v4i32, InputV),
12716 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12717 int PSHUFWMask[4] = {1, -1, -1, -1};
12718 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12719 return DAG.getBitcast(
12720 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12721 DAG.getBitcast(MVT::v8i16, InputV),
12722 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12723 }
12724
12725 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12726 // to 64-bits.
12727 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12728 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12729 assert(VT.is128BitVector() && "Unexpected vector width!");
12730
12731 int LoIdx = Offset * EltBits;
12732 SDValue Lo = DAG.getBitcast(
12733 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12734 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12735 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12736
12737 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12738 return DAG.getBitcast(VT, Lo);
12739
12740 int HiIdx = (Offset + 1) * EltBits;
12741 SDValue Hi = DAG.getBitcast(
12742 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12743 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12744 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12745 return DAG.getBitcast(VT,
12746 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12747 }
12748
12749 // If this would require more than 2 unpack instructions to expand, use
12750 // pshufb when available. We can only use more than 2 unpack instructions
12751 // when zero extending i8 elements which also makes it easier to use pshufb.
12752 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12753 assert(NumElements == 16 && "Unexpected byte vector width!");
12754 SDValue PSHUFBMask[16];
12755 for (int i = 0; i < 16; ++i) {
12756 int Idx = Offset + (i / Scale);
12757 if ((i % Scale == 0 && SafeOffset(Idx))) {
12758 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12759 continue;
12760 }
12761 PSHUFBMask[i] =
12762 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12763 }
12764 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12765 return DAG.getBitcast(
12766 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12767 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12768 }
12769
12770 // If we are extending from an offset, ensure we start on a boundary that
12771 // we can unpack from.
12772 int AlignToUnpack = Offset % (NumElements / Scale);
12773 if (AlignToUnpack) {
12774 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12775 for (int i = AlignToUnpack; i < NumElements; ++i)
12776 ShMask[i - AlignToUnpack] = i;
12777 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12778 Offset -= AlignToUnpack;
12779 }
12780
12781 // Otherwise emit a sequence of unpacks.
12782 do {
12783 unsigned UnpackLoHi = X86ISD::UNPCKL;
12784 if (Offset >= (NumElements / 2)) {
12785 UnpackLoHi = X86ISD::UNPCKH;
12786 Offset -= (NumElements / 2);
12787 }
12788
12789 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12790 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12791 : getZeroVector(InputVT, Subtarget, DAG, DL);
12792 InputV = DAG.getBitcast(InputVT, InputV);
12793 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12794 Scale /= 2;
12795 EltBits *= 2;
12796 NumElements /= 2;
12797 } while (Scale > 1);
12798 return DAG.getBitcast(VT, InputV);
12799}
12800
12801/// Try to lower a vector shuffle as a zero extension on any microarch.
12802///
12803/// This routine will try to do everything in its power to cleverly lower
12804/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12805/// check for the profitability of this lowering, it tries to aggressively
12806/// match this pattern. It will use all of the micro-architectural details it
12807/// can to emit an efficient lowering. It handles both blends with all-zero
12808/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12809/// masking out later).
12810///
12811/// The reason we have dedicated lowering for zext-style shuffles is that they
12812/// are both incredibly common and often quite performance sensitive.
12814 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12815 const APInt &Zeroable, const X86Subtarget &Subtarget,
12816 SelectionDAG &DAG) {
12817 int Bits = VT.getSizeInBits();
12818 int NumLanes = Bits / 128;
12819 int NumElements = VT.getVectorNumElements();
12820 int NumEltsPerLane = NumElements / NumLanes;
12821 assert(VT.getScalarSizeInBits() <= 32 &&
12822 "Exceeds 32-bit integer zero extension limit");
12823 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12824
12825 // Define a helper function to check a particular ext-scale and lower to it if
12826 // valid.
12827 auto Lower = [&](int Scale) -> SDValue {
12828 SDValue InputV;
12829 bool AnyExt = true;
12830 int Offset = 0;
12831 int Matches = 0;
12832 for (int i = 0; i < NumElements; ++i) {
12833 int M = Mask[i];
12834 if (M < 0)
12835 continue; // Valid anywhere but doesn't tell us anything.
12836 if (i % Scale != 0) {
12837 // Each of the extended elements need to be zeroable.
12838 if (!Zeroable[i])
12839 return SDValue();
12840
12841 // We no longer are in the anyext case.
12842 AnyExt = false;
12843 continue;
12844 }
12845
12846 // Each of the base elements needs to be consecutive indices into the
12847 // same input vector.
12848 SDValue V = M < NumElements ? V1 : V2;
12849 M = M % NumElements;
12850 if (!InputV) {
12851 InputV = V;
12852 Offset = M - (i / Scale);
12853 } else if (InputV != V)
12854 return SDValue(); // Flip-flopping inputs.
12855
12856 // Offset must start in the lowest 128-bit lane or at the start of an
12857 // upper lane.
12858 // FIXME: Is it ever worth allowing a negative base offset?
12859 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12860 (Offset % NumEltsPerLane) == 0))
12861 return SDValue();
12862
12863 // If we are offsetting, all referenced entries must come from the same
12864 // lane.
12865 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12866 return SDValue();
12867
12868 if ((M % NumElements) != (Offset + (i / Scale)))
12869 return SDValue(); // Non-consecutive strided elements.
12870 Matches++;
12871 }
12872
12873 // If we fail to find an input, we have a zero-shuffle which should always
12874 // have already been handled.
12875 // FIXME: Maybe handle this here in case during blending we end up with one?
12876 if (!InputV)
12877 return SDValue();
12878
12879 // If we are offsetting, don't extend if we only match a single input, we
12880 // can always do better by using a basic PSHUF or PUNPCK.
12881 if (Offset != 0 && Matches < 2)
12882 return SDValue();
12883
12884 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12885 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12886 InputV, Mask, Subtarget, DAG);
12887 };
12888
12889 // The widest scale possible for extending is to a 64-bit integer.
12890 assert(Bits % 64 == 0 &&
12891 "The number of bits in a vector must be divisible by 64 on x86!");
12892 int NumExtElements = Bits / 64;
12893
12894 // Each iteration, try extending the elements half as much, but into twice as
12895 // many elements.
12896 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12897 assert(NumElements % NumExtElements == 0 &&
12898 "The input vector size must be divisible by the extended size.");
12899 if (SDValue V = Lower(NumElements / NumExtElements))
12900 return V;
12901 }
12902
12903 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12904 if (Bits != 128)
12905 return SDValue();
12906
12907 // Returns one of the source operands if the shuffle can be reduced to a
12908 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12909 auto CanZExtLowHalf = [&]() {
12910 for (int i = NumElements / 2; i != NumElements; ++i)
12911 if (!Zeroable[i])
12912 return SDValue();
12913 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12914 return V1;
12915 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12916 return V2;
12917 return SDValue();
12918 };
12919
12920 if (SDValue V = CanZExtLowHalf()) {
12921 V = DAG.getBitcast(MVT::v2i64, V);
12922 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12923 return DAG.getBitcast(VT, V);
12924 }
12925
12926 // No viable ext lowering found.
12927 return SDValue();
12928}
12929
12930/// Try to get a scalar value for a specific element of a vector.
12931///
12932/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12934 SelectionDAG &DAG) {
12935 MVT VT = V.getSimpleValueType();
12936 MVT EltVT = VT.getVectorElementType();
12937 V = peekThroughBitcasts(V);
12938
12939 // If the bitcasts shift the element size, we can't extract an equivalent
12940 // element from it.
12941 MVT NewVT = V.getSimpleValueType();
12942 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12943 return SDValue();
12944
12945 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12946 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12947 // Ensure the scalar operand is the same size as the destination.
12948 // FIXME: Add support for scalar truncation where possible.
12949 SDValue S = V.getOperand(Idx);
12950 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12951 return DAG.getBitcast(EltVT, S);
12952 }
12953
12954 return SDValue();
12955}
12956
12957/// Helper to test for a load that can be folded with x86 shuffles.
12958///
12959/// This is particularly important because the set of instructions varies
12960/// significantly based on whether the operand is a load or not.
12962 return V.hasOneUse() &&
12964}
12965
12966template<typename T>
12967static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12968 T EltVT = VT.getScalarType();
12969 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12970 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12971}
12972
12973/// Try to lower insertion of a single element into a zero vector.
12974///
12975/// This is a common pattern that we have especially efficient patterns to lower
12976/// across all subtarget feature sets.
12978 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12979 const APInt &Zeroable, const X86Subtarget &Subtarget,
12980 SelectionDAG &DAG) {
12981 MVT ExtVT = VT;
12982 MVT EltVT = VT.getVectorElementType();
12983 unsigned NumElts = VT.getVectorNumElements();
12984 unsigned EltBits = VT.getScalarSizeInBits();
12985
12986 if (isSoftF16(EltVT, Subtarget))
12987 return SDValue();
12988
12989 int V2Index =
12990 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12991 Mask.begin();
12992 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12993 bool IsV1Zeroable = true;
12994 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12995 if (i != V2Index && !Zeroable[i]) {
12996 IsV1Zeroable = false;
12997 break;
12998 }
12999
13000 // Bail if a non-zero V1 isn't used in place.
13001 if (!IsV1Zeroable) {
13002 SmallVector<int, 8> V1Mask(Mask);
13003 V1Mask[V2Index] = -1;
13004 if (!isNoopShuffleMask(V1Mask))
13005 return SDValue();
13006 }
13007
13008 // Check for a single input from a SCALAR_TO_VECTOR node.
13009 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13010 // all the smarts here sunk into that routine. However, the current
13011 // lowering of BUILD_VECTOR makes that nearly impossible until the old
13012 // vector shuffle lowering is dead.
13013 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13014 DAG);
13015 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13016 // We need to zext the scalar if it is smaller than an i32.
13017 V2S = DAG.getBitcast(EltVT, V2S);
13018 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
13019 // Using zext to expand a narrow element won't work for non-zero
13020 // insertions. But we can use a masked constant vector if we're
13021 // inserting V2 into the bottom of V1.
13022 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
13023 return SDValue();
13024
13025 // Zero-extend directly to i32.
13026 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13027 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13028
13029 // If we're inserting into a constant, mask off the inserted index
13030 // and OR with the zero-extended scalar.
13031 if (!IsV1Zeroable) {
13032 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
13033 Bits[V2Index] = APInt::getZero(EltBits);
13034 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
13035 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
13036 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13037 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
13038 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
13039 }
13040 }
13041 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13042 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13043 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
13044 // Either not inserting from the low element of the input or the input
13045 // element size is too small to use VZEXT_MOVL to clear the high bits.
13046 return SDValue();
13047 }
13048
13049 if (!IsV1Zeroable) {
13050 // If V1 can't be treated as a zero vector we have fewer options to lower
13051 // this. We can't support integer vectors or non-zero targets cheaply.
13052 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
13053 if (!VT.isFloatingPoint() || V2Index != 0)
13054 return SDValue();
13055 if (!VT.is128BitVector())
13056 return SDValue();
13057
13058 // Otherwise, use MOVSD, MOVSS or MOVSH.
13059 unsigned MovOpc = 0;
13060 if (EltVT == MVT::f16)
13061 MovOpc = X86ISD::MOVSH;
13062 else if (EltVT == MVT::f32)
13063 MovOpc = X86ISD::MOVSS;
13064 else if (EltVT == MVT::f64)
13065 MovOpc = X86ISD::MOVSD;
13066 else
13067 llvm_unreachable("Unsupported floating point element type to handle!");
13068 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
13069 }
13070
13071 // This lowering only works for the low element with floating point vectors.
13072 if (VT.isFloatingPoint() && V2Index != 0)
13073 return SDValue();
13074
13075 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13076 if (ExtVT != VT)
13077 V2 = DAG.getBitcast(VT, V2);
13078
13079 if (V2Index != 0) {
13080 // If we have 4 or fewer lanes we can cheaply shuffle the element into
13081 // the desired position. Otherwise it is more efficient to do a vector
13082 // shift left. We know that we can do a vector shift left because all
13083 // the inputs are zero.
13084 if (VT.isFloatingPoint() || NumElts <= 4) {
13085 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13086 V2Shuffle[V2Index] = 0;
13087 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13088 } else {
13089 V2 = DAG.getBitcast(MVT::v16i8, V2);
13090 V2 = DAG.getNode(
13091 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13092 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
13093 V2 = DAG.getBitcast(VT, V2);
13094 }
13095 }
13096 return V2;
13097}
13098
13099/// Try to lower broadcast of a single - truncated - integer element,
13100/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13101///
13102/// This assumes we have AVX2.
13104 int BroadcastIdx,
13105 const X86Subtarget &Subtarget,
13106 SelectionDAG &DAG) {
13107 assert(Subtarget.hasAVX2() &&
13108 "We can only lower integer broadcasts with AVX2!");
13109
13110 MVT EltVT = VT.getVectorElementType();
13111 MVT V0VT = V0.getSimpleValueType();
13112
13113 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
13114 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
13115
13116 MVT V0EltVT = V0VT.getVectorElementType();
13117 if (!V0EltVT.isInteger())
13118 return SDValue();
13119
13120 const unsigned EltSize = EltVT.getSizeInBits();
13121 const unsigned V0EltSize = V0EltVT.getSizeInBits();
13122
13123 // This is only a truncation if the original element type is larger.
13124 if (V0EltSize <= EltSize)
13125 return SDValue();
13126
13127 assert(((V0EltSize % EltSize) == 0) &&
13128 "Scalar type sizes must all be powers of 2 on x86!");
13129
13130 const unsigned V0Opc = V0.getOpcode();
13131 const unsigned Scale = V0EltSize / EltSize;
13132 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13133
13134 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13135 V0Opc != ISD::BUILD_VECTOR)
13136 return SDValue();
13137
13138 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13139
13140 // If we're extracting non-least-significant bits, shift so we can truncate.
13141 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13142 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13143 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13144 if (const int OffsetIdx = BroadcastIdx % Scale)
13145 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13146 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13147
13148 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13149 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13150}
13151
13152/// Test whether this can be lowered with a single SHUFPS instruction.
13153///
13154/// This is used to disable more specialized lowerings when the shufps lowering
13155/// will happen to be efficient.
13157 // This routine only handles 128-bit shufps.
13158 assert(Mask.size() == 4 && "Unsupported mask size!");
13159 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
13160 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
13161 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
13162 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
13163
13164 // To lower with a single SHUFPS we need to have the low half and high half
13165 // each requiring a single input.
13166 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13167 return false;
13168 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13169 return false;
13170
13171 return true;
13172}
13173
13174/// Test whether the specified input (0 or 1) is in-place blended by the
13175/// given mask.
13176///
13177/// This returns true if the elements from a particular input are already in the
13178/// slot required by the given mask and require no permutation.
13180 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13181 int Size = Mask.size();
13182 for (int i = 0; i < Size; ++i)
13183 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13184 return false;
13185
13186 return true;
13187}
13188
13189/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
13190/// the given mask.
13191///
13193 int BroadcastableElement = 0) {
13194 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13195 int Size = Mask.size();
13196 for (int i = 0; i < Size; ++i)
13197 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
13198 Mask[i] % Size != BroadcastableElement)
13199 return false;
13200 return true;
13201}
13202
13203/// If we are extracting two 128-bit halves of a vector and shuffling the
13204/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13205/// multi-shuffle lowering.
13207 SDValue N1, ArrayRef<int> Mask,
13208 SelectionDAG &DAG) {
13209 MVT VT = N0.getSimpleValueType();
13210 assert((VT.is128BitVector() &&
13211 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
13212 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13213
13214 // Check that both sources are extracts of the same source vector.
13215 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13217 N0.getOperand(0) != N1.getOperand(0) ||
13218 !N0.hasOneUse() || !N1.hasOneUse())
13219 return SDValue();
13220
13221 SDValue WideVec = N0.getOperand(0);
13222 MVT WideVT = WideVec.getSimpleValueType();
13223 if (!WideVT.is256BitVector())
13224 return SDValue();
13225
13226 // Match extracts of each half of the wide source vector. Commute the shuffle
13227 // if the extract of the low half is N1.
13228 unsigned NumElts = VT.getVectorNumElements();
13229 SmallVector<int, 4> NewMask(Mask);
13230 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13231 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13232 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13234 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13235 return SDValue();
13236
13237 // Final bailout: if the mask is simple, we are better off using an extract
13238 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13239 // because that avoids a constant load from memory.
13240 if (NumElts == 4 &&
13241 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13242 return SDValue();
13243
13244 // Extend the shuffle mask with undef elements.
13245 NewMask.append(NumElts, -1);
13246
13247 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13248 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13249 NewMask);
13250 // This is free: ymm -> xmm.
13251 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13252 DAG.getVectorIdxConstant(0, DL));
13253}
13254
13255/// Try to lower broadcast of a single element.
13256///
13257/// For convenience, this code also bundles all of the subtarget feature set
13258/// filtering. While a little annoying to re-dispatch on type here, there isn't
13259/// a convenient way to factor it out.
13261 SDValue V2, ArrayRef<int> Mask,
13262 const X86Subtarget &Subtarget,
13263 SelectionDAG &DAG) {
13264 MVT EltVT = VT.getVectorElementType();
13265 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13266 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13267 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13268 return SDValue();
13269
13270 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13271 // we can only broadcast from a register with AVX2.
13272 unsigned NumEltBits = VT.getScalarSizeInBits();
13273 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13276 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13277
13278 // Check that the mask is a broadcast.
13279 int BroadcastIdx = getSplatIndex(Mask);
13280 if (BroadcastIdx < 0) {
13281 // Check for hidden broadcast.
13282 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13283 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13284 return SDValue();
13285 BroadcastIdx = 0;
13286 }
13287 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13288 "a sorted mask where the broadcast "
13289 "comes from V1.");
13290 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13291
13292 // Go up the chain of (vector) values to find a scalar load that we can
13293 // combine with the broadcast.
13294 // TODO: Combine this logic with findEltLoadSrc() used by
13295 // EltsFromConsecutiveLoads().
13296 int BitOffset = BroadcastIdx * NumEltBits;
13297 SDValue V = V1;
13298 for (;;) {
13299 switch (V.getOpcode()) {
13300 case ISD::BITCAST: {
13301 V = V.getOperand(0);
13302 continue;
13303 }
13304 case ISD::CONCAT_VECTORS: {
13305 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13306 int OpIdx = BitOffset / OpBitWidth;
13307 V = V.getOperand(OpIdx);
13308 BitOffset %= OpBitWidth;
13309 continue;
13310 }
13312 // The extraction index adds to the existing offset.
13313 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13314 unsigned Idx = V.getConstantOperandVal(1);
13315 unsigned BeginOffset = Idx * EltBitWidth;
13316 BitOffset += BeginOffset;
13317 V = V.getOperand(0);
13318 continue;
13319 }
13320 case ISD::INSERT_SUBVECTOR: {
13321 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13322 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13323 int Idx = (int)V.getConstantOperandVal(2);
13324 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13325 int BeginOffset = Idx * EltBitWidth;
13326 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13327 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13328 BitOffset -= BeginOffset;
13329 V = VInner;
13330 } else {
13331 V = VOuter;
13332 }
13333 continue;
13334 }
13335 }
13336 break;
13337 }
13338 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13339 BroadcastIdx = BitOffset / NumEltBits;
13340
13341 // Do we need to bitcast the source to retrieve the original broadcast index?
13342 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13343
13344 // Check if this is a broadcast of a scalar. We special case lowering
13345 // for scalars so that we can more effectively fold with loads.
13346 // If the original value has a larger element type than the shuffle, the
13347 // broadcast element is in essence truncated. Make that explicit to ease
13348 // folding.
13349 if (BitCastSrc && VT.isInteger())
13350 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13351 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13352 return TruncBroadcast;
13353
13354 // Also check the simpler case, where we can directly reuse the scalar.
13355 if (!BitCastSrc &&
13356 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13357 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13358 V = V.getOperand(BroadcastIdx);
13359
13360 // If we can't broadcast from a register, check that the input is a load.
13361 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13362 return SDValue();
13363 } else if (ISD::isNormalLoad(V.getNode()) &&
13364 cast<LoadSDNode>(V)->isSimple()) {
13365 // We do not check for one-use of the vector load because a broadcast load
13366 // is expected to be a win for code size, register pressure, and possibly
13367 // uops even if the original vector load is not eliminated.
13368
13369 // Reduce the vector load and shuffle to a broadcasted scalar load.
13370 auto *Ld = cast<LoadSDNode>(V);
13371 SDValue BaseAddr = Ld->getBasePtr();
13372 MVT SVT = VT.getScalarType();
13373 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13374 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13375 SDValue NewAddr =
13377
13378 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13379 // than MOVDDUP.
13380 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13381 if (Opcode == X86ISD::VBROADCAST) {
13382 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13383 SDValue Ops[] = {Ld->getChain(), NewAddr};
13384 V = DAG.getMemIntrinsicNode(
13385 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13387 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13389 return DAG.getBitcast(VT, V);
13390 }
13391 assert(SVT == MVT::f64 && "Unexpected VT!");
13392 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13394 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13396 } else if (!BroadcastFromReg) {
13397 // We can't broadcast from a vector register.
13398 return SDValue();
13399 } else if (BitOffset != 0) {
13400 // We can only broadcast from the zero-element of a vector register,
13401 // but it can be advantageous to broadcast from the zero-element of a
13402 // subvector.
13403 if (!VT.is256BitVector() && !VT.is512BitVector())
13404 return SDValue();
13405
13406 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13407 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13408 return SDValue();
13409
13410 // If we are broadcasting an element from the lowest 128-bit subvector, try
13411 // to move the element in position.
13412 if (BitOffset < 128 && NumActiveElts > 1 &&
13413 V.getScalarValueSizeInBits() == NumEltBits) {
13414 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13415 "Unexpected bit-offset");
13416 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13417 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13418 V = extractSubVector(V, 0, DAG, DL, 128);
13419 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13420 } else {
13421 // Only broadcast the zero-element of a 128-bit subvector.
13422 if ((BitOffset % 128) != 0)
13423 return SDValue();
13424
13425 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13426 "Unexpected bit-offset");
13427 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13428 "Unexpected vector size");
13429 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13430 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13431 }
13432 }
13433
13434 // On AVX we can use VBROADCAST directly for scalar sources.
13435 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13436 V = DAG.getBitcast(MVT::f64, V);
13437 if (Subtarget.hasAVX()) {
13438 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13439 return DAG.getBitcast(VT, V);
13440 }
13441 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13442 }
13443
13444 // If this is a scalar, do the broadcast on this type and bitcast.
13445 if (!V.getValueType().isVector()) {
13446 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13447 "Unexpected scalar size");
13448 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13450 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13451 }
13452
13453 // We only support broadcasting from 128-bit vectors to minimize the
13454 // number of patterns we need to deal with in isel. So extract down to
13455 // 128-bits, removing as many bitcasts as possible.
13456 if (V.getValueSizeInBits() > 128)
13458
13459 // Otherwise cast V to a vector with the same element type as VT, but
13460 // possibly narrower than VT. Then perform the broadcast.
13461 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13462 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13463 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13464}
13465
13466// Check for whether we can use INSERTPS to perform the shuffle. We only use
13467// INSERTPS when the V1 elements are already in the correct locations
13468// because otherwise we can just always use two SHUFPS instructions which
13469// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13470// perform INSERTPS if a single V1 element is out of place and all V2
13471// elements are zeroable.
13473 unsigned &InsertPSMask,
13474 const APInt &Zeroable,
13475 ArrayRef<int> Mask, SelectionDAG &DAG) {
13476 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13477 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13478 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13479
13480 // Attempt to match INSERTPS with one element from VA or VB being
13481 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13482 // are updated.
13483 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13484 ArrayRef<int> CandidateMask) {
13485 unsigned ZMask = 0;
13486 int VADstIndex = -1;
13487 int VBDstIndex = -1;
13488 bool VAUsedInPlace = false;
13489
13490 for (int i = 0; i < 4; ++i) {
13491 // Synthesize a zero mask from the zeroable elements (includes undefs).
13492 if (Zeroable[i]) {
13493 ZMask |= 1 << i;
13494 continue;
13495 }
13496
13497 // Flag if we use any VA inputs in place.
13498 if (i == CandidateMask[i]) {
13499 VAUsedInPlace = true;
13500 continue;
13501 }
13502
13503 // We can only insert a single non-zeroable element.
13504 if (VADstIndex >= 0 || VBDstIndex >= 0)
13505 return false;
13506
13507 if (CandidateMask[i] < 4) {
13508 // VA input out of place for insertion.
13509 VADstIndex = i;
13510 } else {
13511 // VB input for insertion.
13512 VBDstIndex = i;
13513 }
13514 }
13515
13516 // Don't bother if we have no (non-zeroable) element for insertion.
13517 if (VADstIndex < 0 && VBDstIndex < 0)
13518 return false;
13519
13520 // Determine element insertion src/dst indices. The src index is from the
13521 // start of the inserted vector, not the start of the concatenated vector.
13522 unsigned VBSrcIndex = 0;
13523 if (VADstIndex >= 0) {
13524 // If we have a VA input out of place, we use VA as the V2 element
13525 // insertion and don't use the original V2 at all.
13526 VBSrcIndex = CandidateMask[VADstIndex];
13527 VBDstIndex = VADstIndex;
13528 VB = VA;
13529 } else {
13530 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13531 }
13532
13533 // If no V1 inputs are used in place, then the result is created only from
13534 // the zero mask and the V2 insertion - so remove V1 dependency.
13535 if (!VAUsedInPlace)
13536 VA = DAG.getUNDEF(MVT::v4f32);
13537
13538 // Update V1, V2 and InsertPSMask accordingly.
13539 V1 = VA;
13540 V2 = VB;
13541
13542 // Insert the V2 element into the desired position.
13543 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13544 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13545 return true;
13546 };
13547
13548 if (matchAsInsertPS(V1, V2, Mask))
13549 return true;
13550
13551 // Commute and try again.
13552 SmallVector<int, 4> CommutedMask(Mask);
13554 if (matchAsInsertPS(V2, V1, CommutedMask))
13555 return true;
13556
13557 return false;
13558}
13559
13561 ArrayRef<int> Mask, const APInt &Zeroable,
13562 SelectionDAG &DAG) {
13563 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13564 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13565
13566 // Attempt to match the insertps pattern.
13567 unsigned InsertPSMask = 0;
13568 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13569 return SDValue();
13570
13571 // Insert the V2 element into the desired position.
13572 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13573 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13574}
13575
13576/// Handle lowering of 2-lane 64-bit floating point shuffles.
13577///
13578/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13579/// support for floating point shuffles but not integer shuffles. These
13580/// instructions will incur a domain crossing penalty on some chips though so
13581/// it is better to avoid lowering through this for integer vectors where
13582/// possible.
13584 const APInt &Zeroable, SDValue V1, SDValue V2,
13585 const X86Subtarget &Subtarget,
13586 SelectionDAG &DAG) {
13587 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13588 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13589 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13590
13591 if (V2.isUndef()) {
13592 // Check for being able to broadcast a single element.
13593 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13594 Mask, Subtarget, DAG))
13595 return Broadcast;
13596
13597 // Straight shuffle of a single input vector. Simulate this by using the
13598 // single input as both of the "inputs" to this instruction..
13599 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13600
13601 if (Subtarget.hasAVX()) {
13602 // If we have AVX, we can use VPERMILPS which will allow folding a load
13603 // into the shuffle.
13604 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13605 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13606 }
13607
13608 return DAG.getNode(
13609 X86ISD::SHUFP, DL, MVT::v2f64,
13610 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13611 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13612 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13613 }
13614 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13615 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13616 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13617 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13618
13619 if (Subtarget.hasAVX2())
13620 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13621 return Extract;
13622
13623 // When loading a scalar and then shuffling it into a vector we can often do
13624 // the insertion cheaply.
13626 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13627 return Insertion;
13628 // Try inverting the insertion since for v2 masks it is easy to do and we
13629 // can't reliably sort the mask one way or the other.
13630 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13631 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13633 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13634 return Insertion;
13635
13636 // Try to use one of the special instruction patterns to handle two common
13637 // blend patterns if a zero-blend above didn't work.
13638 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13639 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13640 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13641 // We can either use a special instruction to load over the low double or
13642 // to move just the low double.
13643 return DAG.getNode(
13644 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13645 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13646
13647 if (Subtarget.hasSSE41())
13648 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13649 Zeroable, Subtarget, DAG))
13650 return Blend;
13651
13652 // Use dedicated unpack instructions for masks that match their pattern.
13653 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13654 return V;
13655
13656 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13657 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13658 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13659}
13660
13661/// Handle lowering of 2-lane 64-bit integer shuffles.
13662///
13663/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13664/// the integer unit to minimize domain crossing penalties. However, for blends
13665/// it falls back to the floating point shuffle operation with appropriate bit
13666/// casting.
13668 const APInt &Zeroable, SDValue V1, SDValue V2,
13669 const X86Subtarget &Subtarget,
13670 SelectionDAG &DAG) {
13671 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13672 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13673 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13674
13675 if (V2.isUndef()) {
13676 // Check for being able to broadcast a single element.
13677 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13678 Mask, Subtarget, DAG))
13679 return Broadcast;
13680
13681 // Straight shuffle of a single input vector. For everything from SSE2
13682 // onward this has a single fast instruction with no scary immediates.
13683 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13684 V1 = DAG.getBitcast(MVT::v4i32, V1);
13685 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13686 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13687 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13688 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13689 return DAG.getBitcast(
13690 MVT::v2i64,
13691 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13692 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13693 }
13694 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13695 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13696 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13697 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13698
13699 if (Subtarget.hasAVX2())
13700 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13701 return Extract;
13702
13703 // Try to use shift instructions.
13704 if (SDValue Shift =
13705 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13706 DAG, /*BitwiseOnly*/ false))
13707 return Shift;
13708
13709 // When loading a scalar and then shuffling it into a vector we can often do
13710 // the insertion cheaply.
13712 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13713 return Insertion;
13714 // Try inverting the insertion since for v2 masks it is easy to do and we
13715 // can't reliably sort the mask one way or the other.
13716 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13718 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13719 return Insertion;
13720
13721 // We have different paths for blend lowering, but they all must use the
13722 // *exact* same predicate.
13723 bool IsBlendSupported = Subtarget.hasSSE41();
13724 if (IsBlendSupported)
13725 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13726 Zeroable, Subtarget, DAG))
13727 return Blend;
13728
13729 // Use dedicated unpack instructions for masks that match their pattern.
13730 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13731 return V;
13732
13733 // Try to use byte rotation instructions.
13734 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13735 if (Subtarget.hasSSSE3()) {
13736 if (Subtarget.hasVLX())
13737 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13738 Zeroable, Subtarget, DAG))
13739 return Rotate;
13740
13741 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13742 Subtarget, DAG))
13743 return Rotate;
13744 }
13745
13746 // If we have direct support for blends, we should lower by decomposing into
13747 // a permute. That will be faster than the domain cross.
13748 if (IsBlendSupported)
13749 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13750 Zeroable, Subtarget, DAG);
13751
13752 // We implement this with SHUFPD which is pretty lame because it will likely
13753 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13754 // However, all the alternatives are still more cycles and newer chips don't
13755 // have this problem. It would be really nice if x86 had better shuffles here.
13756 V1 = DAG.getBitcast(MVT::v2f64, V1);
13757 V2 = DAG.getBitcast(MVT::v2f64, V2);
13758 return DAG.getBitcast(MVT::v2i64,
13759 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13760}
13761
13762/// Lower a vector shuffle using the SHUFPS instruction.
13763///
13764/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13765/// It makes no assumptions about whether this is the *best* lowering, it simply
13766/// uses it.
13768 ArrayRef<int> Mask, SDValue V1,
13769 SDValue V2, SelectionDAG &DAG) {
13770 SDValue LowV = V1, HighV = V2;
13771 SmallVector<int, 4> NewMask(Mask);
13772 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13773
13774 if (NumV2Elements == 1) {
13775 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13776
13777 // Compute the index adjacent to V2Index and in the same half by toggling
13778 // the low bit.
13779 int V2AdjIndex = V2Index ^ 1;
13780
13781 if (Mask[V2AdjIndex] < 0) {
13782 // Handles all the cases where we have a single V2 element and an undef.
13783 // This will only ever happen in the high lanes because we commute the
13784 // vector otherwise.
13785 if (V2Index < 2)
13786 std::swap(LowV, HighV);
13787 NewMask[V2Index] -= 4;
13788 } else {
13789 // Handle the case where the V2 element ends up adjacent to a V1 element.
13790 // To make this work, blend them together as the first step.
13791 int V1Index = V2AdjIndex;
13792 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13793 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13794 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13795
13796 // Now proceed to reconstruct the final blend as we have the necessary
13797 // high or low half formed.
13798 if (V2Index < 2) {
13799 LowV = V2;
13800 HighV = V1;
13801 } else {
13802 HighV = V2;
13803 }
13804 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13805 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13806 }
13807 } else if (NumV2Elements == 2) {
13808 if (Mask[0] < 4 && Mask[1] < 4) {
13809 // Handle the easy case where we have V1 in the low lanes and V2 in the
13810 // high lanes.
13811 NewMask[2] -= 4;
13812 NewMask[3] -= 4;
13813 } else if (Mask[2] < 4 && Mask[3] < 4) {
13814 // We also handle the reversed case because this utility may get called
13815 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13816 // arrange things in the right direction.
13817 NewMask[0] -= 4;
13818 NewMask[1] -= 4;
13819 HighV = V1;
13820 LowV = V2;
13821 } else {
13822 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13823 // trying to place elements directly, just blend them and set up the final
13824 // shuffle to place them.
13825
13826 // The first two blend mask elements are for V1, the second two are for
13827 // V2.
13828 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13829 Mask[2] < 4 ? Mask[2] : Mask[3],
13830 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13831 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13832 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13833 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13834
13835 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13836 // a blend.
13837 LowV = HighV = V1;
13838 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13839 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13840 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13841 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13842 }
13843 } else if (NumV2Elements == 3) {
13844 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13845 // we can get here due to other paths (e.g repeated mask matching) that we
13846 // don't want to do another round of lowerVECTOR_SHUFFLE.
13848 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13849 }
13850 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13851 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13852}
13853
13854/// Lower 4-lane 32-bit floating point shuffles.
13855///
13856/// Uses instructions exclusively from the floating point unit to minimize
13857/// domain crossing penalties, as these are sufficient to implement all v4f32
13858/// shuffles.
13860 const APInt &Zeroable, SDValue V1, SDValue V2,
13861 const X86Subtarget &Subtarget,
13862 SelectionDAG &DAG) {
13863 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13864 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13865 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13866
13867 if (Subtarget.hasSSE41())
13868 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13869 Zeroable, Subtarget, DAG))
13870 return Blend;
13871
13872 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13873
13874 if (NumV2Elements == 0) {
13875 // Check for being able to broadcast a single element.
13876 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13877 Mask, Subtarget, DAG))
13878 return Broadcast;
13879
13880 // Use even/odd duplicate instructions for masks that match their pattern.
13881 if (Subtarget.hasSSE3()) {
13882 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13883 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13884 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13885 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13886 }
13887
13888 if (Subtarget.hasAVX()) {
13889 // If we have AVX, we can use VPERMILPS which will allow folding a load
13890 // into the shuffle.
13891 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13892 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13893 }
13894
13895 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13896 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13897 if (!Subtarget.hasSSE2()) {
13898 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13899 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13900 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13901 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13902 }
13903
13904 // Otherwise, use a straight shuffle of a single input vector. We pass the
13905 // input vector to both operands to simulate this with a SHUFPS.
13906 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13907 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13908 }
13909
13910 if (Subtarget.hasSSE2())
13912 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13913 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13914 return ZExt;
13915 }
13916
13917 if (Subtarget.hasAVX2())
13918 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13919 return Extract;
13920
13921 // There are special ways we can lower some single-element blends. However, we
13922 // have custom ways we can lower more complex single-element blends below that
13923 // we defer to if both this and BLENDPS fail to match, so restrict this to
13924 // when the V2 input is targeting element 0 of the mask -- that is the fast
13925 // case here.
13926 if (NumV2Elements == 1 && Mask[0] >= 4)
13928 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13929 return V;
13930
13931 if (Subtarget.hasSSE41()) {
13932 // Use INSERTPS if we can complete the shuffle efficiently.
13933 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13934 return V;
13935
13936 if (!isSingleSHUFPSMask(Mask))
13937 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13938 V2, Mask, DAG))
13939 return BlendPerm;
13940 }
13941
13942 // Use low/high mov instructions. These are only valid in SSE1 because
13943 // otherwise they are widened to v2f64 and never get here.
13944 if (!Subtarget.hasSSE2()) {
13945 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13946 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13947 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13948 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13949 }
13950
13951 // Use dedicated unpack instructions for masks that match their pattern.
13952 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13953 return V;
13954
13955 // Otherwise fall back to a SHUFPS lowering strategy.
13956 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13957}
13958
13959/// Lower 4-lane i32 vector shuffles.
13960///
13961/// We try to handle these with integer-domain shuffles where we can, but for
13962/// blends we use the floating point domain blend instructions.
13964 const APInt &Zeroable, SDValue V1, SDValue V2,
13965 const X86Subtarget &Subtarget,
13966 SelectionDAG &DAG) {
13967 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13968 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13969 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13970
13971 // Whenever we can lower this as a zext, that instruction is strictly faster
13972 // than any alternative. It also allows us to fold memory operands into the
13973 // shuffle in many cases.
13974 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13975 Zeroable, Subtarget, DAG))
13976 return ZExt;
13977
13978 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13979
13980 // Try to use shift instructions if fast.
13981 if (Subtarget.preferLowerShuffleAsShift()) {
13982 if (SDValue Shift =
13983 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13984 Subtarget, DAG, /*BitwiseOnly*/ true))
13985 return Shift;
13986 if (NumV2Elements == 0)
13987 if (SDValue Rotate =
13988 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13989 return Rotate;
13990 }
13991
13992 if (NumV2Elements == 0) {
13993 // Try to use broadcast unless the mask only has one non-undef element.
13994 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13995 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13996 Mask, Subtarget, DAG))
13997 return Broadcast;
13998 }
13999
14000 // Straight shuffle of a single input vector. For everything from SSE2
14001 // onward this has a single fast instruction with no scary immediates.
14002 // We coerce the shuffle pattern to be compatible with UNPCK instructions
14003 // but we aren't actually going to use the UNPCK instruction because doing
14004 // so prevents folding a load into this instruction or making a copy.
14005 const int UnpackLoMask[] = {0, 0, 1, 1};
14006 const int UnpackHiMask[] = {2, 2, 3, 3};
14007 if (!isSingleElementRepeatedMask(Mask)) {
14008 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14009 Mask = UnpackLoMask;
14010 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14011 Mask = UnpackHiMask;
14012 }
14013
14014 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14015 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14016 }
14017
14018 if (Subtarget.hasAVX2())
14019 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14020 return Extract;
14021
14022 // Try to use shift instructions.
14023 if (SDValue Shift =
14024 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
14025 DAG, /*BitwiseOnly*/ false))
14026 return Shift;
14027
14028 // There are special ways we can lower some single-element blends.
14029 if (NumV2Elements == 1)
14031 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14032 return V;
14033
14034 // We have different paths for blend lowering, but they all must use the
14035 // *exact* same predicate.
14036 bool IsBlendSupported = Subtarget.hasSSE41();
14037 if (IsBlendSupported)
14038 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14039 Zeroable, Subtarget, DAG))
14040 return Blend;
14041
14042 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14043 Zeroable, Subtarget, DAG))
14044 return Masked;
14045
14046 // Use dedicated unpack instructions for masks that match their pattern.
14047 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
14048 return V;
14049
14050 // Try to use byte rotation instructions.
14051 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14052 if (Subtarget.hasSSSE3()) {
14053 if (Subtarget.hasVLX())
14054 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14055 Zeroable, Subtarget, DAG))
14056 return Rotate;
14057
14058 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14059 Subtarget, DAG))
14060 return Rotate;
14061 }
14062
14063 // Assume that a single SHUFPS is faster than an alternative sequence of
14064 // multiple instructions (even if the CPU has a domain penalty).
14065 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14066 if (!isSingleSHUFPSMask(Mask)) {
14067 // If we have direct support for blends, we should lower by decomposing into
14068 // a permute. That will be faster than the domain cross.
14069 if (IsBlendSupported)
14070 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14071 Zeroable, Subtarget, DAG);
14072
14073 // Try to lower by permuting the inputs into an unpack instruction.
14074 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14075 Mask, Subtarget, DAG))
14076 return Unpack;
14077 }
14078
14079 // We implement this with SHUFPS because it can blend from two vectors.
14080 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14081 // up the inputs, bypassing domain shift penalties that we would incur if we
14082 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14083 // relevant.
14084 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14085 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14086 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14087 return DAG.getBitcast(MVT::v4i32, ShufPS);
14088}
14089
14090/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14091/// shuffle lowering, and the most complex part.
14092///
14093/// The lowering strategy is to try to form pairs of input lanes which are
14094/// targeted at the same half of the final vector, and then use a dword shuffle
14095/// to place them onto the right half, and finally unpack the paired lanes into
14096/// their final position.
14097///
14098/// The exact breakdown of how to form these dword pairs and align them on the
14099/// correct sides is really tricky. See the comments within the function for
14100/// more of the details.
14101///
14102/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14103/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14104/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14105/// vector, form the analogous 128-bit 8-element Mask.
14107 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14108 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14109 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
14110 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14111
14112 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
14113 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14114 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14115
14116 // Attempt to directly match PSHUFLW or PSHUFHW.
14117 if (isUndefOrInRange(LoMask, 0, 4) &&
14118 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14119 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14120 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14121 }
14122 if (isUndefOrInRange(HiMask, 4, 8) &&
14123 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14124 for (int i = 0; i != 4; ++i)
14125 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14126 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14127 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14128 }
14129
14130 SmallVector<int, 4> LoInputs;
14131 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14132 array_pod_sort(LoInputs.begin(), LoInputs.end());
14133 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14134 SmallVector<int, 4> HiInputs;
14135 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14136 array_pod_sort(HiInputs.begin(), HiInputs.end());
14137 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14138 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14139 int NumHToL = LoInputs.size() - NumLToL;
14140 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14141 int NumHToH = HiInputs.size() - NumLToH;
14142 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14143 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14144 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14145 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14146
14147 // If we are shuffling values from one half - check how many different DWORD
14148 // pairs we need to create. If only 1 or 2 then we can perform this as a
14149 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14150 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14151 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14152 V = DAG.getNode(ShufWOp, DL, VT, V,
14153 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14154 V = DAG.getBitcast(PSHUFDVT, V);
14155 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14156 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14157 return DAG.getBitcast(VT, V);
14158 };
14159
14160 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14161 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14162 SmallVector<std::pair<int, int>, 4> DWordPairs;
14163 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14164
14165 // Collect the different DWORD pairs.
14166 for (int DWord = 0; DWord != 4; ++DWord) {
14167 int M0 = Mask[2 * DWord + 0];
14168 int M1 = Mask[2 * DWord + 1];
14169 M0 = (M0 >= 0 ? M0 % 4 : M0);
14170 M1 = (M1 >= 0 ? M1 % 4 : M1);
14171 if (M0 < 0 && M1 < 0)
14172 continue;
14173
14174 bool Match = false;
14175 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14176 auto &DWordPair = DWordPairs[j];
14177 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14178 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14179 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14180 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14181 PSHUFDMask[DWord] = DOffset + j;
14182 Match = true;
14183 break;
14184 }
14185 }
14186 if (!Match) {
14187 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14188 DWordPairs.push_back(std::make_pair(M0, M1));
14189 }
14190 }
14191
14192 if (DWordPairs.size() <= 2) {
14193 DWordPairs.resize(2, std::make_pair(-1, -1));
14194 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14195 DWordPairs[1].first, DWordPairs[1].second};
14196 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
14197 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
14198 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
14199 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
14200 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
14201 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
14202 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
14203 }
14204 if ((NumHToL + NumHToH) == 0)
14205 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14206 if ((NumLToL + NumLToH) == 0)
14207 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14208 }
14209 }
14210
14211 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14212 // such inputs we can swap two of the dwords across the half mark and end up
14213 // with <=2 inputs to each half in each half. Once there, we can fall through
14214 // to the generic code below. For example:
14215 //
14216 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14217 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14218 //
14219 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14220 // and an existing 2-into-2 on the other half. In this case we may have to
14221 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14222 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14223 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14224 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14225 // half than the one we target for fixing) will be fixed when we re-enter this
14226 // path. We will also combine away any sequence of PSHUFD instructions that
14227 // result into a single instruction. Here is an example of the tricky case:
14228 //
14229 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14230 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14231 //
14232 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14233 //
14234 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14235 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14236 //
14237 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14238 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14239 //
14240 // The result is fine to be handled by the generic logic.
14241 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14242 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14243 int AOffset, int BOffset) {
14244 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14245 "Must call this with A having 3 or 1 inputs from the A half.");
14246 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14247 "Must call this with B having 1 or 3 inputs from the B half.");
14248 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14249 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14250
14251 bool ThreeAInputs = AToAInputs.size() == 3;
14252
14253 // Compute the index of dword with only one word among the three inputs in
14254 // a half by taking the sum of the half with three inputs and subtracting
14255 // the sum of the actual three inputs. The difference is the remaining
14256 // slot.
14257 int ADWord = 0, BDWord = 0;
14258 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14259 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14260 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14261 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14262 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14263 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14264 int TripleNonInputIdx =
14265 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14266 TripleDWord = TripleNonInputIdx / 2;
14267
14268 // We use xor with one to compute the adjacent DWord to whichever one the
14269 // OneInput is in.
14270 OneInputDWord = (OneInput / 2) ^ 1;
14271
14272 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14273 // and BToA inputs. If there is also such a problem with the BToB and AToB
14274 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14275 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14276 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14277 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14278 // Compute how many inputs will be flipped by swapping these DWords. We
14279 // need
14280 // to balance this to ensure we don't form a 3-1 shuffle in the other
14281 // half.
14282 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14283 llvm::count(AToBInputs, 2 * ADWord + 1);
14284 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14285 llvm::count(BToBInputs, 2 * BDWord + 1);
14286 if ((NumFlippedAToBInputs == 1 &&
14287 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14288 (NumFlippedBToBInputs == 1 &&
14289 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14290 // We choose whether to fix the A half or B half based on whether that
14291 // half has zero flipped inputs. At zero, we may not be able to fix it
14292 // with that half. We also bias towards fixing the B half because that
14293 // will more commonly be the high half, and we have to bias one way.
14294 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14295 ArrayRef<int> Inputs) {
14296 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14297 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14298 // Determine whether the free index is in the flipped dword or the
14299 // unflipped dword based on where the pinned index is. We use this bit
14300 // in an xor to conditionally select the adjacent dword.
14301 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14302 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14303 if (IsFixIdxInput == IsFixFreeIdxInput)
14304 FixFreeIdx += 1;
14305 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14306 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14307 "We need to be changing the number of flipped inputs!");
14308 int PSHUFHalfMask[] = {0, 1, 2, 3};
14309 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14310 V = DAG.getNode(
14311 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14312 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14313 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14314
14315 for (int &M : Mask)
14316 if (M >= 0 && M == FixIdx)
14317 M = FixFreeIdx;
14318 else if (M >= 0 && M == FixFreeIdx)
14319 M = FixIdx;
14320 };
14321 if (NumFlippedBToBInputs != 0) {
14322 int BPinnedIdx =
14323 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14324 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14325 } else {
14326 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14327 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14328 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14329 }
14330 }
14331 }
14332
14333 int PSHUFDMask[] = {0, 1, 2, 3};
14334 PSHUFDMask[ADWord] = BDWord;
14335 PSHUFDMask[BDWord] = ADWord;
14336 V = DAG.getBitcast(
14337 VT,
14338 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14339 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14340
14341 // Adjust the mask to match the new locations of A and B.
14342 for (int &M : Mask)
14343 if (M >= 0 && M/2 == ADWord)
14344 M = 2 * BDWord + M % 2;
14345 else if (M >= 0 && M/2 == BDWord)
14346 M = 2 * ADWord + M % 2;
14347
14348 // Recurse back into this routine to re-compute state now that this isn't
14349 // a 3 and 1 problem.
14350 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14351 };
14352 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14353 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14354 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14355 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14356
14357 // At this point there are at most two inputs to the low and high halves from
14358 // each half. That means the inputs can always be grouped into dwords and
14359 // those dwords can then be moved to the correct half with a dword shuffle.
14360 // We use at most one low and one high word shuffle to collect these paired
14361 // inputs into dwords, and finally a dword shuffle to place them.
14362 int PSHUFLMask[4] = {-1, -1, -1, -1};
14363 int PSHUFHMask[4] = {-1, -1, -1, -1};
14364 int PSHUFDMask[4] = {-1, -1, -1, -1};
14365
14366 // First fix the masks for all the inputs that are staying in their
14367 // original halves. This will then dictate the targets of the cross-half
14368 // shuffles.
14369 auto fixInPlaceInputs =
14370 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14371 MutableArrayRef<int> SourceHalfMask,
14372 MutableArrayRef<int> HalfMask, int HalfOffset) {
14373 if (InPlaceInputs.empty())
14374 return;
14375 if (InPlaceInputs.size() == 1) {
14376 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14377 InPlaceInputs[0] - HalfOffset;
14378 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14379 return;
14380 }
14381 if (IncomingInputs.empty()) {
14382 // Just fix all of the in place inputs.
14383 for (int Input : InPlaceInputs) {
14384 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14385 PSHUFDMask[Input / 2] = Input / 2;
14386 }
14387 return;
14388 }
14389
14390 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14391 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14392 InPlaceInputs[0] - HalfOffset;
14393 // Put the second input next to the first so that they are packed into
14394 // a dword. We find the adjacent index by toggling the low bit.
14395 int AdjIndex = InPlaceInputs[0] ^ 1;
14396 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14397 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14398 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14399 };
14400 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14401 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14402
14403 // Now gather the cross-half inputs and place them into a free dword of
14404 // their target half.
14405 // FIXME: This operation could almost certainly be simplified dramatically to
14406 // look more like the 3-1 fixing operation.
14407 auto moveInputsToRightHalf = [&PSHUFDMask](
14408 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14409 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14410 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14411 int DestOffset) {
14412 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14413 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14414 };
14415 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14416 int Word) {
14417 int LowWord = Word & ~1;
14418 int HighWord = Word | 1;
14419 return isWordClobbered(SourceHalfMask, LowWord) ||
14420 isWordClobbered(SourceHalfMask, HighWord);
14421 };
14422
14423 if (IncomingInputs.empty())
14424 return;
14425
14426 if (ExistingInputs.empty()) {
14427 // Map any dwords with inputs from them into the right half.
14428 for (int Input : IncomingInputs) {
14429 // If the source half mask maps over the inputs, turn those into
14430 // swaps and use the swapped lane.
14431 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14432 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14433 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14434 Input - SourceOffset;
14435 // We have to swap the uses in our half mask in one sweep.
14436 for (int &M : HalfMask)
14437 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14438 M = Input;
14439 else if (M == Input)
14440 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14441 } else {
14442 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14443 Input - SourceOffset &&
14444 "Previous placement doesn't match!");
14445 }
14446 // Note that this correctly re-maps both when we do a swap and when
14447 // we observe the other side of the swap above. We rely on that to
14448 // avoid swapping the members of the input list directly.
14449 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14450 }
14451
14452 // Map the input's dword into the correct half.
14453 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14454 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14455 else
14456 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14457 Input / 2 &&
14458 "Previous placement doesn't match!");
14459 }
14460
14461 // And just directly shift any other-half mask elements to be same-half
14462 // as we will have mirrored the dword containing the element into the
14463 // same position within that half.
14464 for (int &M : HalfMask)
14465 if (M >= SourceOffset && M < SourceOffset + 4) {
14466 M = M - SourceOffset + DestOffset;
14467 assert(M >= 0 && "This should never wrap below zero!");
14468 }
14469 return;
14470 }
14471
14472 // Ensure we have the input in a viable dword of its current half. This
14473 // is particularly tricky because the original position may be clobbered
14474 // by inputs being moved and *staying* in that half.
14475 if (IncomingInputs.size() == 1) {
14476 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14477 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14478 SourceOffset;
14479 SourceHalfMask[InputFixed - SourceOffset] =
14480 IncomingInputs[0] - SourceOffset;
14481 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14482 IncomingInputs[0] = InputFixed;
14483 }
14484 } else if (IncomingInputs.size() == 2) {
14485 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14486 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14487 // We have two non-adjacent or clobbered inputs we need to extract from
14488 // the source half. To do this, we need to map them into some adjacent
14489 // dword slot in the source mask.
14490 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14491 IncomingInputs[1] - SourceOffset};
14492
14493 // If there is a free slot in the source half mask adjacent to one of
14494 // the inputs, place the other input in it. We use (Index XOR 1) to
14495 // compute an adjacent index.
14496 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14497 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14498 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14499 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14500 InputsFixed[1] = InputsFixed[0] ^ 1;
14501 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14502 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14503 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14504 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14505 InputsFixed[0] = InputsFixed[1] ^ 1;
14506 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14507 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14508 // The two inputs are in the same DWord but it is clobbered and the
14509 // adjacent DWord isn't used at all. Move both inputs to the free
14510 // slot.
14511 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14512 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14513 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14514 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14515 } else {
14516 // The only way we hit this point is if there is no clobbering
14517 // (because there are no off-half inputs to this half) and there is no
14518 // free slot adjacent to one of the inputs. In this case, we have to
14519 // swap an input with a non-input.
14520 for (int i = 0; i < 4; ++i)
14521 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14522 "We can't handle any clobbers here!");
14523 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14524 "Cannot have adjacent inputs here!");
14525
14526 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14527 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14528
14529 // We also have to update the final source mask in this case because
14530 // it may need to undo the above swap.
14531 for (int &M : FinalSourceHalfMask)
14532 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14533 M = InputsFixed[1] + SourceOffset;
14534 else if (M == InputsFixed[1] + SourceOffset)
14535 M = (InputsFixed[0] ^ 1) + SourceOffset;
14536
14537 InputsFixed[1] = InputsFixed[0] ^ 1;
14538 }
14539
14540 // Point everything at the fixed inputs.
14541 for (int &M : HalfMask)
14542 if (M == IncomingInputs[0])
14543 M = InputsFixed[0] + SourceOffset;
14544 else if (M == IncomingInputs[1])
14545 M = InputsFixed[1] + SourceOffset;
14546
14547 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14548 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14549 }
14550 } else {
14551 llvm_unreachable("Unhandled input size!");
14552 }
14553
14554 // Now hoist the DWord down to the right half.
14555 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14556 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14557 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14558 for (int &M : HalfMask)
14559 for (int Input : IncomingInputs)
14560 if (M == Input)
14561 M = FreeDWord * 2 + Input % 2;
14562 };
14563 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14564 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14565 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14566 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14567
14568 // Now enact all the shuffles we've computed to move the inputs into their
14569 // target half.
14570 if (!isNoopShuffleMask(PSHUFLMask))
14571 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14572 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14573 if (!isNoopShuffleMask(PSHUFHMask))
14574 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14575 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14576 if (!isNoopShuffleMask(PSHUFDMask))
14577 V = DAG.getBitcast(
14578 VT,
14579 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14580 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14581
14582 // At this point, each half should contain all its inputs, and we can then
14583 // just shuffle them into their final position.
14584 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14585 "Failed to lift all the high half inputs to the low mask!");
14586 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14587 "Failed to lift all the low half inputs to the high mask!");
14588
14589 // Do a half shuffle for the low mask.
14590 if (!isNoopShuffleMask(LoMask))
14591 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14592 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14593
14594 // Do a half shuffle with the high mask after shifting its values down.
14595 for (int &M : HiMask)
14596 if (M >= 0)
14597 M -= 4;
14598 if (!isNoopShuffleMask(HiMask))
14599 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14600 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14601
14602 return V;
14603}
14604
14605/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14606/// blend if only one input is used.
14608 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14609 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14611 "Lane crossing shuffle masks not supported");
14612
14613 int NumBytes = VT.getSizeInBits() / 8;
14614 int Size = Mask.size();
14615 int Scale = NumBytes / Size;
14616
14617 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14618 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14619 V1InUse = false;
14620 V2InUse = false;
14621
14622 for (int i = 0; i < NumBytes; ++i) {
14623 int M = Mask[i / Scale];
14624 if (M < 0)
14625 continue;
14626
14627 const int ZeroMask = 0x80;
14628 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14629 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14630 if (Zeroable[i / Scale])
14631 V1Idx = V2Idx = ZeroMask;
14632
14633 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14634 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14635 V1InUse |= (ZeroMask != V1Idx);
14636 V2InUse |= (ZeroMask != V2Idx);
14637 }
14638
14639 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14640 if (V1InUse)
14641 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14642 DAG.getBuildVector(ShufVT, DL, V1Mask));
14643 if (V2InUse)
14644 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14645 DAG.getBuildVector(ShufVT, DL, V2Mask));
14646
14647 // If we need shuffled inputs from both, blend the two.
14648 SDValue V;
14649 if (V1InUse && V2InUse)
14650 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14651 else
14652 V = V1InUse ? V1 : V2;
14653
14654 // Cast the result back to the correct type.
14655 return DAG.getBitcast(VT, V);
14656}
14657
14658/// Generic lowering of 8-lane i16 shuffles.
14659///
14660/// This handles both single-input shuffles and combined shuffle/blends with
14661/// two inputs. The single input shuffles are immediately delegated to
14662/// a dedicated lowering routine.
14663///
14664/// The blends are lowered in one of three fundamental ways. If there are few
14665/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14666/// of the input is significantly cheaper when lowered as an interleaving of
14667/// the two inputs, try to interleave them. Otherwise, blend the low and high
14668/// halves of the inputs separately (making them have relatively few inputs)
14669/// and then concatenate them.
14671 const APInt &Zeroable, SDValue V1, SDValue V2,
14672 const X86Subtarget &Subtarget,
14673 SelectionDAG &DAG) {
14674 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14675 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14676 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14677
14678 // Whenever we can lower this as a zext, that instruction is strictly faster
14679 // than any alternative.
14680 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14681 Zeroable, Subtarget, DAG))
14682 return ZExt;
14683
14684 // Try to use lower using a truncation.
14685 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14686 Subtarget, DAG))
14687 return V;
14688
14689 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14690
14691 if (NumV2Inputs == 0) {
14692 // Try to use shift instructions.
14693 if (SDValue Shift =
14694 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14695 Subtarget, DAG, /*BitwiseOnly*/ false))
14696 return Shift;
14697
14698 // Check for being able to broadcast a single element.
14699 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14700 Mask, Subtarget, DAG))
14701 return Broadcast;
14702
14703 // Try to use bit rotation instructions.
14704 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14705 Subtarget, DAG))
14706 return Rotate;
14707
14708 // Use dedicated unpack instructions for masks that match their pattern.
14709 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14710 return V;
14711
14712 // Use dedicated pack instructions for masks that match their pattern.
14713 if (SDValue V =
14714 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14715 return V;
14716
14717 // Try to use byte rotation instructions.
14718 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14719 Subtarget, DAG))
14720 return Rotate;
14721
14722 // Make a copy of the mask so it can be modified.
14723 SmallVector<int, 8> MutableMask(Mask);
14724 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14725 Subtarget, DAG);
14726 }
14727
14728 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14729 "All single-input shuffles should be canonicalized to be V1-input "
14730 "shuffles.");
14731
14732 // Try to use shift instructions.
14733 if (SDValue Shift =
14734 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14735 DAG, /*BitwiseOnly*/ false))
14736 return Shift;
14737
14738 // See if we can use SSE4A Extraction / Insertion.
14739 if (Subtarget.hasSSE4A())
14740 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14741 Zeroable, DAG))
14742 return V;
14743
14744 // There are special ways we can lower some single-element blends.
14745 if (NumV2Inputs == 1)
14747 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14748 return V;
14749
14750 // We have different paths for blend lowering, but they all must use the
14751 // *exact* same predicate.
14752 bool IsBlendSupported = Subtarget.hasSSE41();
14753 if (IsBlendSupported)
14754 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14755 Zeroable, Subtarget, DAG))
14756 return Blend;
14757
14758 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14759 Zeroable, Subtarget, DAG))
14760 return Masked;
14761
14762 // Use dedicated unpack instructions for masks that match their pattern.
14763 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14764 return V;
14765
14766 // Use dedicated pack instructions for masks that match their pattern.
14767 if (SDValue V =
14768 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14769 return V;
14770
14771 // Try to use lower using a truncation.
14772 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14773 Subtarget, DAG))
14774 return V;
14775
14776 // Try to use byte rotation instructions.
14777 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14778 Subtarget, DAG))
14779 return Rotate;
14780
14781 if (SDValue BitBlend =
14782 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14783 return BitBlend;
14784
14785 // Try to use byte shift instructions to mask.
14786 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14787 Zeroable, Subtarget, DAG))
14788 return V;
14789
14790 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14791 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14792 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14793 !Subtarget.hasVLX()) {
14794 // Check if this is part of a 256-bit vector truncation.
14795 unsigned PackOpc = 0;
14796 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14799 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14800 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14801 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14802 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14803 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14804 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14805 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14806 PackOpc = X86ISD::PACKUS;
14807 } else if (Subtarget.hasSSE41()) {
14808 SmallVector<SDValue, 4> DWordClearOps(4,
14809 DAG.getConstant(0, DL, MVT::i32));
14810 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14811 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14812 SDValue DWordClearMask =
14813 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14814 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14815 DWordClearMask);
14816 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14817 DWordClearMask);
14818 PackOpc = X86ISD::PACKUS;
14819 } else if (!Subtarget.hasSSSE3()) {
14820 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14821 V1 = DAG.getBitcast(MVT::v4i32, V1);
14822 V2 = DAG.getBitcast(MVT::v4i32, V2);
14823 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14824 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14825 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14826 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14827 PackOpc = X86ISD::PACKSS;
14828 }
14829 if (PackOpc) {
14830 // Now pack things back together.
14831 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14832 if (NumEvenDrops == 2) {
14833 Result = DAG.getBitcast(MVT::v4i32, Result);
14834 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14835 }
14836 return Result;
14837 }
14838 }
14839
14840 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14841 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14842 if (NumOddDrops == 1) {
14843 bool HasSSE41 = Subtarget.hasSSE41();
14844 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14845 DAG.getBitcast(MVT::v4i32, V1),
14846 DAG.getTargetConstant(16, DL, MVT::i8));
14847 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14848 DAG.getBitcast(MVT::v4i32, V2),
14849 DAG.getTargetConstant(16, DL, MVT::i8));
14850 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14851 MVT::v8i16, V1, V2);
14852 }
14853
14854 // Try to lower by permuting the inputs into an unpack instruction.
14855 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14856 Mask, Subtarget, DAG))
14857 return Unpack;
14858
14859 // If we can't directly blend but can use PSHUFB, that will be better as it
14860 // can both shuffle and set up the inefficient blend.
14861 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14862 bool V1InUse, V2InUse;
14863 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14864 Zeroable, DAG, V1InUse, V2InUse);
14865 }
14866
14867 // We can always bit-blend if we have to so the fallback strategy is to
14868 // decompose into single-input permutes and blends/unpacks.
14869 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14870 Zeroable, Subtarget, DAG);
14871}
14872
14873/// Lower 8-lane 16-bit floating point shuffles.
14875 const APInt &Zeroable, SDValue V1, SDValue V2,
14876 const X86Subtarget &Subtarget,
14877 SelectionDAG &DAG) {
14878 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14879 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14880 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14881 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14882
14883 if (Subtarget.hasFP16()) {
14884 if (NumV2Elements == 0) {
14885 // Check for being able to broadcast a single element.
14886 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14887 Mask, Subtarget, DAG))
14888 return Broadcast;
14889 }
14890 if (NumV2Elements == 1 && Mask[0] >= 8)
14892 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14893 return V;
14894 }
14895
14896 V1 = DAG.getBitcast(MVT::v8i16, V1);
14897 V2 = DAG.getBitcast(MVT::v8i16, V2);
14898 return DAG.getBitcast(MVT::v8f16,
14899 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14900}
14901
14902// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14903// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14904// the active subvector is extracted.
14906 ArrayRef<int> OriginalMask, SDValue V1,
14907 SDValue V2, const X86Subtarget &Subtarget,
14908 SelectionDAG &DAG) {
14909 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14910 SmallVector<int, 32> Mask(OriginalMask);
14911 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14912 !isShuffleFoldableLoad(V2)) {
14914 std::swap(V1, V2);
14915 }
14916
14917 MVT MaskVT = VT.changeTypeToInteger();
14918 SDValue MaskNode;
14919 MVT ShuffleVT = VT;
14920 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14921 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14922 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14923 ShuffleVT = V1.getSimpleValueType();
14924
14925 // Adjust mask to correct indices for the second input.
14926 int NumElts = VT.getVectorNumElements();
14927 unsigned Scale = 512 / VT.getSizeInBits();
14928 SmallVector<int, 32> AdjustedMask(Mask);
14929 for (int &M : AdjustedMask)
14930 if (NumElts <= M)
14931 M += (Scale - 1) * NumElts;
14932 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14933 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14934 } else {
14935 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14936 }
14937
14938 SDValue Result;
14939 if (V2.isUndef())
14940 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14941 else
14942 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14943
14944 if (VT != ShuffleVT)
14945 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14946
14947 return Result;
14948}
14949
14950/// Generic lowering of v16i8 shuffles.
14951///
14952/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14953/// detect any complexity reducing interleaving. If that doesn't help, it uses
14954/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14955/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14956/// back together.
14958 const APInt &Zeroable, SDValue V1, SDValue V2,
14959 const X86Subtarget &Subtarget,
14960 SelectionDAG &DAG) {
14961 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14962 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14963 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14964
14965 // Try to use shift instructions.
14966 if (SDValue Shift =
14967 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14968 DAG, /*BitwiseOnly*/ false))
14969 return Shift;
14970
14971 // Try to use byte rotation instructions.
14972 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14973 Subtarget, DAG))
14974 return Rotate;
14975
14976 // Use dedicated pack instructions for masks that match their pattern.
14977 if (SDValue V =
14978 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14979 return V;
14980
14981 // Try to use a zext lowering.
14982 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14983 Zeroable, Subtarget, DAG))
14984 return ZExt;
14985
14986 // Try to use lower using a truncation.
14987 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14988 Subtarget, DAG))
14989 return V;
14990
14991 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14992 Subtarget, DAG))
14993 return V;
14994
14995 // See if we can use SSE4A Extraction / Insertion.
14996 if (Subtarget.hasSSE4A())
14997 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14998 Zeroable, DAG))
14999 return V;
15000
15001 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15002
15003 // For single-input shuffles, there are some nicer lowering tricks we can use.
15004 if (NumV2Elements == 0) {
15005 // Check for being able to broadcast a single element.
15006 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15007 Mask, Subtarget, DAG))
15008 return Broadcast;
15009
15010 // Try to use bit rotation instructions.
15011 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15012 Subtarget, DAG))
15013 return Rotate;
15014
15015 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
15016 return V;
15017
15018 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15019 // Notably, this handles splat and partial-splat shuffles more efficiently.
15020 // However, it only makes sense if the pre-duplication shuffle simplifies
15021 // things significantly. Currently, this means we need to be able to
15022 // express the pre-duplication shuffle as an i16 shuffle.
15023 //
15024 // FIXME: We should check for other patterns which can be widened into an
15025 // i16 shuffle as well.
15026 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15027 for (int i = 0; i < 16; i += 2)
15028 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15029 return false;
15030
15031 return true;
15032 };
15033 auto tryToWidenViaDuplication = [&]() -> SDValue {
15034 if (!canWidenViaDuplication(Mask))
15035 return SDValue();
15036 SmallVector<int, 4> LoInputs;
15037 copy_if(Mask, std::back_inserter(LoInputs),
15038 [](int M) { return M >= 0 && M < 8; });
15039 array_pod_sort(LoInputs.begin(), LoInputs.end());
15040 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
15041 SmallVector<int, 4> HiInputs;
15042 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15043 array_pod_sort(HiInputs.begin(), HiInputs.end());
15044 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
15045
15046 bool TargetLo = LoInputs.size() >= HiInputs.size();
15047 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15048 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15049
15050 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15052 for (int I : InPlaceInputs) {
15053 PreDupI16Shuffle[I/2] = I/2;
15054 LaneMap[I] = I;
15055 }
15056 int j = TargetLo ? 0 : 4, je = j + 4;
15057 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15058 // Check if j is already a shuffle of this input. This happens when
15059 // there are two adjacent bytes after we move the low one.
15060 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15061 // If we haven't yet mapped the input, search for a slot into which
15062 // we can map it.
15063 while (j < je && PreDupI16Shuffle[j] >= 0)
15064 ++j;
15065
15066 if (j == je)
15067 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15068 return SDValue();
15069
15070 // Map this input with the i16 shuffle.
15071 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15072 }
15073
15074 // Update the lane map based on the mapping we ended up with.
15075 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15076 }
15077 V1 = DAG.getBitcast(
15078 MVT::v16i8,
15079 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15080 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15081
15082 // Unpack the bytes to form the i16s that will be shuffled into place.
15083 bool EvenInUse = false, OddInUse = false;
15084 for (int i = 0; i < 16; i += 2) {
15085 EvenInUse |= (Mask[i + 0] >= 0);
15086 OddInUse |= (Mask[i + 1] >= 0);
15087 if (EvenInUse && OddInUse)
15088 break;
15089 }
15090 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15091 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15092 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15093
15094 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15095 for (int i = 0; i < 16; ++i)
15096 if (Mask[i] >= 0) {
15097 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15098 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
15099 if (PostDupI16Shuffle[i / 2] < 0)
15100 PostDupI16Shuffle[i / 2] = MappedMask;
15101 else
15102 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15103 "Conflicting entries in the original shuffle!");
15104 }
15105 return DAG.getBitcast(
15106 MVT::v16i8,
15107 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15108 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15109 };
15110 if (SDValue V = tryToWidenViaDuplication())
15111 return V;
15112 }
15113
15114 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15115 Zeroable, Subtarget, DAG))
15116 return Masked;
15117
15118 // Use dedicated unpack instructions for masks that match their pattern.
15119 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
15120 return V;
15121
15122 // Try to use byte shift instructions to mask.
15123 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15124 Zeroable, Subtarget, DAG))
15125 return V;
15126
15127 // Check for compaction patterns.
15128 bool IsSingleInput = V2.isUndef();
15129 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
15130
15131 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15132 // with PSHUFB. It is important to do this before we attempt to generate any
15133 // blends but after all of the single-input lowerings. If the single input
15134 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15135 // want to preserve that and we can DAG combine any longer sequences into
15136 // a PSHUFB in the end. But once we start blending from multiple inputs,
15137 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15138 // and there are *very* few patterns that would actually be faster than the
15139 // PSHUFB approach because of its ability to zero lanes.
15140 //
15141 // If the mask is a binary compaction, we can more efficiently perform this
15142 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15143 //
15144 // FIXME: The only exceptions to the above are blends which are exact
15145 // interleavings with direct instructions supporting them. We currently don't
15146 // handle those well here.
15147 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15148 bool V1InUse = false;
15149 bool V2InUse = false;
15150
15152 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15153
15154 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15155 // do so. This avoids using them to handle blends-with-zero which is
15156 // important as a single pshufb is significantly faster for that.
15157 if (V1InUse && V2InUse) {
15158 if (Subtarget.hasSSE41())
15159 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15160 Zeroable, Subtarget, DAG))
15161 return Blend;
15162
15163 // We can use an unpack to do the blending rather than an or in some
15164 // cases. Even though the or may be (very minorly) more efficient, we
15165 // preference this lowering because there are common cases where part of
15166 // the complexity of the shuffles goes away when we do the final blend as
15167 // an unpack.
15168 // FIXME: It might be worth trying to detect if the unpack-feeding
15169 // shuffles will both be pshufb, in which case we shouldn't bother with
15170 // this.
15172 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15173 return Unpack;
15174
15175 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15176 if (Subtarget.hasVBMI())
15177 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15178 DAG);
15179
15180 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15181 if (Subtarget.hasXOP()) {
15182 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15183 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15184 }
15185
15186 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15187 // PALIGNR will be cheaper than the second PSHUFB+OR.
15189 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15190 return V;
15191 }
15192
15193 return PSHUFB;
15194 }
15195
15196 // There are special ways we can lower some single-element blends.
15197 if (NumV2Elements == 1)
15199 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15200 return V;
15201
15202 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15203 return Blend;
15204
15205 // Check whether a compaction lowering can be done. This handles shuffles
15206 // which take every Nth element for some even N. See the helper function for
15207 // details.
15208 //
15209 // We special case these as they can be particularly efficiently handled with
15210 // the PACKUSB instruction on x86 and they show up in common patterns of
15211 // rearranging bytes to truncate wide elements.
15212 if (NumEvenDrops) {
15213 // NumEvenDrops is the power of two stride of the elements. Another way of
15214 // thinking about it is that we need to drop the even elements this many
15215 // times to get the original input.
15216
15217 // First we need to zero all the dropped bytes.
15218 assert(NumEvenDrops <= 3 &&
15219 "No support for dropping even elements more than 3 times.");
15220 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15221 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15222 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15223 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15224 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15225 WordClearMask);
15226 if (!IsSingleInput)
15227 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15228 WordClearMask);
15229
15230 // Now pack things back together.
15231 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15232 IsSingleInput ? V1 : V2);
15233 for (int i = 1; i < NumEvenDrops; ++i) {
15234 Result = DAG.getBitcast(MVT::v8i16, Result);
15235 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15236 }
15237 return Result;
15238 }
15239
15240 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15241 if (NumOddDrops == 1) {
15242 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15243 DAG.getBitcast(MVT::v8i16, V1),
15244 DAG.getTargetConstant(8, DL, MVT::i8));
15245 if (!IsSingleInput)
15246 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15247 DAG.getBitcast(MVT::v8i16, V2),
15248 DAG.getTargetConstant(8, DL, MVT::i8));
15249 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15250 IsSingleInput ? V1 : V2);
15251 }
15252
15253 // Handle multi-input cases by blending/unpacking single-input shuffles.
15254 if (NumV2Elements > 0)
15255 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15256 Zeroable, Subtarget, DAG);
15257
15258 // The fallback path for single-input shuffles widens this into two v8i16
15259 // vectors with unpacks, shuffles those, and then pulls them back together
15260 // with a pack.
15261 SDValue V = V1;
15262
15263 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15264 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15265 for (int i = 0; i < 16; ++i)
15266 if (Mask[i] >= 0)
15267 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15268
15269 SDValue VLoHalf, VHiHalf;
15270 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15271 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15272 // i16s.
15273 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15274 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15275 // Use a mask to drop the high bytes.
15276 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15277 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15278 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15279
15280 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15281 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15282
15283 // Squash the masks to point directly into VLoHalf.
15284 for (int &M : LoBlendMask)
15285 if (M >= 0)
15286 M /= 2;
15287 for (int &M : HiBlendMask)
15288 if (M >= 0)
15289 M /= 2;
15290 } else {
15291 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15292 // VHiHalf so that we can blend them as i16s.
15293 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15294
15295 VLoHalf = DAG.getBitcast(
15296 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15297 VHiHalf = DAG.getBitcast(
15298 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15299 }
15300
15301 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15302 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15303
15304 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15305}
15306
15307/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15308///
15309/// This routine breaks down the specific type of 128-bit shuffle and
15310/// dispatches to the lowering routines accordingly.
15312 MVT VT, SDValue V1, SDValue V2,
15313 const APInt &Zeroable,
15314 const X86Subtarget &Subtarget,
15315 SelectionDAG &DAG) {
15316 if (VT == MVT::v8bf16) {
15317 V1 = DAG.getBitcast(MVT::v8i16, V1);
15318 V2 = DAG.getBitcast(MVT::v8i16, V2);
15319 return DAG.getBitcast(VT,
15320 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15321 }
15322
15323 switch (VT.SimpleTy) {
15324 case MVT::v2i64:
15325 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15326 case MVT::v2f64:
15327 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15328 case MVT::v4i32:
15329 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15330 case MVT::v4f32:
15331 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15332 case MVT::v8i16:
15333 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15334 case MVT::v8f16:
15335 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15336 case MVT::v16i8:
15337 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15338
15339 default:
15340 llvm_unreachable("Unimplemented!");
15341 }
15342}
15343
15344/// Generic routine to split vector shuffle into half-sized shuffles.
15345///
15346/// This routine just extracts two subvectors, shuffles them independently, and
15347/// then concatenates them back together. This should work effectively with all
15348/// AVX vector shuffle types.
15350 SDValue V2, ArrayRef<int> Mask,
15351 SelectionDAG &DAG, bool SimpleOnly) {
15352 assert(VT.getSizeInBits() >= 256 &&
15353 "Only for 256-bit or wider vector shuffles!");
15354 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15355 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15356
15357 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15358 if (VT == MVT::v8f32) {
15359 SDValue BC1 = peekThroughBitcasts(V1);
15360 SDValue BC2 = peekThroughBitcasts(V2);
15361 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15362 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15363 DAG, SimpleOnly))
15364 return DAG.getBitcast(VT, Split);
15365 }
15366 }
15367
15368 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15369 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15370
15371 int NumElements = VT.getVectorNumElements();
15372 int SplitNumElements = NumElements / 2;
15373 MVT ScalarVT = VT.getVectorElementType();
15374 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15375
15376 // Use splitVector/extractSubVector so that split build-vectors just build two
15377 // narrower build vectors. This helps shuffling with splats and zeros.
15378 auto SplitVector = [&](SDValue V) {
15379 SDValue LoV, HiV;
15380 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15381 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15382 DAG.getBitcast(SplitVT, HiV));
15383 };
15384
15385 SDValue LoV1, HiV1, LoV2, HiV2;
15386 std::tie(LoV1, HiV1) = SplitVector(V1);
15387 std::tie(LoV2, HiV2) = SplitVector(V2);
15388
15389 // Now create two 4-way blends of these half-width vectors.
15390 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15391 bool &UseHiV1, bool &UseLoV2,
15392 bool &UseHiV2) {
15393 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15394 for (int i = 0; i < SplitNumElements; ++i) {
15395 int M = HalfMask[i];
15396 if (M >= NumElements) {
15397 if (M >= NumElements + SplitNumElements)
15398 UseHiV2 = true;
15399 else
15400 UseLoV2 = true;
15401 } else if (M >= 0) {
15402 if (M >= SplitNumElements)
15403 UseHiV1 = true;
15404 else
15405 UseLoV1 = true;
15406 }
15407 }
15408 };
15409
15410 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15411 if (!SimpleOnly)
15412 return true;
15413
15414 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15415 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15416
15417 return !(UseHiV1 || UseHiV2);
15418 };
15419
15420 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15421 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15422 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15423 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15424 for (int i = 0; i < SplitNumElements; ++i) {
15425 int M = HalfMask[i];
15426 if (M >= NumElements) {
15427 V2BlendMask[i] = M - NumElements;
15428 BlendMask[i] = SplitNumElements + i;
15429 } else if (M >= 0) {
15430 V1BlendMask[i] = M;
15431 BlendMask[i] = i;
15432 }
15433 }
15434
15435 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15436 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15437
15438 // Because the lowering happens after all combining takes place, we need to
15439 // manually combine these blend masks as much as possible so that we create
15440 // a minimal number of high-level vector shuffle nodes.
15441 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15442
15443 // First try just blending the halves of V1 or V2.
15444 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15445 return DAG.getUNDEF(SplitVT);
15446 if (!UseLoV2 && !UseHiV2)
15447 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15448 if (!UseLoV1 && !UseHiV1)
15449 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15450
15451 SDValue V1Blend, V2Blend;
15452 if (UseLoV1 && UseHiV1) {
15453 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15454 } else {
15455 // We only use half of V1 so map the usage down into the final blend mask.
15456 V1Blend = UseLoV1 ? LoV1 : HiV1;
15457 for (int i = 0; i < SplitNumElements; ++i)
15458 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15459 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15460 }
15461 if (UseLoV2 && UseHiV2) {
15462 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15463 } else {
15464 // We only use half of V2 so map the usage down into the final blend mask.
15465 V2Blend = UseLoV2 ? LoV2 : HiV2;
15466 for (int i = 0; i < SplitNumElements; ++i)
15467 if (BlendMask[i] >= SplitNumElements)
15468 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15469 }
15470 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15471 };
15472
15473 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15474 return SDValue();
15475
15476 SDValue Lo = HalfBlend(LoMask);
15477 SDValue Hi = HalfBlend(HiMask);
15478 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15479}
15480
15481/// Either split a vector in halves or decompose the shuffles and the
15482/// blend/unpack.
15483///
15484/// This is provided as a good fallback for many lowerings of non-single-input
15485/// shuffles with more than one 128-bit lane. In those cases, we want to select
15486/// between splitting the shuffle into 128-bit components and stitching those
15487/// back together vs. extracting the single-input shuffles and blending those
15488/// results.
15490 SDValue V2, ArrayRef<int> Mask,
15491 const APInt &Zeroable,
15492 const X86Subtarget &Subtarget,
15493 SelectionDAG &DAG) {
15494 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15495 "shuffles as it could then recurse on itself.");
15496 int Size = Mask.size();
15497
15498 // If this can be modeled as a broadcast of two elements followed by a blend,
15499 // prefer that lowering. This is especially important because broadcasts can
15500 // often fold with memory operands.
15501 auto DoBothBroadcast = [&] {
15502 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15503 for (int M : Mask)
15504 if (M >= Size) {
15505 if (V2BroadcastIdx < 0)
15506 V2BroadcastIdx = M - Size;
15507 else if ((M - Size) != V2BroadcastIdx &&
15508 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15509 return false;
15510 } else if (M >= 0) {
15511 if (V1BroadcastIdx < 0)
15512 V1BroadcastIdx = M;
15513 else if (M != V1BroadcastIdx &&
15514 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15515 return false;
15516 }
15517 return true;
15518 };
15519 if (DoBothBroadcast())
15520 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15521 Subtarget, DAG);
15522
15523 // If the inputs all stem from a single 128-bit lane of each input, then we
15524 // split them rather than blending because the split will decompose to
15525 // unusually few instructions.
15526 int LaneCount = VT.getSizeInBits() / 128;
15527 int LaneSize = Size / LaneCount;
15528 SmallBitVector LaneInputs[2];
15529 LaneInputs[0].resize(LaneCount, false);
15530 LaneInputs[1].resize(LaneCount, false);
15531 for (int i = 0; i < Size; ++i)
15532 if (Mask[i] >= 0)
15533 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15534 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15535 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15536 /*SimpleOnly*/ false);
15537
15538 // Without AVX2, if we can freely split the subvectors then we're better off
15539 // performing half width shuffles.
15540 if (!Subtarget.hasAVX2()) {
15541 SDValue BC1 = peekThroughBitcasts(V1);
15542 SDValue BC2 = peekThroughBitcasts(V2);
15543 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15544 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15545 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15546 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15547 if (SplatOrSplitV1 && SplatOrSplitV2)
15548 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15549 /*SimpleOnly*/ false);
15550 }
15551
15552 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15553 // requires that the decomposed single-input shuffles don't end up here.
15554 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15555 Subtarget, DAG);
15556}
15557
15558// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15559// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15561 SDValue V1, SDValue V2,
15562 ArrayRef<int> Mask,
15563 SelectionDAG &DAG) {
15564 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15565
15566 int LHSMask[4] = {-1, -1, -1, -1};
15567 int RHSMask[4] = {-1, -1, -1, -1};
15568 int SHUFPDMask[4] = {-1, -1, -1, -1};
15569
15570 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15571 // perform the shuffle once the lanes have been shuffled in place.
15572 for (int i = 0; i != 4; ++i) {
15573 int M = Mask[i];
15574 if (M < 0)
15575 continue;
15576 int LaneBase = i & ~1;
15577 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15578 LaneMask[LaneBase + (M & 1)] = M;
15579 SHUFPDMask[i] = M & 1;
15580 }
15581
15582 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15583 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15584 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15585 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15586}
15587
15588/// Lower a vector shuffle crossing multiple 128-bit lanes as
15589/// a lane permutation followed by a per-lane permutation.
15590///
15591/// This is mainly for cases where we can have non-repeating permutes
15592/// in each lane.
15593///
15594/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15595/// we should investigate merging them.
15597 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15598 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15599 int NumElts = VT.getVectorNumElements();
15600 int NumLanes = VT.getSizeInBits() / 128;
15601 int NumEltsPerLane = NumElts / NumLanes;
15602 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15603
15604 /// Attempts to find a sublane permute with the given size
15605 /// that gets all elements into their target lanes.
15606 ///
15607 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15608 /// If unsuccessful, returns false and may overwrite InLaneMask.
15609 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15610 int NumSublanesPerLane = NumSublanes / NumLanes;
15611 int NumEltsPerSublane = NumElts / NumSublanes;
15612
15613 SmallVector<int, 16> CrossLaneMask;
15614 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15615 // CrossLaneMask but one entry == one sublane.
15616 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15617 APInt DemandedCrossLane = APInt::getZero(NumElts);
15618
15619 for (int i = 0; i != NumElts; ++i) {
15620 int M = Mask[i];
15621 if (M < 0)
15622 continue;
15623
15624 int SrcSublane = M / NumEltsPerSublane;
15625 int DstLane = i / NumEltsPerLane;
15626
15627 // We only need to get the elements into the right lane, not sublane.
15628 // So search all sublanes that make up the destination lane.
15629 bool Found = false;
15630 int DstSubStart = DstLane * NumSublanesPerLane;
15631 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15632 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15633 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15634 continue;
15635
15636 Found = true;
15637 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15638 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15639 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15640 DemandedCrossLane.setBit(InLaneMask[i]);
15641 break;
15642 }
15643 if (!Found)
15644 return SDValue();
15645 }
15646
15647 // Fill CrossLaneMask using CrossLaneMaskLarge.
15648 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15649
15650 if (!CanUseSublanes) {
15651 // If we're only shuffling a single lowest lane and the rest are identity
15652 // then don't bother.
15653 // TODO - isShuffleMaskInputInPlace could be extended to something like
15654 // this.
15655 int NumIdentityLanes = 0;
15656 bool OnlyShuffleLowestLane = true;
15657 for (int i = 0; i != NumLanes; ++i) {
15658 int LaneOffset = i * NumEltsPerLane;
15659 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15660 i * NumEltsPerLane))
15661 NumIdentityLanes++;
15662 else if (CrossLaneMask[LaneOffset] != 0)
15663 OnlyShuffleLowestLane = false;
15664 }
15665 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15666 return SDValue();
15667 }
15668
15669 // Simplify CrossLaneMask based on the actual demanded elements.
15670 if (V1.hasOneUse())
15671 for (int i = 0; i != NumElts; ++i)
15672 if (!DemandedCrossLane[i])
15673 CrossLaneMask[i] = SM_SentinelUndef;
15674
15675 // Avoid returning the same shuffle operation. For example,
15676 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15677 // undef:v16i16
15678 if (CrossLaneMask == Mask || InLaneMask == Mask)
15679 return SDValue();
15680
15681 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15682 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15683 InLaneMask);
15684 };
15685
15686 // First attempt a solution with full lanes.
15687 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15688 return V;
15689
15690 // The rest of the solutions use sublanes.
15691 if (!CanUseSublanes)
15692 return SDValue();
15693
15694 // Then attempt a solution with 64-bit sublanes (vpermq).
15695 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15696 return V;
15697
15698 // If that doesn't work and we have fast variable cross-lane shuffle,
15699 // attempt 32-bit sublanes (vpermd).
15700 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15701 return SDValue();
15702
15703 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15704}
15705
15706/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15707static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15708 SmallVector<int> &InLaneMask) {
15709 int Size = Mask.size();
15710 InLaneMask.assign(Mask.begin(), Mask.end());
15711 for (int i = 0; i < Size; ++i) {
15712 int &M = InLaneMask[i];
15713 if (M < 0)
15714 continue;
15715 if (((M % Size) / LaneSize) != (i / LaneSize))
15716 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15717 }
15718}
15719
15720/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15721/// source with a lane permutation.
15722///
15723/// This lowering strategy results in four instructions in the worst case for a
15724/// single-input cross lane shuffle which is lower than any other fully general
15725/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15726/// shuffle pattern should be handled prior to trying this lowering.
15728 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15729 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15730 // FIXME: This should probably be generalized for 512-bit vectors as well.
15731 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15732 int Size = Mask.size();
15733 int LaneSize = Size / 2;
15734
15735 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15736 // Only do this if the elements aren't all from the lower lane,
15737 // otherwise we're (probably) better off doing a split.
15738 if (VT == MVT::v4f64 &&
15739 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15740 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15741
15742 // If there are only inputs from one 128-bit lane, splitting will in fact be
15743 // less expensive. The flags track whether the given lane contains an element
15744 // that crosses to another lane.
15745 bool AllLanes;
15746 if (!Subtarget.hasAVX2()) {
15747 bool LaneCrossing[2] = {false, false};
15748 for (int i = 0; i < Size; ++i)
15749 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15750 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15751 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15752 } else {
15753 bool LaneUsed[2] = {false, false};
15754 for (int i = 0; i < Size; ++i)
15755 if (Mask[i] >= 0)
15756 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15757 AllLanes = LaneUsed[0] && LaneUsed[1];
15758 }
15759
15760 // TODO - we could support shuffling V2 in the Flipped input.
15761 assert(V2.isUndef() &&
15762 "This last part of this routine only works on single input shuffles");
15763
15764 SmallVector<int> InLaneMask;
15765 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15766
15767 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15768 "In-lane shuffle mask expected");
15769
15770 // If we're not using both lanes in each lane and the inlane mask is not
15771 // repeating, then we're better off splitting.
15772 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15773 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15774 /*SimpleOnly*/ false);
15775
15776 // Flip the lanes, and shuffle the results which should now be in-lane.
15777 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15778 SDValue Flipped = DAG.getBitcast(PVT, V1);
15779 Flipped =
15780 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15781 Flipped = DAG.getBitcast(VT, Flipped);
15782 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15783}
15784
15785/// Handle lowering 2-lane 128-bit shuffles.
15787 SDValue V2, ArrayRef<int> Mask,
15788 const APInt &Zeroable,
15789 const X86Subtarget &Subtarget,
15790 SelectionDAG &DAG) {
15791 if (V2.isUndef()) {
15792 // Attempt to match VBROADCAST*128 subvector broadcast load.
15793 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15794 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15795 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15797 MVT MemVT = VT.getHalfNumVectorElementsVT();
15798 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15801 VT, MemVT, Ld, Ofs, DAG))
15802 return BcstLd;
15803 }
15804
15805 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15806 if (Subtarget.hasAVX2())
15807 return SDValue();
15808 }
15809
15810 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15811
15812 SmallVector<int, 4> WidenedMask;
15813 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15814 return SDValue();
15815
15816 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15817 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15818
15819 // Try to use an insert into a zero vector.
15820 if (WidenedMask[0] == 0 && IsHighZero) {
15821 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15822 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15823 DAG.getVectorIdxConstant(0, DL));
15824 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15825 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15826 DAG.getVectorIdxConstant(0, DL));
15827 }
15828
15829 // TODO: If minimizing size and one of the inputs is a zero vector and the
15830 // the zero vector has only one use, we could use a VPERM2X128 to save the
15831 // instruction bytes needed to explicitly generate the zero vector.
15832
15833 // Blends are faster and handle all the non-lane-crossing cases.
15834 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15835 Subtarget, DAG))
15836 return Blend;
15837
15838 // If either input operand is a zero vector, use VPERM2X128 because its mask
15839 // allows us to replace the zero input with an implicit zero.
15840 if (!IsLowZero && !IsHighZero) {
15841 // Check for patterns which can be matched with a single insert of a 128-bit
15842 // subvector.
15843 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15844 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15845
15846 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15847 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15849 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15850 SDValue SubVec =
15851 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15852 DAG.getVectorIdxConstant(0, DL));
15853 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15854 DAG.getVectorIdxConstant(2, DL));
15855 }
15856 }
15857
15858 // Try to use SHUF128 if possible.
15859 if (Subtarget.hasVLX()) {
15860 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15861 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15862 ((WidenedMask[1] % 2) << 1);
15863 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15864 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15865 }
15866 }
15867 }
15868
15869 // Otherwise form a 128-bit permutation. After accounting for undefs,
15870 // convert the 64-bit shuffle mask selection values into 128-bit
15871 // selection bits by dividing the indexes by 2 and shifting into positions
15872 // defined by a vperm2*128 instruction's immediate control byte.
15873
15874 // The immediate permute control byte looks like this:
15875 // [1:0] - select 128 bits from sources for low half of destination
15876 // [2] - ignore
15877 // [3] - zero low half of destination
15878 // [5:4] - select 128 bits from sources for high half of destination
15879 // [6] - ignore
15880 // [7] - zero high half of destination
15881
15882 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15883 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15884
15885 unsigned PermMask = 0;
15886 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15887 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15888
15889 // Check the immediate mask and replace unused sources with undef.
15890 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15891 V1 = DAG.getUNDEF(VT);
15892 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15893 V2 = DAG.getUNDEF(VT);
15894
15895 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15896 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15897}
15898
15899/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15900/// shuffling each lane.
15901///
15902/// This attempts to create a repeated lane shuffle where each lane uses one
15903/// or two of the lanes of the inputs. The lanes of the input vectors are
15904/// shuffled in one or two independent shuffles to get the lanes into the
15905/// position needed by the final shuffle.
15907 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15908 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15909 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15910
15911 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15912 return SDValue();
15913
15914 int NumElts = Mask.size();
15915 int NumLanes = VT.getSizeInBits() / 128;
15916 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15917 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15918 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15919
15920 // First pass will try to fill in the RepeatMask from lanes that need two
15921 // sources.
15922 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15923 int Srcs[2] = {-1, -1};
15924 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15925 for (int i = 0; i != NumLaneElts; ++i) {
15926 int M = Mask[(Lane * NumLaneElts) + i];
15927 if (M < 0)
15928 continue;
15929 // Determine which of the possible input lanes (NumLanes from each source)
15930 // this element comes from. Assign that as one of the sources for this
15931 // lane. We can assign up to 2 sources for this lane. If we run out
15932 // sources we can't do anything.
15933 int LaneSrc = M / NumLaneElts;
15934 int Src;
15935 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15936 Src = 0;
15937 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15938 Src = 1;
15939 else
15940 return SDValue();
15941
15942 Srcs[Src] = LaneSrc;
15943 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15944 }
15945
15946 // If this lane has two sources, see if it fits with the repeat mask so far.
15947 if (Srcs[1] < 0)
15948 continue;
15949
15950 LaneSrcs[Lane][0] = Srcs[0];
15951 LaneSrcs[Lane][1] = Srcs[1];
15952
15953 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15954 assert(M1.size() == M2.size() && "Unexpected mask size");
15955 for (int i = 0, e = M1.size(); i != e; ++i)
15956 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15957 return false;
15958 return true;
15959 };
15960
15961 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15962 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15963 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15964 int M = Mask[i];
15965 if (M < 0)
15966 continue;
15967 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15968 "Unexpected mask element");
15969 MergedMask[i] = M;
15970 }
15971 };
15972
15973 if (MatchMasks(InLaneMask, RepeatMask)) {
15974 // Merge this lane mask into the final repeat mask.
15975 MergeMasks(InLaneMask, RepeatMask);
15976 continue;
15977 }
15978
15979 // Didn't find a match. Swap the operands and try again.
15980 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15982
15983 if (MatchMasks(InLaneMask, RepeatMask)) {
15984 // Merge this lane mask into the final repeat mask.
15985 MergeMasks(InLaneMask, RepeatMask);
15986 continue;
15987 }
15988
15989 // Couldn't find a match with the operands in either order.
15990 return SDValue();
15991 }
15992
15993 // Now handle any lanes with only one source.
15994 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15995 // If this lane has already been processed, skip it.
15996 if (LaneSrcs[Lane][0] >= 0)
15997 continue;
15998
15999 for (int i = 0; i != NumLaneElts; ++i) {
16000 int M = Mask[(Lane * NumLaneElts) + i];
16001 if (M < 0)
16002 continue;
16003
16004 // If RepeatMask isn't defined yet we can define it ourself.
16005 if (RepeatMask[i] < 0)
16006 RepeatMask[i] = M % NumLaneElts;
16007
16008 if (RepeatMask[i] < NumElts) {
16009 if (RepeatMask[i] != M % NumLaneElts)
16010 return SDValue();
16011 LaneSrcs[Lane][0] = M / NumLaneElts;
16012 } else {
16013 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16014 return SDValue();
16015 LaneSrcs[Lane][1] = M / NumLaneElts;
16016 }
16017 }
16018
16019 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16020 return SDValue();
16021 }
16022
16023 SmallVector<int, 16> NewMask(NumElts, -1);
16024 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16025 int Src = LaneSrcs[Lane][0];
16026 for (int i = 0; i != NumLaneElts; ++i) {
16027 int M = -1;
16028 if (Src >= 0)
16029 M = Src * NumLaneElts + i;
16030 NewMask[Lane * NumLaneElts + i] = M;
16031 }
16032 }
16033 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16034 // Ensure we didn't get back the shuffle we started with.
16035 // FIXME: This is a hack to make up for some splat handling code in
16036 // getVectorShuffle.
16037 if (isa<ShuffleVectorSDNode>(NewV1) &&
16038 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16039 return SDValue();
16040
16041 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16042 int Src = LaneSrcs[Lane][1];
16043 for (int i = 0; i != NumLaneElts; ++i) {
16044 int M = -1;
16045 if (Src >= 0)
16046 M = Src * NumLaneElts + i;
16047 NewMask[Lane * NumLaneElts + i] = M;
16048 }
16049 }
16050 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16051 // Ensure we didn't get back the shuffle we started with.
16052 // FIXME: This is a hack to make up for some splat handling code in
16053 // getVectorShuffle.
16054 if (isa<ShuffleVectorSDNode>(NewV2) &&
16055 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16056 return SDValue();
16057
16058 for (int i = 0; i != NumElts; ++i) {
16059 if (Mask[i] < 0) {
16060 NewMask[i] = -1;
16061 continue;
16062 }
16063 NewMask[i] = RepeatMask[i % NumLaneElts];
16064 if (NewMask[i] < 0)
16065 continue;
16066
16067 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16068 }
16069 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16070}
16071
16072/// If the input shuffle mask results in a vector that is undefined in all upper
16073/// or lower half elements and that mask accesses only 2 halves of the
16074/// shuffle's operands, return true. A mask of half the width with mask indexes
16075/// adjusted to access the extracted halves of the original shuffle operands is
16076/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16077/// lower half of each input operand is accessed.
16078static bool
16080 int &HalfIdx1, int &HalfIdx2) {
16081 assert((Mask.size() == HalfMask.size() * 2) &&
16082 "Expected input mask to be twice as long as output");
16083
16084 // Exactly one half of the result must be undef to allow narrowing.
16085 bool UndefLower = isUndefLowerHalf(Mask);
16086 bool UndefUpper = isUndefUpperHalf(Mask);
16087 if (UndefLower == UndefUpper)
16088 return false;
16089
16090 unsigned HalfNumElts = HalfMask.size();
16091 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16092 HalfIdx1 = -1;
16093 HalfIdx2 = -1;
16094 for (unsigned i = 0; i != HalfNumElts; ++i) {
16095 int M = Mask[i + MaskIndexOffset];
16096 if (M < 0) {
16097 HalfMask[i] = M;
16098 continue;
16099 }
16100
16101 // Determine which of the 4 half vectors this element is from.
16102 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16103 int HalfIdx = M / HalfNumElts;
16104
16105 // Determine the element index into its half vector source.
16106 int HalfElt = M % HalfNumElts;
16107
16108 // We can shuffle with up to 2 half vectors, set the new 'half'
16109 // shuffle mask accordingly.
16110 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16111 HalfMask[i] = HalfElt;
16112 HalfIdx1 = HalfIdx;
16113 continue;
16114 }
16115 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16116 HalfMask[i] = HalfElt + HalfNumElts;
16117 HalfIdx2 = HalfIdx;
16118 continue;
16119 }
16120
16121 // Too many half vectors referenced.
16122 return false;
16123 }
16124
16125 return true;
16126}
16127
16128/// Given the output values from getHalfShuffleMask(), create a half width
16129/// shuffle of extracted vectors followed by an insert back to full width.
16131 ArrayRef<int> HalfMask, int HalfIdx1,
16132 int HalfIdx2, bool UndefLower,
16133 SelectionDAG &DAG, bool UseConcat = false) {
16134 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
16135 assert(V1.getValueType().isSimple() && "Expecting only simple types");
16136
16137 MVT VT = V1.getSimpleValueType();
16138 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16139 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16140
16141 auto getHalfVector = [&](int HalfIdx) {
16142 if (HalfIdx < 0)
16143 return DAG.getUNDEF(HalfVT);
16144 SDValue V = (HalfIdx < 2 ? V1 : V2);
16145 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16146 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16147 DAG.getVectorIdxConstant(HalfIdx, DL));
16148 };
16149
16150 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16151 SDValue Half1 = getHalfVector(HalfIdx1);
16152 SDValue Half2 = getHalfVector(HalfIdx2);
16153 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16154 if (UseConcat) {
16155 SDValue Op0 = V;
16156 SDValue Op1 = DAG.getUNDEF(HalfVT);
16157 if (UndefLower)
16158 std::swap(Op0, Op1);
16159 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16160 }
16161
16162 unsigned Offset = UndefLower ? HalfNumElts : 0;
16163 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16165}
16166
16167/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16168/// This allows for fast cases such as subvector extraction/insertion
16169/// or shuffling smaller vector types which can lower more efficiently.
16171 SDValue V2, ArrayRef<int> Mask,
16172 const X86Subtarget &Subtarget,
16173 SelectionDAG &DAG) {
16174 assert((VT.is256BitVector() || VT.is512BitVector()) &&
16175 "Expected 256-bit or 512-bit vector");
16176
16177 bool UndefLower = isUndefLowerHalf(Mask);
16178 if (!UndefLower && !isUndefUpperHalf(Mask))
16179 return SDValue();
16180
16181 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
16182 "Completely undef shuffle mask should have been simplified already");
16183
16184 // Upper half is undef and lower half is whole upper subvector.
16185 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16186 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16187 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16188 if (!UndefLower &&
16189 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16190 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16191 DAG.getVectorIdxConstant(HalfNumElts, DL));
16192 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16193 DAG.getVectorIdxConstant(0, DL));
16194 }
16195
16196 // Lower half is undef and upper half is whole lower subvector.
16197 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16198 if (UndefLower &&
16199 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16200 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16201 DAG.getVectorIdxConstant(0, DL));
16202 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16203 DAG.getVectorIdxConstant(HalfNumElts, DL));
16204 }
16205
16206 int HalfIdx1, HalfIdx2;
16207 SmallVector<int, 8> HalfMask(HalfNumElts);
16208 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16209 return SDValue();
16210
16211 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
16212
16213 // Only shuffle the halves of the inputs when useful.
16214 unsigned NumLowerHalves =
16215 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16216 unsigned NumUpperHalves =
16217 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16218 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
16219
16220 // Determine the larger pattern of undef/halves, then decide if it's worth
16221 // splitting the shuffle based on subtarget capabilities and types.
16222 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16223 if (!UndefLower) {
16224 // XXXXuuuu: no insert is needed.
16225 // Always extract lowers when setting lower - these are all free subreg ops.
16226 if (NumUpperHalves == 0)
16227 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16228 UndefLower, DAG);
16229
16230 if (NumUpperHalves == 1) {
16231 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16232 if (Subtarget.hasAVX2()) {
16233 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16234 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16235 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16236 (!isSingleSHUFPSMask(HalfMask) ||
16237 Subtarget.hasFastVariableCrossLaneShuffle()))
16238 return SDValue();
16239 // If this is an unary shuffle (assume that the 2nd operand is
16240 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16241 // are better off extracting the upper half of 1 operand and using a
16242 // narrow shuffle.
16243 if (EltWidth == 64 && V2.isUndef())
16244 return SDValue();
16245 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16246 // full width pshufb, and then merge.
16247 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16248 return SDValue();
16249 }
16250 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16251 if (Subtarget.hasAVX512() && VT.is512BitVector())
16252 return SDValue();
16253 // Extract + narrow shuffle is better than the wide alternative.
16254 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16255 UndefLower, DAG);
16256 }
16257
16258 // Don't extract both uppers, instead shuffle and then extract.
16259 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16260 return SDValue();
16261 }
16262
16263 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16264 if (NumUpperHalves == 0) {
16265 // AVX2 has efficient 64-bit element cross-lane shuffles.
16266 // TODO: Refine to account for unary shuffle, splat, and other masks?
16267 if (Subtarget.hasAVX2() && EltWidth == 64)
16268 return SDValue();
16269 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16270 if (Subtarget.hasAVX512() && VT.is512BitVector())
16271 return SDValue();
16272 // Narrow shuffle + insert is better than the wide alternative.
16273 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16274 UndefLower, DAG);
16275 }
16276
16277 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16278 return SDValue();
16279}
16280
16281/// Handle case where shuffle sources are coming from the same 128-bit lane and
16282/// every lane can be represented as the same repeating mask - allowing us to
16283/// shuffle the sources with the repeating shuffle and then permute the result
16284/// to the destination lanes.
16286 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16287 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16288 int NumElts = VT.getVectorNumElements();
16289 int NumLanes = VT.getSizeInBits() / 128;
16290 int NumLaneElts = NumElts / NumLanes;
16291
16292 // On AVX2 we may be able to just shuffle the lowest elements and then
16293 // broadcast the result.
16294 if (Subtarget.hasAVX2()) {
16295 for (unsigned BroadcastSize : {16, 32, 64}) {
16296 if (BroadcastSize <= VT.getScalarSizeInBits())
16297 continue;
16298 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16299
16300 // Attempt to match a repeating pattern every NumBroadcastElts,
16301 // accounting for UNDEFs but only references the lowest 128-bit
16302 // lane of the inputs.
16303 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16304 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16305 for (int j = 0; j != NumBroadcastElts; ++j) {
16306 int M = Mask[i + j];
16307 if (M < 0)
16308 continue;
16309 int &R = RepeatMask[j];
16310 if (0 != ((M % NumElts) / NumLaneElts))
16311 return false;
16312 if (0 <= R && R != M)
16313 return false;
16314 R = M;
16315 }
16316 return true;
16317 };
16318
16319 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16320 if (!FindRepeatingBroadcastMask(RepeatMask))
16321 continue;
16322
16323 // Shuffle the (lowest) repeated elements in place for broadcast.
16324 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16325
16326 // Shuffle the actual broadcast.
16327 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16328 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16329 for (int j = 0; j != NumBroadcastElts; ++j)
16330 BroadcastMask[i + j] = j;
16331
16332 // Avoid returning the same shuffle operation. For example,
16333 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16334 if (BroadcastMask == Mask)
16335 return SDValue();
16336
16337 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16338 BroadcastMask);
16339 }
16340 }
16341
16342 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16343 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16344 return SDValue();
16345
16346 // Bail if we already have a repeated lane shuffle mask.
16347 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16348 return SDValue();
16349
16350 // Helper to look for repeated mask in each split sublane, and that those
16351 // sublanes can then be permuted into place.
16352 auto ShuffleSubLanes = [&](int SubLaneScale) {
16353 int NumSubLanes = NumLanes * SubLaneScale;
16354 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16355
16356 // Check that all the sources are coming from the same lane and see if we
16357 // can form a repeating shuffle mask (local to each sub-lane). At the same
16358 // time, determine the source sub-lane for each destination sub-lane.
16359 int TopSrcSubLane = -1;
16360 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16361 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16362 SubLaneScale,
16363 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16364
16365 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16366 // Extract the sub-lane mask, check that it all comes from the same lane
16367 // and normalize the mask entries to come from the first lane.
16368 int SrcLane = -1;
16369 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16370 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16371 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16372 if (M < 0)
16373 continue;
16374 int Lane = (M % NumElts) / NumLaneElts;
16375 if ((0 <= SrcLane) && (SrcLane != Lane))
16376 return SDValue();
16377 SrcLane = Lane;
16378 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16379 SubLaneMask[Elt] = LocalM;
16380 }
16381
16382 // Whole sub-lane is UNDEF.
16383 if (SrcLane < 0)
16384 continue;
16385
16386 // Attempt to match against the candidate repeated sub-lane masks.
16387 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16388 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16389 for (int i = 0; i != NumSubLaneElts; ++i) {
16390 if (M1[i] < 0 || M2[i] < 0)
16391 continue;
16392 if (M1[i] != M2[i])
16393 return false;
16394 }
16395 return true;
16396 };
16397
16398 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16399 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16400 continue;
16401
16402 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16403 for (int i = 0; i != NumSubLaneElts; ++i) {
16404 int M = SubLaneMask[i];
16405 if (M < 0)
16406 continue;
16407 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16408 "Unexpected mask element");
16409 RepeatedSubLaneMask[i] = M;
16410 }
16411
16412 // Track the top most source sub-lane - by setting the remaining to
16413 // UNDEF we can greatly simplify shuffle matching.
16414 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16415 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16416 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16417 break;
16418 }
16419
16420 // Bail if we failed to find a matching repeated sub-lane mask.
16421 if (Dst2SrcSubLanes[DstSubLane] < 0)
16422 return SDValue();
16423 }
16424 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16425 "Unexpected source lane");
16426
16427 // Create a repeating shuffle mask for the entire vector.
16428 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16429 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16430 int Lane = SubLane / SubLaneScale;
16431 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16432 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16433 int M = RepeatedSubLaneMask[Elt];
16434 if (M < 0)
16435 continue;
16436 int Idx = (SubLane * NumSubLaneElts) + Elt;
16437 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16438 }
16439 }
16440
16441 // Shuffle each source sub-lane to its destination.
16442 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16443 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16444 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16445 if (SrcSubLane < 0)
16446 continue;
16447 for (int j = 0; j != NumSubLaneElts; ++j)
16448 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16449 }
16450
16451 // Avoid returning the same shuffle operation.
16452 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16453 if (RepeatedMask == Mask || SubLaneMask == Mask)
16454 return SDValue();
16455
16456 SDValue RepeatedShuffle =
16457 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16458
16459 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16460 SubLaneMask);
16461 };
16462
16463 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16464 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16465 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16466 // Otherwise we can only permute whole 128-bit lanes.
16467 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16468 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16469 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16470 MinSubLaneScale = 2;
16471 MaxSubLaneScale =
16472 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16473 }
16474 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16475 MinSubLaneScale = MaxSubLaneScale = 4;
16476
16477 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16478 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16479 return Shuffle;
16480
16481 return SDValue();
16482}
16483
16485 bool &ForceV1Zero, bool &ForceV2Zero,
16486 unsigned &ShuffleImm, ArrayRef<int> Mask,
16487 const APInt &Zeroable) {
16488 int NumElts = VT.getVectorNumElements();
16489 assert(VT.getScalarSizeInBits() == 64 &&
16490 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16491 "Unexpected data type for VSHUFPD");
16492 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16493 "Illegal shuffle mask");
16494
16495 bool ZeroLane[2] = { true, true };
16496 for (int i = 0; i < NumElts; ++i)
16497 ZeroLane[i & 1] &= Zeroable[i];
16498
16499 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16500 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16501 bool IsSHUFPD = true;
16502 bool IsCommutable = true;
16503 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16504 for (int i = 0; i < NumElts; ++i) {
16505 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16506 continue;
16507 if (Mask[i] < 0)
16508 return false;
16509 int Val = (i & 6) + NumElts * (i & 1);
16510 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16511 if (Mask[i] < Val || Mask[i] > Val + 1)
16512 IsSHUFPD = false;
16513 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16514 IsCommutable = false;
16515 SHUFPDMask[i] = Mask[i] % 2;
16516 }
16517
16518 if (!IsSHUFPD && !IsCommutable)
16519 return false;
16520
16521 if (!IsSHUFPD && IsCommutable)
16522 std::swap(V1, V2);
16523
16524 ForceV1Zero = ZeroLane[0];
16525 ForceV2Zero = ZeroLane[1];
16526 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16527 return true;
16528}
16529
16531 SDValue V2, ArrayRef<int> Mask,
16532 const APInt &Zeroable,
16533 const X86Subtarget &Subtarget,
16534 SelectionDAG &DAG) {
16535 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16536 "Unexpected data type for VSHUFPD");
16537
16538 unsigned Immediate = 0;
16539 bool ForceV1Zero = false, ForceV2Zero = false;
16540 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16541 Mask, Zeroable))
16542 return SDValue();
16543
16544 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16545 if (ForceV1Zero)
16546 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16547 if (ForceV2Zero)
16548 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16549
16550 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16551 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16552}
16553
16554// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16555// by zeroable elements in the remaining 24 elements. Turn this into two
16556// vmovqb instructions shuffled together.
16558 SDValue V1, SDValue V2,
16559 ArrayRef<int> Mask,
16560 const APInt &Zeroable,
16561 SelectionDAG &DAG) {
16562 assert(VT == MVT::v32i8 && "Unexpected type!");
16563
16564 // The first 8 indices should be every 8th element.
16565 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16566 return SDValue();
16567
16568 // Remaining elements need to be zeroable.
16569 if (Zeroable.countl_one() < (Mask.size() - 8))
16570 return SDValue();
16571
16572 V1 = DAG.getBitcast(MVT::v4i64, V1);
16573 V2 = DAG.getBitcast(MVT::v4i64, V2);
16574
16575 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16576 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16577
16578 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16579 // the upper bits of the result using an unpckldq.
16580 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16581 { 0, 1, 2, 3, 16, 17, 18, 19,
16582 4, 5, 6, 7, 20, 21, 22, 23 });
16583 // Insert the unpckldq into a zero vector to widen to v32i8.
16584 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16585 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16586 DAG.getVectorIdxConstant(0, DL));
16587}
16588
16589// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16590// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16591// =>
16592// ul = unpckl v1, v2
16593// uh = unpckh v1, v2
16594// a = vperm ul, uh
16595// b = vperm ul, uh
16596//
16597// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16598// and permute. We cannot directly match v3 because it is split into two
16599// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16600// pair of 256-bit shuffles and makes sure the masks are consecutive.
16601//
16602// Once unpck and permute nodes are created, the permute corresponding to this
16603// shuffle is returned, while the other permute replaces the other half of the
16604// shuffle in the selection dag.
16606 SDValue V1, SDValue V2,
16607 ArrayRef<int> Mask,
16608 SelectionDAG &DAG) {
16609 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16610 VT != MVT::v32i8)
16611 return SDValue();
16612 // <B0, B1, B0+1, B1+1, ..., >
16613 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16614 unsigned Begin1) {
16615 size_t Size = Mask.size();
16616 assert(Size % 2 == 0 && "Expected even mask size");
16617 for (unsigned I = 0; I < Size; I += 2) {
16618 if (Mask[I] != (int)(Begin0 + I / 2) ||
16619 Mask[I + 1] != (int)(Begin1 + I / 2))
16620 return false;
16621 }
16622 return true;
16623 };
16624 // Check which half is this shuffle node
16625 int NumElts = VT.getVectorNumElements();
16626 size_t FirstQtr = NumElts / 2;
16627 size_t ThirdQtr = NumElts + NumElts / 2;
16628 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16629 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16630 if (!IsFirstHalf && !IsSecondHalf)
16631 return SDValue();
16632
16633 // Find the intersection between shuffle users of V1 and V2.
16634 SmallVector<SDNode *, 2> Shuffles;
16635 for (SDNode *User : V1->users())
16636 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16637 User->getOperand(1) == V2)
16638 Shuffles.push_back(User);
16639 // Limit user size to two for now.
16640 if (Shuffles.size() != 2)
16641 return SDValue();
16642 // Find out which half of the 512-bit shuffles is each smaller shuffle
16643 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16644 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16645 SDNode *FirstHalf;
16646 SDNode *SecondHalf;
16647 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16648 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16649 FirstHalf = Shuffles[0];
16650 SecondHalf = Shuffles[1];
16651 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16652 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16653 FirstHalf = Shuffles[1];
16654 SecondHalf = Shuffles[0];
16655 } else {
16656 return SDValue();
16657 }
16658 // Lower into unpck and perm. Return the perm of this shuffle and replace
16659 // the other.
16660 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16661 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16662 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16663 DAG.getTargetConstant(0x20, DL, MVT::i8));
16664 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16665 DAG.getTargetConstant(0x31, DL, MVT::i8));
16666 if (IsFirstHalf) {
16667 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16668 return Perm1;
16669 }
16670 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16671 return Perm2;
16672}
16673
16674/// Handle lowering of 4-lane 64-bit floating point shuffles.
16675///
16676/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16677/// isn't available.
16679 const APInt &Zeroable, SDValue V1, SDValue V2,
16680 const X86Subtarget &Subtarget,
16681 SelectionDAG &DAG) {
16682 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16683 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16684 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16685
16686 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16687 Subtarget, DAG))
16688 return V;
16689
16690 if (V2.isUndef()) {
16691 // Check for being able to broadcast a single element.
16692 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16693 Mask, Subtarget, DAG))
16694 return Broadcast;
16695
16696 // Use low duplicate instructions for masks that match their pattern.
16697 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16698 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16699
16700 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16701 // Non-half-crossing single input shuffles can be lowered with an
16702 // interleaved permutation.
16703 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16704 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16705 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16706 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16707 }
16708
16709 // With AVX2 we have direct support for this permutation.
16710 if (Subtarget.hasAVX2())
16711 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16712 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16713
16714 // Try to create an in-lane repeating shuffle mask and then shuffle the
16715 // results into the target lanes.
16717 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16718 return V;
16719
16720 // Try to permute the lanes and then use a per-lane permute.
16721 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16722 Mask, DAG, Subtarget))
16723 return V;
16724
16725 // Otherwise, fall back.
16726 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16727 DAG, Subtarget);
16728 }
16729
16730 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16731 Zeroable, Subtarget, DAG))
16732 return Blend;
16733
16734 // Use dedicated unpack instructions for masks that match their pattern.
16735 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16736 return V;
16737
16738 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16739 Zeroable, Subtarget, DAG))
16740 return Op;
16741
16742 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16743 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16744 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16745 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16746
16747 // If we have lane crossing shuffles AND they don't all come from the lower
16748 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16749 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16750 // canonicalize to a blend of splat which isn't necessary for this combine.
16751 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16752 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16753 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16754 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16755 (!Subtarget.hasAVX2() ||
16756 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16757 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16758
16759 // If we have one input in place, then we can permute the other input and
16760 // blend the result.
16761 if (V1IsInPlace || V2IsInPlace)
16762 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16763 Zeroable, Subtarget, DAG);
16764
16765 // Try to create an in-lane repeating shuffle mask and then shuffle the
16766 // results into the target lanes.
16768 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16769 return V;
16770
16771 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16772 // shuffle. However, if we have AVX2 and either inputs are already in place,
16773 // we will be able to shuffle even across lanes the other input in a single
16774 // instruction so skip this pattern.
16775 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16777 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16778 return V;
16779
16780 // If we have VLX support, we can use VEXPAND.
16781 if (Subtarget.hasVLX())
16782 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16783 Zeroable, Subtarget, DAG))
16784 return V;
16785
16786 // If we have AVX2 then we always want to lower with a blend because an v4 we
16787 // can fully permute the elements.
16788 if (Subtarget.hasAVX2())
16789 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16790 Zeroable, Subtarget, DAG);
16791
16792 // Otherwise fall back on generic lowering.
16793 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16794 Subtarget, DAG);
16795}
16796
16797/// Handle lowering of 4-lane 64-bit integer shuffles.
16798///
16799/// This routine is only called when we have AVX2 and thus a reasonable
16800/// instruction set for v4i64 shuffling..
16802 const APInt &Zeroable, SDValue V1, SDValue V2,
16803 const X86Subtarget &Subtarget,
16804 SelectionDAG &DAG) {
16805 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16806 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16807 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16808 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16809
16810 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16811 Subtarget, DAG))
16812 return V;
16813
16814 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16815 Zeroable, Subtarget, DAG))
16816 return Blend;
16817
16818 // Check for being able to broadcast a single element.
16819 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16820 Subtarget, DAG))
16821 return Broadcast;
16822
16823 // Try to use shift instructions if fast.
16824 if (Subtarget.preferLowerShuffleAsShift())
16825 if (SDValue Shift =
16826 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16827 Subtarget, DAG, /*BitwiseOnly*/ true))
16828 return Shift;
16829
16830 if (V2.isUndef()) {
16831 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16832 // can use lower latency instructions that will operate on both lanes.
16833 SmallVector<int, 2> RepeatedMask;
16834 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16835 SmallVector<int, 4> PSHUFDMask;
16836 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16837 return DAG.getBitcast(
16838 MVT::v4i64,
16839 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16840 DAG.getBitcast(MVT::v8i32, V1),
16841 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16842 }
16843
16844 // AVX2 provides a direct instruction for permuting a single input across
16845 // lanes.
16846 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16847 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16848 }
16849
16850 // Try to use shift instructions.
16851 if (SDValue Shift =
16852 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16853 DAG, /*BitwiseOnly*/ false))
16854 return Shift;
16855
16856 // If we have VLX support, we can use VALIGN or VEXPAND.
16857 if (Subtarget.hasVLX()) {
16858 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16859 Zeroable, Subtarget, DAG))
16860 return Rotate;
16861
16862 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16863 Zeroable, Subtarget, DAG))
16864 return V;
16865 }
16866
16867 // Try to use PALIGNR.
16868 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16869 Subtarget, DAG))
16870 return Rotate;
16871
16872 // Use dedicated unpack instructions for masks that match their pattern.
16873 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16874 return V;
16875
16876 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16877 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16878
16879 // If we have one input in place, then we can permute the other input and
16880 // blend the result.
16881 if (V1IsInPlace || V2IsInPlace)
16882 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16883 Zeroable, Subtarget, DAG);
16884
16885 // Try to create an in-lane repeating shuffle mask and then shuffle the
16886 // results into the target lanes.
16888 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16889 return V;
16890
16891 // Try to lower to PERMQ(BLENDD(V1,V2)).
16892 if (SDValue V =
16893 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16894 return V;
16895
16896 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16897 // shuffle. However, if we have AVX2 and either inputs are already in place,
16898 // we will be able to shuffle even across lanes the other input in a single
16899 // instruction so skip this pattern.
16900 if (!V1IsInPlace && !V2IsInPlace)
16902 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16903 return Result;
16904
16905 // Otherwise fall back on generic blend lowering.
16906 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16907 Zeroable, Subtarget, DAG);
16908}
16909
16910/// Handle lowering of 8-lane 32-bit floating point shuffles.
16911///
16912/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16913/// isn't available.
16915 const APInt &Zeroable, SDValue V1, SDValue V2,
16916 const X86Subtarget &Subtarget,
16917 SelectionDAG &DAG) {
16918 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16919 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16920 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16921
16922 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16923 Zeroable, Subtarget, DAG))
16924 return Blend;
16925
16926 // Check for being able to broadcast a single element.
16927 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16928 Subtarget, DAG))
16929 return Broadcast;
16930
16931 if (!Subtarget.hasAVX2()) {
16932 SmallVector<int> InLaneMask;
16933 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16934
16935 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16936 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16937 /*SimpleOnly*/ true))
16938 return R;
16939 }
16940 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16941 Zeroable, Subtarget, DAG))
16942 return DAG.getBitcast(MVT::v8f32, ZExt);
16943
16944 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16945 // options to efficiently lower the shuffle.
16946 SmallVector<int, 4> RepeatedMask;
16947 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16948 assert(RepeatedMask.size() == 4 &&
16949 "Repeated masks must be half the mask width!");
16950
16951 // Use even/odd duplicate instructions for masks that match their pattern.
16952 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16953 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16954 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16955 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16956
16957 if (V2.isUndef())
16958 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16959 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16960
16961 // Use dedicated unpack instructions for masks that match their pattern.
16962 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16963 return V;
16964
16965 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16966 // have already handled any direct blends.
16967 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16968 }
16969
16970 // Try to create an in-lane repeating shuffle mask and then shuffle the
16971 // results into the target lanes.
16973 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16974 return V;
16975
16976 // If we have a single input shuffle with different shuffle patterns in the
16977 // two 128-bit lanes use the variable mask to VPERMILPS.
16978 if (V2.isUndef()) {
16979 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16980 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16981 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16982 }
16983 if (Subtarget.hasAVX2()) {
16984 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16985 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16986 }
16987 // Otherwise, fall back.
16988 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16989 DAG, Subtarget);
16990 }
16991
16992 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16993 // shuffle.
16995 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16996 return Result;
16997
16998 // If we have VLX support, we can use VEXPAND.
16999 if (Subtarget.hasVLX())
17000 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
17001 Zeroable, Subtarget, DAG))
17002 return V;
17003
17004 // Try to match an interleave of two v8f32s and lower them as unpck and
17005 // permutes using ymms. This needs to go before we try to split the vectors.
17006 // Don't attempt on AVX1 if we're likely to split vectors anyway.
17007 if ((Subtarget.hasAVX2() ||
17010 !Subtarget.hasAVX512())
17011 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
17012 Mask, DAG))
17013 return V;
17014
17015 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17016 // since after split we get a more efficient code using vpunpcklwd and
17017 // vpunpckhwd instrs than vblend.
17018 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
17019 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
17020 Subtarget, DAG);
17021
17022 // If we have AVX2 then we always want to lower with a blend because at v8 we
17023 // can fully permute the elements.
17024 if (Subtarget.hasAVX2())
17025 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17026 Zeroable, Subtarget, DAG);
17027
17028 // Otherwise fall back on generic lowering.
17029 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
17030 Subtarget, DAG);
17031}
17032
17033/// Handle lowering of 8-lane 32-bit integer shuffles.
17034///
17035/// This routine is only called when we have AVX2 and thus a reasonable
17036/// instruction set for v8i32 shuffling..
17038 const APInt &Zeroable, SDValue V1, SDValue V2,
17039 const X86Subtarget &Subtarget,
17040 SelectionDAG &DAG) {
17041 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17042 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17043 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17044 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
17045
17046 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
17047
17048 // Whenever we can lower this as a zext, that instruction is strictly faster
17049 // than any alternative. It also allows us to fold memory operands into the
17050 // shuffle in many cases.
17051 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17052 Zeroable, Subtarget, DAG))
17053 return ZExt;
17054
17055 // Try to match an interleave of two v8i32s and lower them as unpck and
17056 // permutes using ymms. This needs to go before we try to split the vectors.
17057 if (!Subtarget.hasAVX512())
17058 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
17059 Mask, DAG))
17060 return V;
17061
17062 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17063 // since after split we get a more efficient code than vblend by using
17064 // vpunpcklwd and vpunpckhwd instrs.
17065 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
17066 !Subtarget.hasAVX512())
17067 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
17068 Subtarget, DAG);
17069
17070 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17071 Zeroable, Subtarget, DAG))
17072 return Blend;
17073
17074 // Check for being able to broadcast a single element.
17075 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17076 Subtarget, DAG))
17077 return Broadcast;
17078
17079 // Try to use shift instructions if fast.
17080 if (Subtarget.preferLowerShuffleAsShift()) {
17081 if (SDValue Shift =
17082 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
17083 Subtarget, DAG, /*BitwiseOnly*/ true))
17084 return Shift;
17085 if (NumV2Elements == 0)
17086 if (SDValue Rotate =
17087 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
17088 return Rotate;
17089 }
17090
17091 // If the shuffle mask is repeated in each 128-bit lane we can use more
17092 // efficient instructions that mirror the shuffles across the two 128-bit
17093 // lanes.
17094 SmallVector<int, 4> RepeatedMask;
17095 bool Is128BitLaneRepeatedShuffle =
17096 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17097 if (Is128BitLaneRepeatedShuffle) {
17098 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17099 if (V2.isUndef())
17100 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17101 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17102
17103 // Use dedicated unpack instructions for masks that match their pattern.
17104 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
17105 return V;
17106 }
17107
17108 // Try to use shift instructions.
17109 if (SDValue Shift =
17110 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
17111 DAG, /*BitwiseOnly*/ false))
17112 return Shift;
17113
17114 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
17115 if (SDValue Rotate =
17116 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
17117 return Rotate;
17118
17119 // If we have VLX support, we can use VALIGN or EXPAND.
17120 if (Subtarget.hasVLX()) {
17121 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17122 Zeroable, Subtarget, DAG))
17123 return Rotate;
17124
17125 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
17126 Zeroable, Subtarget, DAG))
17127 return V;
17128 }
17129
17130 // Try to use byte rotation instructions.
17131 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17132 Subtarget, DAG))
17133 return Rotate;
17134
17135 // Try to create an in-lane repeating shuffle mask and then shuffle the
17136 // results into the target lanes.
17138 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17139 return V;
17140
17141 if (V2.isUndef()) {
17142 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17143 // because that should be faster than the variable permute alternatives.
17144 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
17145 return V;
17146
17147 // If the shuffle patterns aren't repeated but it's a single input, directly
17148 // generate a cross-lane VPERMD instruction.
17149 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17150 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17151 }
17152
17153 // Assume that a single SHUFPS is faster than an alternative sequence of
17154 // multiple instructions (even if the CPU has a domain penalty).
17155 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17156 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17157 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17158 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17159 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17160 CastV1, CastV2, DAG);
17161 return DAG.getBitcast(MVT::v8i32, ShufPS);
17162 }
17163
17164 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17165 // shuffle.
17167 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17168 return Result;
17169
17170 // Otherwise fall back on generic blend lowering.
17171 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17172 Zeroable, Subtarget, DAG);
17173}
17174
17175/// Handle lowering of 16-lane 16-bit integer shuffles.
17176///
17177/// This routine is only called when we have AVX2 and thus a reasonable
17178/// instruction set for v16i16 shuffling..
17180 const APInt &Zeroable, SDValue V1, SDValue V2,
17181 const X86Subtarget &Subtarget,
17182 SelectionDAG &DAG) {
17183 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17184 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17185 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17186 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
17187
17188 // Whenever we can lower this as a zext, that instruction is strictly faster
17189 // than any alternative. It also allows us to fold memory operands into the
17190 // shuffle in many cases.
17192 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17193 return ZExt;
17194
17195 // Check for being able to broadcast a single element.
17196 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17197 Subtarget, DAG))
17198 return Broadcast;
17199
17200 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17201 Zeroable, Subtarget, DAG))
17202 return Blend;
17203
17204 // Use dedicated unpack instructions for masks that match their pattern.
17205 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
17206 return V;
17207
17208 // Use dedicated pack instructions for masks that match their pattern.
17209 if (SDValue V =
17210 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17211 return V;
17212
17213 // Try to use lower using a truncation.
17214 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17215 Subtarget, DAG))
17216 return V;
17217
17218 // Try to use shift instructions.
17219 if (SDValue Shift =
17220 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17221 Subtarget, DAG, /*BitwiseOnly*/ false))
17222 return Shift;
17223
17224 // Try to use byte rotation instructions.
17225 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17226 Subtarget, DAG))
17227 return Rotate;
17228
17229 // Try to create an in-lane repeating shuffle mask and then shuffle the
17230 // results into the target lanes.
17232 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17233 return V;
17234
17235 if (V2.isUndef()) {
17236 // Try to use bit rotation instructions.
17237 if (SDValue Rotate =
17238 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17239 return Rotate;
17240
17241 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17242 // because that should be faster than the variable permute alternatives.
17243 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17244 return V;
17245
17246 // There are no generalized cross-lane shuffle operations available on i16
17247 // element types.
17248 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17250 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17251 return V;
17252
17253 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17254 DAG, Subtarget);
17255 }
17256
17257 SmallVector<int, 8> RepeatedMask;
17258 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17259 // As this is a single-input shuffle, the repeated mask should be
17260 // a strictly valid v8i16 mask that we can pass through to the v8i16
17261 // lowering to handle even the v16 case.
17263 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17264 }
17265 }
17266
17267 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17268 Zeroable, Subtarget, DAG))
17269 return PSHUFB;
17270
17271 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17272 if (Subtarget.hasBWI())
17273 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17274
17275 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17276 // shuffle.
17278 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17279 return Result;
17280
17281 // Try to permute the lanes and then use a per-lane permute.
17283 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17284 return V;
17285
17286 // Try to match an interleave of two v16i16s and lower them as unpck and
17287 // permutes using ymms.
17288 if (!Subtarget.hasAVX512())
17289 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17290 Mask, DAG))
17291 return V;
17292
17293 // Otherwise fall back on generic lowering.
17294 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17295 Subtarget, DAG);
17296}
17297
17298/// Handle lowering of 32-lane 8-bit integer shuffles.
17299///
17300/// This routine is only called when we have AVX2 and thus a reasonable
17301/// instruction set for v32i8 shuffling..
17303 const APInt &Zeroable, SDValue V1, SDValue V2,
17304 const X86Subtarget &Subtarget,
17305 SelectionDAG &DAG) {
17306 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17307 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17308 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17309 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17310
17311 // Whenever we can lower this as a zext, that instruction is strictly faster
17312 // than any alternative. It also allows us to fold memory operands into the
17313 // shuffle in many cases.
17314 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17315 Zeroable, Subtarget, DAG))
17316 return ZExt;
17317
17318 // Check for being able to broadcast a single element.
17319 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17320 Subtarget, DAG))
17321 return Broadcast;
17322
17323 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17324 Zeroable, Subtarget, DAG))
17325 return Blend;
17326
17327 // Use dedicated unpack instructions for masks that match their pattern.
17328 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17329 return V;
17330
17331 // Use dedicated pack instructions for masks that match their pattern.
17332 if (SDValue V =
17333 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17334 return V;
17335
17336 // Try to use lower using a truncation.
17337 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17338 Subtarget, DAG))
17339 return V;
17340
17341 // Try to use shift instructions.
17342 if (SDValue Shift =
17343 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17344 DAG, /*BitwiseOnly*/ false))
17345 return Shift;
17346
17347 // Try to use byte rotation instructions.
17348 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17349 Subtarget, DAG))
17350 return Rotate;
17351
17352 // Try to use bit rotation instructions.
17353 if (V2.isUndef())
17354 if (SDValue Rotate =
17355 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17356 return Rotate;
17357
17358 // Try to create an in-lane repeating shuffle mask and then shuffle the
17359 // results into the target lanes.
17361 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17362 return V;
17363
17364 // There are no generalized cross-lane shuffle operations available on i8
17365 // element types.
17366 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17367 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17368 // because that should be faster than the variable permute alternatives.
17369 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17370 return V;
17371
17373 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17374 return V;
17375
17376 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17377 DAG, Subtarget);
17378 }
17379
17380 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17381 Zeroable, Subtarget, DAG))
17382 return PSHUFB;
17383
17384 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17385 if (Subtarget.hasVBMI())
17386 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17387
17388 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17389 // shuffle.
17391 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17392 return Result;
17393
17394 // Try to permute the lanes and then use a per-lane permute.
17396 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17397 return V;
17398
17399 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17400 // by zeroable elements in the remaining 24 elements. Turn this into two
17401 // vmovqb instructions shuffled together.
17402 if (Subtarget.hasVLX())
17403 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17404 Mask, Zeroable, DAG))
17405 return V;
17406
17407 // Try to match an interleave of two v32i8s and lower them as unpck and
17408 // permutes using ymms.
17409 if (!Subtarget.hasAVX512())
17410 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17411 Mask, DAG))
17412 return V;
17413
17414 // Otherwise fall back on generic lowering.
17415 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17416 Subtarget, DAG);
17417}
17418
17419/// High-level routine to lower various 256-bit x86 vector shuffles.
17420///
17421/// This routine either breaks down the specific type of a 256-bit x86 vector
17422/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17423/// together based on the available instructions.
17425 SDValue V1, SDValue V2, const APInt &Zeroable,
17426 const X86Subtarget &Subtarget,
17427 SelectionDAG &DAG) {
17428 // If we have a single input to the zero element, insert that into V1 if we
17429 // can do so cheaply.
17430 int NumElts = VT.getVectorNumElements();
17431 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17432
17433 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17435 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17436 return Insertion;
17437
17438 // Handle special cases where the lower or upper half is UNDEF.
17439 if (SDValue V =
17440 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17441 return V;
17442
17443 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17444 // can check for those subtargets here and avoid much of the subtarget
17445 // querying in the per-vector-type lowering routines. With AVX1 we have
17446 // essentially *zero* ability to manipulate a 256-bit vector with integer
17447 // types. Since we'll use floating point types there eventually, just
17448 // immediately cast everything to a float and operate entirely in that domain.
17449 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17450 int ElementBits = VT.getScalarSizeInBits();
17451 if (ElementBits < 32) {
17452 // No floating point type available, if we can't use the bit operations
17453 // for masking/blending then decompose into 128-bit vectors.
17454 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17455 Subtarget, DAG))
17456 return V;
17457 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17458 return V;
17459 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17460 }
17461
17462 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17464 V1 = DAG.getBitcast(FpVT, V1);
17465 V2 = DAG.getBitcast(FpVT, V2);
17466 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17467 }
17468
17469 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17470 V1 = DAG.getBitcast(MVT::v16i16, V1);
17471 V2 = DAG.getBitcast(MVT::v16i16, V2);
17472 return DAG.getBitcast(VT,
17473 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17474 }
17475
17476 switch (VT.SimpleTy) {
17477 case MVT::v4f64:
17478 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17479 case MVT::v4i64:
17480 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17481 case MVT::v8f32:
17482 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17483 case MVT::v8i32:
17484 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17485 case MVT::v16i16:
17486 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17487 case MVT::v32i8:
17488 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17489
17490 default:
17491 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17492 }
17493}
17494
17495/// Try to lower a vector shuffle as a 128-bit shuffles.
17497 const APInt &Zeroable, SDValue V1, SDValue V2,
17498 const X86Subtarget &Subtarget,
17499 SelectionDAG &DAG) {
17500 assert(VT.getScalarSizeInBits() == 64 &&
17501 "Unexpected element type size for 128bit shuffle.");
17502
17503 // To handle 256 bit vector requires VLX and most probably
17504 // function lowerV2X128VectorShuffle() is better solution.
17505 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17506
17507 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17508 SmallVector<int, 4> Widened128Mask;
17509 if (!canWidenShuffleElements(Mask, Widened128Mask))
17510 return SDValue();
17511 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17512
17513 // Try to use an insert into a zero vector.
17514 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17515 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17516 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17517 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17518 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17519 DAG.getVectorIdxConstant(0, DL));
17520 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17521 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17522 DAG.getVectorIdxConstant(0, DL));
17523 }
17524
17525 // Check for patterns which can be matched with a single insert of a 256-bit
17526 // subvector.
17527 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17528 if (OnlyUsesV1 ||
17529 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17530 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17531 SDValue SubVec =
17532 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17533 DAG.getVectorIdxConstant(0, DL));
17534 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17535 DAG.getVectorIdxConstant(4, DL));
17536 }
17537
17538 // See if this is an insertion of the lower 128-bits of V2 into V1.
17539 bool IsInsert = true;
17540 int V2Index = -1;
17541 for (int i = 0; i < 4; ++i) {
17542 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17543 if (Widened128Mask[i] < 0)
17544 continue;
17545
17546 // Make sure all V1 subvectors are in place.
17547 if (Widened128Mask[i] < 4) {
17548 if (Widened128Mask[i] != i) {
17549 IsInsert = false;
17550 break;
17551 }
17552 } else {
17553 // Make sure we only have a single V2 index and its the lowest 128-bits.
17554 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17555 IsInsert = false;
17556 break;
17557 }
17558 V2Index = i;
17559 }
17560 }
17561 if (IsInsert && V2Index >= 0) {
17562 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17563 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17564 DAG.getVectorIdxConstant(0, DL));
17565 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17566 }
17567
17568 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17569 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17570 // possible we at least ensure the lanes stay sequential to help later
17571 // combines.
17572 SmallVector<int, 2> Widened256Mask;
17573 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17574 Widened128Mask.clear();
17575 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17576 }
17577
17578 // Try to lower to vshuf64x2/vshuf32x4.
17579 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17580 int PermMask[4] = {-1, -1, -1, -1};
17581 // Ensure elements came from the same Op.
17582 for (int i = 0; i < 4; ++i) {
17583 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17584 if (Widened128Mask[i] < 0)
17585 continue;
17586
17587 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17588 unsigned OpIndex = i / 2;
17589 if (Ops[OpIndex].isUndef())
17590 Ops[OpIndex] = Op;
17591 else if (Ops[OpIndex] != Op)
17592 return SDValue();
17593
17594 PermMask[i] = Widened128Mask[i] % 4;
17595 }
17596
17597 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17598 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17599}
17600
17601/// Handle lowering of 8-lane 64-bit floating point shuffles.
17603 const APInt &Zeroable, SDValue V1, SDValue V2,
17604 const X86Subtarget &Subtarget,
17605 SelectionDAG &DAG) {
17606 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17607 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17608 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17609
17610 if (V2.isUndef()) {
17611 // Use low duplicate instructions for masks that match their pattern.
17612 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17613 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17614
17615 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17616 // Non-half-crossing single input shuffles can be lowered with an
17617 // interleaved permutation.
17618 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17619 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17620 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17621 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17622 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17623 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17624 }
17625
17626 SmallVector<int, 4> RepeatedMask;
17627 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17628 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17629 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17630 }
17631
17632 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17633 V2, Subtarget, DAG))
17634 return Shuf128;
17635
17636 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17637 return Unpck;
17638
17639 // Check if the blend happens to exactly fit that of SHUFPD.
17640 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17641 Zeroable, Subtarget, DAG))
17642 return Op;
17643
17644 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17645 Subtarget, DAG))
17646 return V;
17647
17648 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17649 Zeroable, Subtarget, DAG))
17650 return Blend;
17651
17652 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17653}
17654
17655/// Handle lowering of 16-lane 32-bit floating point shuffles.
17657 const APInt &Zeroable, SDValue V1, SDValue V2,
17658 const X86Subtarget &Subtarget,
17659 SelectionDAG &DAG) {
17660 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17661 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17662 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17663
17664 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17665 // options to efficiently lower the shuffle.
17666 SmallVector<int, 4> RepeatedMask;
17667 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17668 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17669
17670 // Use even/odd duplicate instructions for masks that match their pattern.
17671 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17672 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17673 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17674 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17675
17676 if (V2.isUndef())
17677 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17678 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17679
17680 // Use dedicated unpack instructions for masks that match their pattern.
17681 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17682 return V;
17683
17684 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17685 Zeroable, Subtarget, DAG))
17686 return Blend;
17687
17688 // Otherwise, fall back to a SHUFPS sequence.
17689 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17690 }
17691
17692 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17693 Zeroable, Subtarget, DAG))
17694 return Blend;
17695
17697 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17698 return DAG.getBitcast(MVT::v16f32, ZExt);
17699
17700 // Try to create an in-lane repeating shuffle mask and then shuffle the
17701 // results into the target lanes.
17703 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17704 return V;
17705
17706 // If we have a single input shuffle with different shuffle patterns in the
17707 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17708 if (V2.isUndef() &&
17709 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17710 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17711 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17712 }
17713
17714 // If we have AVX512F support, we can use VEXPAND.
17715 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17716 Zeroable, Subtarget, DAG))
17717 return V;
17718
17719 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17720}
17721
17722/// Handle lowering of 8-lane 64-bit integer shuffles.
17724 const APInt &Zeroable, SDValue V1, SDValue V2,
17725 const X86Subtarget &Subtarget,
17726 SelectionDAG &DAG) {
17727 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17728 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17729 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17730
17731 // Try to use shift instructions if fast.
17732 if (Subtarget.preferLowerShuffleAsShift())
17733 if (SDValue Shift =
17734 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17735 Subtarget, DAG, /*BitwiseOnly*/ true))
17736 return Shift;
17737
17738 if (V2.isUndef()) {
17739 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17740 // can use lower latency instructions that will operate on all four
17741 // 128-bit lanes.
17742 SmallVector<int, 2> Repeated128Mask;
17743 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17744 SmallVector<int, 4> PSHUFDMask;
17745 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17746 return DAG.getBitcast(
17747 MVT::v8i64,
17748 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17749 DAG.getBitcast(MVT::v16i32, V1),
17750 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17751 }
17752
17753 SmallVector<int, 4> Repeated256Mask;
17754 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17755 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17756 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17757 }
17758
17759 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17760 V2, Subtarget, DAG))
17761 return Shuf128;
17762
17763 // Try to use shift instructions.
17764 if (SDValue Shift =
17765 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17766 DAG, /*BitwiseOnly*/ false))
17767 return Shift;
17768
17769 // Try to use VALIGN.
17770 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17771 Zeroable, Subtarget, DAG))
17772 return Rotate;
17773
17774 // Try to use PALIGNR.
17775 if (Subtarget.hasBWI())
17776 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17777 Subtarget, DAG))
17778 return Rotate;
17779
17780 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17781 return Unpck;
17782
17783 // If we have AVX512F support, we can use VEXPAND.
17784 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17785 Subtarget, DAG))
17786 return V;
17787
17788 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17789 Zeroable, Subtarget, DAG))
17790 return Blend;
17791
17792 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17793}
17794
17795/// Handle lowering of 16-lane 32-bit integer shuffles.
17797 const APInt &Zeroable, SDValue V1, SDValue V2,
17798 const X86Subtarget &Subtarget,
17799 SelectionDAG &DAG) {
17800 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17801 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17802 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17803
17804 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17805
17806 // Whenever we can lower this as a zext, that instruction is strictly faster
17807 // than any alternative. It also allows us to fold memory operands into the
17808 // shuffle in many cases.
17810 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17811 return ZExt;
17812
17813 // Try to use shift instructions if fast.
17814 if (Subtarget.preferLowerShuffleAsShift()) {
17815 if (SDValue Shift =
17816 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17817 Subtarget, DAG, /*BitwiseOnly*/ true))
17818 return Shift;
17819 if (NumV2Elements == 0)
17820 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17821 Subtarget, DAG))
17822 return Rotate;
17823 }
17824
17825 // If the shuffle mask is repeated in each 128-bit lane we can use more
17826 // efficient instructions that mirror the shuffles across the four 128-bit
17827 // lanes.
17828 SmallVector<int, 4> RepeatedMask;
17829 bool Is128BitLaneRepeatedShuffle =
17830 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17831 if (Is128BitLaneRepeatedShuffle) {
17832 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17833 if (V2.isUndef())
17834 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17835 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17836
17837 // Use dedicated unpack instructions for masks that match their pattern.
17838 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17839 return V;
17840 }
17841
17842 // Try to use shift instructions.
17843 if (SDValue Shift =
17844 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17845 Subtarget, DAG, /*BitwiseOnly*/ false))
17846 return Shift;
17847
17848 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17849 if (SDValue Rotate =
17850 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17851 return Rotate;
17852
17853 // Try to use VALIGN.
17854 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17855 Zeroable, Subtarget, DAG))
17856 return Rotate;
17857
17858 // Try to use byte rotation instructions.
17859 if (Subtarget.hasBWI())
17860 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17861 Subtarget, DAG))
17862 return Rotate;
17863
17864 // Assume that a single SHUFPS is faster than using a permv shuffle.
17865 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17866 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17867 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17868 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17869 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17870 CastV1, CastV2, DAG);
17871 return DAG.getBitcast(MVT::v16i32, ShufPS);
17872 }
17873
17874 // Try to create an in-lane repeating shuffle mask and then shuffle the
17875 // results into the target lanes.
17877 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17878 return V;
17879
17880 // If we have AVX512F support, we can use VEXPAND.
17881 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17882 Zeroable, Subtarget, DAG))
17883 return V;
17884
17885 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17886 Zeroable, Subtarget, DAG))
17887 return Blend;
17888
17889 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17890}
17891
17892/// Handle lowering of 32-lane 16-bit integer shuffles.
17894 const APInt &Zeroable, SDValue V1, SDValue V2,
17895 const X86Subtarget &Subtarget,
17896 SelectionDAG &DAG) {
17897 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17898 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17899 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17900 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17901
17902 // Whenever we can lower this as a zext, that instruction is strictly faster
17903 // than any alternative. It also allows us to fold memory operands into the
17904 // shuffle in many cases.
17906 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17907 return ZExt;
17908
17909 // Use dedicated unpack instructions for masks that match their pattern.
17910 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17911 return V;
17912
17913 // Use dedicated pack instructions for masks that match their pattern.
17914 if (SDValue V =
17915 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17916 return V;
17917
17918 // Try to use shift instructions.
17919 if (SDValue Shift =
17920 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17921 Subtarget, DAG, /*BitwiseOnly*/ false))
17922 return Shift;
17923
17924 // Try to use byte rotation instructions.
17925 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17926 Subtarget, DAG))
17927 return Rotate;
17928
17929 if (V2.isUndef()) {
17930 // Try to use bit rotation instructions.
17931 if (SDValue Rotate =
17932 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17933 return Rotate;
17934
17935 SmallVector<int, 8> RepeatedMask;
17936 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17937 // As this is a single-input shuffle, the repeated mask should be
17938 // a strictly valid v8i16 mask that we can pass through to the v8i16
17939 // lowering to handle even the v32 case.
17940 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17941 RepeatedMask, Subtarget, DAG);
17942 }
17943 }
17944
17945 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17946 Zeroable, Subtarget, DAG))
17947 return Blend;
17948
17949 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17950 Zeroable, Subtarget, DAG))
17951 return PSHUFB;
17952
17953 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17954 // shuffle.
17955 if (!V2.isUndef())
17957 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17958 return Result;
17959
17960 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17961}
17962
17963/// Handle lowering of 64-lane 8-bit integer shuffles.
17965 const APInt &Zeroable, SDValue V1, SDValue V2,
17966 const X86Subtarget &Subtarget,
17967 SelectionDAG &DAG) {
17968 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17969 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17970 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17971 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17972
17973 // Whenever we can lower this as a zext, that instruction is strictly faster
17974 // than any alternative. It also allows us to fold memory operands into the
17975 // shuffle in many cases.
17977 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17978 return ZExt;
17979
17980 // Use dedicated unpack instructions for masks that match their pattern.
17981 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17982 return V;
17983
17984 // Use dedicated pack instructions for masks that match their pattern.
17985 if (SDValue V =
17986 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17987 return V;
17988
17989 // Try to use shift instructions.
17990 if (SDValue Shift =
17991 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17992 DAG, /*BitwiseOnly*/ false))
17993 return Shift;
17994
17995 // Try to use byte rotation instructions.
17996 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17997 Subtarget, DAG))
17998 return Rotate;
17999
18000 // Try to use bit rotation instructions.
18001 if (V2.isUndef())
18002 if (SDValue Rotate =
18003 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18004 return Rotate;
18005
18006 // Lower as AND if possible.
18007 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18008 Zeroable, Subtarget, DAG))
18009 return Masked;
18010
18011 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18012 Zeroable, Subtarget, DAG))
18013 return PSHUFB;
18014
18015 // Try to create an in-lane repeating shuffle mask and then shuffle the
18016 // results into the target lanes.
18018 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18019 return V;
18020
18022 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
18023 return Result;
18024
18025 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18026 Zeroable, Subtarget, DAG))
18027 return Blend;
18028
18029 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
18030 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
18031 // PALIGNR will be cheaper than the second PSHUFB+OR.
18032 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
18033 Mask, Subtarget, DAG))
18034 return V;
18035
18036 // If we can't directly blend but can use PSHUFB, that will be better as it
18037 // can both shuffle and set up the inefficient blend.
18038 bool V1InUse, V2InUse;
18039 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
18040 DAG, V1InUse, V2InUse);
18041 }
18042
18043 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18044 // shuffle.
18045 if (!V2.isUndef())
18047 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18048 return Result;
18049
18050 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18051 if (Subtarget.hasVBMI())
18052 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18053
18054 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
18055}
18056
18057/// High-level routine to lower various 512-bit x86 vector shuffles.
18058///
18059/// This routine either breaks down the specific type of a 512-bit x86 vector
18060/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18061/// together based on the available instructions.
18063 MVT VT, SDValue V1, SDValue V2,
18064 const APInt &Zeroable,
18065 const X86Subtarget &Subtarget,
18066 SelectionDAG &DAG) {
18067 assert(Subtarget.hasAVX512() &&
18068 "Cannot lower 512-bit vectors w/ basic ISA!");
18069
18070 // If we have a single input to the zero element, insert that into V1 if we
18071 // can do so cheaply.
18072 int NumElts = Mask.size();
18073 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18074
18075 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18077 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18078 return Insertion;
18079
18080 // Handle special cases where the lower or upper half is UNDEF.
18081 if (SDValue V =
18082 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18083 return V;
18084
18085 // Check for being able to broadcast a single element.
18086 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18087 Subtarget, DAG))
18088 return Broadcast;
18089
18090 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18091 // Try using bit ops for masking and blending before falling back to
18092 // splitting.
18093 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18094 Subtarget, DAG))
18095 return V;
18096 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18097 return V;
18098
18099 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
18100 }
18101
18102 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
18103 if (!Subtarget.hasBWI())
18104 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
18105 /*SimpleOnly*/ false);
18106
18107 V1 = DAG.getBitcast(MVT::v32i16, V1);
18108 V2 = DAG.getBitcast(MVT::v32i16, V2);
18109 return DAG.getBitcast(VT,
18110 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
18111 }
18112
18113 // Dispatch to each element type for lowering. If we don't have support for
18114 // specific element type shuffles at 512 bits, immediately split them and
18115 // lower them. Each lowering routine of a given type is allowed to assume that
18116 // the requisite ISA extensions for that element type are available.
18117 switch (VT.SimpleTy) {
18118 case MVT::v8f64:
18119 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18120 case MVT::v16f32:
18121 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18122 case MVT::v8i64:
18123 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18124 case MVT::v16i32:
18125 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18126 case MVT::v32i16:
18127 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18128 case MVT::v64i8:
18129 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18130
18131 default:
18132 llvm_unreachable("Not a valid 512-bit x86 vector type!");
18133 }
18134}
18135
18137 MVT VT, SDValue V1, SDValue V2,
18138 const X86Subtarget &Subtarget,
18139 SelectionDAG &DAG) {
18140 // Shuffle should be unary.
18141 if (!V2.isUndef())
18142 return SDValue();
18143
18144 int ShiftAmt = -1;
18145 int NumElts = Mask.size();
18146 for (int i = 0; i != NumElts; ++i) {
18147 int M = Mask[i];
18148 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
18149 "Unexpected mask index.");
18150 if (M < 0)
18151 continue;
18152
18153 // The first non-undef element determines our shift amount.
18154 if (ShiftAmt < 0) {
18155 ShiftAmt = M - i;
18156 // Need to be shifting right.
18157 if (ShiftAmt <= 0)
18158 return SDValue();
18159 }
18160 // All non-undef elements must shift by the same amount.
18161 if (ShiftAmt != M - i)
18162 return SDValue();
18163 }
18164 assert(ShiftAmt >= 0 && "All undef?");
18165
18166 // Great we found a shift right.
18167 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
18168 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
18169 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18170 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18171 DAG.getVectorIdxConstant(0, DL));
18172}
18173
18174// Determine if this shuffle can be implemented with a KSHIFT instruction.
18175// Returns the shift amount if possible or -1 if not. This is a simplified
18176// version of matchShuffleAsShift.
18177static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18178 int MaskOffset, const APInt &Zeroable) {
18179 int Size = Mask.size();
18180
18181 auto CheckZeros = [&](int Shift, bool Left) {
18182 for (int j = 0; j < Shift; ++j)
18183 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18184 return false;
18185
18186 return true;
18187 };
18188
18189 auto MatchShift = [&](int Shift, bool Left) {
18190 unsigned Pos = Left ? Shift : 0;
18191 unsigned Low = Left ? 0 : Shift;
18192 unsigned Len = Size - Shift;
18193 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18194 };
18195
18196 for (int Shift = 1; Shift != Size; ++Shift)
18197 for (bool Left : {true, false})
18198 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18200 return Shift;
18201 }
18202
18203 return -1;
18204}
18205
18206
18207// Lower vXi1 vector shuffles.
18208// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18209// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18210// vector, shuffle and then truncate it back.
18212 MVT VT, SDValue V1, SDValue V2,
18213 const APInt &Zeroable,
18214 const X86Subtarget &Subtarget,
18215 SelectionDAG &DAG) {
18216 assert(Subtarget.hasAVX512() &&
18217 "Cannot lower 512-bit vectors w/o basic ISA!");
18218
18219 int NumElts = Mask.size();
18220 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18221
18222 // Try to recognize shuffles that are just padding a subvector with zeros.
18223 int SubvecElts = 0;
18224 int Src = -1;
18225 for (int i = 0; i != NumElts; ++i) {
18226 if (Mask[i] >= 0) {
18227 // Grab the source from the first valid mask. All subsequent elements need
18228 // to use this same source.
18229 if (Src < 0)
18230 Src = Mask[i] / NumElts;
18231 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18232 break;
18233 }
18234
18235 ++SubvecElts;
18236 }
18237 assert(SubvecElts != NumElts && "Identity shuffle?");
18238
18239 // Clip to a power 2.
18240 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18241
18242 // Make sure the number of zeroable bits in the top at least covers the bits
18243 // not covered by the subvector.
18244 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18245 assert(Src >= 0 && "Expected a source!");
18246 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18247 SDValue Extract =
18248 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18249 DAG.getVectorIdxConstant(0, DL));
18250 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18251 DAG.getConstant(0, DL, VT), Extract,
18252 DAG.getVectorIdxConstant(0, DL));
18253 }
18254
18255 // Try a simple shift right with undef elements. Later we'll try with zeros.
18256 if (SDValue Shift =
18257 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18258 return Shift;
18259
18260 // Try to match KSHIFTs.
18261 unsigned Offset = 0;
18262 for (SDValue V : {V1, V2}) {
18263 unsigned Opcode;
18264 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18265 if (ShiftAmt >= 0) {
18266 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18267 MVT WideVT = Res.getSimpleValueType();
18268 // Widened right shifts need two shifts to ensure we shift in zeroes.
18269 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18270 int WideElts = WideVT.getVectorNumElements();
18271 // Shift left to put the original vector in the MSBs of the new size.
18272 Res =
18273 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18274 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18275 // Increase the shift amount to account for the left shift.
18276 ShiftAmt += WideElts - NumElts;
18277 }
18278
18279 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18280 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18281 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18282 DAG.getVectorIdxConstant(0, DL));
18283 }
18284 Offset += NumElts; // Increment for next iteration.
18285 }
18286
18287 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18288 // ops instead.
18289 // TODO: What other unary shuffles would benefit from this?
18290 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18291 SDValue Op0 = V1.getOperand(0);
18292 SDValue Op1 = V1.getOperand(1);
18294 EVT OpVT = Op0.getValueType();
18295 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18296 return DAG.getSetCC(
18297 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18298 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18299 }
18300
18301 MVT ExtVT;
18302 switch (VT.SimpleTy) {
18303 default:
18304 llvm_unreachable("Expected a vector of i1 elements");
18305 case MVT::v2i1:
18306 ExtVT = MVT::v2i64;
18307 break;
18308 case MVT::v4i1:
18309 ExtVT = MVT::v4i32;
18310 break;
18311 case MVT::v8i1:
18312 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18313 // shuffle.
18314 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18315 break;
18316 case MVT::v16i1:
18317 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18318 // 256-bit operation available.
18319 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18320 break;
18321 case MVT::v32i1:
18322 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18323 // 256-bit operation available.
18324 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18325 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18326 break;
18327 case MVT::v64i1:
18328 // Fall back to scalarization. FIXME: We can do better if the shuffle
18329 // can be partitioned cleanly.
18330 if (!Subtarget.useBWIRegs())
18331 return SDValue();
18332 ExtVT = MVT::v64i8;
18333 break;
18334 }
18335
18336 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18337 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18338
18339 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18340 // i1 was sign extended we can use X86ISD::CVT2MASK.
18341 int NumElems = VT.getVectorNumElements();
18342 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18343 (Subtarget.hasDQI() && (NumElems < 32)))
18344 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18345 Shuffle, ISD::SETGT);
18346
18347 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18348}
18349
18350/// Helper function that returns true if the shuffle mask should be
18351/// commuted to improve canonicalization.
18353 int NumElements = Mask.size();
18354
18355 int NumV1Elements = 0, NumV2Elements = 0;
18356 for (int M : Mask)
18357 if (M < 0)
18358 continue;
18359 else if (M < NumElements)
18360 ++NumV1Elements;
18361 else
18362 ++NumV2Elements;
18363
18364 // Commute the shuffle as needed such that more elements come from V1 than
18365 // V2. This allows us to match the shuffle pattern strictly on how many
18366 // elements come from V1 without handling the symmetric cases.
18367 if (NumV2Elements > NumV1Elements)
18368 return true;
18369
18370 assert(NumV1Elements > 0 && "No V1 indices");
18371
18372 if (NumV2Elements == 0)
18373 return false;
18374
18375 // When the number of V1 and V2 elements are the same, try to minimize the
18376 // number of uses of V2 in the low half of the vector. When that is tied,
18377 // ensure that the sum of indices for V1 is equal to or lower than the sum
18378 // indices for V2. When those are equal, try to ensure that the number of odd
18379 // indices for V1 is lower than the number of odd indices for V2.
18380 if (NumV1Elements == NumV2Elements) {
18381 int LowV1Elements = 0, LowV2Elements = 0;
18382 for (int M : Mask.slice(0, NumElements / 2))
18383 if (M >= NumElements)
18384 ++LowV2Elements;
18385 else if (M >= 0)
18386 ++LowV1Elements;
18387 if (LowV2Elements > LowV1Elements)
18388 return true;
18389 if (LowV2Elements == LowV1Elements) {
18390 int SumV1Indices = 0, SumV2Indices = 0;
18391 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18392 if (Mask[i] >= NumElements)
18393 SumV2Indices += i;
18394 else if (Mask[i] >= 0)
18395 SumV1Indices += i;
18396 if (SumV2Indices < SumV1Indices)
18397 return true;
18398 if (SumV2Indices == SumV1Indices) {
18399 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18400 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18401 if (Mask[i] >= NumElements)
18402 NumV2OddIndices += i % 2;
18403 else if (Mask[i] >= 0)
18404 NumV1OddIndices += i % 2;
18405 if (NumV2OddIndices < NumV1OddIndices)
18406 return true;
18407 }
18408 }
18409 }
18410
18411 return false;
18412}
18413
18415 const X86Subtarget &Subtarget) {
18416 if (!Subtarget.hasAVX512())
18417 return false;
18418
18419 if (!V.getValueType().isSimple())
18420 return false;
18421
18422 MVT VT = V.getSimpleValueType().getScalarType();
18423 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18424 return false;
18425
18426 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18427 // are preferable to blendw/blendvb/masked-mov.
18428 if ((VT == MVT::i16 || VT == MVT::i8) &&
18429 V.getSimpleValueType().getSizeInBits() < 512)
18430 return false;
18431
18432 auto HasMaskOperation = [&](SDValue V) {
18433 // TODO: Currently we only check limited opcode. We probably extend
18434 // it to all binary operation by checking TLI.isBinOp().
18435 switch (V->getOpcode()) {
18436 default:
18437 return false;
18438 case ISD::ADD:
18439 case ISD::SUB:
18440 case ISD::AND:
18441 case ISD::XOR:
18442 case ISD::OR:
18443 case ISD::SMAX:
18444 case ISD::SMIN:
18445 case ISD::UMAX:
18446 case ISD::UMIN:
18447 case ISD::ABS:
18448 case ISD::SHL:
18449 case ISD::SRL:
18450 case ISD::SRA:
18451 case ISD::MUL:
18452 break;
18453 }
18454 if (!V->hasOneUse())
18455 return false;
18456
18457 return true;
18458 };
18459
18460 if (HasMaskOperation(V))
18461 return true;
18462
18463 return false;
18464}
18465
18466// Forward declaration.
18469 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18470 const X86Subtarget &Subtarget);
18471
18472 /// Top-level lowering for x86 vector shuffles.
18473///
18474/// This handles decomposition, canonicalization, and lowering of all x86
18475/// vector shuffles. Most of the specific lowering strategies are encapsulated
18476/// above in helper routines. The canonicalization attempts to widen shuffles
18477/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18478/// s.t. only one of the two inputs needs to be tested, etc.
18480 SelectionDAG &DAG) {
18482 ArrayRef<int> OrigMask = SVOp->getMask();
18483 SDValue V1 = Op.getOperand(0);
18484 SDValue V2 = Op.getOperand(1);
18485 MVT VT = Op.getSimpleValueType();
18486 int NumElements = VT.getVectorNumElements();
18487 SDLoc DL(Op);
18488 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18489
18490 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18491 "Can't lower MMX shuffles");
18492
18493 bool V1IsUndef = V1.isUndef();
18494 bool V2IsUndef = V2.isUndef();
18495 if (V1IsUndef && V2IsUndef)
18496 return DAG.getUNDEF(VT);
18497
18498 // When we create a shuffle node we put the UNDEF node to second operand,
18499 // but in some cases the first operand may be transformed to UNDEF.
18500 // In this case we should just commute the node.
18501 if (V1IsUndef)
18502 return DAG.getCommutedVectorShuffle(*SVOp);
18503
18504 // Check for non-undef masks pointing at an undef vector and make the masks
18505 // undef as well. This makes it easier to match the shuffle based solely on
18506 // the mask.
18507 if (V2IsUndef &&
18508 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18509 SmallVector<int, 8> NewMask(OrigMask);
18510 for (int &M : NewMask)
18511 if (M >= NumElements)
18512 M = -1;
18513 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18514 }
18515
18516 // Check for illegal shuffle mask element index values.
18517 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18518 (void)MaskUpperLimit;
18519 assert(llvm::all_of(OrigMask,
18520 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18521 "Out of bounds shuffle index");
18522
18523 // We actually see shuffles that are entirely re-arrangements of a set of
18524 // zero inputs. This mostly happens while decomposing complex shuffles into
18525 // simple ones. Directly lower these as a buildvector of zeros.
18526 APInt KnownUndef, KnownZero;
18527 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18528
18529 APInt Zeroable = KnownUndef | KnownZero;
18530 if (Zeroable.isAllOnes())
18531 return getZeroVector(VT, Subtarget, DAG, DL);
18532
18533 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18534
18535 // Try to collapse shuffles into using a vector type with fewer elements but
18536 // wider element types. We cap this to not form integers or floating point
18537 // elements wider than 64 bits. It does not seem beneficial to form i128
18538 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18539 SmallVector<int, 16> WidenedMask;
18540 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18541 !canCombineAsMaskOperation(V1, Subtarget) &&
18542 !canCombineAsMaskOperation(V2, Subtarget) &&
18543 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18544 // Shuffle mask widening should not interfere with a broadcast opportunity
18545 // by obfuscating the operands with bitcasts.
18546 // TODO: Avoid lowering directly from this top-level function: make this
18547 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18548 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18549 Subtarget, DAG))
18550 return Broadcast;
18551
18552 MVT NewEltVT = VT.isFloatingPoint()
18555 int NewNumElts = NumElements / 2;
18556 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18557 // Make sure that the new vector type is legal. For example, v2f64 isn't
18558 // legal on SSE1.
18559 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18560 if (V2IsZero) {
18561 // Modify the new Mask to take all zeros from the all-zero vector.
18562 // Choose indices that are blend-friendly.
18563 bool UsedZeroVector = false;
18564 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18565 "V2's non-undef elements are used?!");
18566 for (int i = 0; i != NewNumElts; ++i)
18567 if (WidenedMask[i] == SM_SentinelZero) {
18568 WidenedMask[i] = i + NewNumElts;
18569 UsedZeroVector = true;
18570 }
18571 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18572 // some elements to be undef.
18573 if (UsedZeroVector)
18574 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18575 }
18576 V1 = DAG.getBitcast(NewVT, V1);
18577 V2 = DAG.getBitcast(NewVT, V2);
18578 return DAG.getBitcast(
18579 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18580 }
18581 }
18582
18583 SmallVector<SDValue> Ops = {V1, V2};
18584 SmallVector<int> Mask(OrigMask);
18585
18586 // Canonicalize the shuffle with any horizontal ops inputs.
18587 // Don't attempt this if the shuffle can still be widened as we may lose
18588 // whole lane shuffle patterns.
18589 // NOTE: This may update Ops and Mask.
18590 if (!canWidenShuffleElements(Mask)) {
18592 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18593 return DAG.getBitcast(VT, HOp);
18594
18595 V1 = DAG.getBitcast(VT, Ops[0]);
18596 V2 = DAG.getBitcast(VT, Ops[1]);
18597 assert(NumElements == (int)Mask.size() &&
18598 "canonicalizeShuffleMaskWithHorizOp "
18599 "shouldn't alter the shuffle mask size");
18600 }
18601
18602 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18603 // These will be materialized uniformly anyway, so make splat matching easier.
18604 // TODO: Allow all int constants?
18605 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18606 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18607 BitVector Undefs;
18608 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18609 if (Undefs.any() &&
18612 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18613 }
18614 }
18615 }
18616 return V;
18617 };
18618 V1 = CanonicalizeConstant(V1);
18619 V2 = CanonicalizeConstant(V2);
18620
18621 // Commute the shuffle if it will improve canonicalization.
18624 std::swap(V1, V2);
18625 }
18626
18627 // For each vector width, delegate to a specialized lowering routine.
18628 if (VT.is128BitVector())
18629 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18630
18631 if (VT.is256BitVector())
18632 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18633
18634 if (VT.is512BitVector())
18635 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18636
18637 if (Is1BitVector)
18638 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18639
18640 llvm_unreachable("Unimplemented!");
18641}
18642
18643// As legal vpcompress instructions depend on various AVX512 extensions, try to
18644// convert illegal vector sizes to legal ones to avoid expansion.
18646 SelectionDAG &DAG) {
18647 assert(Subtarget.hasAVX512() &&
18648 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18649
18650 SDLoc DL(Op);
18651 SDValue Vec = Op.getOperand(0);
18652 SDValue Mask = Op.getOperand(1);
18653 SDValue Passthru = Op.getOperand(2);
18654
18655 EVT VecVT = Vec.getValueType();
18656 EVT ElementVT = VecVT.getVectorElementType();
18657 unsigned NumElements = VecVT.getVectorNumElements();
18658 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18659 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18660
18661 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18662 // compressed as 512-bit vectors in AVX512F.
18663 if (NumVecBits != 128 && NumVecBits != 256)
18664 return SDValue();
18665
18666 if (NumElementBits == 32 || NumElementBits == 64) {
18667 unsigned NumLargeElements = 512 / NumElementBits;
18668 MVT LargeVecVT =
18669 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18670 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18671
18672 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18673 DAG, DL);
18674 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18675 Subtarget, DAG, DL);
18676 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18677 : widenSubVector(LargeVecVT, Passthru,
18678 /*ZeroNewElements=*/false,
18679 Subtarget, DAG, DL);
18680
18681 SDValue Compressed =
18682 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18683 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18684 DAG.getConstant(0, DL, MVT::i64));
18685 }
18686
18687 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18688 VecVT == MVT::v16i16) {
18689 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18690 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18691
18692 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18693 Passthru = Passthru.isUndef()
18694 ? DAG.getUNDEF(LargeVecVT)
18695 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18696
18697 SDValue Compressed =
18698 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18699 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18700 }
18701
18702 return SDValue();
18703}
18704
18705/// Try to lower a VSELECT instruction to a vector shuffle.
18707 const X86Subtarget &Subtarget,
18708 SelectionDAG &DAG) {
18709 SDValue Cond = Op.getOperand(0);
18710 SDValue LHS = Op.getOperand(1);
18711 SDValue RHS = Op.getOperand(2);
18712 MVT VT = Op.getSimpleValueType();
18713
18714 // Only non-legal VSELECTs reach this lowering, convert those into generic
18715 // shuffles and re-use the shuffle lowering path for blends.
18719 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18720 }
18721
18722 return SDValue();
18723}
18724
18725SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18726 SDValue Cond = Op.getOperand(0);
18727 SDValue LHS = Op.getOperand(1);
18728 SDValue RHS = Op.getOperand(2);
18729
18730 SDLoc dl(Op);
18731 MVT VT = Op.getSimpleValueType();
18732 if (isSoftF16(VT, Subtarget)) {
18733 MVT NVT = VT.changeVectorElementTypeToInteger();
18734 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18735 DAG.getBitcast(NVT, LHS),
18736 DAG.getBitcast(NVT, RHS)));
18737 }
18738
18739 // A vselect where all conditions and data are constants can be optimized into
18740 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18744 return SDValue();
18745
18746 // Try to lower this to a blend-style vector shuffle. This can handle all
18747 // constant condition cases.
18748 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18749 return BlendOp;
18750
18751 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18752 // with patterns on the mask registers on AVX-512.
18753 MVT CondVT = Cond.getSimpleValueType();
18754 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18755 if (CondEltSize == 1)
18756 return Op;
18757
18758 // Variable blends are only legal from SSE4.1 onward.
18759 if (!Subtarget.hasSSE41())
18760 return SDValue();
18761
18762 unsigned EltSize = VT.getScalarSizeInBits();
18763 unsigned NumElts = VT.getVectorNumElements();
18764
18765 // Expand v32i16/v64i8 without BWI.
18766 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18767 return SDValue();
18768
18769 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18770 // into an i1 condition so that we can use the mask-based 512-bit blend
18771 // instructions.
18772 if (VT.getSizeInBits() == 512) {
18773 // Build a mask by testing the condition against zero.
18774 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18775 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18776 DAG.getConstant(0, dl, CondVT),
18777 ISD::SETNE);
18778 // Now return a new VSELECT using the mask.
18779 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18780 }
18781
18782 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18783 if (CondEltSize != EltSize) {
18784 // If we don't have a sign splat, rely on the expansion.
18785 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18786 return SDValue();
18787
18788 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18789 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18790 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18791 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18792 }
18793
18794 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18795 // are free to split, then better to split before expanding the
18796 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18797 // TODO: This is very similar to narrowVectorSelect.
18798 // TODO: Add Load splitting to isFreeToSplitVector ?
18799 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18800 !Subtarget.hasXOP()) {
18801 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18802 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18803 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18804 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18805 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18806 if (FreeCond && (FreeLHS || FreeRHS))
18807 return splitVectorOp(Op, DAG, dl);
18808 }
18809
18810 // Only some types will be legal on some subtargets. If we can emit a legal
18811 // VSELECT-matching blend, return Op, and but if we need to expand, return
18812 // a null value.
18813 switch (VT.SimpleTy) {
18814 default:
18815 // Most of the vector types have blends past SSE4.1.
18816 return Op;
18817
18818 case MVT::v32i8:
18819 // The byte blends for AVX vectors were introduced only in AVX2.
18820 if (Subtarget.hasAVX2())
18821 return Op;
18822
18823 return SDValue();
18824
18825 case MVT::v8i16:
18826 case MVT::v16i16:
18827 case MVT::v8f16:
18828 case MVT::v16f16: {
18829 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18830 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18831 Cond = DAG.getBitcast(CastVT, Cond);
18832 LHS = DAG.getBitcast(CastVT, LHS);
18833 RHS = DAG.getBitcast(CastVT, RHS);
18834 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18835 return DAG.getBitcast(VT, Select);
18836 }
18837 }
18838}
18839
18841 MVT VT = Op.getSimpleValueType();
18842 SDValue Vec = Op.getOperand(0);
18843 SDValue Idx = Op.getOperand(1);
18844 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18845 SDLoc dl(Op);
18846
18848 return SDValue();
18849
18850 if (VT.getSizeInBits() == 8) {
18851 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18852 // we're going to zero extend the register or fold the store.
18855 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18856 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18857 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18858
18859 unsigned IdxVal = Idx->getAsZExtVal();
18860 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18861 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18862 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18863 }
18864
18865 if (VT == MVT::f32) {
18866 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18867 // the result back to FR32 register. It's only worth matching if the
18868 // result has a single use which is a store or a bitcast to i32. And in
18869 // the case of a store, it's not worth it if the index is a constant 0,
18870 // because a MOVSSmr can be used instead, which is smaller and faster.
18871 if (!Op.hasOneUse())
18872 return SDValue();
18873 SDNode *User = *Op.getNode()->user_begin();
18874 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18875 (User->getOpcode() != ISD::BITCAST ||
18876 User->getValueType(0) != MVT::i32))
18877 return SDValue();
18878 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18879 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18880 return DAG.getBitcast(MVT::f32, Extract);
18881 }
18882
18883 if (VT == MVT::i32 || VT == MVT::i64)
18884 return Op;
18885
18886 return SDValue();
18887}
18888
18889/// Extract one bit from mask vector, like v16i1 or v8i1.
18890/// AVX-512 feature.
18892 const X86Subtarget &Subtarget) {
18893 SDValue Vec = Op.getOperand(0);
18894 SDLoc dl(Vec);
18895 MVT VecVT = Vec.getSimpleValueType();
18896 SDValue Idx = Op.getOperand(1);
18897 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18898 MVT EltVT = Op.getSimpleValueType();
18899
18900 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18901 "Unexpected vector type in ExtractBitFromMaskVector");
18902
18903 // variable index can't be handled in mask registers,
18904 // extend vector to VR512/128
18905 if (!IdxC) {
18906 unsigned NumElts = VecVT.getVectorNumElements();
18907 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18908 // than extending to 128/256bit.
18909 if (NumElts == 1) {
18910 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18912 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18913 }
18914 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18915 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18916 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18917 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18918 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18919 }
18920
18921 unsigned IdxVal = IdxC->getZExtValue();
18922 if (IdxVal == 0) // the operation is legal
18923 return Op;
18924
18925 // Extend to natively supported kshift.
18926 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18927
18928 // Use kshiftr instruction to move to the lower element.
18929 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18930 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18931
18932 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18933 DAG.getVectorIdxConstant(0, dl));
18934}
18935
18936// Helper to find all the extracted elements from a vector.
18938 MVT VT = N->getSimpleValueType(0);
18939 unsigned NumElts = VT.getVectorNumElements();
18940 APInt DemandedElts = APInt::getZero(NumElts);
18941 for (SDNode *User : N->users()) {
18942 switch (User->getOpcode()) {
18943 case X86ISD::PEXTRB:
18944 case X86ISD::PEXTRW:
18947 DemandedElts.setAllBits();
18948 return DemandedElts;
18949 }
18950 DemandedElts.setBit(User->getConstantOperandVal(1));
18951 break;
18952 case ISD::BITCAST: {
18953 if (!User->getValueType(0).isSimple() ||
18954 !User->getValueType(0).isVector()) {
18955 DemandedElts.setAllBits();
18956 return DemandedElts;
18957 }
18958 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18959 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18960 break;
18961 }
18962 default:
18963 DemandedElts.setAllBits();
18964 return DemandedElts;
18965 }
18966 }
18967 return DemandedElts;
18968}
18969
18970SDValue
18971X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18972 SelectionDAG &DAG) const {
18973 SDLoc dl(Op);
18974 SDValue Vec = Op.getOperand(0);
18975 MVT VecVT = Vec.getSimpleValueType();
18976 SDValue Idx = Op.getOperand(1);
18977 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18978
18979 if (VecVT.getVectorElementType() == MVT::i1)
18980 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18981
18982 if (!IdxC) {
18983 // Its more profitable to go through memory (1 cycles throughput)
18984 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18985 // IACA tool was used to get performance estimation
18986 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18987 //
18988 // example : extractelement <16 x i8> %a, i32 %i
18989 //
18990 // Block Throughput: 3.00 Cycles
18991 // Throughput Bottleneck: Port5
18992 //
18993 // | Num Of | Ports pressure in cycles | |
18994 // | Uops | 0 - DV | 5 | 6 | 7 | |
18995 // ---------------------------------------------
18996 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18997 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18998 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18999 // Total Num Of Uops: 4
19000 //
19001 //
19002 // Block Throughput: 1.00 Cycles
19003 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
19004 //
19005 // | | Ports pressure in cycles | |
19006 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
19007 // ---------------------------------------------------------
19008 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
19009 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
19010 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
19011 // Total Num Of Uops: 4
19012
19013 return SDValue();
19014 }
19015
19016 unsigned IdxVal = IdxC->getZExtValue();
19017
19018 // If this is a 256-bit vector result, first extract the 128-bit vector and
19019 // then extract the element from the 128-bit vector.
19020 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
19021 // Get the 128-bit vector.
19022 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
19023 MVT EltVT = VecVT.getVectorElementType();
19024
19025 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
19026 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
19027
19028 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
19029 // this can be done with a mask.
19030 IdxVal &= ElemsPerChunk - 1;
19031 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19032 DAG.getVectorIdxConstant(IdxVal, dl));
19033 }
19034
19035 assert(VecVT.is128BitVector() && "Unexpected vector length");
19036
19037 MVT VT = Op.getSimpleValueType();
19038
19039 if (VT == MVT::i16) {
19040 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
19041 // we're going to zero extend the register or fold the store (SSE41 only).
19042 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
19043 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
19044 if (Subtarget.hasFP16())
19045 return Op;
19046
19047 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
19048 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19049 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19050 }
19051
19052 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19053 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19054 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19055 }
19056
19057 if (Subtarget.hasSSE41())
19058 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
19059 return Res;
19060
19061 // Only extract a single element from a v16i8 source - determine the common
19062 // DWORD/WORD that all extractions share, and extract the sub-byte.
19063 // TODO: Add QWORD MOVQ extraction?
19064 if (VT == MVT::i8) {
19065 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
19066 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
19067
19068 // Extract either the lowest i32 or any i16, and extract the sub-byte.
19069 int DWordIdx = IdxVal / 4;
19070 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
19071 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19072 DAG.getBitcast(MVT::v4i32, Vec),
19073 DAG.getVectorIdxConstant(DWordIdx, dl));
19074 int ShiftVal = (IdxVal % 4) * 8;
19075 if (ShiftVal != 0)
19076 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
19077 DAG.getConstant(ShiftVal, dl, MVT::i8));
19078 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19079 }
19080
19081 int WordIdx = IdxVal / 2;
19082 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
19083 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
19084 DAG.getBitcast(MVT::v8i16, Vec),
19085 DAG.getVectorIdxConstant(WordIdx, dl));
19086 int ShiftVal = (IdxVal % 2) * 8;
19087 if (ShiftVal != 0)
19088 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
19089 DAG.getConstant(ShiftVal, dl, MVT::i8));
19090 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19091 }
19092 }
19093
19094 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
19095 if (IdxVal == 0)
19096 return Op;
19097
19098 // Shuffle the element to the lowest element, then movss or movsh.
19099 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
19100 Mask[0] = static_cast<int>(IdxVal);
19101 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19102 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19103 DAG.getVectorIdxConstant(0, dl));
19104 }
19105
19106 if (VT.getSizeInBits() == 64) {
19107 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
19108 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
19109 // to match extract_elt for f64.
19110 if (IdxVal == 0)
19111 return Op;
19112
19113 // UNPCKHPD the element to the lowest double word, then movsd.
19114 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
19115 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
19116 int Mask[2] = { 1, -1 };
19117 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19118 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19119 DAG.getVectorIdxConstant(0, dl));
19120 }
19121
19122 return SDValue();
19123}
19124
19125/// Insert one bit to mask vector, like v16i1 or v8i1.
19126/// AVX-512 feature.
19128 const X86Subtarget &Subtarget) {
19129 SDLoc dl(Op);
19130 SDValue Vec = Op.getOperand(0);
19131 SDValue Elt = Op.getOperand(1);
19132 SDValue Idx = Op.getOperand(2);
19133 MVT VecVT = Vec.getSimpleValueType();
19134
19135 if (!isa<ConstantSDNode>(Idx)) {
19136 // Non constant index. Extend source and destination,
19137 // insert element and then truncate the result.
19138 unsigned NumElts = VecVT.getVectorNumElements();
19139 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19140 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19141 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
19142 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
19143 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
19144 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
19145 }
19146
19147 // Copy into a k-register, extract to v1i1 and insert_subvector.
19148 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
19149 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
19150}
19151
19152SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
19153 SelectionDAG &DAG) const {
19154 MVT VT = Op.getSimpleValueType();
19155 MVT EltVT = VT.getVectorElementType();
19156 unsigned NumElts = VT.getVectorNumElements();
19157 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
19158
19159 if (EltVT == MVT::i1)
19160 return InsertBitToMaskVector(Op, DAG, Subtarget);
19161
19162 SDLoc dl(Op);
19163 SDValue N0 = Op.getOperand(0);
19164 SDValue N1 = Op.getOperand(1);
19165 SDValue N2 = Op.getOperand(2);
19166 auto *N2C = dyn_cast<ConstantSDNode>(N2);
19167
19168 if (EltVT == MVT::bf16) {
19169 MVT IVT = VT.changeVectorElementTypeToInteger();
19170 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
19171 DAG.getBitcast(IVT, N0),
19172 DAG.getBitcast(MVT::i16, N1), N2);
19173 return DAG.getBitcast(VT, Res);
19174 }
19175
19176 if (!N2C) {
19177 // Variable insertion indices, usually we're better off spilling to stack,
19178 // but AVX512 can use a variable compare+select by comparing against all
19179 // possible vector indices, and FP insertion has less gpr->simd traffic.
19180 if (!(Subtarget.hasBWI() ||
19181 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19182 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
19183 return SDValue();
19184
19185 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
19186 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
19187 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
19188 return SDValue();
19189
19190 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
19191 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
19192 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
19193
19194 SmallVector<SDValue, 16> RawIndices;
19195 for (unsigned I = 0; I != NumElts; ++I)
19196 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
19197 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
19198
19199 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
19200 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19202 }
19203
19204 if (N2C->getAPIntValue().uge(NumElts))
19205 return SDValue();
19206 uint64_t IdxVal = N2C->getZExtValue();
19207
19208 bool IsZeroElt = X86::isZeroNode(N1);
19209 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19210
19211 if (IsZeroElt || IsAllOnesElt) {
19212 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
19213 // We don't deal with i8 0 since it appears to be handled elsewhere.
19214 if (IsAllOnesElt &&
19215 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
19216 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
19217 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
19218 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
19219 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
19220 CstVectorElts[IdxVal] = OnesCst;
19221 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
19222 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
19223 }
19224 // See if we can do this more efficiently with a blend shuffle with a
19225 // rematerializable vector.
19226 if (Subtarget.hasSSE41() &&
19227 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19228 SmallVector<int, 8> BlendMask;
19229 for (unsigned i = 0; i != NumElts; ++i)
19230 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19231 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19232 : getOnesVector(VT, DAG, dl);
19233 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19234 }
19235 }
19236
19237 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19238 // into that, and then insert the subvector back into the result.
19239 if (VT.is256BitVector() || VT.is512BitVector()) {
19240 // With a 256-bit vector, we can insert into the zero element efficiently
19241 // using a blend if we have AVX or AVX2 and the right data type.
19242 if (VT.is256BitVector() && IdxVal == 0) {
19243 // TODO: It is worthwhile to cast integer to floating point and back
19244 // and incur a domain crossing penalty if that's what we'll end up
19245 // doing anyway after extracting to a 128-bit vector.
19246 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19247 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19248 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19249 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19250 DAG.getTargetConstant(1, dl, MVT::i8));
19251 }
19252 }
19253
19254 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19255 assert(isPowerOf2_32(NumEltsIn128) &&
19256 "Vectors will always have power-of-two number of elements.");
19257
19258 // If we are not inserting into the low 128-bit vector chunk,
19259 // then prefer the broadcast+blend sequence.
19260 // FIXME: relax the profitability check iff all N1 uses are insertions.
19261 if (IdxVal >= NumEltsIn128 &&
19262 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19263 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19264 X86::mayFoldLoad(N1, Subtarget)))) {
19265 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19266 SmallVector<int, 8> BlendMask;
19267 for (unsigned i = 0; i != NumElts; ++i)
19268 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19269 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19270 }
19271
19272 // Get the desired 128-bit vector chunk.
19273 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19274
19275 // Insert the element into the desired chunk.
19276 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19277 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19278
19279 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19280 DAG.getVectorIdxConstant(IdxIn128, dl));
19281
19282 // Insert the changed part back into the bigger vector
19283 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19284 }
19285 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19286
19287 // This will be just movw/movd/movq/movsh/movss/movsd.
19288 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19289 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19290 EltVT == MVT::f16 || EltVT == MVT::i64) {
19291 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19292 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19293 }
19294
19295 // We can't directly insert an i8 or i16 into a vector, so zero extend
19296 // it to i32 first.
19297 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19298 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19299 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19300 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19301 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19302 return DAG.getBitcast(VT, N1);
19303 }
19304 }
19305
19306 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19307 // argument. SSE41 required for pinsrb.
19308 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19309 unsigned Opc;
19310 if (VT == MVT::v8i16) {
19311 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19313 } else {
19314 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19315 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19317 }
19318
19319 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19320 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19321 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19322 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19323 }
19324
19325 if (Subtarget.hasSSE41()) {
19326 if (EltVT == MVT::f32) {
19327 // Bits [7:6] of the constant are the source select. This will always be
19328 // zero here. The DAG Combiner may combine an extract_elt index into
19329 // these bits. For example (insert (extract, 3), 2) could be matched by
19330 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19331 // Bits [5:4] of the constant are the destination select. This is the
19332 // value of the incoming immediate.
19333 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19334 // combine either bitwise AND or insert of float 0.0 to set these bits.
19335
19336 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19337 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19338 // If this is an insertion of 32-bits into the low 32-bits of
19339 // a vector, we prefer to generate a blend with immediate rather
19340 // than an insertps. Blends are simpler operations in hardware and so
19341 // will always have equal or better performance than insertps.
19342 // But if optimizing for size and there's a load folding opportunity,
19343 // generate insertps because blendps does not have a 32-bit memory
19344 // operand form.
19345 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19346 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19347 DAG.getTargetConstant(1, dl, MVT::i8));
19348 }
19349 // Create this as a scalar to vector..
19350 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19351 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19352 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19353 }
19354
19355 // PINSR* works with constant index.
19356 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19357 return Op;
19358 }
19359
19360 return SDValue();
19361}
19362
19363static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
19364 SelectionDAG &DAG) {
19365 SDLoc DL(Op);
19366 SDValue X = Op.getOperand(0);
19367 MVT XTy = X.getSimpleValueType();
19368 SDValue Exp = Op.getOperand(1);
19369
19370 switch (XTy.SimpleTy) {
19371 default:
19372 return SDValue();
19373 case MVT::f16:
19374 if (!Subtarget.hasFP16())
19375 X = DAG.getFPExtendOrRound(X, DL, MVT::f32);
19376 [[fallthrough]];
19377 case MVT::f32:
19378 case MVT::f64: {
19379 MVT VT = MVT::getVectorVT(X.getSimpleValueType(),
19380 128 / X.getSimpleValueType().getSizeInBits());
19381 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
19382 SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X);
19383 SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp);
19384 SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp);
19385 SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0);
19386 return DAG.getFPExtendOrRound(Final, DL, XTy);
19387 }
19388 case MVT::v4f32:
19389 case MVT::v2f64:
19390 case MVT::v8f32:
19391 case MVT::v4f64:
19392 case MVT::v16f32:
19393 case MVT::v8f64:
19394 if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) {
19395 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19396 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19397 }
19398 break;
19399 case MVT::v8f16:
19400 case MVT::v16f16:
19401 if (Subtarget.hasFP16()) {
19402 if (Subtarget.hasVLX()) {
19403 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19404 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19405 }
19406 break;
19407 }
19408 X = DAG.getFPExtendOrRound(X, DL, XTy.changeVectorElementType(MVT::f32));
19409 Exp = DAG.getSExtOrTrunc(Exp, DL,
19410 X.getSimpleValueType().changeTypeToInteger());
19411 break;
19412 case MVT::v32f16:
19413 if (Subtarget.hasFP16()) {
19414 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19415 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19416 }
19417 return splitVectorOp(Op, DAG, DL);
19418 }
19419 SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512);
19420 SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512);
19421 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, WideExp.getSimpleValueType(), Exp);
19422 SDValue Scalef =
19423 DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp);
19424 SDValue Final =
19425 DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0);
19426 return DAG.getFPExtendOrRound(Final, DL, XTy);
19427}
19428
19430 SelectionDAG &DAG) {
19431 SDLoc dl(Op);
19432 MVT OpVT = Op.getSimpleValueType();
19433
19434 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19435 // combines.
19436 if (X86::isZeroNode(Op.getOperand(0)))
19437 return getZeroVector(OpVT, Subtarget, DAG, dl);
19438
19439 // If this is a 256-bit vector result, first insert into a 128-bit
19440 // vector and then insert into the 256-bit vector.
19441 if (!OpVT.is128BitVector()) {
19442 // Insert into a 128-bit vector.
19443 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19445 OpVT.getVectorNumElements() / SizeFactor);
19446
19447 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19448
19449 // Insert the 128-bit vector.
19450 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19451 }
19452 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19453 "Expected an SSE type!");
19454
19455 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19456 // tblgen.
19457 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19458 return Op;
19459
19460 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19461 return DAG.getBitcast(
19462 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19463}
19464
19465// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19466// simple superregister reference or explicit instructions to insert
19467// the upper bits of a vector.
19469 SelectionDAG &DAG) {
19470 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19471
19472 return insert1BitVector(Op, DAG, Subtarget);
19473}
19474
19476 SelectionDAG &DAG) {
19477 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19478 "Only vXi1 extract_subvectors need custom lowering");
19479
19480 SDLoc dl(Op);
19481 SDValue Vec = Op.getOperand(0);
19482 uint64_t IdxVal = Op.getConstantOperandVal(1);
19483
19484 if (IdxVal == 0) // the operation is legal
19485 return Op;
19486
19487 // Extend to natively supported kshift.
19488 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19489
19490 // Shift to the LSB.
19491 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19492 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19493
19494 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19495 DAG.getVectorIdxConstant(0, dl));
19496}
19497
19498// Returns the appropriate wrapper opcode for a global reference.
19499unsigned X86TargetLowering::getGlobalWrapperKind(
19500 const GlobalValue *GV, const unsigned char OpFlags) const {
19501 // References to absolute symbols are never PC-relative.
19502 if (GV && GV->isAbsoluteSymbolRef())
19503 return X86ISD::Wrapper;
19504
19505 // The following OpFlags under RIP-rel PIC use RIP.
19506 if (Subtarget.isPICStyleRIPRel() &&
19507 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19508 OpFlags == X86II::MO_DLLIMPORT))
19509 return X86ISD::WrapperRIP;
19510
19511 // GOTPCREL references must always use RIP.
19512 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19513 return X86ISD::WrapperRIP;
19514
19515 return X86ISD::Wrapper;
19516}
19517
19518// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19519// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19520// one of the above mentioned nodes. It has to be wrapped because otherwise
19521// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19522// be used to form addressing mode. These wrapped nodes will be selected
19523// into MOV32ri.
19524SDValue
19525X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19526 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19527
19528 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19529 // global base reg.
19530 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19531
19532 auto PtrVT = getPointerTy(DAG.getDataLayout());
19534 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19535 SDLoc DL(CP);
19536 Result =
19537 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19538 // With PIC, the address is actually $g + Offset.
19539 if (OpFlag) {
19540 Result =
19541 DAG.getNode(ISD::ADD, DL, PtrVT,
19542 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19543 }
19544
19545 return Result;
19546}
19547
19548SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19549 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19550
19551 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19552 // global base reg.
19553 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19554
19555 EVT PtrVT = Op.getValueType();
19556 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19557 SDLoc DL(JT);
19558 Result =
19559 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19560
19561 // With PIC, the address is actually $g + Offset.
19562 if (OpFlag)
19563 Result =
19564 DAG.getNode(ISD::ADD, DL, PtrVT,
19565 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19566
19567 return Result;
19568}
19569
19570SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19571 SelectionDAG &DAG) const {
19572 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19573}
19574
19575SDValue
19576X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19577 // Create the TargetBlockAddressAddress node.
19578 unsigned char OpFlags =
19579 Subtarget.classifyBlockAddressReference();
19580 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19581 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19582 SDLoc dl(Op);
19583 EVT PtrVT = Op.getValueType();
19584 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19585 Result =
19586 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19587
19588 // With PIC, the address is actually $g + Offset.
19589 if (isGlobalRelativeToPICBase(OpFlags)) {
19590 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19591 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19592 }
19593
19594 return Result;
19595}
19596
19597/// Creates target global address or external symbol nodes for calls or
19598/// other uses.
19599SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19600 bool ForCall,
19601 bool *IsImpCall) const {
19602 // Unpack the global address or external symbol.
19603 SDLoc dl(Op);
19604 const GlobalValue *GV = nullptr;
19605 int64_t Offset = 0;
19606 const char *ExternalSym = nullptr;
19607 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19608 GV = G->getGlobal();
19609 Offset = G->getOffset();
19610 } else {
19611 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19612 ExternalSym = ES->getSymbol();
19613 }
19614
19615 // Calculate some flags for address lowering.
19617 unsigned char OpFlags;
19618 if (ForCall)
19619 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19620 else
19621 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19622 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19623 bool NeedsLoad = isGlobalStubReference(OpFlags);
19624
19626 EVT PtrVT = Op.getValueType();
19628
19629 if (GV) {
19630 // Create a target global address if this is a global. If possible, fold the
19631 // offset into the global address reference. Otherwise, ADD it on later.
19632 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19633 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19634 // relocation will compute to a negative value, which is invalid.
19635 int64_t GlobalOffset = 0;
19636 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19638 std::swap(GlobalOffset, Offset);
19639 }
19640 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19641 } else {
19642 // If this is not a global address, this must be an external symbol.
19643 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19644 }
19645
19646 // If this is a direct call, avoid the wrapper if we don't need to do any
19647 // loads or adds. This allows SDAG ISel to match direct calls.
19648 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19649 return Result;
19650
19651 // If Import Call Optimization is enabled and this is an imported function
19652 // then make a note of it and return the global address without wrapping.
19653 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19654 Mod.getModuleFlag("import-call-optimization")) {
19655 assert(ForCall && "Should only enable import call optimization if we are "
19656 "lowering a call");
19657 *IsImpCall = true;
19658 return Result;
19659 }
19660
19661 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19662
19663 // With PIC, the address is actually $g + Offset.
19664 if (HasPICReg) {
19665 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19666 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19667 }
19668
19669 // For globals that require a load from a stub to get the address, emit the
19670 // load.
19671 if (NeedsLoad)
19672 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19674
19675 // If there was a non-zero offset that we didn't fold, create an explicit
19676 // addition for it.
19677 if (Offset != 0)
19678 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19679 DAG.getSignedConstant(Offset, dl, PtrVT));
19680
19681 return Result;
19682}
19683
19684SDValue
19685X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19686 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19687}
19688
19690 const EVT PtrVT, unsigned ReturnReg,
19691 unsigned char OperandFlags,
19692 bool LoadGlobalBaseReg = false,
19693 bool LocalDynamic = false) {
19695 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19696 SDLoc dl(GA);
19697 SDValue TGA;
19698 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19699 SDValue Chain = DAG.getEntryNode();
19700 SDValue Ret;
19701 if (LocalDynamic && UseTLSDESC) {
19702 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19703 // Reuse existing GetTLSADDR node if we can find it.
19704 if (TGA->hasOneUse()) {
19705 // TLSDESC uses TGA.
19706 SDNode *TLSDescOp = *TGA->user_begin();
19707 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19708 "Unexpected TLSDESC DAG");
19709 // CALLSEQ_END uses TGA via a chain and glue.
19710 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19711 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19712 "Unexpected TLSDESC DAG");
19713 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19714 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19715 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19716 "Unexpected TLSDESC DAG");
19717 Ret = SDValue(CopyFromRegOp, 0);
19718 }
19719 } else {
19720 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19721 GA->getOffset(), OperandFlags);
19722 }
19723
19724 if (!Ret) {
19725 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19726 : LocalDynamic ? X86ISD::TLSBASEADDR
19728
19729 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19730 if (LoadGlobalBaseReg) {
19731 SDValue InGlue;
19732 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19733 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19734 InGlue);
19735 InGlue = Chain.getValue(1);
19736 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19737 } else {
19738 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19739 }
19740 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19741
19742 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19743 MFI.setHasCalls(true);
19744
19745 SDValue Glue = Chain.getValue(1);
19746 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19747 }
19748
19749 if (!UseTLSDESC)
19750 return Ret;
19751
19752 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19753 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19754
19756 SDValue Offset =
19757 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19758 MachinePointerInfo(Ptr));
19759 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19760}
19761
19762// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19763static SDValue
19765 const EVT PtrVT) {
19766 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19767 /*LoadGlobalBaseReg=*/true);
19768}
19769
19770// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19771static SDValue
19773 const EVT PtrVT) {
19774 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19775}
19776
19777// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19778static SDValue
19780 const EVT PtrVT) {
19781 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19782}
19783
19785 SelectionDAG &DAG, const EVT PtrVT,
19786 bool Is64Bit, bool Is64BitLP64) {
19787 SDLoc dl(GA);
19788
19789 // Get the start address of the TLS block for this module.
19793
19794 SDValue Base;
19795 if (Is64Bit) {
19796 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19797 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19798 /*LoadGlobalBaseReg=*/false,
19799 /*LocalDynamic=*/true);
19800 } else {
19801 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19802 /*LoadGlobalBaseReg=*/true,
19803 /*LocalDynamic=*/true);
19804 }
19805
19806 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19807 // of Base.
19808
19809 // Build x@dtpoff.
19810 unsigned char OperandFlags = X86II::MO_DTPOFF;
19811 unsigned WrapperKind = X86ISD::Wrapper;
19812 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19813 GA->getValueType(0),
19814 GA->getOffset(), OperandFlags);
19815 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19816
19817 // Add x@dtpoff with the base.
19818 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19819}
19820
19821// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19823 const EVT PtrVT, TLSModel::Model model,
19824 bool is64Bit, bool isPIC) {
19825 SDLoc dl(GA);
19826
19827 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19830
19831 SDValue ThreadPointer =
19832 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19833 MachinePointerInfo(Ptr));
19834
19835 unsigned char OperandFlags = 0;
19836 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19837 // initialexec.
19838 unsigned WrapperKind = X86ISD::Wrapper;
19839 if (model == TLSModel::LocalExec) {
19840 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19841 } else if (model == TLSModel::InitialExec) {
19842 if (is64Bit) {
19843 OperandFlags = X86II::MO_GOTTPOFF;
19844 WrapperKind = X86ISD::WrapperRIP;
19845 } else {
19846 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19847 }
19848 } else {
19849 llvm_unreachable("Unexpected model");
19850 }
19851
19852 // emit "addl x@ntpoff,%eax" (local exec)
19853 // or "addl x@indntpoff,%eax" (initial exec)
19854 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19855 SDValue TGA =
19856 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19857 GA->getOffset(), OperandFlags);
19858 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19859
19860 if (model == TLSModel::InitialExec) {
19861 if (isPIC && !is64Bit) {
19862 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19863 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19864 Offset);
19865 }
19866
19867 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19869 }
19870
19871 // The address of the thread local variable is the add of the thread
19872 // pointer with the offset of the variable.
19873 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19874}
19875
19876SDValue
19877X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19878
19879 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19880
19881 if (DAG.getTarget().useEmulatedTLS())
19882 return LowerToTLSEmulatedModel(GA, DAG);
19883
19884 const GlobalValue *GV = GA->getGlobal();
19885 EVT PtrVT = Op.getValueType();
19886 bool PositionIndependent = isPositionIndependent();
19887
19888 if (Subtarget.isTargetELF()) {
19889 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19890 switch (model) {
19892 if (Subtarget.is64Bit()) {
19893 if (Subtarget.isTarget64BitLP64())
19894 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19895 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19896 }
19897 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19899 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19900 Subtarget.isTarget64BitLP64());
19903 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19904 PositionIndependent);
19905 }
19906 llvm_unreachable("Unknown TLS model.");
19907 }
19908
19909 if (Subtarget.isTargetDarwin()) {
19910 // Darwin only has one model of TLS. Lower to that.
19911 unsigned char OpFlag = 0;
19912 unsigned WrapperKind = 0;
19913
19914 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19915 // global base reg.
19916 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19917 if (PIC32) {
19918 OpFlag = X86II::MO_TLVP_PIC_BASE;
19919 WrapperKind = X86ISD::Wrapper;
19920 } else {
19921 OpFlag = X86II::MO_TLVP;
19922 WrapperKind = X86ISD::WrapperRIP;
19923 }
19924 SDLoc DL(Op);
19926 GA->getValueType(0),
19927 GA->getOffset(), OpFlag);
19928 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19929
19930 // With PIC32, the address is actually $g + Offset.
19931 if (PIC32)
19932 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19933 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19934 Offset);
19935
19936 // Lowering the machine isd will make sure everything is in the right
19937 // location.
19938 SDValue Chain = DAG.getEntryNode();
19939 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19940 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19941 SDValue Args[] = { Chain, Offset };
19942 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19943 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19944
19945 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19946 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19947 MFI.setAdjustsStack(true);
19948
19949 // And our return value (tls address) is in the standard call return value
19950 // location.
19951 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19952 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19953 }
19954
19955 if (Subtarget.isOSWindows()) {
19956 // Just use the implicit TLS architecture
19957 // Need to generate something similar to:
19958 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19959 // ; from TEB
19960 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19961 // mov rcx, qword [rdx+rcx*8]
19962 // mov eax, .tls$:tlsvar
19963 // [rax+rcx] contains the address
19964 // Windows 64bit: gs:0x58
19965 // Windows 32bit: fs:__tls_array
19966
19967 SDLoc dl(GA);
19968 SDValue Chain = DAG.getEntryNode();
19969
19970 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19971 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19972 // use its literal value of 0x2C.
19974 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19976
19977 SDValue TlsArray = Subtarget.is64Bit()
19978 ? DAG.getIntPtrConstant(0x58, dl)
19979 : (Subtarget.isTargetWindowsGNU()
19980 ? DAG.getIntPtrConstant(0x2C, dl)
19981 : DAG.getExternalSymbol("_tls_array", PtrVT));
19982
19983 SDValue ThreadPointer =
19984 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19985
19986 SDValue res;
19988 res = ThreadPointer;
19989 } else {
19990 // Load the _tls_index variable
19991 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19992 if (Subtarget.is64Bit())
19993 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19994 MachinePointerInfo(), MVT::i32);
19995 else
19996 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19997
19998 const DataLayout &DL = DAG.getDataLayout();
19999 SDValue Scale =
20000 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
20001 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
20002
20003 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
20004 }
20005
20006 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
20007
20008 // Get the offset of start of .tls section
20009 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20010 GA->getValueType(0),
20012 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
20013
20014 // The address of the thread local variable is the add of the thread
20015 // pointer with the offset of the variable.
20016 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
20017 }
20018
20019 llvm_unreachable("TLS not implemented for this target.");
20020}
20021
20023 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
20024 const TargetMachine &TM = getTargetMachine();
20025 TLSModel::Model Model = TM.getTLSModel(&GV);
20026 switch (Model) {
20029 // We can include the %fs segment register in addressing modes.
20030 return true;
20033 // These models do not result in %fs relative addresses unless
20034 // TLS descriptior are used.
20035 //
20036 // Even in the case of TLS descriptors we currently have no way to model
20037 // the difference between %fs access and the computations needed for the
20038 // offset and returning `true` for TLS-desc currently duplicates both
20039 // which is detrimental :-/
20040 return false;
20041 }
20042 }
20043 return false;
20044}
20045
20046/// Lower SRA_PARTS and friends, which return two i32 values
20047/// and take a 2 x i32 value to shift plus a shift amount.
20048/// TODO: Can this be moved to general expansion code?
20050 SDValue Lo, Hi;
20051 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
20052 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
20053}
20054
20055// Try to use a packed vector operation to handle i64 on 32-bit targets when
20056// AVX512DQ is enabled.
20058 SelectionDAG &DAG,
20059 const X86Subtarget &Subtarget) {
20060 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20061 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20062 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20063 Op.getOpcode() == ISD::UINT_TO_FP) &&
20064 "Unexpected opcode!");
20065 bool IsStrict = Op->isStrictFPOpcode();
20066 unsigned OpNo = IsStrict ? 1 : 0;
20067 SDValue Src = Op.getOperand(OpNo);
20068 MVT SrcVT = Src.getSimpleValueType();
20069 MVT VT = Op.getSimpleValueType();
20070
20071 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20072 (VT != MVT::f32 && VT != MVT::f64))
20073 return SDValue();
20074
20075 // Pack the i64 into a vector, do the operation and extract.
20076
20077 // Using 256-bit to ensure result is 128-bits for f32 case.
20078 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20079 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
20080 MVT VecVT = MVT::getVectorVT(VT, NumElts);
20081
20082 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
20083 if (IsStrict) {
20084 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
20085 {Op.getOperand(0), InVec});
20086 SDValue Chain = CvtVec.getValue(1);
20087 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20088 DAG.getVectorIdxConstant(0, dl));
20089 return DAG.getMergeValues({Value, Chain}, dl);
20090 }
20091
20092 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
20093
20094 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20095 DAG.getVectorIdxConstant(0, dl));
20096}
20097
20098// Try to use a packed vector operation to handle i64 on 32-bit targets.
20100 const X86Subtarget &Subtarget) {
20101 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20102 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20103 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20104 Op.getOpcode() == ISD::UINT_TO_FP) &&
20105 "Unexpected opcode!");
20106 bool IsStrict = Op->isStrictFPOpcode();
20107 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20108 MVT SrcVT = Src.getSimpleValueType();
20109 MVT VT = Op.getSimpleValueType();
20110
20111 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20112 return SDValue();
20113
20114 // Pack the i64 into a vector, do the operation and extract.
20115
20116 assert(Subtarget.hasFP16() && "Expected FP16");
20117
20118 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
20119 if (IsStrict) {
20120 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20121 {Op.getOperand(0), InVec});
20122 SDValue Chain = CvtVec.getValue(1);
20123 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20124 DAG.getVectorIdxConstant(0, dl));
20125 return DAG.getMergeValues({Value, Chain}, dl);
20126 }
20127
20128 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
20129
20130 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20131 DAG.getVectorIdxConstant(0, dl));
20132}
20133
20134static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
20135 const X86Subtarget &Subtarget) {
20136 switch (Opcode) {
20137 case ISD::SINT_TO_FP:
20138 // TODO: Handle wider types with AVX/AVX512.
20139 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
20140 return false;
20141 // CVTDQ2PS or (V)CVTDQ2PD
20142 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
20143
20144 case ISD::UINT_TO_FP:
20145 // TODO: Handle wider types and i64 elements.
20146 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
20147 return false;
20148 // VCVTUDQ2PS or VCVTUDQ2PD
20149 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20150
20151 default:
20152 return false;
20153 }
20154}
20155
20156/// Given a scalar cast operation that is extracted from a vector, try to
20157/// vectorize the cast op followed by extraction. This will avoid an expensive
20158/// round-trip between XMM and GPR.
20160 SelectionDAG &DAG,
20161 const X86Subtarget &Subtarget) {
20162 // TODO: This could be enhanced to handle smaller integer types by peeking
20163 // through an extend.
20164 SDValue Extract = Cast.getOperand(0);
20165 MVT DestVT = Cast.getSimpleValueType();
20166 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20167 !isa<ConstantSDNode>(Extract.getOperand(1)))
20168 return SDValue();
20169
20170 // See if we have a 128-bit vector cast op for this type of cast.
20171 SDValue VecOp = Extract.getOperand(0);
20172 MVT FromVT = VecOp.getSimpleValueType();
20173 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
20174 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
20175 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
20176 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
20177 return SDValue();
20178
20179 // If we are extracting from a non-zero element, first shuffle the source
20180 // vector to allow extracting from element zero.
20181 if (!isNullConstant(Extract.getOperand(1))) {
20182 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
20183 Mask[0] = Extract.getConstantOperandVal(1);
20184 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
20185 }
20186 // If the source vector is wider than 128-bits, extract the low part. Do not
20187 // create an unnecessarily wide vector cast op.
20188 if (FromVT != Vec128VT)
20189 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
20190
20191 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
20192 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
20193 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
20194 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
20195 DAG.getVectorIdxConstant(0, DL));
20196}
20197
20198/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
20199/// try to vectorize the cast ops. This will avoid an expensive round-trip
20200/// between XMM and GPR.
20201static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
20202 SelectionDAG &DAG,
20203 const X86Subtarget &Subtarget) {
20204 SDValue CastToInt = CastToFP.getOperand(0);
20205 MVT VT = CastToFP.getSimpleValueType();
20206 if ((CastToInt.getOpcode() != ISD::FP_TO_SINT &&
20207 CastToInt.getOpcode() != ISD::FP_TO_UINT) ||
20208 VT.isVector())
20209 return SDValue();
20210
20211 MVT IntVT = CastToInt.getSimpleValueType();
20212 SDValue X = CastToInt.getOperand(0);
20213 MVT SrcVT = X.getSimpleValueType();
20214 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20215 return SDValue();
20216
20217 // See if we have 128-bit vector cast instructions for this type of cast.
20218 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
20219 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20220 (IntVT != MVT::i32 && IntVT != MVT::i64))
20221 return SDValue();
20222
20223 unsigned SrcSize = SrcVT.getSizeInBits();
20224 unsigned IntSize = IntVT.getSizeInBits();
20225 unsigned VTSize = VT.getSizeInBits();
20226 bool IsUnsigned = CastToInt.getOpcode() == ISD::FP_TO_UINT;
20227 unsigned ToIntOpcode =
20228 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
20229 unsigned ToFPOpcode =
20230 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
20231 unsigned Width = 128;
20232
20233 if (Subtarget.hasVLX() && Subtarget.hasDQI()) {
20234 // AVX512DQ+VLX
20235 if (IsUnsigned) {
20236 ToIntOpcode =
20237 SrcSize != IntSize ? X86ISD::CVTTP2UI : (unsigned)ISD::FP_TO_UINT;
20238 ToFPOpcode =
20239 IntSize != VTSize ? X86ISD::CVTUI2P : (unsigned)ISD::UINT_TO_FP;
20240 }
20241 } else {
20242 if (IsUnsigned || IntVT == MVT::i64) {
20243 // SSE2 can only perform f64/f32 <-> i32 signed.
20244 if (!Subtarget.useAVX512Regs() || !Subtarget.hasDQI())
20245 return SDValue();
20246
20247 // Need to extend width for AVX512DQ without AVX512VL.
20248 Width = 512;
20249 ToIntOpcode = CastToInt.getOpcode();
20250 ToFPOpcode = IsUnsigned ? ISD::UINT_TO_FP : ISD::SINT_TO_FP;
20251 }
20252 }
20253
20254 MVT VecSrcVT, VecIntVT, VecVT;
20255 unsigned NumElts;
20256 unsigned SrcElts, VTElts;
20257 // Some conversions are only legal with uniform vector sizes on AVX512DQ.
20258 if (Width == 512) {
20259 NumElts = std::min(Width / IntSize, Width / SrcSize);
20260 SrcElts = NumElts;
20261 VTElts = NumElts;
20262 } else {
20263 NumElts = Width / IntSize;
20264 SrcElts = Width / SrcSize;
20265 VTElts = Width / VTSize;
20266 }
20267 VecIntVT = MVT::getVectorVT(IntVT, NumElts);
20268 VecSrcVT = MVT::getVectorVT(SrcVT, SrcElts);
20269 VecVT = MVT::getVectorVT(VT, VTElts);
20270 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
20271 //
20272 // We are not defining the high elements (for example, zero them) because
20273 // that could nullify any performance advantage that we hoped to gain from
20274 // this vector op hack. We do not expect any adverse effects (like denorm
20275 // penalties) with cast ops.
20276 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20277 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
20278 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
20279 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
20280 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
20281}
20282
20284 SelectionDAG &DAG,
20285 const X86Subtarget &Subtarget) {
20286 bool IsStrict = Op->isStrictFPOpcode();
20287 MVT VT = Op->getSimpleValueType(0);
20288 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
20289
20290 if (Subtarget.hasDQI()) {
20291 assert(!Subtarget.hasVLX() && "Unexpected features");
20292
20293 assert((Src.getSimpleValueType() == MVT::v2i64 ||
20294 Src.getSimpleValueType() == MVT::v4i64) &&
20295 "Unsupported custom type");
20296
20297 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
20298 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
20299 "Unexpected VT!");
20300 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20301
20302 // Need to concat with zero vector for strict fp to avoid spurious
20303 // exceptions.
20304 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
20305 : DAG.getUNDEF(MVT::v8i64);
20306 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
20307 DAG.getVectorIdxConstant(0, DL));
20308 SDValue Res, Chain;
20309 if (IsStrict) {
20310 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
20311 {Op->getOperand(0), Src});
20312 Chain = Res.getValue(1);
20313 } else {
20314 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
20315 }
20316
20317 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20318 DAG.getVectorIdxConstant(0, DL));
20319
20320 if (IsStrict)
20321 return DAG.getMergeValues({Res, Chain}, DL);
20322 return Res;
20323 }
20324
20325 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20326 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20327 if (VT != MVT::v4f32 || IsSigned)
20328 return SDValue();
20329
20330 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20331 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20332 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20333 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20334 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20335 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20336 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20337 SmallVector<SDValue, 4> SignCvts(4);
20338 SmallVector<SDValue, 4> Chains(4);
20339 for (int i = 0; i != 4; ++i) {
20340 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20341 DAG.getVectorIdxConstant(i, DL));
20342 if (IsStrict) {
20343 SignCvts[i] =
20344 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20345 {Op.getOperand(0), Elt});
20346 Chains[i] = SignCvts[i].getValue(1);
20347 } else {
20348 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20349 }
20350 }
20351 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20352
20353 SDValue Slow, Chain;
20354 if (IsStrict) {
20355 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20356 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20357 {Chain, SignCvt, SignCvt});
20358 Chain = Slow.getValue(1);
20359 } else {
20360 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20361 }
20362
20363 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20364 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20365
20366 if (IsStrict)
20367 return DAG.getMergeValues({Cvt, Chain}, DL);
20368
20369 return Cvt;
20370}
20371
20373 SelectionDAG &DAG) {
20374 bool IsStrict = Op->isStrictFPOpcode();
20375 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20376 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20377 MVT VT = Op.getSimpleValueType();
20378 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20379
20380 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20381 if (IsStrict)
20382 return DAG.getNode(
20383 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20384 {Chain,
20385 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20386 Rnd});
20387 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20388 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20389}
20390
20391static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20392 const X86Subtarget &Subtarget) {
20393 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20394 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20395 return true;
20396 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20397 return true;
20398 }
20399 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20400 return true;
20401 if (Subtarget.useAVX512Regs()) {
20402 if (VT == MVT::v16i32)
20403 return true;
20404 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20405 return true;
20406 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20407 return true;
20408 }
20409 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20410 (VT == MVT::v2i64 || VT == MVT::v4i64))
20411 return true;
20412 return false;
20413}
20414
20415SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20416 SelectionDAG &DAG) const {
20417 bool IsStrict = Op->isStrictFPOpcode();
20418 unsigned OpNo = IsStrict ? 1 : 0;
20419 SDValue Src = Op.getOperand(OpNo);
20420 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20421 MVT SrcVT = Src.getSimpleValueType();
20422 MVT VT = Op.getSimpleValueType();
20423 SDLoc dl(Op);
20424
20425 if (isSoftF16(VT, Subtarget))
20426 return promoteXINT_TO_FP(Op, dl, DAG);
20427 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20428 return Op;
20429
20430 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20431 return LowerWin64_INT128_TO_FP(Op, DAG);
20432
20433 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20434 return Extract;
20435
20436 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20437 return R;
20438
20439 if (SrcVT.isVector()) {
20440 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20441 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20442 // source for strict FP.
20443 if (IsStrict)
20444 return DAG.getNode(
20445 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20446 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20447 DAG.getUNDEF(SrcVT))});
20448 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20449 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20450 DAG.getUNDEF(SrcVT)));
20451 }
20452 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20453 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20454
20455 return SDValue();
20456 }
20457
20458 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20459 "Unknown SINT_TO_FP to lower!");
20460
20461 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20462
20463 // These are really Legal; return the operand so the caller accepts it as
20464 // Legal.
20465 if (SrcVT == MVT::i32 && UseSSEReg)
20466 return Op;
20467 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20468 return Op;
20469
20470 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20471 return V;
20472 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20473 return V;
20474
20475 // SSE doesn't have an i16 conversion so we need to promote.
20476 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20477 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20478 if (IsStrict)
20479 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20480 {Chain, Ext});
20481
20482 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20483 }
20484
20485 if (VT == MVT::f128 || !Subtarget.hasX87())
20486 return SDValue();
20487
20488 SDValue ValueToStore = Src;
20489 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20490 // Bitcasting to f64 here allows us to do a single 64-bit store from
20491 // an SSE register, avoiding the store forwarding penalty that would come
20492 // with two 32-bit stores.
20493 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20494
20495 unsigned Size = SrcVT.getStoreSize();
20496 Align Alignment(Size);
20497 MachineFunction &MF = DAG.getMachineFunction();
20498 auto PtrVT = getPointerTy(MF.getDataLayout());
20499 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20500 MachinePointerInfo MPI =
20502 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20503 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20504 std::pair<SDValue, SDValue> Tmp =
20505 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20506
20507 if (IsStrict)
20508 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20509
20510 return Tmp.first;
20511}
20512
20513std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20514 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20515 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20516 // Build the FILD
20517 SDVTList Tys;
20518 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20519 if (useSSE)
20520 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20521 else
20522 Tys = DAG.getVTList(DstVT, MVT::Other);
20523
20524 SDValue FILDOps[] = {Chain, Pointer};
20525 SDValue Result =
20526 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20527 Alignment, MachineMemOperand::MOLoad);
20528 Chain = Result.getValue(1);
20529
20530 if (useSSE) {
20532 unsigned SSFISize = DstVT.getStoreSize();
20533 int SSFI =
20534 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20535 auto PtrVT = getPointerTy(MF.getDataLayout());
20536 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20537 Tys = DAG.getVTList(MVT::Other);
20538 SDValue FSTOps[] = {Chain, Result, StackSlot};
20541 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20542
20543 Chain =
20544 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20545 Result = DAG.getLoad(
20546 DstVT, DL, Chain, StackSlot,
20548 Chain = Result.getValue(1);
20549 }
20550
20551 return { Result, Chain };
20552}
20553
20554/// Horizontal vector math instructions may be slower than normal math with
20555/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20556/// implementation, and likely shuffle complexity of the alternate sequence.
20557static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20558 const X86Subtarget &Subtarget) {
20559 bool IsOptimizingSize = DAG.shouldOptForSize();
20560 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20561 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20562}
20563
20564/// 64-bit unsigned integer to double expansion.
20566 SelectionDAG &DAG,
20567 const X86Subtarget &Subtarget) {
20568 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20569 // when converting 0 when rounding toward negative infinity. Caller will
20570 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20571 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20572 // This algorithm is not obvious. Here it is what we're trying to output:
20573 /*
20574 movq %rax, %xmm0
20575 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20576 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20577 #ifdef __SSE3__
20578 haddpd %xmm0, %xmm0
20579 #else
20580 pshufd $0x4e, %xmm0, %xmm1
20581 addpd %xmm1, %xmm0
20582 #endif
20583 */
20584
20585 LLVMContext *Context = DAG.getContext();
20586
20587 // Build some magic constants.
20588 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20589 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20590 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20591 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20592
20594 CV1.push_back(
20595 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20596 APInt(64, 0x4330000000000000ULL))));
20597 CV1.push_back(
20598 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20599 APInt(64, 0x4530000000000000ULL))));
20600 Constant *C1 = ConstantVector::get(CV1);
20601 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20602
20603 // Load the 64-bit value into an XMM register.
20604 SDValue XR1 =
20605 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20606 SDValue CLod0 = DAG.getLoad(
20607 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20609 SDValue Unpck1 =
20610 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20611
20612 SDValue CLod1 = DAG.getLoad(
20613 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20615 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20616 // TODO: Are there any fast-math-flags to propagate here?
20617 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20618 SDValue Result;
20619
20620 if (Subtarget.hasSSE3() &&
20621 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20622 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20623 } else {
20624 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20625 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20626 }
20627 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20628 DAG.getVectorIdxConstant(0, dl));
20629 return Result;
20630}
20631
20632/// 32-bit unsigned integer to float expansion.
20634 SelectionDAG &DAG,
20635 const X86Subtarget &Subtarget) {
20636 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20637 // FP constant to bias correct the final result.
20638 SDValue Bias = DAG.getConstantFP(
20639 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20640
20641 // Load the 32-bit value into an XMM register.
20642 SDValue Load =
20643 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20644
20645 // Zero out the upper parts of the register.
20646 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20647
20648 // Or the load with the bias.
20649 SDValue Or = DAG.getNode(
20650 ISD::OR, dl, MVT::v2i64,
20651 DAG.getBitcast(MVT::v2i64, Load),
20652 DAG.getBitcast(MVT::v2i64,
20653 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20654 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20655 DAG.getBitcast(MVT::v2f64, Or),
20656 DAG.getVectorIdxConstant(0, dl));
20657
20658 if (Op.getNode()->isStrictFPOpcode()) {
20659 // Subtract the bias.
20660 // TODO: Are there any fast-math-flags to propagate here?
20661 SDValue Chain = Op.getOperand(0);
20662 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20663 {Chain, Or, Bias});
20664
20665 if (Op.getValueType() == Sub.getValueType())
20666 return Sub;
20667
20668 // Handle final rounding.
20669 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20670 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20671
20672 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20673 }
20674
20675 // Subtract the bias.
20676 // TODO: Are there any fast-math-flags to propagate here?
20677 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20678
20679 // Handle final rounding.
20680 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20681}
20682
20684 SelectionDAG &DAG,
20685 const X86Subtarget &Subtarget) {
20686 if (Op.getSimpleValueType() != MVT::v2f64)
20687 return SDValue();
20688
20689 bool IsStrict = Op->isStrictFPOpcode();
20690
20691 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20692 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20693
20694 if (Subtarget.hasAVX512()) {
20695 if (!Subtarget.hasVLX()) {
20696 // Let generic type legalization widen this.
20697 if (!IsStrict)
20698 return SDValue();
20699 // Otherwise pad the integer input with 0s and widen the operation.
20700 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20701 DAG.getConstant(0, DL, MVT::v2i32));
20702 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20703 {Op.getOperand(0), N0});
20704 SDValue Chain = Res.getValue(1);
20705 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20706 DAG.getVectorIdxConstant(0, DL));
20707 return DAG.getMergeValues({Res, Chain}, DL);
20708 }
20709
20710 // Legalize to v4i32 type.
20711 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20712 DAG.getUNDEF(MVT::v2i32));
20713 if (IsStrict)
20714 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20715 {Op.getOperand(0), N0});
20716 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20717 }
20718
20719 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20720 // This gives us the floating point equivalent of 2^52 + the i32 integer
20721 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20722 // point leaving just our i32 integers in double format.
20723 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20724 SDValue VBias = DAG.getConstantFP(
20725 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20726 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20727 DAG.getBitcast(MVT::v2i64, VBias));
20728 Or = DAG.getBitcast(MVT::v2f64, Or);
20729
20730 if (IsStrict)
20731 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20732 {Op.getOperand(0), Or, VBias});
20733 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20734}
20735
20737 SelectionDAG &DAG,
20738 const X86Subtarget &Subtarget) {
20739 bool IsStrict = Op->isStrictFPOpcode();
20740 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20741 MVT VecIntVT = V.getSimpleValueType();
20742 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20743 "Unsupported custom type");
20744
20745 if (Subtarget.hasAVX512()) {
20746 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20747 assert(!Subtarget.hasVLX() && "Unexpected features");
20748 MVT VT = Op->getSimpleValueType(0);
20749
20750 // v8i32->v8f64 is legal with AVX512 so just return it.
20751 if (VT == MVT::v8f64)
20752 return Op;
20753
20754 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20755 VT == MVT::v8f16) &&
20756 "Unexpected VT!");
20757 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20758 MVT WideIntVT = MVT::v16i32;
20759 if (VT == MVT::v4f64) {
20760 WideVT = MVT::v8f64;
20761 WideIntVT = MVT::v8i32;
20762 }
20763
20764 // Need to concat with zero vector for strict fp to avoid spurious
20765 // exceptions.
20766 SDValue Tmp =
20767 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20768 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20769 DAG.getVectorIdxConstant(0, DL));
20770 SDValue Res, Chain;
20771 if (IsStrict) {
20772 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20773 {Op->getOperand(0), V});
20774 Chain = Res.getValue(1);
20775 } else {
20776 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20777 }
20778
20779 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20780 DAG.getVectorIdxConstant(0, DL));
20781
20782 if (IsStrict)
20783 return DAG.getMergeValues({Res, Chain}, DL);
20784 return Res;
20785 }
20786
20787 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20788 Op->getSimpleValueType(0) == MVT::v4f64) {
20789 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20790 Constant *Bias = ConstantFP::get(
20791 *DAG.getContext(),
20792 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20793 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20794 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20795 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20796 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20797 SDValue VBias = DAG.getMemIntrinsicNode(
20798 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20801
20802 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20803 DAG.getBitcast(MVT::v4i64, VBias));
20804 Or = DAG.getBitcast(MVT::v4f64, Or);
20805
20806 if (IsStrict)
20807 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20808 {Op.getOperand(0), Or, VBias});
20809 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20810 }
20811
20812 // The algorithm is the following:
20813 // #ifdef __SSE4_1__
20814 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20815 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20816 // (uint4) 0x53000000, 0xaa);
20817 // #else
20818 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20819 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20820 // #endif
20821 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20822 // return (float4) lo + fhi;
20823
20824 bool Is128 = VecIntVT == MVT::v4i32;
20825 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20826 // If we convert to something else than the supported type, e.g., to v4f64,
20827 // abort early.
20828 if (VecFloatVT != Op->getSimpleValueType(0))
20829 return SDValue();
20830
20831 // In the #idef/#else code, we have in common:
20832 // - The vector of constants:
20833 // -- 0x4b000000
20834 // -- 0x53000000
20835 // - A shift:
20836 // -- v >> 16
20837
20838 // Create the splat vector for 0x4b000000.
20839 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20840 // Create the splat vector for 0x53000000.
20841 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20842
20843 // Create the right shift.
20844 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20845 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20846
20847 SDValue Low, High;
20848 if (Subtarget.hasSSE41()) {
20849 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20850 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20851 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20852 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20853 // Low will be bitcasted right away, so do not bother bitcasting back to its
20854 // original type.
20855 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20856 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20857 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20858 // (uint4) 0x53000000, 0xaa);
20859 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20860 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20861 // High will be bitcasted right away, so do not bother bitcasting back to
20862 // its original type.
20863 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20864 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20865 } else {
20866 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20867 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20868 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20869 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20870
20871 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20872 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20873 }
20874
20875 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20876 SDValue VecCstFSub = DAG.getConstantFP(
20877 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20878
20879 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20880 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20881 // constant, we avoid reassociation in MachineCombiner when reassoc is
20882 // enabled. See PR24512.
20883 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20884 // TODO: Are there any fast-math-flags to propagate here?
20885 // (float4) lo;
20886 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20887 // return (float4) lo + fhi;
20888 if (IsStrict) {
20889 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20890 {Op.getOperand(0), HighBitcast, VecCstFSub});
20891 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20892 {FHigh.getValue(1), LowBitcast, FHigh});
20893 }
20894
20895 SDValue FHigh =
20896 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20897 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20898}
20899
20901 const X86Subtarget &Subtarget) {
20902 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20903 SDValue N0 = Op.getOperand(OpNo);
20904 MVT SrcVT = N0.getSimpleValueType();
20905
20906 switch (SrcVT.SimpleTy) {
20907 default:
20908 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20909 case MVT::v2i32:
20910 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20911 case MVT::v4i32:
20912 case MVT::v8i32:
20913 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20914 case MVT::v2i64:
20915 case MVT::v4i64:
20916 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20917 }
20918}
20919
20920SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20921 SelectionDAG &DAG) const {
20922 bool IsStrict = Op->isStrictFPOpcode();
20923 unsigned OpNo = IsStrict ? 1 : 0;
20924 SDValue Src = Op.getOperand(OpNo);
20925 SDLoc dl(Op);
20926 auto PtrVT = getPointerTy(DAG.getDataLayout());
20927 MVT SrcVT = Src.getSimpleValueType();
20928 MVT DstVT = Op->getSimpleValueType(0);
20929 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20930
20931 // Bail out when we don't have native conversion instructions.
20932 if (DstVT == MVT::f128)
20933 return SDValue();
20934
20935 if (isSoftF16(DstVT, Subtarget))
20936 return promoteXINT_TO_FP(Op, dl, DAG);
20937 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20938 return Op;
20939
20940 if (SDValue V = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20941 return V;
20942
20943 if (DstVT.isVector())
20944 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20945
20946 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20947 return LowerWin64_INT128_TO_FP(Op, DAG);
20948
20949 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20950 return Extract;
20951
20952 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20953 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20954 // Conversions from unsigned i32 to f32/f64 are legal,
20955 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20956 return Op;
20957 }
20958
20959 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20960 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20961 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20962 if (IsStrict)
20963 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20964 {Chain, Src});
20965 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20966 }
20967
20968 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20969 return V;
20970 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20971 return V;
20972
20973 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20974 // infinity. It produces -0.0, so disable under strictfp.
20975 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20976 !IsStrict)
20977 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20978 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20979 // negative infinity. So disable under strictfp. Using FILD instead.
20980 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20981 !IsStrict)
20982 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20983 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20984 (DstVT == MVT::f32 || DstVT == MVT::f64))
20985 return SDValue();
20986
20987 // Make a 64-bit buffer, and use it to build an FILD.
20988 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20989 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20990 Align SlotAlign(8);
20991 MachinePointerInfo MPI =
20993 if (SrcVT == MVT::i32) {
20994 SDValue OffsetSlot =
20995 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20996 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20997 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20998 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20999 std::pair<SDValue, SDValue> Tmp =
21000 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21001 if (IsStrict)
21002 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21003
21004 return Tmp.first;
21005 }
21006
21007 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
21008 SDValue ValueToStore = Src;
21009 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
21010 // Bitcasting to f64 here allows us to do a single 64-bit store from
21011 // an SSE register, avoiding the store forwarding penalty that would come
21012 // with two 32-bit stores.
21013 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21014 }
21015 SDValue Store =
21016 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21017 // For i64 source, we need to add the appropriate power of 2 if the input
21018 // was negative. We must be careful to do the computation in x87 extended
21019 // precision, not in SSE.
21020 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21021 SDValue Ops[] = {Store, StackSlot};
21022 SDValue Fild =
21023 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
21024 SlotAlign, MachineMemOperand::MOLoad);
21025 Chain = Fild.getValue(1);
21026
21027 // Check whether the sign bit is set.
21028 SDValue SignSet = DAG.getSetCC(
21029 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
21030 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
21031
21032 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
21033 APInt FF(64, 0x5F80000000000000ULL);
21034 SDValue FudgePtr =
21035 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
21036 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
21037
21038 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
21039 SDValue Zero = DAG.getIntPtrConstant(0, dl);
21040 SDValue Four = DAG.getIntPtrConstant(4, dl);
21041 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
21042 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
21043
21044 // Load the value out, extending it from f32 to f80.
21045 SDValue Fudge = DAG.getExtLoad(
21046 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
21048 CPAlignment);
21049 Chain = Fudge.getValue(1);
21050 // Extend everything to 80 bits to force it to be done on x87.
21051 // TODO: Are there any fast-math-flags to propagate here?
21052 if (IsStrict) {
21053 unsigned Opc = ISD::STRICT_FADD;
21054 // Windows needs the precision control changed to 80bits around this add.
21055 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21057
21058 SDValue Add =
21059 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
21060 // STRICT_FP_ROUND can't handle equal types.
21061 if (DstVT == MVT::f80)
21062 return Add;
21063 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
21064 {Add.getValue(1), Add,
21065 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
21066 }
21067 unsigned Opc = ISD::FADD;
21068 // Windows needs the precision control changed to 80bits around this add.
21069 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21071
21072 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
21073 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
21074 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21075}
21076
21077// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
21078// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
21079// just return an SDValue().
21080// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
21081// to i16, i32 or i64, and we lower it to a legal sequence and return the
21082// result.
21083SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
21084 bool IsSigned,
21085 SDValue &Chain) const {
21086 bool IsStrict = Op->isStrictFPOpcode();
21087 SDLoc DL(Op);
21088
21089 EVT DstTy = Op.getValueType();
21090 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
21091 EVT TheVT = Value.getValueType();
21092 auto PtrVT = getPointerTy(DAG.getDataLayout());
21093
21094 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21095 // f16 must be promoted before using the lowering in this routine.
21096 // fp128 does not use this lowering.
21097 return SDValue();
21098 }
21099
21100 // If using FIST to compute an unsigned i64, we'll need some fixup
21101 // to handle values above the maximum signed i64. A FIST is always
21102 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
21103 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21104
21105 // FIXME: This does not generate an invalid exception if the input does not
21106 // fit in i32. PR44019
21107 if (!IsSigned && DstTy != MVT::i64) {
21108 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
21109 // The low 32 bits of the fist result will have the correct uint32 result.
21110 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
21111 DstTy = MVT::i64;
21112 }
21113
21114 assert(DstTy.getSimpleVT() <= MVT::i64 &&
21115 DstTy.getSimpleVT() >= MVT::i16 &&
21116 "Unknown FP_TO_INT to lower!");
21117
21118 // We lower FP->int64 into FISTP64 followed by a load from a temporary
21119 // stack slot.
21120 MachineFunction &MF = DAG.getMachineFunction();
21121 unsigned MemSize = DstTy.getStoreSize();
21122 int SSFI =
21123 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
21124 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21125
21126 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21127
21128 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
21129
21130 if (UnsignedFixup) {
21131 //
21132 // Conversion to unsigned i64 is implemented with a select,
21133 // depending on whether the source value fits in the range
21134 // of a signed i64. Let Thresh be the FP equivalent of
21135 // 0x8000000000000000ULL.
21136 //
21137 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
21138 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
21139 // FistSrc = (Value - FltOfs);
21140 // Fist-to-mem64 FistSrc
21141 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
21142 // to XOR'ing the high 32 bits with Adjust.
21143 //
21144 // Being a power of 2, Thresh is exactly representable in all FP formats.
21145 // For X87 we'd like to use the smallest FP type for this constant, but
21146 // for DAG type consistency we have to match the FP operand type.
21147
21148 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
21149 [[maybe_unused]] APFloat::opStatus Status = APFloat::opOK;
21150 bool LosesInfo = false;
21151 if (TheVT == MVT::f64)
21152 // The rounding mode is irrelevant as the conversion should be exact.
21153 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
21154 &LosesInfo);
21155 else if (TheVT == MVT::f80)
21156 Status = Thresh.convert(APFloat::x87DoubleExtended(),
21157 APFloat::rmNearestTiesToEven, &LosesInfo);
21158
21159 assert(Status == APFloat::opOK && !LosesInfo &&
21160 "FP conversion should have been exact");
21161
21162 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
21163
21164 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
21165 *DAG.getContext(), TheVT);
21166 SDValue Cmp;
21167 if (IsStrict) {
21168 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
21169 /*IsSignaling*/ true);
21170 Chain = Cmp.getValue(1);
21171 } else {
21172 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
21173 }
21174
21175 // Our preferred lowering of
21176 //
21177 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
21178 //
21179 // is
21180 //
21181 // (Value >= Thresh) << 63
21182 //
21183 // but since we can get here after LegalOperations, DAGCombine might do the
21184 // wrong thing if we create a select. So, directly create the preferred
21185 // version.
21186 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
21187 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
21188 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
21189
21190 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
21191 DAG.getConstantFP(0.0, DL, TheVT));
21192
21193 if (IsStrict) {
21194 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
21195 { Chain, Value, FltOfs });
21196 Chain = Value.getValue(1);
21197 } else
21198 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
21199 }
21200
21201 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
21202
21203 // FIXME This causes a redundant load/store if the SSE-class value is already
21204 // in memory, such as if it is on the callstack.
21205 if (isScalarFPTypeInSSEReg(TheVT)) {
21206 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
21207 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
21208 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21209 SDValue Ops[] = { Chain, StackSlot };
21210
21211 unsigned FLDSize = TheVT.getStoreSize();
21212 assert(FLDSize <= MemSize && "Stack slot not big enough");
21213 MachineMemOperand *MMO = MF.getMachineMemOperand(
21214 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
21215 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
21216 Chain = Value.getValue(1);
21217 }
21218
21219 // Build the FP_TO_INT*_IN_MEM
21220 MachineMemOperand *MMO = MF.getMachineMemOperand(
21221 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
21222 SDValue Ops[] = { Chain, Value, StackSlot };
21224 DAG.getVTList(MVT::Other),
21225 Ops, DstTy, MMO);
21226
21227 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
21228 Chain = Res.getValue(1);
21229
21230 // If we need an unsigned fixup, XOR the result with adjust.
21231 if (UnsignedFixup)
21232 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
21233
21234 return Res;
21235}
21236
21238 const X86Subtarget &Subtarget) {
21239 MVT VT = Op.getSimpleValueType();
21240 SDValue In = Op.getOperand(0);
21241 MVT InVT = In.getSimpleValueType();
21242 unsigned Opc = Op.getOpcode();
21243
21244 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
21246 "Unexpected extension opcode");
21248 "Expected same number of elements");
21249 assert((VT.getVectorElementType() == MVT::i16 ||
21250 VT.getVectorElementType() == MVT::i32 ||
21251 VT.getVectorElementType() == MVT::i64) &&
21252 "Unexpected element type");
21253 assert((InVT.getVectorElementType() == MVT::i8 ||
21254 InVT.getVectorElementType() == MVT::i16 ||
21255 InVT.getVectorElementType() == MVT::i32) &&
21256 "Unexpected element type");
21257
21258 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
21259
21260 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
21261 assert(InVT == MVT::v32i8 && "Unexpected VT!");
21262 return splitVectorIntUnary(Op, DAG, dl);
21263 }
21264
21265 if (Subtarget.hasInt256())
21266 return Op;
21267
21268 // Optimize vectors in AVX mode:
21269 //
21270 // v8i16 -> v8i32
21271 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
21272 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
21273 // Concat upper and lower parts.
21274 //
21275 // v4i32 -> v4i64
21276 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
21277 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
21278 // Concat upper and lower parts.
21279 //
21280 MVT HalfVT = VT.getHalfNumVectorElementsVT();
21281 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
21282
21283 // Short-circuit if we can determine that each 128-bit half is the same value.
21284 // Otherwise, this is difficult to match and optimize.
21285 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
21286 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
21287 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
21288
21289 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
21290 SDValue Undef = DAG.getUNDEF(InVT);
21291 bool NeedZero = Opc == ISD::ZERO_EXTEND;
21292 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
21293 OpHi = DAG.getBitcast(HalfVT, OpHi);
21294
21295 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
21296}
21297
21298// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
21299static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
21300 const SDLoc &dl, SelectionDAG &DAG) {
21301 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
21302 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21303 DAG.getVectorIdxConstant(0, dl));
21304 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21305 DAG.getVectorIdxConstant(8, dl));
21306 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
21307 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
21308 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
21309 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21310}
21311
21313 const X86Subtarget &Subtarget,
21314 SelectionDAG &DAG) {
21315 MVT VT = Op->getSimpleValueType(0);
21316 SDValue In = Op->getOperand(0);
21317 MVT InVT = In.getSimpleValueType();
21318 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
21319 unsigned NumElts = VT.getVectorNumElements();
21320
21321 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
21322 // avoids a constant pool load.
21323 if (VT.getVectorElementType() != MVT::i8) {
21324 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
21325 return DAG.getNode(ISD::SRL, DL, VT, Extend,
21326 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
21327 }
21328
21329 // Extend VT if BWI is not supported.
21330 MVT ExtVT = VT;
21331 if (!Subtarget.hasBWI()) {
21332 // If v16i32 is to be avoided, we'll need to split and concatenate.
21333 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21334 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21335
21336 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21337 }
21338
21339 // Widen to 512-bits if VLX is not supported.
21340 MVT WideVT = ExtVT;
21341 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21342 NumElts *= 512 / ExtVT.getSizeInBits();
21343 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21344 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21345 DAG.getVectorIdxConstant(0, DL));
21346 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21347 }
21348
21349 SDValue One = DAG.getConstant(1, DL, WideVT);
21350 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21351
21352 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21353
21354 // Truncate if we had to extend above.
21355 if (VT != ExtVT) {
21356 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21357 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21358 }
21359
21360 // Extract back to 128/256-bit if we widened.
21361 if (WideVT != VT)
21362 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21363 DAG.getVectorIdxConstant(0, DL));
21364
21365 return SelectedVal;
21366}
21367
21369 SelectionDAG &DAG) {
21370 SDValue In = Op.getOperand(0);
21371 MVT SVT = In.getSimpleValueType();
21372 SDLoc DL(Op);
21373
21374 if (SVT.getVectorElementType() == MVT::i1)
21375 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21376
21377 assert(Subtarget.hasAVX() && "Expected AVX support");
21378 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21379}
21380
21381/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21382/// It makes use of the fact that vectors with eno