LLVM 23.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1// I
2//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3//
4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5// See https://llvm.org/LICENSE.txt for license information.
6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#include "X86ISelLowering.h"
17#include "X86.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
22#include "X86TargetMachine.h"
24#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM, STI), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
305
306 if (!Subtarget.is64Bit()) {
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
352 if (Subtarget.is64Bit()) {
354 // Without SSE, i64->f64 goes through memory.
356 }
357 } else if (!Subtarget.is64Bit())
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
385 }
386 if (Subtarget.is64Bit())
391
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
537
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
549 }
550
551 if (!Subtarget.is64Bit())
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
569 }
570
573
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
581 else
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
593
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
640 };
641
642 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
643 // f16, f32 and f64 use SSE.
644 // Set up the FP register classes.
645 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
646 : &X86::FR16RegClass);
647 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
648 : &X86::FR32RegClass);
649 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
650 : &X86::FR64RegClass);
651
652 // Disable f32->f64 extload as we can only generate this in one instruction
653 // under optsize. So its easier to pattern match (fpext (load)) for that
654 // case instead of needing to emit 2 instructions for extload in the
655 // non-optsize case.
656 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
657
658 for (auto VT : { MVT::f32, MVT::f64 }) {
659 // Use ANDPD to simulate FABS.
661
662 // Use XORP to simulate FNEG.
664
665 // Use ANDPD and ORPD to simulate FCOPYSIGN.
667
668 // These might be better off as horizontal vector ops.
671
672 // We don't support sin/cos/fmod
676 }
677
678 // Half type will be promoted by default.
679 setF16Action(MVT::f16, Promote);
690
721
726
731
732 // Lower this to MOVMSK plus an AND.
735
736 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
737 (UseX87 || Is64Bit)) {
738 // Use SSE for f32, x87 for f64.
739 // Set up the FP register classes.
740 addRegisterClass(MVT::f32, &X86::FR32RegClass);
741 if (UseX87)
742 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
743
744 // Use ANDPS to simulate FABS.
746
747 // Use XORP to simulate FNEG.
749
750 if (UseX87)
752
753 // Use ANDPS and ORPS to simulate FCOPYSIGN.
754 if (UseX87)
757
758 // We don't support sin/cos/fmod
762
763 if (UseX87) {
764 // Always expand sin/cos functions even though x87 has an instruction.
768 }
769 } else if (UseX87) {
770 // f32 and f64 in x87.
771 // Set up the FP register classes.
772 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
773 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
774
775 for (auto VT : { MVT::f32, MVT::f64 }) {
778
779 // Always expand sin/cos functions even though x87 has an instruction.
783 }
784 }
785
786 // Expand FP32 immediates into loads from the stack, save special cases.
787 if (isTypeLegal(MVT::f32)) {
788 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
789 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
790 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
791 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
792 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
793 } else // SSE immediates.
794 addLegalFPImmediate(APFloat(+0.0f)); // xorps
795 }
796 // Expand FP64 immediates into loads from the stack, save special cases.
797 if (isTypeLegal(MVT::f64)) {
798 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
799 addLegalFPImmediate(APFloat(+0.0)); // FLD0
800 addLegalFPImmediate(APFloat(+1.0)); // FLD1
801 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
802 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
803 } else // SSE immediates.
804 addLegalFPImmediate(APFloat(+0.0)); // xorpd
805 }
806 // Support fp16 0 immediate.
807 if (isTypeLegal(MVT::f16))
808 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
809
810 // Handle constrained floating-point operations of scalar.
823
824 // We don't support FMA.
827
828 // f80 always uses X87.
829 if (UseX87) {
830 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
833 {
835 addLegalFPImmediate(TmpFlt); // FLD0
836 TmpFlt.changeSign();
837 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
838
839 bool ignored;
840 APFloat TmpFlt2(+1.0);
842 &ignored);
843 addLegalFPImmediate(TmpFlt2); // FLD1
844 TmpFlt2.changeSign();
845 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
846 }
847
848 // Always expand sin/cos functions even though x87 has an instruction.
849 // clang-format off
861 // clang-format on
862
874
875 // Handle constrained floating-point operations of scalar.
882 if (isTypeLegal(MVT::f16)) {
885 } else {
887 }
888 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
889 // as Custom.
891 }
892
893 // f128 uses xmm registers, but most operations require libcalls.
894 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
895 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
896 : &X86::VR128RegClass);
897
898 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
899
910
914
915 // clang-format off
923 // clang-format on
924 // No STRICT_FSINCOS
927
930 // We need to custom handle any FP_ROUND with an f128 input, but
931 // LegalizeDAG uses the result type to know when to run a custom handler.
932 // So we have to list all legal floating point result types here.
933 if (isTypeLegal(MVT::f32)) {
936 }
937 if (isTypeLegal(MVT::f64)) {
940 }
941 if (isTypeLegal(MVT::f80)) {
945 }
946
948
949 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
950 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
951 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
952 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
953 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
954 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
955 }
956
957 // Always use a library call for pow.
958 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
959 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
960 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
961 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
962
971
972 // Some FP actions are always expanded for vector types.
973 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
974 MVT::v4f32, MVT::v8f32, MVT::v16f32,
975 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
976 // clang-format off
990 // clang-format on
991 }
992
993 // First set operation action for all vector types to either promote
994 // (for widening) or expand (for scalarization). Then we will selectively
995 // turn on ones that can be effectively codegen'd.
1035 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1036 setTruncStoreAction(InnerVT, VT, Expand);
1037
1038 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1039 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1040
1041 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1042 // types, we have to deal with them whether we ask for Expansion or not.
1043 // Setting Expand causes its own optimisation problems though, so leave
1044 // them legal.
1045 if (VT.getVectorElementType() == MVT::i1)
1046 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1047
1048 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1049 // split/scalarized right now.
1050 if (VT.getVectorElementType() == MVT::f16 ||
1051 VT.getVectorElementType() == MVT::bf16)
1052 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1053 }
1054 }
1055
1056 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1057 // with -msoft-float, disable use of MMX as well.
1058 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1059 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1060 // No operations on x86mmx supported, everything uses intrinsics.
1061 }
1062
1063 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1064 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1065 : &X86::VR128RegClass);
1066
1071
1072 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1073 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1081
1082 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1083 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1085
1091 }
1092
1093 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1094 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096
1097 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1098 // registers cannot be used even for integer operations.
1099 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1100 : &X86::VR128RegClass);
1101 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1102 : &X86::VR128RegClass);
1103 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1104 : &X86::VR128RegClass);
1105 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1106 : &X86::VR128RegClass);
1107 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1108 : &X86::VR128RegClass);
1109
1110 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1115 }
1116
1117 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1118 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1123 }
1124
1125 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1126 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1127 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1128
1129 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1130 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1131 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1132 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1133 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1134 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1135 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1136 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1137 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1138 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1141
1142 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1143 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1144 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1145
1146 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1148 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1150
1151 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1152 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1153
1154 setOperationAction(ISD::AND, MVT::i128, Custom);
1155 setOperationAction(ISD::OR, MVT::i128, Custom);
1156 setOperationAction(ISD::XOR, MVT::i128, Custom);
1157
1158 if (Subtarget.hasPCLMUL()) {
1159 for (auto VT : {MVT::i64, MVT::v4i32, MVT::v2i64}) {
1162 }
1166 }
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1169 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1170 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1171 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1172 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1173 }
1174
1185
1190
1191 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1197
1198 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1199 // setcc all the way to isel and prefer SETGT in some isel patterns.
1202 }
1203
1204 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1205 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1210
1211 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1217 }
1218
1219 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1223
1224 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1225 continue;
1226
1229 }
1230 setF16Action(MVT::v8f16, Expand);
1231 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1232 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1233 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1234 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1235 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1236 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1238
1239 // Custom lower v2i64 and v2f64 selects.
1246
1253
1254 // Custom legalize these to avoid over promotion or custom promotion.
1255 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1260 }
1261
1266
1269
1272
1273 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1278
1283
1284 // We want to legalize this to an f64 load rather than an i64 load on
1285 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1286 // store.
1287 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1288 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1289 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1290 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1291 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1293
1294 // Add 32-bit vector stores to help vectorization opportunities.
1295 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1297
1301 if (!Subtarget.hasAVX512())
1303
1307
1309
1326
1327 // In the customized shift lowering, the legal v4i32/v2i64 cases
1328 // in AVX2 will be recognized.
1329 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1333 if (VT == MVT::v2i64) continue;
1338 }
1339
1345 }
1346
1347 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1352
1353 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1355 }
1356 }
1357
1358 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1359 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1360 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1361 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1362
1363 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1366 }
1367
1368 // These might be better off as horizontal vector ops.
1373 }
1374
1375 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1376 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1379 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1383 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1389
1391 }
1392
1393 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1394 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1395 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1396 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1397 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1398 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1399 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1400 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1401
1405
1406 // FIXME: Do we need to handle scalar-to-vector here?
1407 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1408 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1409
1410 // We directly match byte blends in the backend as they match the VSELECT
1411 // condition form.
1413
1414 // SSE41 brings specific instructions for doing vector sign extend even in
1415 // cases where we don't have SRA.
1416 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1419 }
1420
1421 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1422 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1423 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1424 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1425 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1426 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1427 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1428 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1429 }
1430
1431 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1432 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1433 // do the pre and post work in the vector domain.
1436 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1437 // so that DAG combine doesn't try to turn it into uint_to_fp.
1440 }
1441 }
1442
1443 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1445 }
1446
1447 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1448 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1449 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1452 }
1453
1454 // XOP can efficiently perform BITREVERSE with VPPERM.
1455 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1457 }
1458
1459 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1460 bool HasInt256 = Subtarget.hasInt256();
1461
1462 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1463 : &X86::VR256RegClass);
1464 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1465 : &X86::VR256RegClass);
1466 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1467 : &X86::VR256RegClass);
1468 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1469 : &X86::VR256RegClass);
1470 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1471 : &X86::VR256RegClass);
1472 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1473 : &X86::VR256RegClass);
1474 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1475 : &X86::VR256RegClass);
1476
1477 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1490
1492
1496
1502 }
1503
1504 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1505 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1506
1507 setOperationAction(ISD::AND, MVT::i256, Custom);
1508 setOperationAction(ISD::OR, MVT::i256, Custom);
1509 setOperationAction(ISD::XOR, MVT::i256, Custom);
1510
1511 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1512 // even though v8i16 is a legal type.
1513 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1514 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1515 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1516 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1520
1527
1539
1540 if (!Subtarget.hasAVX512())
1542
1543 // In the customized shift lowering, the legal v8i32/v4i64 cases
1544 // in AVX2 will be recognized.
1545 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1551 if (VT == MVT::v4i64) continue;
1556 }
1557
1558 // These types need custom splitting if their input is a 128-bit vector.
1563
1567 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1568 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1571
1572 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1576 }
1577
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1588
1589 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1590 // setcc all the way to isel and prefer SETGT in some isel patterns.
1593 }
1594
1595 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1596 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1601
1602 if (Subtarget.hasAnyFMA()) {
1603 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1604 MVT::v2f64, MVT::v4f64 }) {
1607 }
1608 }
1609
1610 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1611 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1613 }
1614
1615 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1616 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1619
1620 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1621 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1622 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1623 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1624 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1625 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1626 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1628
1629 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1630 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1631
1632 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1633 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1634 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1635 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1636 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1637
1638 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1639 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1640 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1641 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1642 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1643 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1644 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1645 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1650
1651 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1652 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1653 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1654 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1655 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1656 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1657 }
1658
1659 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1662 }
1663
1664 if (HasInt256) {
1665 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1666 // when we have a 256bit-wide blend with immediate.
1669
1670 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1671 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1672 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1673 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1674 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1675 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1676 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1677 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1678 }
1679 }
1680
1681 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1682 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1683 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1685 }
1686
1687 // Extract subvector is special because the value type
1688 // (result) is 128-bit but the source is 256-bit wide.
1689 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1690 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1692 }
1693
1694 // Custom lower several nodes for 256-bit types.
1695 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1696 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1706 }
1707 setF16Action(MVT::v16f16, Expand);
1708 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1709 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1711 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1712 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1713 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1714 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1715
1716 // Only PCLMUL required as we always unroll clmul vectors.
1717 if (Subtarget.hasPCLMUL()) {
1718 for (auto VT : {MVT::v8i32, MVT::v4i64}) {
1721 }
1722 }
1723
1724 if (HasInt256) {
1726
1727 // Custom legalize 2x32 to get a little better code.
1730
1731 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1732 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1734 }
1735 }
1736
1737 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1738 Subtarget.hasF16C()) {
1739 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1742 }
1743 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1746 }
1747 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1748 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1749 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1750 }
1751 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1752 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1753 }
1754
1755 // This block controls legalization of the mask vector sizes that are
1756 // available with AVX512. 512-bit vectors are in a separate block controlled
1757 // by useAVX512Regs.
1758 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1759 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1760 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1761 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1762 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1763 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1764
1768
1769 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1770 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1771 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1772 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1773 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1774 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1775 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1776 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1784
1785 // There is no byte sized k-register load or store without AVX512DQ.
1786 if (!Subtarget.hasDQI()) {
1787 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1788 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1789 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1790 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1791
1796 }
1797
1798 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1799 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1803 }
1804
1805 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1807
1808 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1812
1819 }
1820
1821 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1823 }
1824 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1825 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1828 }
1829 }
1830
1831 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1832 // elements. 512-bits can be disabled based on prefer-vector-width and
1833 // required-vector-width function attributes.
1834 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1835 bool HasBWI = Subtarget.hasBWI();
1836
1837 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1838 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1839 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1840 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1841 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1842 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1843 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1844
1845 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1846 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1847 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1848 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1849 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1850 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1851 if (HasBWI)
1852 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1853 }
1854
1855 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1866 }
1867 setOperationAction(ISD::LRINT, MVT::v16f32,
1868 Subtarget.hasDQI() ? Legal : Custom);
1869 setOperationAction(ISD::LRINT, MVT::v8f64,
1870 Subtarget.hasDQI() ? Legal : Custom);
1871 if (Subtarget.hasDQI())
1872 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1873
1874 setOperationAction(ISD::AND, MVT::i512, Custom);
1875 setOperationAction(ISD::OR, MVT::i512, Custom);
1876 setOperationAction(ISD::XOR, MVT::i512, Custom);
1877
1878 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1883 }
1884
1885 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1890 }
1891
1898
1910
1911 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1912 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1913 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1914 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1915 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1916 if (HasBWI)
1917 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1918
1919 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1920 // to 512-bit rather than use the AVX2 instructions so that we can use
1921 // k-masks.
1922 if (!Subtarget.hasVLX()) {
1923 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1924 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1927 }
1928 }
1929
1931 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1932 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1942
1943 if (HasBWI) {
1944 // Extends from v64i1 masks to 512-bit vectors.
1948 }
1949
1950 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1963
1965 }
1966
1967 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1970 }
1971
1972 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1973 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1974 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1975 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1976
1977 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1978 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1979 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1980 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1981
1982 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1983 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1984 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1985 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1986 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1987 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1988 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1989 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1990
1991 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1992 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1993
1994 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
2004
2005 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
2006 // setcc all the way to isel and prefer SETGT in some isel patterns.
2009 }
2010
2011 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
2012 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
2017
2018 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
2025 }
2026
2027 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2028 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
2029 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
2031 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
2032 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
2033 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
2034 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2039 }
2040
2041 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2042 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2043 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2044 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2045 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2046 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2047
2048 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2052 setOperationAction(Opc, MVT::v8i64, Custom);
2053
2054 if (Subtarget.hasDQI())
2055 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2056
2057 if (Subtarget.hasCDI()) {
2058 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2059 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2061 }
2062 } // Subtarget.hasCDI()
2063
2064 if (Subtarget.hasVPOPCNTDQ()) {
2065 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2067 }
2068
2069 // Extract subvector is special because the value type
2070 // (result) is 256-bit but the source is 512-bit wide.
2071 // 128-bit was made Legal under AVX1.
2072 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2073 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2075
2076 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2077 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2087 }
2088 setF16Action(MVT::v32f16, Expand);
2093 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2094 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2095 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2096
2097 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2102 }
2103 if (HasBWI) {
2104 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2107 }
2108 } else {
2109 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2110 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2111 }
2112
2113 if (Subtarget.hasVBMI2()) {
2114 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2117 }
2118
2119 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2120 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2121 }
2122
2123 // Only PCLMUL required as we always unroll clmul vectors.
2124 if (Subtarget.hasPCLMUL()) {
2125 for (auto VT : {MVT::v16i32, MVT::v8i64}) {
2128 }
2129 }
2130
2131 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2132 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2134 }// useAVX512Regs
2135
2136 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2137 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2138 MVT::v4i64}) {
2139 setOperationAction(ISD::FSHL, VT, Subtarget.hasVLX() ? Legal : Custom);
2140 setOperationAction(ISD::FSHR, VT, Subtarget.hasVLX() ? Legal : Custom);
2141 }
2142 }
2143
2144 // This block controls legalization for operations that don't have
2145 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2146 // narrower widths.
2147 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2148 for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
2149 MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
2150 MVT::v16f32, MVT::v8f64})
2152
2153 // These operations are handled on non-VLX by artificially widening in
2154 // isel patterns.
2158
2159 if (Subtarget.hasDQI()) {
2160 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2161 // v2f32 UINT_TO_FP is already custom under SSE2.
2164 "Unexpected operation action!");
2165 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2170 }
2171
2172 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2178 }
2179
2180 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2183 }
2184
2185 // Custom legalize 2x32 to get a little better code.
2188
2189 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2190 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2192
2193 if (Subtarget.hasDQI()) {
2197 setOperationAction(Opc, MVT::v2i64, Custom);
2198 setOperationAction(Opc, MVT::v4i64, Custom);
2199 }
2200 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2201 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2202 }
2203
2204 if (Subtarget.hasCDI()) {
2205 for (auto VT : {MVT::i256, MVT::i512}) {
2206 if (VT == MVT::i512 && !Subtarget.useAVX512Regs())
2207 continue;
2212 }
2213 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2215 }
2216 } // Subtarget.hasCDI()
2217
2218 if (Subtarget.hasVPOPCNTDQ()) {
2219 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2221 }
2222
2223 // We can try to convert vectors to different sizes to leverage legal
2224 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2225 // then specialize to Legal below.
2226 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2227 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2228 MVT::v16i16, MVT::v8i8})
2230
2231 // Legal vpcompress depends on various AVX512 extensions.
2232 // Legal in AVX512F
2233 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2235
2236 // Legal in AVX512F + AVX512VL
2237 if (Subtarget.hasVLX())
2238 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2239 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2241
2242 // Legal in AVX512F + AVX512VBMI2
2243 if (Subtarget.hasVBMI2())
2244 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2246
2247 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2248 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2249 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2251 }
2252
2253 // This block control legalization of v32i1/v64i1 which are available with
2254 // AVX512BW..
2255 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2256 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2257 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2258
2259 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2270 }
2271
2272 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2274
2275 // Extends from v32i1 masks to 256-bit vectors.
2279
2280 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2281 MVT::v16f16, MVT::v8f16}) {
2282 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2283 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2284 }
2285
2286 // These operations are handled on non-VLX by artificially widening in
2287 // isel patterns.
2288 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2289
2290 if (Subtarget.hasBITALG()) {
2291 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2293 }
2294 }
2295
2296 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2297 auto setGroup = [&] (MVT VT) {
2308
2321
2323
2326
2332
2338
2342 };
2343
2344 // AVX512_FP16 scalar operations
2345 setGroup(MVT::f16);
2363
2366
2367 if (Subtarget.useAVX512Regs()) {
2368 setGroup(MVT::v32f16);
2374 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2381
2386 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2388 MVT::v32i16);
2389 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2391 MVT::v32i16);
2392 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2394 MVT::v32i16);
2395 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2397 MVT::v32i16);
2398
2402
2403 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2404 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2405
2410 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2411 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2412 }
2413
2418
2419 if (Subtarget.hasVLX()) {
2420 setGroup(MVT::v8f16);
2421 setGroup(MVT::v16f16);
2422
2433
2440
2441 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2444
2448
2449 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2450 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2451 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2452 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2453
2454 // Need to custom widen these to prevent scalarization.
2455 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2456 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2457
2462
2467 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2468 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2469 }
2470 }
2471
2472 if (!Subtarget.useSoftFloat() &&
2473 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2474 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2475 : &X86::VR128RegClass);
2476 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2477 : &X86::VR256RegClass);
2478 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2479 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2480 // Set the operation action Custom to do the customization later.
2483 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2484 setF16Action(VT, Expand);
2485 if (!Subtarget.hasBF16())
2491 }
2492 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2493 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2494 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2495 }
2496 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2497 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2499 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2500 }
2501
2502 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2503 Subtarget.useAVX512Regs()) {
2504 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2505 setF16Action(MVT::v32bf16, Expand);
2506 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2507 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2508 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2510 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2514 }
2515
2516 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2517 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2518 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2519 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2520 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2521 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2522 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2523 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2524 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2525 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2528 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2540 }
2541 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2544 }
2545 }
2546
2547 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2548 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2549 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2550 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2551 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2552 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2553
2554 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2555 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2556 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2557 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2558 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2559
2560 if (Subtarget.hasBWI()) {
2561 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2562 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2563 }
2564
2565 if (Subtarget.hasFP16()) {
2566 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2575 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2584 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2589 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2594 }
2595 }
2596
2597 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2598 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2599 }
2600
2601 // We want to custom lower some of our intrinsics.
2605 if (!Subtarget.is64Bit()) {
2607 }
2608
2609 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2610 // handle type legalization for these operations here.
2611 //
2612 // FIXME: We really should do custom legalization for addition and
2613 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2614 // than generic legalization for 64-bit multiplication-with-overflow, though.
2615 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2616 if (VT == MVT::i64 && !Subtarget.is64Bit())
2617 continue;
2618 // Add/Sub/Mul with overflow operations are custom lowered.
2625
2626 // Support carry in as value rather than glue.
2632 }
2633
2634 // Combine sin / cos into _sincos_stret if it is available.
2637
2638 if (Subtarget.isTargetWin64()) {
2639 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2640 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2641 setOperationAction(ISD::SREM, MVT::i128, Custom);
2642 setOperationAction(ISD::UREM, MVT::i128, Custom);
2651 }
2652
2653 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2654 // is. We should promote the value to 64-bits to solve this.
2655 // This is what the CRT headers do - `fmodf` is an inline header
2656 // function casting to f64 and calling `fmod`.
2657 if (Subtarget.is32Bit() &&
2658 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2659 // clang-format off
2660 for (ISD::NodeType Op :
2678 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2679 ISD::FMODF})
2680 if (isOperationExpandOrLibCall(Op, MVT::f32))
2681 setOperationAction(Op, MVT::f32, Promote);
2682 // clang-format on
2683
2684 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2685 // it, but it's just a wrapper around ldexp.
2686 if (Subtarget.isOSWindows()) {
2688 if (isOperationExpand(Op, MVT::f32))
2689 setOperationAction(Op, MVT::f32, Promote);
2690 }
2691
2692 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
2693 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
2694 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
2695
2696 // We have target-specific dag combine patterns for the following nodes:
2707 ISD::SHL,
2708 ISD::SRA,
2709 ISD::SRL,
2710 ISD::OR,
2711 ISD::AND,
2717 ISD::ADD,
2720 ISD::FADD,
2721 ISD::FSUB,
2722 ISD::FNEG,
2723 ISD::FMA,
2727 ISD::SUB,
2728 ISD::LOAD,
2729 ISD::LRINT,
2731 ISD::MLOAD,
2732 ISD::STORE,
2749 ISD::SETCC,
2750 ISD::MUL,
2751 ISD::XOR,
2759 ISD::FSHL,
2760 ISD::FSHR,
2764
2765 computeRegisterProperties(Subtarget.getRegisterInfo());
2766
2767 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2769 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2771 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2773
2774 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2775 // that needs to benchmarked and balanced with the potential use of vector
2776 // load/store types (PR33329, PR33914).
2779
2780 // Default loop alignment, which can be overridden by -align-loops.
2782
2783 // An out-of-order CPU can speculatively execute past a predictable branch,
2784 // but a conditional move could be stalled by an expensive earlier operation.
2785 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2786 EnableExtLdPromotion = true;
2788
2790
2791 // Default to having -disable-strictnode-mutation on
2792 IsStrictFPEnabled = true;
2793}
2794
2795// This has so far only been implemented for 64-bit MachO.
2797 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2798}
2799
2801 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2802 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2803}
2804
2806 const SDLoc &DL) const {
2807 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2808 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2809 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2810 return SDValue(Node, 0);
2811}
2812
2815 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2816 !Subtarget.hasBWI())
2817 return TypeSplitVector;
2818
2819 // Since v8f16 is legal, widen anything over v4f16.
2820 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2821 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2822 VT.getVectorElementType() == MVT::f16)
2823 return TypeSplitVector;
2824
2825 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2826 VT.getVectorElementType() != MVT::i1)
2827 return TypeWidenVector;
2828
2830}
2831
2833 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
2834 const LibcallLoweringInfo *libcallLowering) const {
2835 return X86::createFastISel(funcInfo, libInfo, libcallLowering);
2836}
2837
2838//===----------------------------------------------------------------------===//
2839// Other Lowering Hooks
2840//===----------------------------------------------------------------------===//
2841
2843 bool AssumeSingleUse, bool IgnoreAlignment) {
2844 if (!AssumeSingleUse && !Op.hasOneUse())
2845 return false;
2846 if (!ISD::isNormalLoad(Op.getNode()))
2847 return false;
2848
2849 // If this is an unaligned vector, make sure the target supports folding it.
2850 auto *Ld = cast<LoadSDNode>(Op.getNode());
2851 if (!IgnoreAlignment && !Subtarget.hasAVX() &&
2852 !Subtarget.hasSSEUnalignedMem() && Ld->getValueSizeInBits(0) == 128 &&
2853 Ld->getAlign() < Align(16))
2854 return false;
2855
2856 // TODO: If this is a non-temporal load and the target has an instruction
2857 // for it, it should not be folded. See "useNonTemporalLoad()".
2858
2859 return true;
2860}
2861
2863 const X86Subtarget &Subtarget,
2864 bool AssumeSingleUse) {
2865 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2866 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2867 return false;
2868
2869 // We can not replace a wide volatile load with a broadcast-from-memory,
2870 // because that would narrow the load, which isn't legal for volatiles.
2871 auto *Ld = cast<LoadSDNode>(Op.getNode());
2872 return !Ld->isVolatile() ||
2873 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2874}
2875
2877 if (!Op.hasOneUse())
2878 return false;
2879 // Peek through (oneuse) bitcast users
2880 SDNode *User = *Op->user_begin();
2881 while (User->getOpcode() == ISD::BITCAST) {
2882 if (!User->hasOneUse())
2883 return false;
2884 User = *User->user_begin();
2885 }
2886 return ISD::isNormalStore(User);
2887}
2888
2890 if (Op.hasOneUse()) {
2891 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2892 return (ISD::ZERO_EXTEND == Opcode);
2893 }
2894 return false;
2895}
2896
2897// Return true if its cheap to bitcast this to a vector type.
2898static bool mayFoldIntoVector(SDValue Op, const X86Subtarget &Subtarget,
2899 bool AssumeSingleUse = false) {
2900 if (peekThroughBitcasts(Op).getValueType().isVector())
2901 return true;
2903 return true;
2904 return X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse,
2905 /*IgnoreAlignment=*/true);
2906}
2907
2908static bool isLogicOp(unsigned Opcode) {
2909 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2910 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2911}
2912
2913static bool isTargetShuffle(unsigned Opcode) {
2914 switch(Opcode) {
2915 default: return false;
2916 case X86ISD::BLENDI:
2917 case X86ISD::PSHUFB:
2918 case X86ISD::PSHUFD:
2919 case X86ISD::PSHUFHW:
2920 case X86ISD::PSHUFLW:
2921 case X86ISD::SHUFP:
2922 case X86ISD::INSERTPS:
2923 case X86ISD::EXTRQI:
2924 case X86ISD::INSERTQI:
2925 case X86ISD::VALIGN:
2926 case X86ISD::PALIGNR:
2927 case X86ISD::VSHLDQ:
2928 case X86ISD::VSRLDQ:
2929 case X86ISD::MOVLHPS:
2930 case X86ISD::MOVHLPS:
2931 case X86ISD::MOVSHDUP:
2932 case X86ISD::MOVSLDUP:
2933 case X86ISD::MOVDDUP:
2934 case X86ISD::MOVSS:
2935 case X86ISD::MOVSD:
2936 case X86ISD::MOVSH:
2937 case X86ISD::UNPCKL:
2938 case X86ISD::UNPCKH:
2939 case X86ISD::VBROADCAST:
2940 case X86ISD::VPERMILPI:
2941 case X86ISD::VPERMILPV:
2942 case X86ISD::VPERM2X128:
2943 case X86ISD::SHUF128:
2944 case X86ISD::VPERMIL2:
2945 case X86ISD::VPERMI:
2946 case X86ISD::VPPERM:
2947 case X86ISD::VPERMV:
2948 case X86ISD::VPERMV3:
2949 case X86ISD::VZEXT_MOVL:
2950 case X86ISD::COMPRESS:
2951 case X86ISD::EXPAND:
2952 return true;
2953 }
2954}
2955
2956static bool isTargetShuffleVariableMask(unsigned Opcode) {
2957 switch (Opcode) {
2958 default: return false;
2959 // Target Shuffles.
2960 case X86ISD::PSHUFB:
2961 case X86ISD::VPERMILPV:
2962 case X86ISD::VPERMIL2:
2963 case X86ISD::VPPERM:
2964 case X86ISD::VPERMV:
2965 case X86ISD::VPERMV3:
2966 return true;
2967 // 'Faux' Target Shuffles.
2968 case ISD::OR:
2969 case ISD::AND:
2970 case X86ISD::ANDNP:
2971 return true;
2972 }
2973}
2974
2977 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2979 int ReturnAddrIndex = FuncInfo->getRAIndex();
2980
2981 if (ReturnAddrIndex == 0) {
2982 // Set up a frame object for the return address.
2983 unsigned SlotSize = RegInfo->getSlotSize();
2984 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2985 -(int64_t)SlotSize,
2986 false);
2987 FuncInfo->setRAIndex(ReturnAddrIndex);
2988 }
2989
2990 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2991}
2992
2994 bool HasSymbolicDisplacement) {
2995 // Offset should fit into 32 bit immediate field.
2996 if (!isInt<32>(Offset))
2997 return false;
2998
2999 // If we don't have a symbolic displacement - we don't have any extra
3000 // restrictions.
3001 if (!HasSymbolicDisplacement)
3002 return true;
3003
3004 // We can fold large offsets in the large code model because we always use
3005 // 64-bit offsets.
3006 if (CM == CodeModel::Large)
3007 return true;
3008
3009 // For kernel code model we know that all object resist in the negative half
3010 // of 32bits address space. We may not accept negative offsets, since they may
3011 // be just off and we may accept pretty large positive ones.
3012 if (CM == CodeModel::Kernel)
3013 return Offset >= 0;
3014
3015 // For other non-large code models we assume that latest small object is 16MB
3016 // before end of 31 bits boundary. We may also accept pretty large negative
3017 // constants knowing that all objects are in the positive half of address
3018 // space.
3019 return Offset < 16 * 1024 * 1024;
3020}
3021
3022/// Return true if the condition is an signed comparison operation.
3023static bool isX86CCSigned(X86::CondCode X86CC) {
3024 switch (X86CC) {
3025 default:
3026 llvm_unreachable("Invalid integer condition!");
3027 case X86::COND_E:
3028 case X86::COND_NE:
3029 case X86::COND_B:
3030 case X86::COND_A:
3031 case X86::COND_BE:
3032 case X86::COND_AE:
3033 return false;
3034 case X86::COND_G:
3035 case X86::COND_GE:
3036 case X86::COND_L:
3037 case X86::COND_LE:
3038 return true;
3039 }
3040}
3041
3043 switch (SetCCOpcode) {
3044 // clang-format off
3045 default: llvm_unreachable("Invalid integer condition!");
3046 case ISD::SETEQ: return X86::COND_E;
3047 case ISD::SETGT: return X86::COND_G;
3048 case ISD::SETGE: return X86::COND_GE;
3049 case ISD::SETLT: return X86::COND_L;
3050 case ISD::SETLE: return X86::COND_LE;
3051 case ISD::SETNE: return X86::COND_NE;
3052 case ISD::SETULT: return X86::COND_B;
3053 case ISD::SETUGT: return X86::COND_A;
3054 case ISD::SETULE: return X86::COND_BE;
3055 case ISD::SETUGE: return X86::COND_AE;
3056 // clang-format on
3057 }
3058}
3059
3060/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
3061/// condition code, returning the condition code and the LHS/RHS of the
3062/// comparison to make.
3064 bool isFP, SDValue &LHS, SDValue &RHS,
3065 SelectionDAG &DAG) {
3066 if (!isFP) {
3068 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
3069 // X > -1 -> X == 0, jump !sign.
3070 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3071 return X86::COND_NS;
3072 }
3073 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
3074 // X < 0 -> X == 0, jump on sign.
3075 return X86::COND_S;
3076 }
3077 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3078 // X >= 0 -> X == 0, jump on !sign.
3079 return X86::COND_NS;
3080 }
3081 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3082 // X < 1 -> X <= 0
3083 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3084 return X86::COND_LE;
3085 }
3086 }
3087
3088 return TranslateIntegerX86CC(SetCCOpcode);
3089 }
3090
3091 // First determine if it is required or is profitable to flip the operands.
3092
3093 // If LHS is a foldable load, but RHS is not, flip the condition.
3094 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3095 !ISD::isNON_EXTLoad(RHS.getNode())) {
3096 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3097 std::swap(LHS, RHS);
3098 }
3099
3100 switch (SetCCOpcode) {
3101 default: break;
3102 case ISD::SETOLT:
3103 case ISD::SETOLE:
3104 case ISD::SETUGT:
3105 case ISD::SETUGE:
3106 std::swap(LHS, RHS);
3107 break;
3108 }
3109
3110 // On a floating point condition, the flags are set as follows:
3111 // ZF PF CF op
3112 // 0 | 0 | 0 | X > Y
3113 // 0 | 0 | 1 | X < Y
3114 // 1 | 0 | 0 | X == Y
3115 // 1 | 1 | 1 | unordered
3116 switch (SetCCOpcode) {
3117 // clang-format off
3118 default: llvm_unreachable("Condcode should be pre-legalized away");
3119 case ISD::SETUEQ:
3120 case ISD::SETEQ: return X86::COND_E;
3121 case ISD::SETOLT: // flipped
3122 case ISD::SETOGT:
3123 case ISD::SETGT: return X86::COND_A;
3124 case ISD::SETOLE: // flipped
3125 case ISD::SETOGE:
3126 case ISD::SETGE: return X86::COND_AE;
3127 case ISD::SETUGT: // flipped
3128 case ISD::SETULT:
3129 case ISD::SETLT: return X86::COND_B;
3130 case ISD::SETUGE: // flipped
3131 case ISD::SETULE:
3132 case ISD::SETLE: return X86::COND_BE;
3133 case ISD::SETONE:
3134 case ISD::SETNE: return X86::COND_NE;
3135 case ISD::SETUO: return X86::COND_P;
3136 case ISD::SETO: return X86::COND_NP;
3137 case ISD::SETOEQ:
3138 case ISD::SETUNE: return X86::COND_INVALID;
3139 // clang-format on
3140 }
3141}
3142
3143/// Is there a floating point cmov for the specific X86 condition code?
3144/// Current x86 isa includes the following FP cmov instructions:
3145/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3146static bool hasFPCMov(unsigned X86CC) {
3147 switch (X86CC) {
3148 default:
3149 return false;
3150 case X86::COND_B:
3151 case X86::COND_BE:
3152 case X86::COND_E:
3153 case X86::COND_P:
3154 case X86::COND_A:
3155 case X86::COND_AE:
3156 case X86::COND_NE:
3157 case X86::COND_NP:
3158 return true;
3159 }
3160}
3161
3162static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3163 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3164 VT.is512BitVector();
3165}
3166
3168 const CallBase &I,
3169 MachineFunction &MF,
3170 unsigned Intrinsic) const {
3171 Info.flags = MachineMemOperand::MONone;
3172 Info.offset = 0;
3173
3175 if (!IntrData) {
3176 switch (Intrinsic) {
3177 case Intrinsic::x86_aesenc128kl:
3178 case Intrinsic::x86_aesdec128kl:
3179 Info.opc = ISD::INTRINSIC_W_CHAIN;
3180 Info.ptrVal = I.getArgOperand(1);
3181 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3182 Info.align = Align(1);
3183 Info.flags |= MachineMemOperand::MOLoad;
3184 return true;
3185 case Intrinsic::x86_aesenc256kl:
3186 case Intrinsic::x86_aesdec256kl:
3187 Info.opc = ISD::INTRINSIC_W_CHAIN;
3188 Info.ptrVal = I.getArgOperand(1);
3189 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3190 Info.align = Align(1);
3191 Info.flags |= MachineMemOperand::MOLoad;
3192 return true;
3193 case Intrinsic::x86_aesencwide128kl:
3194 case Intrinsic::x86_aesdecwide128kl:
3195 Info.opc = ISD::INTRINSIC_W_CHAIN;
3196 Info.ptrVal = I.getArgOperand(0);
3197 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3198 Info.align = Align(1);
3199 Info.flags |= MachineMemOperand::MOLoad;
3200 return true;
3201 case Intrinsic::x86_aesencwide256kl:
3202 case Intrinsic::x86_aesdecwide256kl:
3203 Info.opc = ISD::INTRINSIC_W_CHAIN;
3204 Info.ptrVal = I.getArgOperand(0);
3205 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3206 Info.align = Align(1);
3207 Info.flags |= MachineMemOperand::MOLoad;
3208 return true;
3209 case Intrinsic::x86_cmpccxadd32:
3210 case Intrinsic::x86_cmpccxadd64:
3211 case Intrinsic::x86_atomic_bts:
3212 case Intrinsic::x86_atomic_btc:
3213 case Intrinsic::x86_atomic_btr: {
3214 Info.opc = ISD::INTRINSIC_W_CHAIN;
3215 Info.ptrVal = I.getArgOperand(0);
3216 unsigned Size = I.getType()->getScalarSizeInBits();
3217 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3218 Info.align = Align(Size);
3221 return true;
3222 }
3223 case Intrinsic::x86_atomic_bts_rm:
3224 case Intrinsic::x86_atomic_btc_rm:
3225 case Intrinsic::x86_atomic_btr_rm: {
3226 Info.opc = ISD::INTRINSIC_W_CHAIN;
3227 Info.ptrVal = I.getArgOperand(0);
3228 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3229 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3230 Info.align = Align(Size);
3233 return true;
3234 }
3235 case Intrinsic::x86_aadd32:
3236 case Intrinsic::x86_aadd64:
3237 case Intrinsic::x86_aand32:
3238 case Intrinsic::x86_aand64:
3239 case Intrinsic::x86_aor32:
3240 case Intrinsic::x86_aor64:
3241 case Intrinsic::x86_axor32:
3242 case Intrinsic::x86_axor64:
3243 case Intrinsic::x86_atomic_add_cc:
3244 case Intrinsic::x86_atomic_sub_cc:
3245 case Intrinsic::x86_atomic_or_cc:
3246 case Intrinsic::x86_atomic_and_cc:
3247 case Intrinsic::x86_atomic_xor_cc: {
3248 Info.opc = ISD::INTRINSIC_W_CHAIN;
3249 Info.ptrVal = I.getArgOperand(0);
3250 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3251 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3252 Info.align = Align(Size);
3255 return true;
3256 }
3257 }
3258 return false;
3259 }
3260
3261 switch (IntrData->Type) {
3264 case TRUNCATE_TO_MEM_VI32: {
3265 Info.opc = ISD::INTRINSIC_VOID;
3266 Info.ptrVal = I.getArgOperand(0);
3267 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3269 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3270 ScalarVT = MVT::i8;
3271 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3272 ScalarVT = MVT::i16;
3273 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3274 ScalarVT = MVT::i32;
3275
3276 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3277 Info.align = Align(1);
3278 Info.flags |= MachineMemOperand::MOStore;
3279 break;
3280 }
3281 case GATHER:
3282 case GATHER_AVX2: {
3283 Info.opc = ISD::INTRINSIC_W_CHAIN;
3284 Info.ptrVal = nullptr;
3285 MVT DataVT = MVT::getVT(I.getType());
3286 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3287 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3288 IndexVT.getVectorNumElements());
3289 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3290 Info.align = Align(1);
3291 Info.flags |= MachineMemOperand::MOLoad;
3292 break;
3293 }
3294 case SCATTER: {
3295 Info.opc = ISD::INTRINSIC_VOID;
3296 Info.ptrVal = nullptr;
3297 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3298 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3299 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3300 IndexVT.getVectorNumElements());
3301 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3302 Info.align = Align(1);
3303 Info.flags |= MachineMemOperand::MOStore;
3304 break;
3305 }
3306 default:
3307 return false;
3308 }
3309
3310 return true;
3311}
3312
3313/// Returns true if the target can instruction select the
3314/// specified FP immediate natively. If false, the legalizer will
3315/// materialize the FP immediate as a load from a constant pool.
3317 bool ForCodeSize) const {
3318 for (const APFloat &FPImm : LegalFPImmediates)
3319 if (Imm.bitwiseIsEqual(FPImm))
3320 return true;
3321 return false;
3322}
3323
3325 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3326 std::optional<unsigned> ByteOffset) const {
3327 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3328
3329 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3330 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3331 N = *N->user_begin();
3332 return N;
3333 };
3334
3335 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3336 // relocation target a movq or addq instruction: don't let the load shrink.
3337 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3338 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3339 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3340 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3341
3342 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3343 // those uses are extracted directly into a store, then the extract + store
3344 // can be store-folded, or (4) any use will be used by legal full width
3345 // instruction. Then, it's probably not worth splitting the load.
3346 EVT VT = Load->getValueType(0);
3347 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3348 !SDValue(Load, 0).hasOneUse()) {
3349 bool FullWidthUse = false;
3350 bool AllExtractStores = true;
3351 for (SDUse &Use : Load->uses()) {
3352 // Skip uses of the chain value. Result 0 of the node is the load value.
3353 if (Use.getResNo() != 0)
3354 continue;
3355
3356 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3357
3358 // If this use is an extract + store, it's probably not worth splitting.
3359 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3360 all_of(User->uses(), [&](const SDUse &U) {
3361 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3362 return Inner->getOpcode() == ISD::STORE;
3363 }))
3364 continue;
3365
3366 AllExtractStores = false;
3367
3368 // If any use is a full width legal/target bin op, then assume its legal
3369 // and won't split.
3370 if (isBinOp(User->getOpcode()) &&
3371 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3372 User->getOpcode() > ISD::BUILTIN_OP_END))
3373 FullWidthUse = true;
3374 }
3375
3376 if (AllExtractStores)
3377 return false;
3378
3379 // If we have an user that uses the full vector width, then this use is
3380 // only worth splitting if the offset isn't 0 (to avoid an
3381 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3382 if (FullWidthUse)
3383 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3384 }
3385
3386 return true;
3387}
3388
3389/// Returns true if it is beneficial to convert a load of a constant
3390/// to just the constant itself.
3392 Type *Ty) const {
3393 assert(Ty->isIntegerTy());
3394
3395 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3396 if (BitSize == 0 || BitSize > 64)
3397 return false;
3398 return true;
3399}
3400
3402 // If we are using XMM registers in the ABI and the condition of the select is
3403 // a floating-point compare and we have blendv or conditional move, then it is
3404 // cheaper to select instead of doing a cross-register move and creating a
3405 // load that depends on the compare result.
3406 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3407 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3408}
3409
3411 // TODO: It might be a win to ease or lift this restriction, but the generic
3412 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3413 if (VT.isVector() && Subtarget.hasAVX512())
3414 return false;
3415
3416 return true;
3417}
3418
3420 SDValue C) const {
3421 // TODO: We handle scalars using custom code, but generic combining could make
3422 // that unnecessary.
3423 APInt MulC;
3424 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3425 return false;
3426
3427 // Find the type this will be legalized too. Otherwise we might prematurely
3428 // convert this to shl+add/sub and then still have to type legalize those ops.
3429 // Another choice would be to defer the decision for illegal types until
3430 // after type legalization. But constant splat vectors of i64 can't make it
3431 // through type legalization on 32-bit targets so we would need to special
3432 // case vXi64.
3433 while (getTypeAction(Context, VT) != TypeLegal)
3434 VT = getTypeToTransformTo(Context, VT);
3435
3436 // If vector multiply is legal, assume that's faster than shl + add/sub.
3437 // Multiply is a complex op with higher latency and lower throughput in
3438 // most implementations, sub-vXi32 vector multiplies are always fast,
3439 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3440 // is always going to be slow.
3441 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3442 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3443 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3444 return false;
3445
3446 // shl+add, shl+sub, shl+add+neg
3447 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3448 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3449}
3450
3452 unsigned Index) const {
3454 return false;
3455
3456 // Mask vectors support all subregister combinations and operations that
3457 // extract half of vector.
3458 if (ResVT.getVectorElementType() == MVT::i1)
3459 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3460 (Index == ResVT.getVectorNumElements()));
3461
3462 return (Index % ResVT.getVectorNumElements()) == 0;
3463}
3464
3466 unsigned Opc = VecOp.getOpcode();
3467
3468 // Assume target opcodes can't be scalarized.
3469 // TODO - do we have any exceptions?
3470 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3471 return false;
3472
3473 // If the vector op is not supported, try to convert to scalar.
3474 EVT VecVT = VecOp.getValueType();
3476 return true;
3477
3478 // If the vector op is supported, but the scalar op is not, the transform may
3479 // not be worthwhile.
3480 EVT ScalarVT = VecVT.getScalarType();
3481 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3482}
3483
3485 bool) const {
3486 // TODO: Allow vectors?
3487 if (VT.isVector())
3488 return false;
3489 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3490}
3491
3493 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3494 // i32/i64 or can rely on BSF passthrough value.
3495 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3496 Subtarget.hasBitScanPassThrough() ||
3497 (!Ty->isVectorTy() &&
3498 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3499}
3500
3502 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3503 // passthrough value.
3504 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3505 Subtarget.hasBitScanPassThrough();
3506}
3507
3509 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3510 // expensive than a straight movsd. On the other hand, it's important to
3511 // shrink long double fp constant since fldt is very slow.
3512 return !Subtarget.hasSSE2() || VT == MVT::f80;
3513}
3514
3516 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3517 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3518}
3519
3521 const SelectionDAG &DAG,
3522 const MachineMemOperand &MMO) const {
3523 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3524 BitcastVT.getVectorElementType() == MVT::i1)
3525 return false;
3526
3527 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3528 return false;
3529
3530 if (LoadVT.isVector() && BitcastVT.isVector()) {
3531 // If both types are legal vectors, it's always ok to convert them.
3532 // Don't convert to an illegal type.
3533 if (isTypeLegal(LoadVT))
3534 return isTypeLegal(BitcastVT);
3535 }
3536
3537 // If we have a large vector type (even if illegal), don't bitcast to large
3538 // (illegal) scalar types. Better to load fewer vectors and extract.
3539 if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() &&
3540 BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0)
3541 return false;
3542
3543 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3544}
3545
3547 const MachineFunction &MF) const {
3548 // Do not merge to float value size (128 bytes) if no implicit
3549 // float attribute is set.
3550 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3551
3552 if (NoFloat) {
3553 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3554 return (MemVT.getSizeInBits() <= MaxIntSize);
3555 }
3556 // Make sure we don't merge greater than our preferred vector
3557 // width.
3558 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3559 return false;
3560
3561 return true;
3562}
3563
3565 return Subtarget.hasFastLZCNT();
3566}
3567
3569 const Instruction &AndI) const {
3570 return true;
3571}
3572
3574 EVT VT = Y.getValueType();
3575
3576 if (VT.isVector())
3577 return false;
3578
3579 if (!Subtarget.hasBMI())
3580 return false;
3581
3582 // There are only 32-bit and 64-bit forms for 'andn'.
3583 if (VT != MVT::i32 && VT != MVT::i64)
3584 return false;
3585
3586 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3587}
3588
3590 EVT VT = Y.getValueType();
3591
3592 if (!VT.isVector())
3593 return hasAndNotCompare(Y);
3594
3595 // Vector.
3596
3597 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3598 return false;
3599
3600 if (VT == MVT::v4i32)
3601 return true;
3602
3603 return Subtarget.hasSSE2();
3604}
3605
3607 return X.getValueType().isScalarInteger(); // 'bt'
3608}
3609
3613 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3614 SelectionDAG &DAG) const {
3615 // Does baseline recommend not to perform the fold by default?
3617 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3618 return false;
3619 // For scalars this transform is always beneficial.
3620 if (X.getValueType().isScalarInteger())
3621 return true;
3622 // If all the shift amounts are identical, then transform is beneficial even
3623 // with rudimentary SSE2 shifts.
3624 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3625 return true;
3626 // If we have AVX2 with it's powerful shift operations, then it's also good.
3627 if (Subtarget.hasAVX2())
3628 return true;
3629 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3630 return NewShiftOpcode == ISD::SHL;
3631}
3632
3634 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3635 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3636 if (!VT.isInteger())
3637 return ShiftOpc;
3638
3639 bool PreferRotate = false;
3640 if (VT.isVector()) {
3641 // For vectors, if we have rotate instruction support, then its definetly
3642 // best. Otherwise its not clear what the best so just don't make changed.
3643 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3644 VT.getScalarType() == MVT::i64);
3645 } else {
3646 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3647 // rotate unless we have a zext mask+shr.
3648 PreferRotate = Subtarget.hasBMI2();
3649 if (!PreferRotate) {
3650 unsigned MaskBits =
3651 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3652 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3653 }
3654 }
3655
3656 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3657 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3658
3659 if (PreferRotate && MayTransformRotate)
3660 return ISD::ROTL;
3661
3662 // If vector we don't really get much benefit swapping around constants.
3663 // Maybe we could check if the DAG has the flipped node already in the
3664 // future.
3665 if (VT.isVector())
3666 return ShiftOpc;
3667
3668 // See if the beneficial to swap shift type.
3669 if (ShiftOpc == ISD::SHL) {
3670 // If the current setup has imm64 mask, then inverse will have
3671 // at least imm32 mask (or be zext i32 -> i64).
3672 if (VT == MVT::i64)
3673 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3674 : ShiftOpc;
3675
3676 // We can only benefit if req at least 7-bit for the mask. We
3677 // don't want to replace shl of 1,2,3 as they can be implemented
3678 // with lea/add.
3679 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3680 }
3681
3682 if (VT == MVT::i64)
3683 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3684 // extremely efficient.
3685 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3686
3687 // Keep small shifts as shl so we can generate add/lea.
3688 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3689 }
3690
3691 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3692 // (PreferRotate will be set in the latter case).
3693 if (PreferRotate || !MayTransformRotate || VT.isVector())
3694 return ShiftOpc;
3695
3696 // Non-vector type and we have a zext mask with SRL.
3697 return ISD::SRL;
3698}
3699
3702 const Value *Lhs,
3703 const Value *Rhs) const {
3704 using namespace llvm::PatternMatch;
3705 int BaseCost = BrMergingBaseCostThresh.getValue();
3706 // With CCMP, branches can be merged in a more efficient way.
3707 if (BaseCost >= 0 && Subtarget.hasCCMP())
3708 BaseCost += BrMergingCcmpBias;
3709 // a == b && a == c is a fast pattern on x86.
3710 if (BaseCost >= 0 && Opc == Instruction::And &&
3713 BaseCost += 1;
3714
3715 // For OR conditions with EQ comparisons, prefer splitting into branches
3716 // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops,
3717 // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed
3718 // comparisons (SLT, SGT) that can be optimized.
3719 if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or &&
3722 return {-1, -1, -1};
3723
3724 return {BaseCost, BrMergingLikelyBias.getValue(),
3725 BrMergingUnlikelyBias.getValue()};
3726}
3727
3729 return N->getOpcode() != ISD::FP_EXTEND;
3730}
3731
3733 const SDNode *N) const {
3734 assert(((N->getOpcode() == ISD::SHL &&
3735 N->getOperand(0).getOpcode() == ISD::SRL) ||
3736 (N->getOpcode() == ISD::SRL &&
3737 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3738 "Expected shift-shift mask");
3739 // TODO: Should we always create i64 masks? Or only folded immediates?
3740 EVT VT = N->getValueType(0);
3741 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3742 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3743 // Only fold if the shift values are equal - so it folds to AND.
3744 // TODO - we should fold if either is a non-uniform vector but we don't do
3745 // the fold for non-splats yet.
3746 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3747 }
3749}
3750
3752 EVT VT = Y.getValueType();
3753
3754 // For vectors, we don't have a preference, but we probably want a mask.
3755 if (VT.isVector())
3756 return false;
3757
3758 unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
3759 return VT.getScalarSizeInBits() <= MaxWidth;
3760}
3761
3764 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3766 !Subtarget.isOSWindows())
3769 ExpansionFactor);
3770}
3771
3773 // Any legal vector type can be splatted more efficiently than
3774 // loading/spilling from memory.
3775 return isTypeLegal(VT);
3776}
3777
3779 MVT VT = MVT::getIntegerVT(NumBits);
3780 if (isTypeLegal(VT))
3781 return VT;
3782
3783 // PMOVMSKB can handle this.
3784 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3785 return MVT::v16i8;
3786
3787 // VPMOVMSKB can handle this.
3788 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3789 return MVT::v32i8;
3790
3791 // TODO: Allow 64-bit type for 32-bit target.
3792 // TODO: 512-bit types should be allowed, but make sure that those
3793 // cases are handled in combineVectorSizedSetCCEquality().
3794
3796}
3797
3798/// Val is the undef sentinel value or equal to the specified value.
3799static bool isUndefOrEqual(int Val, int CmpVal) {
3800 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3801}
3802
3803/// Return true if every element in Mask is the undef sentinel value or equal to
3804/// the specified value.
3805static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3806 return llvm::all_of(Mask, [CmpVal](int M) {
3807 return (M == SM_SentinelUndef) || (M == CmpVal);
3808 });
3809}
3810
3811/// Return true if every element in Mask, beginning from position Pos and ending
3812/// in Pos+Size is the undef sentinel value or equal to the specified value.
3813static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3814 unsigned Size) {
3815 return llvm::all_of(Mask.slice(Pos, Size),
3816 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3817}
3818
3819/// Val is either the undef or zero sentinel value.
3820static bool isUndefOrZero(int Val) {
3821 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3822}
3823
3824/// Return true if every element in Mask, beginning from position Pos and ending
3825/// in Pos+Size is the undef sentinel value.
3826static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3827 return llvm::all_of(Mask.slice(Pos, Size), equal_to(SM_SentinelUndef));
3828}
3829
3830/// Return true if the mask creates a vector whose lower half is undefined.
3832 unsigned NumElts = Mask.size();
3833 return isUndefInRange(Mask, 0, NumElts / 2);
3834}
3835
3836/// Return true if the mask creates a vector whose upper half is undefined.
3838 unsigned NumElts = Mask.size();
3839 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3840}
3841
3842/// Return true if Val falls within the specified range (L, H].
3843static bool isInRange(int Val, int Low, int Hi) {
3844 return (Val >= Low && Val < Hi);
3845}
3846
3847/// Return true if the value of any element in Mask falls within the specified
3848/// range (L, H].
3849static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3850 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3851}
3852
3853/// Return true if the value of any element in Mask is the zero sentinel value.
3854static bool isAnyZero(ArrayRef<int> Mask) {
3855 return llvm::any_of(Mask, equal_to(SM_SentinelZero));
3856}
3857
3858/// Return true if Val is undef or if its value falls within the
3859/// specified range (L, H].
3860static bool isUndefOrInRange(int Val, int Low, int Hi) {
3861 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3862}
3863
3864/// Return true if every element in Mask is undef or if its value
3865/// falls within the specified range (L, H].
3866static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3867 return llvm::all_of(
3868 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3869}
3870
3871/// Return true if Val is undef, zero or if its value falls within the
3872/// specified range (L, H].
3873static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3874 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3875}
3876
3877/// Return true if every element in Mask is undef, zero or if its value
3878/// falls within the specified range (L, H].
3879static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3880 return llvm::all_of(
3881 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3882}
3883
3884/// Return true if every element in Mask, is an in-place blend/select mask or is
3885/// undef.
3886[[maybe_unused]] static bool isBlendOrUndef(ArrayRef<int> Mask) {
3887 unsigned NumElts = Mask.size();
3888 for (auto [I, M] : enumerate(Mask))
3889 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3890 return false;
3891 return true;
3892}
3893
3894/// Return true if every element in Mask, beginning
3895/// from position Pos and ending in Pos + Size, falls within the specified
3896/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3897static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3898 unsigned Size, int Low, int Step = 1) {
3899 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3900 if (!isUndefOrEqual(Mask[i], Low))
3901 return false;
3902 return true;
3903}
3904
3905/// Return true if every element in Mask, beginning
3906/// from position Pos and ending in Pos+Size, falls within the specified
3907/// sequential range (Low, Low+Size], or is undef or is zero.
3909 unsigned Size, int Low,
3910 int Step = 1) {
3911 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3912 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3913 return false;
3914 return true;
3915}
3916
3917/// Return true if every element in Mask, beginning
3918/// from position Pos and ending in Pos+Size is undef or is zero.
3919static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3920 unsigned Size) {
3921 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3922}
3923
3924/// Return true if every element of a single input is referenced by the shuffle
3925/// mask. i.e. it just permutes them all.
3927 unsigned NumElts = Mask.size();
3928 APInt DemandedElts = APInt::getZero(NumElts);
3929 for (int M : Mask)
3930 if (isInRange(M, 0, NumElts))
3931 DemandedElts.setBit(M);
3932 return DemandedElts.isAllOnes();
3933}
3934
3935/// Helper function to test whether a shuffle mask could be
3936/// simplified by widening the elements being shuffled.
3937///
3938/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3939/// leaves it in an unspecified state.
3940///
3941/// NOTE: This must handle normal vector shuffle masks and *target* vector
3942/// shuffle masks. The latter have the special property of a '-2' representing
3943/// a zero-ed lane of a vector.
3945 SmallVectorImpl<int> &WidenedMask) {
3946 WidenedMask.assign(Mask.size() / 2, 0);
3947 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3948 int M0 = Mask[i];
3949 int M1 = Mask[i + 1];
3950
3951 // If both elements are undef, its trivial.
3952 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3953 WidenedMask[i / 2] = SM_SentinelUndef;
3954 continue;
3955 }
3956
3957 // Check for an undef mask and a mask value properly aligned to fit with
3958 // a pair of values. If we find such a case, use the non-undef mask's value.
3959 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3960 WidenedMask[i / 2] = M1 / 2;
3961 continue;
3962 }
3963 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3964 WidenedMask[i / 2] = M0 / 2;
3965 continue;
3966 }
3967
3968 // When zeroing, we need to spread the zeroing across both lanes to widen.
3969 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3970 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3972 WidenedMask[i / 2] = SM_SentinelZero;
3973 continue;
3974 }
3975 return false;
3976 }
3977
3978 // Finally check if the two mask values are adjacent and aligned with
3979 // a pair.
3980 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3981 WidenedMask[i / 2] = M0 / 2;
3982 continue;
3983 }
3984
3985 // Otherwise we can't safely widen the elements used in this shuffle.
3986 return false;
3987 }
3988 assert(WidenedMask.size() == Mask.size() / 2 &&
3989 "Incorrect size of mask after widening the elements!");
3990
3991 return true;
3992}
3993
3995 const APInt &Zeroable,
3996 bool V2IsZero,
3997 SmallVectorImpl<int> &WidenedMask) {
3998 // Create an alternative mask with info about zeroable elements.
3999 // Here we do not set undef elements as zeroable.
4000 SmallVector<int, 64> ZeroableMask(Mask);
4001 if (V2IsZero) {
4002 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
4003 for (int i = 0, Size = Mask.size(); i != Size; ++i)
4004 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
4005 ZeroableMask[i] = SM_SentinelZero;
4006 }
4007 return canWidenShuffleElements(ZeroableMask, WidenedMask);
4008}
4009
4011 SmallVector<int, 32> WidenedMask;
4012 return canWidenShuffleElements(Mask, WidenedMask);
4013}
4014
4015// Attempt to narrow/widen shuffle mask until it matches the target number of
4016// elements.
4017static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
4018 SmallVectorImpl<int> &ScaledMask) {
4019 unsigned NumSrcElts = Mask.size();
4020 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
4021 "Illegal shuffle scale factor");
4022
4023 // Narrowing is guaranteed to work.
4024 if (NumDstElts >= NumSrcElts) {
4025 int Scale = NumDstElts / NumSrcElts;
4026 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
4027 return true;
4028 }
4029
4030 // We have to repeat the widening until we reach the target size, but we can
4031 // split out the first widening as it sets up ScaledMask for us.
4032 if (canWidenShuffleElements(Mask, ScaledMask)) {
4033 while (ScaledMask.size() > NumDstElts) {
4034 SmallVector<int, 16> WidenedMask;
4035 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
4036 return false;
4037 ScaledMask = std::move(WidenedMask);
4038 }
4039 return true;
4040 }
4041
4042 return false;
4043}
4044
4045static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
4046 SmallVector<int, 32> ScaledMask;
4047 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
4048}
4049
4050// Helper to grow the shuffle mask for a larger value type.
4051// NOTE: This is different to scaleShuffleElements which is a same size type.
4052static void growShuffleMask(ArrayRef<int> SrcMask,
4053 SmallVectorImpl<int> &DstMask,
4054 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
4055 assert(DstMask.empty() && "Expected an empty shuffle mas");
4056 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
4057 unsigned Scale = DstSizeInBits / SrcSizeInBits;
4058 unsigned NumSrcElts = SrcMask.size();
4059 DstMask.assign(SrcMask.begin(), SrcMask.end());
4060 for (int &M : DstMask) {
4061 if (M < 0)
4062 continue;
4063 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
4064 }
4065 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
4066}
4067
4068/// Returns true if Elt is a constant zero or a floating point constant +0.0.
4070 return isNullConstant(Elt) || isNullFPConstant(Elt);
4071}
4072
4073// Build a vector of constants.
4074// Use an UNDEF node if MaskElt == -1.
4075// Split 64-bit constants in the 32-bit mode.
4077 const SDLoc &dl, bool IsMask = false) {
4078
4080 bool Split = false;
4081
4082 MVT ConstVecVT = VT;
4083 unsigned NumElts = VT.getVectorNumElements();
4084 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4085 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4086 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4087 Split = true;
4088 }
4089
4090 MVT EltVT = ConstVecVT.getVectorElementType();
4091 for (unsigned i = 0; i < NumElts; ++i) {
4092 bool IsUndef = Values[i] < 0 && IsMask;
4093 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4094 DAG.getConstant(Values[i], dl, EltVT);
4095 Ops.push_back(OpNode);
4096 if (Split)
4097 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4098 DAG.getConstant(0, dl, EltVT));
4099 }
4100 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4101 if (Split)
4102 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4103 return ConstsNode;
4104}
4105
4106static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4107 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4108 assert(Bits.size() == Undefs.getBitWidth() &&
4109 "Unequal constant and undef arrays");
4111 bool Split = false;
4112
4113 MVT ConstVecVT = VT;
4114 unsigned NumElts = VT.getVectorNumElements();
4115 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4116 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4117 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4118 Split = true;
4119 }
4120
4121 MVT EltVT = ConstVecVT.getVectorElementType();
4122 MVT EltIntVT = EltVT.changeTypeToInteger();
4123 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4124 if (Undefs[i]) {
4125 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4126 continue;
4127 }
4128 const APInt &V = Bits[i];
4129 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4130 if (Split) {
4131 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4132 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4133 } else {
4134 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4135 }
4136 }
4137
4138 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4139 return DAG.getBitcast(VT, ConstsNode);
4140}
4141
4143 SelectionDAG &DAG, const SDLoc &dl) {
4144 APInt Undefs = APInt::getZero(Bits.size());
4145 return getConstVector(Bits, Undefs, VT, DAG, dl);
4146}
4147
4148/// Returns a vector of specified type with all zero elements.
4149static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4150 SelectionDAG &DAG, const SDLoc &dl) {
4151 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4152 VT.getVectorElementType() == MVT::i1) &&
4153 "Unexpected vector type");
4154
4155 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4156 // type. This ensures they get CSE'd. But if the integer type is not
4157 // available, use a floating-point +0.0 instead.
4158 SDValue Vec;
4159 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4160 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4161 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4162 } else if (VT.isFloatingPoint() &&
4164 Vec = DAG.getConstantFP(+0.0, dl, VT);
4165 } else if (VT.getVectorElementType() == MVT::i1) {
4166 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4167 "Unexpected vector type");
4168 Vec = DAG.getConstant(0, dl, VT);
4169 } else {
4170 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4171 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4172 }
4173 return DAG.getBitcast(VT, Vec);
4174}
4175
4176// Helper to determine if the ops are all the extracted subvectors come from a
4177// single source. If we allow commute they don't have to be in order (Lo/Hi).
4178static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4179 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4180 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4181 LHS.getValueType() != RHS.getValueType() ||
4182 LHS.getOperand(0) != RHS.getOperand(0))
4183 return SDValue();
4184
4185 SDValue Src = LHS.getOperand(0);
4186 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4187 return SDValue();
4188
4189 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4190 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4191 RHS.getConstantOperandAPInt(1) == NumElts) ||
4192 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4193 LHS.getConstantOperandAPInt(1) == NumElts))
4194 return Src;
4195
4196 return SDValue();
4197}
4198
4199static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4200 const SDLoc &dl, unsigned vectorWidth) {
4201 EVT VT = Vec.getValueType();
4202 EVT ElVT = VT.getVectorElementType();
4203 unsigned ResultNumElts =
4204 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4205 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4206
4207 assert(ResultVT.getSizeInBits() == vectorWidth &&
4208 "Illegal subvector extraction");
4209
4210 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4211 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4212 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4213
4214 // This is the index of the first element of the vectorWidth-bit chunk
4215 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4216 IdxVal &= ~(ElemsPerChunk - 1);
4217
4218 // If the input is a buildvector just emit a smaller one.
4219 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4220 return DAG.getBuildVector(ResultVT, dl,
4221 Vec->ops().slice(IdxVal, ElemsPerChunk));
4222
4223 // Check if we're extracting the upper undef of a widening pattern.
4224 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4225 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4226 isNullConstant(Vec.getOperand(2)))
4227 return DAG.getUNDEF(ResultVT);
4228
4229 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4230}
4231
4232/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4233/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4234/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4235/// instructions or a simple subregister reference. Idx is an index in the
4236/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4237/// lowering EXTRACT_VECTOR_ELT operations easier.
4238static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4239 SelectionDAG &DAG, const SDLoc &dl) {
4241 Vec.getValueType().is512BitVector()) &&
4242 "Unexpected vector size!");
4243 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4244}
4245
4246/// Generate a DAG to grab 256-bits from a 512-bit vector.
4247static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4248 SelectionDAG &DAG, const SDLoc &dl) {
4249 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4250 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4251}
4252
4253static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4254 SelectionDAG &DAG, const SDLoc &dl,
4255 unsigned vectorWidth) {
4256 assert((vectorWidth == 128 || vectorWidth == 256) &&
4257 "Unsupported vector width");
4258 // Inserting UNDEF is Result
4259 if (Vec.isUndef())
4260 return Result;
4261
4262 // Insert the relevant vectorWidth bits.
4263 EVT VT = Vec.getValueType();
4264 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4265 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4266
4267 // This is the index of the first element of the vectorWidth-bit chunk
4268 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4269 IdxVal &= ~(ElemsPerChunk - 1);
4270 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4271}
4272
4273/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4274/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4275/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4276/// simple superregister reference. Idx is an index in the 128 bits
4277/// we want. It need not be aligned to a 128-bit boundary. That makes
4278/// lowering INSERT_VECTOR_ELT operations easier.
4279static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4280 SelectionDAG &DAG, const SDLoc &dl) {
4281 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4282 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4283}
4284
4285/// Widen a vector to a larger size with the same scalar type, with the new
4286/// elements either zero or undef.
4287static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4288 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4289 const SDLoc &dl) {
4290 EVT VecVT = Vec.getValueType();
4292 VecVT.getScalarType() == VT.getScalarType() &&
4293 "Unsupported vector widening type");
4294 // If the upper 128-bits of a build vector are already undef/zero, then try to
4295 // widen from the lower 128-bits.
4296 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4297 unsigned NumSrcElts = VecVT.getVectorNumElements();
4298 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4299 if (all_of(Hi, [&](SDValue V) {
4300 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4301 }))
4302 Vec = extract128BitVector(Vec, 0, DAG, dl);
4303 }
4304 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4305 : DAG.getUNDEF(VT);
4306 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4307}
4308
4309/// Widen a vector to a larger size with the same scalar type, with the new
4310/// elements either zero or undef.
4311static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4312 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4313 const SDLoc &dl, unsigned WideSizeInBits) {
4314 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4315 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4316 "Unsupported vector widening type");
4317 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4318 MVT SVT = Vec.getSimpleValueType().getScalarType();
4319 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4320 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4321}
4322
4323/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4324/// and bitcast with integer types.
4325static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4326 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4327 unsigned NumElts = VT.getVectorNumElements();
4328 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4329 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4330 return VT;
4331}
4332
4333/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4334/// bitcast with integer types.
4335static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4336 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4337 const SDLoc &dl) {
4338 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4339 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4340}
4341
4342// Helper function to collect subvector ops that are concatenated together,
4343// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4344// The subvectors in Ops are guaranteed to be the same type.
4346 SelectionDAG &DAG) {
4347 assert(Ops.empty() && "Expected an empty ops vector");
4348
4349 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4350 Ops.append(N->op_begin(), N->op_end());
4351 return true;
4352 }
4353
4354 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4355 SDValue Src = N->getOperand(0);
4356 SDValue Sub = N->getOperand(1);
4357 const APInt &Idx = N->getConstantOperandAPInt(2);
4358 EVT VT = Src.getValueType();
4359 EVT SubVT = Sub.getValueType();
4360
4361 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4362 // insert_subvector(undef, x, lo)
4363 if (Idx == 0 && Src.isUndef()) {
4364 Ops.push_back(Sub);
4365 Ops.push_back(DAG.getUNDEF(SubVT));
4366 return true;
4367 }
4368 if (Idx == (VT.getVectorNumElements() / 2)) {
4369 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4370 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4371 Src.getOperand(1).getValueType() == SubVT &&
4372 isNullConstant(Src.getOperand(2))) {
4373 // Attempt to recurse into inner (matching) concats.
4374 SDValue Lo = Src.getOperand(1);
4375 SDValue Hi = Sub;
4376 SmallVector<SDValue, 2> LoOps, HiOps;
4377 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4378 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4379 LoOps.size() == HiOps.size()) {
4380 Ops.append(LoOps);
4381 Ops.append(HiOps);
4382 return true;
4383 }
4384 Ops.push_back(Lo);
4385 Ops.push_back(Hi);
4386 return true;
4387 }
4388 // insert_subvector(x, extract_subvector(x, lo), hi)
4389 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4390 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4391 Ops.append(2, Sub);
4392 return true;
4393 }
4394 // insert_subvector(undef, x, hi)
4395 if (Src.isUndef()) {
4396 Ops.push_back(DAG.getUNDEF(SubVT));
4397 Ops.push_back(Sub);
4398 return true;
4399 }
4400 }
4401 }
4402 }
4403
4404 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4405 EVT VT = N->getValueType(0);
4406 SDValue Src = N->getOperand(0);
4407 uint64_t Idx = N->getConstantOperandVal(1);
4408
4409 // Collect all the subvectors from the source vector and slice off the
4410 // extraction.
4412 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4413 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4414 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4415 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4416 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4417 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4418 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4419 return true;
4420 }
4421 }
4422
4423 assert(Ops.empty() && "Expected an empty ops vector");
4424 return false;
4425}
4426
4427// Helper to check if \p V can be split into subvectors and the upper subvectors
4428// are all undef. In which case return the lower subvector.
4430 SelectionDAG &DAG) {
4431 SmallVector<SDValue> SubOps;
4432 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4433 return SDValue();
4434
4435 unsigned NumSubOps = SubOps.size();
4436 unsigned HalfNumSubOps = NumSubOps / 2;
4437 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4438
4439 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4440 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4441 return SDValue();
4442
4443 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4444 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4445 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4446}
4447
4448// Helper to check if we can access all the constituent subvectors without any
4449// extract ops.
4452 return collectConcatOps(V.getNode(), Ops, DAG);
4453}
4454
4455static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4456 const SDLoc &dl) {
4457 EVT VT = Op.getValueType();
4458 unsigned NumElems = VT.getVectorNumElements();
4459 unsigned SizeInBits = VT.getSizeInBits();
4460 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4461 "Can't split odd sized vector");
4462
4464 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4465 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4466 unsigned HalfOps = SubOps.size() / 2;
4467 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4468 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4469 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4470 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4471 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4472 return std::make_pair(Lo, Hi);
4473 }
4474
4475 // If this is a splat value (with no-undefs) then use the lower subvector,
4476 // which should be a free extraction.
4477 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4478 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4479 return std::make_pair(Lo, Lo);
4480
4481 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4482 return std::make_pair(Lo, Hi);
4483}
4484
4485/// Break an operation into 2 half sized ops and then concatenate the results.
4487 unsigned NumOps = Op.getNumOperands();
4488 EVT VT = Op.getValueType();
4489
4490 // Extract the LHS Lo/Hi vectors
4493 for (unsigned I = 0; I != NumOps; ++I) {
4494 SDValue SrcOp = Op.getOperand(I);
4495 if (!SrcOp.getValueType().isVector()) {
4496 LoOps[I] = HiOps[I] = SrcOp;
4497 continue;
4498 }
4499 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4500 }
4501
4502 EVT LoVT, HiVT;
4503 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4504 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4505 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4506 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4507}
4508
4509/// Break an unary integer operation into 2 half sized ops and then
4510/// concatenate the result back.
4512 const SDLoc &dl) {
4513 // Make sure we only try to split 256/512-bit types to avoid creating
4514 // narrow vectors.
4515 [[maybe_unused]] EVT VT = Op.getValueType();
4516 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4517 Op.getOperand(0).getValueType().is512BitVector()) &&
4518 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4519 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4520 VT.getVectorNumElements() &&
4521 "Unexpected VTs!");
4522 return splitVectorOp(Op, DAG, dl);
4523}
4524
4525/// Break a binary integer operation into 2 half sized ops and then
4526/// concatenate the result back.
4528 const SDLoc &dl) {
4529 // Assert that all the types match.
4530 [[maybe_unused]] EVT VT = Op.getValueType();
4531 assert(Op.getOperand(0).getValueType() == VT &&
4532 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4533 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4534 return splitVectorOp(Op, DAG, dl);
4535}
4536
4537// Helper for splitting operands of an operation to legal target size and
4538// apply a function on each part.
4539// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4540// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4541// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4542// The argument Builder is a function that will be applied on each split part:
4543// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4544template <typename F>
4546 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4547 F Builder, bool CheckBWI = true,
4548 bool AllowAVX512 = true) {
4549 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4550 unsigned NumSubs = 1;
4551 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4552 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4553 if (VT.getSizeInBits() > 512) {
4554 NumSubs = VT.getSizeInBits() / 512;
4555 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4556 }
4557 } else if (Subtarget.hasAVX2()) {
4558 if (VT.getSizeInBits() > 256) {
4559 NumSubs = VT.getSizeInBits() / 256;
4560 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4561 }
4562 } else {
4563 if (VT.getSizeInBits() > 128) {
4564 NumSubs = VT.getSizeInBits() / 128;
4565 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4566 }
4567 }
4568
4569 if (NumSubs == 1)
4570 return Builder(DAG, DL, Ops);
4571
4573 for (unsigned i = 0; i != NumSubs; ++i) {
4575 for (SDValue Op : Ops) {
4576 EVT OpVT = Op.getValueType();
4577 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4578 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4579 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4580 }
4581 Subs.push_back(Builder(DAG, DL, SubOps));
4582 }
4583 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4584}
4585
4586// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4587// targets.
4588static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4590 const X86Subtarget &Subtarget) {
4591 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4592 MVT SVT = VT.getScalarType();
4593
4594 // If we have a 32/64 splatted constant, splat it to DstTy to
4595 // encourage a foldable broadcast'd operand.
4596 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4597 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4598 // AVX512 broadcasts 32/64-bit operands.
4599 // TODO: Support float once getAVX512Node is used by fp-ops.
4600 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4602 return SDValue();
4603 // If we're not widening, don't bother if we're not bitcasting.
4604 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4605 return SDValue();
4607 APInt SplatValue, SplatUndef;
4608 unsigned SplatBitSize;
4609 bool HasAnyUndefs;
4610 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4611 HasAnyUndefs, OpEltSizeInBits) &&
4612 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4613 return DAG.getConstant(SplatValue, DL, DstVT);
4614 }
4615 return SDValue();
4616 };
4617
4618 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4619
4620 MVT DstVT = VT;
4621 if (Widen)
4622 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4623
4624 // Canonicalize src operands.
4625 SmallVector<SDValue> SrcOps(Ops);
4626 for (SDValue &Op : SrcOps) {
4627 MVT OpVT = Op.getSimpleValueType();
4628 // Just pass through scalar operands.
4629 if (!OpVT.isVector())
4630 continue;
4631 assert(OpVT == VT && "Vector type mismatch");
4632
4633 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4634 Op = BroadcastOp;
4635 continue;
4636 }
4637
4638 // Just widen the subvector by inserting into an undef wide vector.
4639 if (Widen)
4640 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4641 }
4642
4643 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4644
4645 // Perform the 512-bit op then extract the bottom subvector.
4646 if (Widen)
4647 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4648 return Res;
4649}
4650
4651/// Insert i1-subvector to i1-vector.
4653 const X86Subtarget &Subtarget) {
4654
4655 SDLoc dl(Op);
4656 SDValue Vec = Op.getOperand(0);
4657 SDValue SubVec = Op.getOperand(1);
4658 SDValue Idx = Op.getOperand(2);
4659 unsigned IdxVal = Op.getConstantOperandVal(2);
4660
4661 // Inserting undef is a nop. We can just return the original vector.
4662 if (SubVec.isUndef())
4663 return Vec;
4664
4665 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4666 return Op;
4667
4668 MVT OpVT = Op.getSimpleValueType();
4669 unsigned NumElems = OpVT.getVectorNumElements();
4670 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4671
4672 // Extend to natively supported kshift.
4673 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4674
4675 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4676 // if necessary.
4677 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4678 // May need to promote to a legal type.
4679 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4680 DAG.getConstant(0, dl, WideOpVT),
4681 SubVec, Idx);
4682 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4683 }
4684
4685 MVT SubVecVT = SubVec.getSimpleValueType();
4686 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4687 assert(IdxVal + SubVecNumElems <= NumElems &&
4688 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4689 "Unexpected index value in INSERT_SUBVECTOR");
4690
4691 SDValue Undef = DAG.getUNDEF(WideOpVT);
4692
4693 if (IdxVal == 0) {
4694 // Zero lower bits of the Vec
4695 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4696 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4697 ZeroIdx);
4698 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4699 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4700 // Merge them together, SubVec should be zero extended.
4701 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4702 DAG.getConstant(0, dl, WideOpVT),
4703 SubVec, ZeroIdx);
4704 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4705 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4706 }
4707
4708 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4709 Undef, SubVec, ZeroIdx);
4710
4711 if (Vec.isUndef()) {
4712 assert(IdxVal != 0 && "Unexpected index");
4713 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4714 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4715 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4716 }
4717
4719 assert(IdxVal != 0 && "Unexpected index");
4720 // If upper elements of Vec are known undef, then just shift into place.
4721 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4722 [](SDValue V) { return V.isUndef(); })) {
4723 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4724 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4725 } else {
4726 NumElems = WideOpVT.getVectorNumElements();
4727 unsigned ShiftLeft = NumElems - SubVecNumElems;
4728 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4729 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4730 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4731 if (ShiftRight != 0)
4732 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4733 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4734 }
4735 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4736 }
4737
4738 // Simple case when we put subvector in the upper part
4739 if (IdxVal + SubVecNumElems == NumElems) {
4740 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4741 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4742 if (SubVecNumElems * 2 == NumElems) {
4743 // Special case, use legal zero extending insert_subvector. This allows
4744 // isel to optimize when bits are known zero.
4745 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4746 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4747 DAG.getConstant(0, dl, WideOpVT),
4748 Vec, ZeroIdx);
4749 } else {
4750 // Otherwise use explicit shifts to zero the bits.
4751 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4752 Undef, Vec, ZeroIdx);
4753 NumElems = WideOpVT.getVectorNumElements();
4754 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4755 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4756 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4757 }
4758 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4759 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4760 }
4761
4762 // Inserting into the middle is more complicated.
4763
4764 NumElems = WideOpVT.getVectorNumElements();
4765
4766 // Widen the vector if needed.
4767 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4768
4769 unsigned ShiftLeft = NumElems - SubVecNumElems;
4770 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4771
4772 // Do an optimization for the most frequently used types.
4773 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4774 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4775 Mask0.flipAllBits();
4776 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4777 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4778 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4779 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4780 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4781 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4782 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4783 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4784
4785 // Reduce to original width if needed.
4786 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4787 }
4788
4789 // Clear the upper bits of the subvector and move it to its insert position.
4790 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4791 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4792 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4793 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4794
4795 // Isolate the bits below the insertion point.
4796 unsigned LowShift = NumElems - IdxVal;
4797 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4798 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4799 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4800 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4801
4802 // Isolate the bits after the last inserted bit.
4803 unsigned HighShift = IdxVal + SubVecNumElems;
4804 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4805 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4806 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4807 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4808
4809 // Now OR all 3 pieces together.
4810 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4811 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4812
4813 // Reduce to original width if needed.
4814 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4815}
4816
4818 const SDLoc &dl) {
4819 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4820 EVT SubVT = V1.getValueType();
4821 EVT SubSVT = SubVT.getScalarType();
4822 unsigned SubNumElts = SubVT.getVectorNumElements();
4823 unsigned SubVectorWidth = SubVT.getSizeInBits();
4824 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4825 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4826 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4827}
4828
4829/// Returns a vector of specified type with all bits set.
4830/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4831/// Then bitcast to their original type, ensuring they get CSE'd.
4832static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4833 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4834 "Expected a 128/256/512-bit vector type");
4835 unsigned NumElts = VT.getSizeInBits() / 32;
4836 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4837 return DAG.getBitcast(VT, Vec);
4838}
4839
4840static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4841 SDValue In, SelectionDAG &DAG) {
4842 EVT InVT = In.getValueType();
4843 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4844
4845 // Canonicalize Opcode to general extension version.
4846 switch (Opcode) {
4847 case ISD::ANY_EXTEND:
4849 Opcode = ISD::ANY_EXTEND;
4850 break;
4851 case ISD::SIGN_EXTEND:
4853 Opcode = ISD::SIGN_EXTEND;
4854 break;
4855 case ISD::ZERO_EXTEND:
4857 Opcode = ISD::ZERO_EXTEND;
4858 break;
4859 default:
4860 llvm_unreachable("Unknown extension opcode");
4861 }
4862
4863 // For 256-bit vectors, we only need the lower (128-bit) input half.
4864 // For 512-bit vectors, we only need the lower input half or quarter.
4865 if (InVT.getSizeInBits() > 128) {
4866 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4867 "Expected VTs to be the same size!");
4868 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4869 In = extractSubVector(In, 0, DAG, DL,
4870 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4871 InVT = In.getValueType();
4872 }
4873
4874 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4875 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4876
4877 return DAG.getNode(Opcode, DL, VT, In);
4878}
4879
4880// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4882 SDValue Mask, SelectionDAG &DAG) {
4883 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4884 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4885 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4886}
4887
4889 bool Lo, bool Unary) {
4890 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4891 "Illegal vector type to unpack");
4892 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4893 int NumElts = VT.getVectorNumElements();
4894 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4895 for (int i = 0; i < NumElts; ++i) {
4896 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4897 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4898 Pos += (Unary ? 0 : NumElts * (i % 2));
4899 Pos += (Lo ? 0 : NumEltsInLane / 2);
4900 Mask.push_back(Pos);
4901 }
4902}
4903
4904/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4905/// imposed by AVX and specific to the unary pattern. Example:
4906/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4907/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4909 bool Lo) {
4910 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4911 int NumElts = VT.getVectorNumElements();
4912 for (int i = 0; i < NumElts; ++i) {
4913 int Pos = i / 2;
4914 Pos += (Lo ? 0 : NumElts / 2);
4915 Mask.push_back(Pos);
4916 }
4917}
4918
4919// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4920static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4921 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4924 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4925 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4926 int M = Mask[I];
4927 if (M < 0)
4928 continue;
4929 SDValue V = (M < NumElts) ? V1 : V2;
4930 if (V.isUndef())
4931 continue;
4932 Ops[I] = V.getOperand(M % NumElts);
4933 }
4934 return DAG.getBuildVector(VT, dl, Ops);
4935 }
4936
4937 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4938}
4939
4940/// Returns a vector_shuffle node for an unpackl operation.
4941static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4942 SDValue V1, SDValue V2) {
4944 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4945 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4946}
4947
4948/// Returns a vector_shuffle node for an unpackh operation.
4949static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4950 SDValue V1, SDValue V2) {
4952 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4953 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4954}
4955
4956/// Returns a node that packs the LHS + RHS nodes together at half width.
4957/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4958/// TODO: Add subvector splitting if/when we have a need for it.
4959static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4960 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4961 bool PackHiHalf = false) {
4962 MVT OpVT = LHS.getSimpleValueType();
4963 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4964 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4965 assert(OpVT == RHS.getSimpleValueType() &&
4966 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4967 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4968 "Unexpected PACK operand types");
4969 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4970 "Unexpected PACK result type");
4971
4972 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4973 if (EltSizeInBits == 32) {
4974 SmallVector<int> PackMask;
4975 int Offset = PackHiHalf ? 1 : 0;
4976 int NumElts = VT.getVectorNumElements();
4977 for (int I = 0; I != NumElts; I += 4) {
4978 PackMask.push_back(I + Offset);
4979 PackMask.push_back(I + Offset + 2);
4980 PackMask.push_back(I + Offset + NumElts);
4981 PackMask.push_back(I + Offset + NumElts + 2);
4982 }
4983 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4984 DAG.getBitcast(VT, RHS), PackMask);
4985 }
4986
4987 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4988 if (!PackHiHalf) {
4989 if (UsePackUS &&
4990 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4991 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4992 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4993
4994 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4995 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4996 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4997 }
4998
4999 // Fallback to sign/zero extending the requested half and pack.
5000 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
5001 if (UsePackUS) {
5002 if (PackHiHalf) {
5003 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
5004 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
5005 } else {
5006 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
5007 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
5008 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
5009 };
5010 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
5011 };
5012
5013 if (!PackHiHalf) {
5014 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
5015 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
5016 }
5017 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
5018 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
5019 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
5020}
5021
5022/// Return a vector_shuffle of the specified vector of zero or undef vector.
5023/// This produces a shuffle where the low element of V2 is swizzled into the
5024/// zero/undef vector, landing at element Idx.
5025/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5027 bool IsZero,
5028 const X86Subtarget &Subtarget,
5029 SelectionDAG &DAG) {
5030 MVT VT = V2.getSimpleValueType();
5031 SDValue V1 = IsZero
5032 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5033 int NumElems = VT.getVectorNumElements();
5034 SmallVector<int, 16> MaskVec(NumElems);
5035 for (int i = 0; i != NumElems; ++i)
5036 // If this is the insertion idx, put the low elt of V2 here.
5037 MaskVec[i] = (i == Idx) ? NumElems : i;
5038 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5039}
5040
5042 if (Ptr.getOpcode() == X86ISD::Wrapper ||
5044 Ptr = Ptr.getOperand(0);
5045 return dyn_cast<ConstantPoolSDNode>(Ptr);
5046}
5047
5048// TODO: Add support for non-zero offsets.
5051 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
5052 return nullptr;
5053 return CNode->getConstVal();
5054}
5055
5057 if (!Load || !ISD::isNormalLoad(Load))
5058 return nullptr;
5059 return getTargetConstantFromBasePtr(Load->getBasePtr());
5060}
5061
5066
5067const Constant *
5069 assert(LD && "Unexpected null LoadSDNode");
5070 return getTargetConstantFromNode(LD);
5071}
5072
5074 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
5075 SDValue Cond = N->getOperand(0);
5076 SDValue RHS = N->getOperand(2);
5077 EVT CondVT = Cond.getValueType();
5078 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
5079 CondVT.getVectorElementType() == MVT::i1 &&
5080 ISD::isBuildVectorAllZeros(RHS.getNode());
5081}
5082
5083// Extract raw constant bits from constant pools.
5084static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5085 APInt &UndefElts,
5086 SmallVectorImpl<APInt> &EltBits,
5087 bool AllowWholeUndefs = true,
5088 bool AllowPartialUndefs = false) {
5089 assert(EltBits.empty() && "Expected an empty EltBits vector");
5090
5092
5093 EVT VT = Op.getValueType();
5094 unsigned SizeInBits = VT.getSizeInBits();
5095 unsigned NumElts = SizeInBits / EltSizeInBits;
5096
5097 // Can't split constant.
5098 if ((SizeInBits % EltSizeInBits) != 0)
5099 return false;
5100
5101 // Bitcast a source array of element bits to the target size.
5102 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5103 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5104 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5105 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5106 "Constant bit sizes don't match");
5107
5108 // Don't split if we don't allow undef bits.
5109 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5110 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5111 return false;
5112
5113 // If we're already the right size, don't bother bitcasting.
5114 if (NumSrcElts == NumElts) {
5115 UndefElts = UndefSrcElts;
5116 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5117 return true;
5118 }
5119
5120 // Extract all the undef/constant element data and pack into single bitsets.
5121 APInt UndefBits(SizeInBits, 0);
5122 APInt MaskBits(SizeInBits, 0);
5123
5124 for (unsigned i = 0; i != NumSrcElts; ++i) {
5125 unsigned BitOffset = i * SrcEltSizeInBits;
5126 if (UndefSrcElts[i])
5127 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5128 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5129 }
5130
5131 // Split the undef/constant single bitset data into the target elements.
5132 UndefElts = APInt(NumElts, 0);
5133 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5134
5135 for (unsigned i = 0; i != NumElts; ++i) {
5136 unsigned BitOffset = i * EltSizeInBits;
5137 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5138
5139 // Only treat an element as UNDEF if all bits are UNDEF.
5140 if (UndefEltBits.isAllOnes()) {
5141 if (!AllowWholeUndefs)
5142 return false;
5143 UndefElts.setBit(i);
5144 continue;
5145 }
5146
5147 // If only some bits are UNDEF then treat them as zero (or bail if not
5148 // supported).
5149 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5150 return false;
5151
5152 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5153 }
5154 return true;
5155 };
5156
5157 // Collect constant bits and insert into mask/undef bit masks.
5158 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5159 unsigned UndefBitIndex) {
5160 if (!Cst)
5161 return false;
5162 if (isa<UndefValue>(Cst)) {
5163 Undefs.setBit(UndefBitIndex);
5164 return true;
5165 }
5166 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5167 Mask = CInt->getValue();
5168 return true;
5169 }
5170 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5171 Mask = CFP->getValueAPF().bitcastToAPInt();
5172 return true;
5173 }
5174 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5175 Type *Ty = CDS->getType();
5176 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5177 Type *EltTy = CDS->getElementType();
5178 bool IsInteger = EltTy->isIntegerTy();
5179 bool IsFP =
5180 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5181 if (!IsInteger && !IsFP)
5182 return false;
5183 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5184 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5185 if (IsInteger)
5186 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5187 else
5188 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5189 I * EltBits);
5190 return true;
5191 }
5192 return false;
5193 };
5194
5195 // Handle UNDEFs.
5196 if (Op.isUndef()) {
5197 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5198 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5199 return CastBitData(UndefSrcElts, SrcEltBits);
5200 }
5201
5202 // Extract scalar constant bits.
5203 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5204 APInt UndefSrcElts = APInt::getZero(1);
5205 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5206 return CastBitData(UndefSrcElts, SrcEltBits);
5207 }
5208 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5209 APInt UndefSrcElts = APInt::getZero(1);
5210 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5211 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5212 return CastBitData(UndefSrcElts, SrcEltBits);
5213 }
5214
5215 // Extract constant bits from build vector.
5216 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5217 BitVector Undefs;
5218 SmallVector<APInt> SrcEltBits;
5219 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5220 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5221 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5222 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5223 if (Undefs[I])
5224 UndefSrcElts.setBit(I);
5225 return CastBitData(UndefSrcElts, SrcEltBits);
5226 }
5227 }
5228
5229 // Extract constant bits from constant pool vector.
5230 if (auto *Cst = getTargetConstantFromNode(Op)) {
5231 Type *CstTy = Cst->getType();
5232 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5233 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5234 return false;
5235
5236 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5237 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5238 if ((SizeInBits % SrcEltSizeInBits) != 0)
5239 return false;
5240
5241 APInt UndefSrcElts(NumSrcElts, 0);
5242 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5243 for (unsigned i = 0; i != NumSrcElts; ++i)
5244 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5245 UndefSrcElts, i))
5246 return false;
5247
5248 return CastBitData(UndefSrcElts, SrcEltBits);
5249 }
5250
5251 // Extract constant bits from a broadcasted constant pool scalar.
5252 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5253 EltSizeInBits <= VT.getScalarSizeInBits()) {
5254 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5255 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5256 return false;
5257
5258 SDValue Ptr = MemIntr->getBasePtr();
5259 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
5260 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5261 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5262
5263 APInt UndefSrcElts(NumSrcElts, 0);
5264 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5265 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5266 if (UndefSrcElts[0])
5267 UndefSrcElts.setBits(0, NumSrcElts);
5268 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5269 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5270 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5271 return CastBitData(UndefSrcElts, SrcEltBits);
5272 }
5273 }
5274 }
5275
5276 // Extract constant bits from a subvector broadcast.
5277 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5278 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5279 SDValue Ptr = MemIntr->getBasePtr();
5280 // The source constant may be larger than the subvector broadcast,
5281 // ensure we extract the correct subvector constants.
5282 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5283 Type *CstTy = Cst->getType();
5284 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5285 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5286 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5287 (SizeInBits % SubVecSizeInBits) != 0)
5288 return false;
5289 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5290 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5291 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5292 APInt UndefSubElts(NumSubElts, 0);
5293 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5294 APInt(CstEltSizeInBits, 0));
5295 for (unsigned i = 0; i != NumSubElts; ++i) {
5296 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5297 UndefSubElts, i))
5298 return false;
5299 for (unsigned j = 1; j != NumSubVecs; ++j)
5300 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5301 }
5302 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5303 UndefSubElts);
5304 return CastBitData(UndefSubElts, SubEltBits);
5305 }
5306 }
5307
5308 // Extract a rematerialized scalar constant insertion.
5309 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5310 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5311 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5312 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5313 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5314
5315 APInt UndefSrcElts(NumSrcElts, 0);
5316 SmallVector<APInt, 64> SrcEltBits;
5317 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5318 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5319 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5320 return CastBitData(UndefSrcElts, SrcEltBits);
5321 }
5322
5323 // Insert constant bits from a base and sub vector sources.
5324 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5325 // If bitcasts to larger elements we might lose track of undefs - don't
5326 // allow any to be safe.
5327 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5328 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5329
5330 APInt UndefSrcElts, UndefSubElts;
5331 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5332 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5333 UndefSubElts, EltSubBits,
5334 AllowWholeUndefs && AllowUndefs,
5335 AllowPartialUndefs && AllowUndefs) &&
5336 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5337 UndefSrcElts, EltSrcBits,
5338 AllowWholeUndefs && AllowUndefs,
5339 AllowPartialUndefs && AllowUndefs)) {
5340 unsigned BaseIdx = Op.getConstantOperandVal(2);
5341 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5342 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5343 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5344 return CastBitData(UndefSrcElts, EltSrcBits);
5345 }
5346 }
5347
5348 // Extract constant bits from a subvector's source.
5349 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5350 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5351 EltBits, AllowWholeUndefs,
5352 AllowPartialUndefs)) {
5353 EVT SrcVT = Op.getOperand(0).getValueType();
5354 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5355 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5356 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5357 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5358 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5359 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5360 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5361
5362 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5363 if ((BaseIdx + NumSubElts) != NumSrcElts)
5364 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5365 if (BaseIdx != 0)
5366 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5367 return true;
5368 }
5369
5370 // Extract constant bits from shuffle node sources.
5371 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5372 // TODO - support shuffle through bitcasts.
5373 if (EltSizeInBits != VT.getScalarSizeInBits())
5374 return false;
5375
5376 ArrayRef<int> Mask = SVN->getMask();
5377 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5378 llvm::any_of(Mask, [](int M) { return M < 0; }))
5379 return false;
5380
5381 APInt UndefElts0, UndefElts1;
5382 SmallVector<APInt, 32> EltBits0, EltBits1;
5383 if (isAnyInRange(Mask, 0, NumElts) &&
5384 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5385 UndefElts0, EltBits0, AllowWholeUndefs,
5386 AllowPartialUndefs))
5387 return false;
5388 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5389 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5390 UndefElts1, EltBits1, AllowWholeUndefs,
5391 AllowPartialUndefs))
5392 return false;
5393
5394 UndefElts = APInt::getZero(NumElts);
5395 for (int i = 0; i != (int)NumElts; ++i) {
5396 int M = Mask[i];
5397 if (M < 0) {
5398 UndefElts.setBit(i);
5399 EltBits.push_back(APInt::getZero(EltSizeInBits));
5400 } else if (M < (int)NumElts) {
5401 if (UndefElts0[M])
5402 UndefElts.setBit(i);
5403 EltBits.push_back(EltBits0[M]);
5404 } else {
5405 if (UndefElts1[M - NumElts])
5406 UndefElts.setBit(i);
5407 EltBits.push_back(EltBits1[M - NumElts]);
5408 }
5409 }
5410 return true;
5411 }
5412
5413 return false;
5414}
5415
5416namespace llvm {
5417namespace X86 {
5418bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5419 APInt UndefElts;
5420 SmallVector<APInt, 16> EltBits;
5422 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5423 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5424 int SplatIndex = -1;
5425 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5426 if (UndefElts[i])
5427 continue;
5428 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5429 SplatIndex = -1;
5430 break;
5431 }
5432 SplatIndex = i;
5433 }
5434 if (0 <= SplatIndex) {
5435 SplatVal = EltBits[SplatIndex];
5436 return true;
5437 }
5438 }
5439
5440 return false;
5441}
5442
5443int getRoundingModeX86(unsigned RM) {
5444 switch (static_cast<::llvm::RoundingMode>(RM)) {
5445 // clang-format off
5446 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest;
5447 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward;
5448 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward;
5449 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero;
5450 default: return X86::rmInvalid;
5451 // clang-format on
5452 }
5453}
5454
5455} // namespace X86
5456} // namespace llvm
5457
5459 unsigned MaskEltSizeInBits,
5461 APInt &UndefElts) {
5462 // Extract the raw target constant bits.
5463 SmallVector<APInt, 64> EltBits;
5464 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5465 EltBits, /* AllowWholeUndefs */ true,
5466 /* AllowPartialUndefs */ false))
5467 return false;
5468
5469 // Insert the extracted elements into the mask.
5470 for (const APInt &Elt : EltBits)
5471 RawMask.push_back(Elt.getZExtValue());
5472
5473 return true;
5474}
5475
5476static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5477 bool AllowUndefs) {
5478 APInt UndefElts;
5479 SmallVector<APInt, 64> EltBits;
5480 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5481 /*AllowWholeUndefs*/ AllowUndefs,
5482 /*AllowPartialUndefs*/ false))
5483 return false;
5484
5485 bool IsPow2OrUndef = true;
5486 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5487 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5488 return IsPow2OrUndef;
5489}
5490
5491// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5493 // TODO: don't always ignore oneuse constraints.
5494 V = peekThroughBitcasts(V);
5495 EVT VT = V.getValueType();
5496
5497 // Match not(xor X, -1) -> X.
5498 if (V.getOpcode() == ISD::XOR &&
5499 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5500 isAllOnesConstant(V.getOperand(1))))
5501 return V.getOperand(0);
5502
5503 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5504 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5505 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5506 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5507 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5508 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5509 V.getOperand(1));
5510 }
5511 }
5512
5513 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5514 if (V.getOpcode() == X86ISD::PCMPGT &&
5515 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5516 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5517 V.getOperand(0).hasOneUse()) {
5518 APInt UndefElts;
5519 SmallVector<APInt> EltBits;
5520 if (getTargetConstantBitsFromNode(V.getOperand(0),
5521 V.getScalarValueSizeInBits(), UndefElts,
5522 EltBits) &&
5523 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5524 // Don't fold min_signed_value -> (min_signed_value - 1)
5525 bool MinSigned = false;
5526 for (APInt &Elt : EltBits) {
5527 MinSigned |= Elt.isMinSignedValue();
5528 Elt -= 1;
5529 }
5530 if (!MinSigned) {
5531 SDLoc DL(V);
5532 MVT VT = V.getSimpleValueType();
5533 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5534 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5535 }
5536 }
5537 }
5538
5539 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5541 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5542 for (SDValue &CatOp : CatOps) {
5543 SDValue NotCat = IsNOT(CatOp, DAG);
5544 if (!NotCat)
5545 return SDValue();
5546 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5547 }
5548 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5549 }
5550
5551 // Match not(or(not(X),not(Y))) -> and(X, Y).
5552 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5553 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5554 // TODO: Handle cases with single NOT operand -> ANDNP
5555 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5556 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5557 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5558 DAG.getBitcast(VT, Op1));
5559 }
5560
5561 return SDValue();
5562}
5563
5564/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5565/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5566/// Note: This ignores saturation, so inputs must be checked first.
5568 bool Unary, unsigned NumStages = 1) {
5569 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5570 unsigned NumElts = VT.getVectorNumElements();
5571 unsigned NumLanes = VT.getSizeInBits() / 128;
5572 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5573 unsigned Offset = Unary ? 0 : NumElts;
5574 unsigned Repetitions = 1u << (NumStages - 1);
5575 unsigned Increment = 1u << NumStages;
5576 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5577
5578 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5579 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5580 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5581 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5582 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5583 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5584 }
5585 }
5586}
5587
5588// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5589static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5590 APInt &DemandedLHS, APInt &DemandedRHS) {
5591 int NumLanes = VT.getSizeInBits() / 128;
5592 int NumElts = DemandedElts.getBitWidth();
5593 int NumInnerElts = NumElts / 2;
5594 int NumEltsPerLane = NumElts / NumLanes;
5595 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5596
5597 DemandedLHS = APInt::getZero(NumInnerElts);
5598 DemandedRHS = APInt::getZero(NumInnerElts);
5599
5600 // Map DemandedElts to the packed operands.
5601 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5602 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5603 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5604 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5605 if (DemandedElts[OuterIdx])
5606 DemandedLHS.setBit(InnerIdx);
5607 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5608 DemandedRHS.setBit(InnerIdx);
5609 }
5610 }
5611}
5612
5613// Split the demanded elts of a HADD/HSUB node between its operands.
5614static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5615 APInt &DemandedLHS, APInt &DemandedRHS) {
5617 DemandedLHS, DemandedRHS);
5618 DemandedLHS |= DemandedLHS << 1;
5619 DemandedRHS |= DemandedRHS << 1;
5620}
5621
5622/// Calculates the shuffle mask corresponding to the target-specific opcode.
5623/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5624/// operands in \p Ops, and returns true.
5625/// Sets \p IsUnary to true if only one source is used. Note that this will set
5626/// IsUnary for shuffles which use a single input multiple times, and in those
5627/// cases it will adjust the mask to only have indices within that single input.
5628/// It is an error to call this with non-empty Mask/Ops vectors.
5629static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5631 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5632 if (!isTargetShuffle(N.getOpcode()))
5633 return false;
5634
5635 MVT VT = N.getSimpleValueType();
5636 unsigned NumElems = VT.getVectorNumElements();
5637 unsigned MaskEltSize = VT.getScalarSizeInBits();
5639 APInt RawUndefs;
5640 uint64_t ImmN;
5641
5642 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5643 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5644
5645 IsUnary = false;
5646 bool IsFakeUnary = false;
5647 switch (N.getOpcode()) {
5648 case X86ISD::BLENDI:
5649 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5650 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5651 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5652 DecodeBLENDMask(NumElems, ImmN, Mask);
5653 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5654 break;
5655 case X86ISD::SHUFP:
5656 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5657 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5658 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5659 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5660 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5661 break;
5662 case X86ISD::INSERTPS:
5663 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5664 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5665 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5666 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5667 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5668 break;
5669 case X86ISD::EXTRQI:
5670 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5671 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5672 isa<ConstantSDNode>(N.getOperand(2))) {
5673 int BitLen = N.getConstantOperandVal(1);
5674 int BitIdx = N.getConstantOperandVal(2);
5675 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5676 IsUnary = true;
5677 }
5678 break;
5679 case X86ISD::INSERTQI:
5680 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5681 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5682 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5683 isa<ConstantSDNode>(N.getOperand(3))) {
5684 int BitLen = N.getConstantOperandVal(2);
5685 int BitIdx = N.getConstantOperandVal(3);
5686 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5687 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5688 }
5689 break;
5690 case X86ISD::UNPCKH:
5691 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5692 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5693 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5694 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5695 break;
5696 case X86ISD::UNPCKL:
5697 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5698 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5699 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5700 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5701 break;
5702 case X86ISD::MOVHLPS:
5703 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5704 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5705 DecodeMOVHLPSMask(NumElems, Mask);
5706 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5707 break;
5708 case X86ISD::MOVLHPS:
5709 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5710 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5711 DecodeMOVLHPSMask(NumElems, Mask);
5712 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5713 break;
5714 case X86ISD::VALIGN:
5715 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5716 "Only 32-bit and 64-bit elements are supported!");
5717 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5718 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5719 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5720 DecodeVALIGNMask(NumElems, ImmN, Mask);
5721 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5722 Ops.push_back(N.getOperand(1));
5723 Ops.push_back(N.getOperand(0));
5724 break;
5725 case X86ISD::PALIGNR:
5726 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5727 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5728 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5729 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5730 DecodePALIGNRMask(NumElems, ImmN, Mask);
5731 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5732 Ops.push_back(N.getOperand(1));
5733 Ops.push_back(N.getOperand(0));
5734 break;
5735 case X86ISD::VSHLDQ:
5736 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5737 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5738 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5739 DecodePSLLDQMask(NumElems, ImmN, Mask);
5740 IsUnary = true;
5741 break;
5742 case X86ISD::VSRLDQ:
5743 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5744 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5745 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5746 DecodePSRLDQMask(NumElems, ImmN, Mask);
5747 IsUnary = true;
5748 break;
5749 case X86ISD::PSHUFD:
5750 case X86ISD::VPERMILPI:
5751 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5752 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5753 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5754 IsUnary = true;
5755 break;
5756 case X86ISD::PSHUFHW:
5757 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5758 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5759 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5760 IsUnary = true;
5761 break;
5762 case X86ISD::PSHUFLW:
5763 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5764 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5765 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5766 IsUnary = true;
5767 break;
5768 case X86ISD::VZEXT_MOVL:
5769 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5770 DecodeZeroMoveLowMask(NumElems, Mask);
5771 IsUnary = true;
5772 break;
5773 case X86ISD::VBROADCAST:
5774 // We only decode broadcasts of same-sized vectors, peeking through to
5775 // extracted subvectors is likely to cause hasOneUse issues with
5776 // SimplifyDemandedBits etc.
5777 if (N.getOperand(0).getValueType() == VT) {
5778 DecodeVectorBroadcast(NumElems, Mask);
5779 IsUnary = true;
5780 break;
5781 }
5782 return false;
5783 case X86ISD::VPERMILPV: {
5784 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5785 IsUnary = true;
5786 SDValue MaskNode = N.getOperand(1);
5787 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5788 RawUndefs)) {
5789 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5790 break;
5791 }
5792 return false;
5793 }
5794 case X86ISD::PSHUFB: {
5795 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5796 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5797 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5798 IsUnary = true;
5799 SDValue MaskNode = N.getOperand(1);
5800 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5801 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5802 break;
5803 }
5804 return false;
5805 }
5806 case X86ISD::VPERMI:
5807 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5808 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5809 DecodeVPERMMask(NumElems, ImmN, Mask);
5810 IsUnary = true;
5811 break;
5812 case X86ISD::MOVSS:
5813 case X86ISD::MOVSD:
5814 case X86ISD::MOVSH:
5815 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5816 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5817 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5818 break;
5819 case X86ISD::VPERM2X128:
5820 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5821 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5822 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5823 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5824 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5825 break;
5826 case X86ISD::SHUF128:
5827 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5828 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5829 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5830 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5831 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5832 break;
5833 case X86ISD::MOVSLDUP:
5834 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5835 DecodeMOVSLDUPMask(NumElems, Mask);
5836 IsUnary = true;
5837 break;
5838 case X86ISD::MOVSHDUP:
5839 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5840 DecodeMOVSHDUPMask(NumElems, Mask);
5841 IsUnary = true;
5842 break;
5843 case X86ISD::MOVDDUP:
5844 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5845 DecodeMOVDDUPMask(NumElems, Mask);
5846 IsUnary = true;
5847 break;
5848 case X86ISD::VPERMIL2: {
5849 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5850 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5851 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5852 SDValue MaskNode = N.getOperand(2);
5853 SDValue CtrlNode = N.getOperand(3);
5854 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5855 unsigned CtrlImm = CtrlOp->getZExtValue();
5856 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5857 RawUndefs)) {
5858 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5859 Mask);
5860 break;
5861 }
5862 }
5863 return false;
5864 }
5865 case X86ISD::VPPERM: {
5866 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5867 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5868 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5869 SDValue MaskNode = N.getOperand(2);
5870 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5871 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5872 break;
5873 }
5874 return false;
5875 }
5876 case X86ISD::VPERMV: {
5877 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5878 IsUnary = true;
5879 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5880 Ops.push_back(N.getOperand(1));
5881 SDValue MaskNode = N.getOperand(0);
5882 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5883 RawUndefs)) {
5884 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5885 break;
5886 }
5887 return false;
5888 }
5889 case X86ISD::VPERMV3: {
5890 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5891 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5892 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5893 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5894 Ops.push_back(N.getOperand(0));
5895 Ops.push_back(N.getOperand(2));
5896 SDValue MaskNode = N.getOperand(1);
5897 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5898 RawUndefs)) {
5899 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5900 break;
5901 }
5902 return false;
5903 }
5904 case X86ISD::COMPRESS: {
5905 SDValue CmpVec = N.getOperand(0);
5906 SDValue PassThru = N.getOperand(1);
5907 SDValue CmpMask = N.getOperand(2);
5908 APInt UndefElts;
5909 SmallVector<APInt> EltBits;
5910 if (!getTargetConstantBitsFromNode(CmpMask, 1, UndefElts, EltBits))
5911 return false;
5912 assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
5913 "Illegal compression mask");
5914 for (unsigned I = 0; I != NumElems; ++I) {
5915 if (!EltBits[I].isZero())
5916 Mask.push_back(I);
5917 }
5918 while (Mask.size() != NumElems) {
5919 Mask.push_back(NumElems + Mask.size());
5920 }
5921 Ops.push_back(CmpVec);
5922 Ops.push_back(PassThru);
5923 return true;
5924 }
5925 case X86ISD::EXPAND: {
5926 SDValue ExpVec = N.getOperand(0);
5927 SDValue PassThru = N.getOperand(1);
5928 SDValue ExpMask = N.getOperand(2);
5929 APInt UndefElts;
5930 SmallVector<APInt> EltBits;
5931 if (!getTargetConstantBitsFromNode(ExpMask, 1, UndefElts, EltBits))
5932 return false;
5933 assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
5934 "Illegal expansion mask");
5935 unsigned ExpIndex = 0;
5936 for (unsigned I = 0; I != NumElems; ++I) {
5937 if (EltBits[I].isZero())
5938 Mask.push_back(I + NumElems);
5939 else
5940 Mask.push_back(ExpIndex++);
5941 }
5942 Ops.push_back(ExpVec);
5943 Ops.push_back(PassThru);
5944 return true;
5945 }
5946 default:
5947 llvm_unreachable("unknown target shuffle node");
5948 }
5949
5950 // Empty mask indicates the decode failed.
5951 if (Mask.empty())
5952 return false;
5953
5954 // Check if we're getting a shuffle mask with zero'd elements.
5955 if (!AllowSentinelZero && isAnyZero(Mask))
5956 return false;
5957
5958 // If we have a fake unary shuffle, the shuffle mask is spread across two
5959 // inputs that are actually the same node. Re-map the mask to always point
5960 // into the first input.
5961 if (IsFakeUnary)
5962 for (int &M : Mask)
5963 if (M >= (int)Mask.size())
5964 M -= Mask.size();
5965
5966 // If we didn't already add operands in the opcode-specific code, default to
5967 // adding 1 or 2 operands starting at 0.
5968 if (Ops.empty()) {
5969 Ops.push_back(N.getOperand(0));
5970 if (!IsUnary || IsFakeUnary)
5971 Ops.push_back(N.getOperand(1));
5972 }
5973
5974 return true;
5975}
5976
5977// Wrapper for getTargetShuffleMask with InUnary;
5978static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5980 SmallVectorImpl<int> &Mask) {
5981 bool IsUnary;
5982 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5983}
5984
5985/// Compute whether each element of a shuffle is zeroable.
5986///
5987/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5988/// Either it is an undef element in the shuffle mask, the element of the input
5989/// referenced is undef, or the element of the input referenced is known to be
5990/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5991/// as many lanes with this technique as possible to simplify the remaining
5992/// shuffle.
5994 SDValue V1, SDValue V2,
5995 APInt &KnownUndef, APInt &KnownZero) {
5996 int Size = Mask.size();
5997 KnownUndef = KnownZero = APInt::getZero(Size);
5998
5999 V1 = peekThroughBitcasts(V1);
6000 V2 = peekThroughBitcasts(V2);
6001
6002 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
6003 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
6004
6005 int VectorSizeInBits = V1.getValueSizeInBits();
6006 int ScalarSizeInBits = VectorSizeInBits / Size;
6007 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
6008
6009 for (int i = 0; i < Size; ++i) {
6010 int M = Mask[i];
6011 // Handle the easy cases.
6012 if (M < 0) {
6013 KnownUndef.setBit(i);
6014 continue;
6015 }
6016 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
6017 KnownZero.setBit(i);
6018 continue;
6019 }
6020
6021 // Determine shuffle input and normalize the mask.
6022 SDValue V = M < Size ? V1 : V2;
6023 M %= Size;
6024
6025 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
6026 if (V.getOpcode() != ISD::BUILD_VECTOR)
6027 continue;
6028
6029 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
6030 // the (larger) source element must be UNDEF/ZERO.
6031 if ((Size % V.getNumOperands()) == 0) {
6032 int Scale = Size / V->getNumOperands();
6033 SDValue Op = V.getOperand(M / Scale);
6034 if (Op.isUndef())
6035 KnownUndef.setBit(i);
6036 if (X86::isZeroNode(Op))
6037 KnownZero.setBit(i);
6038 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
6039 APInt Val = Cst->getAPIntValue();
6040 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6041 if (Val == 0)
6042 KnownZero.setBit(i);
6043 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6044 APInt Val = Cst->getValueAPF().bitcastToAPInt();
6045 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6046 if (Val == 0)
6047 KnownZero.setBit(i);
6048 }
6049 continue;
6050 }
6051
6052 // If the BUILD_VECTOR has more elements then all the (smaller) source
6053 // elements must be UNDEF or ZERO.
6054 if ((V.getNumOperands() % Size) == 0) {
6055 int Scale = V->getNumOperands() / Size;
6056 bool AllUndef = true;
6057 bool AllZero = true;
6058 for (int j = 0; j < Scale; ++j) {
6059 SDValue Op = V.getOperand((M * Scale) + j);
6060 AllUndef &= Op.isUndef();
6061 AllZero &= X86::isZeroNode(Op);
6062 }
6063 if (AllUndef)
6064 KnownUndef.setBit(i);
6065 if (AllZero)
6066 KnownZero.setBit(i);
6067 continue;
6068 }
6069 }
6070}
6071
6072/// Decode a target shuffle mask and inputs and see if any values are
6073/// known to be undef or zero from their inputs.
6074/// Returns true if the target shuffle mask was decoded.
6075/// FIXME: Merge this with computeZeroableShuffleElements?
6078 APInt &KnownUndef, APInt &KnownZero) {
6079 bool IsUnary;
6080 if (!isTargetShuffle(N.getOpcode()))
6081 return false;
6082
6083 MVT VT = N.getSimpleValueType();
6084 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
6085 return false;
6086
6087 int Size = Mask.size();
6088 SDValue V1 = Ops[0];
6089 SDValue V2 = IsUnary ? V1 : Ops[1];
6090 KnownUndef = KnownZero = APInt::getZero(Size);
6091
6092 V1 = peekThroughBitcasts(V1);
6093 V2 = peekThroughBitcasts(V2);
6094
6095 assert((VT.getSizeInBits() % Size) == 0 &&
6096 "Illegal split of shuffle value type");
6097 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
6098
6099 // Extract known constant input data.
6100 APInt UndefSrcElts[2];
6101 SmallVector<APInt, 32> SrcEltBits[2];
6102 bool IsSrcConstant[2] = {
6103 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6104 SrcEltBits[0], /*AllowWholeUndefs*/ true,
6105 /*AllowPartialUndefs*/ false),
6106 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6107 SrcEltBits[1], /*AllowWholeUndefs*/ true,
6108 /*AllowPartialUndefs*/ false)};
6109
6110 for (int i = 0; i < Size; ++i) {
6111 int M = Mask[i];
6112
6113 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6114 if (M < 0) {
6115 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
6116 if (SM_SentinelUndef == M)
6117 KnownUndef.setBit(i);
6118 if (SM_SentinelZero == M)
6119 KnownZero.setBit(i);
6120 continue;
6121 }
6122
6123 // Determine shuffle input and normalize the mask.
6124 unsigned SrcIdx = M / Size;
6125 SDValue V = M < Size ? V1 : V2;
6126 M %= Size;
6127
6128 // We are referencing an UNDEF input.
6129 if (V.isUndef()) {
6130 KnownUndef.setBit(i);
6131 continue;
6132 }
6133
6134 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6135 // TODO: We currently only set UNDEF for integer types - floats use the same
6136 // registers as vectors and many of the scalar folded loads rely on the
6137 // SCALAR_TO_VECTOR pattern.
6138 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6139 (Size % V.getValueType().getVectorNumElements()) == 0) {
6140 int Scale = Size / V.getValueType().getVectorNumElements();
6141 int Idx = M / Scale;
6142 if (Idx != 0 && !VT.isFloatingPoint())
6143 KnownUndef.setBit(i);
6144 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6145 KnownZero.setBit(i);
6146 continue;
6147 }
6148
6149 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6150 // base vectors.
6151 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6152 SDValue Vec = V.getOperand(0);
6153 int NumVecElts = Vec.getValueType().getVectorNumElements();
6154 if (Vec.isUndef() && Size == NumVecElts) {
6155 int Idx = V.getConstantOperandVal(2);
6156 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6157 if (M < Idx || (Idx + NumSubElts) <= M)
6158 KnownUndef.setBit(i);
6159 }
6160 continue;
6161 }
6162
6163 // Attempt to extract from the source's constant bits.
6164 if (IsSrcConstant[SrcIdx]) {
6165 if (UndefSrcElts[SrcIdx][M])
6166 KnownUndef.setBit(i);
6167 else if (SrcEltBits[SrcIdx][M] == 0)
6168 KnownZero.setBit(i);
6169 }
6170 }
6171
6172 assert(VT.getVectorNumElements() == (unsigned)Size &&
6173 "Different mask size from vector size!");
6174 return true;
6175}
6176
6177// Replace target shuffle mask elements with known undef/zero sentinels.
6179 const APInt &KnownUndef,
6180 const APInt &KnownZero,
6181 bool ResolveKnownZeros= true) {
6182 unsigned NumElts = Mask.size();
6183 assert(KnownUndef.getBitWidth() == NumElts &&
6184 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6185
6186 for (unsigned i = 0; i != NumElts; ++i) {
6187 if (KnownUndef[i])
6188 Mask[i] = SM_SentinelUndef;
6189 else if (ResolveKnownZeros && KnownZero[i])
6190 Mask[i] = SM_SentinelZero;
6191 }
6192}
6193
6194// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6196 APInt &KnownUndef,
6197 APInt &KnownZero) {
6198 unsigned NumElts = Mask.size();
6199 KnownUndef = KnownZero = APInt::getZero(NumElts);
6200
6201 for (unsigned i = 0; i != NumElts; ++i) {
6202 int M = Mask[i];
6203 if (SM_SentinelUndef == M)
6204 KnownUndef.setBit(i);
6205 if (SM_SentinelZero == M)
6206 KnownZero.setBit(i);
6207 }
6208}
6209
6210// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6212 SDValue Cond, bool IsBLENDV = false) {
6213 EVT CondVT = Cond.getValueType();
6214 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6215 unsigned NumElts = CondVT.getVectorNumElements();
6216
6217 APInt UndefElts;
6218 SmallVector<APInt, 32> EltBits;
6219 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6220 /*AllowWholeUndefs*/ true,
6221 /*AllowPartialUndefs*/ false))
6222 return false;
6223
6224 Mask.resize(NumElts, SM_SentinelUndef);
6225
6226 for (int i = 0; i != (int)NumElts; ++i) {
6227 Mask[i] = i;
6228 // Arbitrarily choose from the 2nd operand if the select condition element
6229 // is undef.
6230 // TODO: Can we do better by matching patterns such as even/odd?
6231 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6232 (IsBLENDV && EltBits[i].isNonNegative()))
6233 Mask[i] += NumElts;
6234 }
6235
6236 return true;
6237}
6238
6239// Forward declaration (for getFauxShuffleMask recursive check).
6240static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6243 const SelectionDAG &DAG, unsigned Depth,
6244 bool ResolveKnownElts);
6245
6246// Attempt to decode ops that could be represented as a shuffle mask.
6247// The decoded shuffle mask may contain a different number of elements to the
6248// destination value type.
6249// TODO: Merge into getTargetShuffleInputs()
6250static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6253 const SelectionDAG &DAG, unsigned Depth,
6254 bool ResolveKnownElts) {
6255 Mask.clear();
6256 Ops.clear();
6257
6258 MVT VT = N.getSimpleValueType();
6259 unsigned NumElts = VT.getVectorNumElements();
6260 unsigned NumSizeInBits = VT.getSizeInBits();
6261 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6262 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6263 return false;
6264 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6265 unsigned NumSizeInBytes = NumSizeInBits / 8;
6266 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6267
6268 unsigned Opcode = N.getOpcode();
6269 switch (Opcode) {
6270 case ISD::VECTOR_SHUFFLE: {
6271 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6272 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6273 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6274 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6275 Ops.push_back(N.getOperand(0));
6276 Ops.push_back(N.getOperand(1));
6277 return true;
6278 }
6279 return false;
6280 }
6281 case ISD::AND:
6282 case X86ISD::ANDNP: {
6283 // Attempt to decode as a per-byte mask.
6284 APInt UndefElts;
6285 SmallVector<APInt, 32> EltBits;
6286 SDValue N0 = N.getOperand(0);
6287 SDValue N1 = N.getOperand(1);
6288 bool IsAndN = (X86ISD::ANDNP == Opcode);
6289 uint64_t ZeroMask = IsAndN ? 255 : 0;
6290 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6291 /*AllowWholeUndefs*/ false,
6292 /*AllowPartialUndefs*/ false))
6293 return false;
6294 // We can't assume an undef src element gives an undef dst - the other src
6295 // might be zero.
6296 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6297 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6298 const APInt &ByteBits = EltBits[i];
6299 if (ByteBits != 0 && ByteBits != 255)
6300 return false;
6301 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6302 }
6303 Ops.push_back(IsAndN ? N1 : N0);
6304 return true;
6305 }
6306 case ISD::OR: {
6307 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6308 // is a valid shuffle index.
6309 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6310 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6311 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6312 return false;
6313
6314 SmallVector<int, 64> SrcMask0, SrcMask1;
6315 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6318 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6319 Depth + 1, true) ||
6320 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6321 Depth + 1, true))
6322 return false;
6323
6324 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6325 SmallVector<int, 64> Mask0, Mask1;
6326 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6327 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6328 for (int i = 0; i != (int)MaskSize; ++i) {
6329 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6330 // loops converting between OR and BLEND shuffles due to
6331 // canWidenShuffleElements merging away undef elements, meaning we
6332 // fail to recognise the OR as the undef element isn't known zero.
6333 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6334 Mask.push_back(SM_SentinelZero);
6335 else if (Mask1[i] == SM_SentinelZero)
6336 Mask.push_back(i);
6337 else if (Mask0[i] == SM_SentinelZero)
6338 Mask.push_back(i + MaskSize);
6339 else
6340 return false;
6341 }
6342 Ops.push_back(N.getOperand(0));
6343 Ops.push_back(N.getOperand(1));
6344 return true;
6345 }
6346 case ISD::CONCAT_VECTORS: {
6347 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6348 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6349 if (NumBitsPerElt == 64) {
6350 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6351 for (unsigned M = 0; M != NumSubElts; ++M)
6352 Mask.push_back((I * NumElts) + M);
6353 Ops.push_back(N.getOperand(I));
6354 }
6355 return true;
6356 }
6357 return false;
6358 }
6359 case ISD::INSERT_SUBVECTOR: {
6360 SDValue Src = N.getOperand(0);
6361 SDValue Sub = N.getOperand(1);
6362 EVT SubVT = Sub.getValueType();
6363 unsigned NumSubElts = SubVT.getVectorNumElements();
6364 uint64_t InsertIdx = N.getConstantOperandVal(2);
6365 // Subvector isn't demanded - just return the base vector.
6366 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6367 Mask.resize(NumElts);
6368 std::iota(Mask.begin(), Mask.end(), 0);
6369 Ops.push_back(Src);
6370 return true;
6371 }
6372 // Handle CONCAT(SUB0, SUB1).
6373 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6374 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6375 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6376 Src.getOperand(0).isUndef() &&
6377 Src.getOperand(1).getValueType() == SubVT &&
6378 Src.getConstantOperandVal(2) == 0 &&
6379 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6380 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6381 Mask.resize(NumElts);
6382 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6383 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6384 Ops.push_back(Src.getOperand(1));
6385 Ops.push_back(Sub);
6386 return true;
6387 }
6388 if (!N->isOnlyUserOf(Sub.getNode()))
6389 return false;
6390
6391 SmallVector<int, 64> SubMask;
6392 SmallVector<SDValue, 2> SubInputs;
6394 EVT SubSrcVT = SubSrc.getValueType();
6395 if (!SubSrcVT.isVector())
6396 return false;
6397
6398 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6399 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6400 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6401 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6402 SDValue SubSrcSrc = SubSrc.getOperand(0);
6403 unsigned NumSubSrcSrcElts =
6404 SubSrcSrc.getValueType().getVectorNumElements();
6405 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6406 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6407 "Subvector valuetype mismatch");
6408 InsertIdx *= (MaxElts / NumElts);
6409 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6410 NumSubElts *= (MaxElts / NumElts);
6411 bool SrcIsUndef = Src.isUndef();
6412 for (int i = 0; i != (int)MaxElts; ++i)
6413 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6414 for (int i = 0; i != (int)NumSubElts; ++i)
6415 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6416 if (!SrcIsUndef)
6417 Ops.push_back(Src);
6418 Ops.push_back(SubSrcSrc);
6419 return true;
6420 }
6421
6422 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6423 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6424 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6425 Depth + 1, ResolveKnownElts))
6426 return false;
6427
6428 // Subvector shuffle inputs must not be larger than the subvector.
6429 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6430 return SubVT.getFixedSizeInBits() <
6431 SubInput.getValueSizeInBits().getFixedValue();
6432 }))
6433 return false;
6434
6435 if (SubMask.size() != NumSubElts) {
6436 assert(((SubMask.size() % NumSubElts) == 0 ||
6437 (NumSubElts % SubMask.size()) == 0) &&
6438 "Illegal submask scale");
6439 if ((NumSubElts % SubMask.size()) == 0) {
6440 int Scale = NumSubElts / SubMask.size();
6441 SmallVector<int, 64> ScaledSubMask;
6442 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6443 SubMask = ScaledSubMask;
6444 } else {
6445 int Scale = SubMask.size() / NumSubElts;
6446 NumSubElts = SubMask.size();
6447 NumElts *= Scale;
6448 InsertIdx *= Scale;
6449 }
6450 }
6451 Ops.push_back(Src);
6452 Ops.append(SubInputs.begin(), SubInputs.end());
6453 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6454 Mask.append(NumElts, SM_SentinelZero);
6455 else
6456 for (int i = 0; i != (int)NumElts; ++i)
6457 Mask.push_back(i);
6458 for (int i = 0; i != (int)NumSubElts; ++i) {
6459 int M = SubMask[i];
6460 if (0 <= M) {
6461 int InputIdx = M / NumSubElts;
6462 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6463 }
6464 Mask[i + InsertIdx] = M;
6465 }
6466 return true;
6467 }
6468 case X86ISD::PINSRB:
6469 case X86ISD::PINSRW:
6472 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6473 // vector, for matching src/dst vector types.
6474 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6475
6476 unsigned DstIdx = 0;
6477 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6478 // Check we have an in-range constant insertion index.
6479 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6480 N.getConstantOperandAPInt(2).uge(NumElts))
6481 return false;
6482 DstIdx = N.getConstantOperandVal(2);
6483
6484 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6485 if (X86::isZeroNode(Scl)) {
6486 Ops.push_back(N.getOperand(0));
6487 for (unsigned i = 0; i != NumElts; ++i)
6488 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6489 return true;
6490 }
6491 }
6492
6493 // Peek through trunc/aext/zext/bitcast.
6494 // TODO: aext shouldn't require SM_SentinelZero padding.
6495 // TODO: handle shift of scalars.
6496 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6497 while (Scl.getOpcode() == ISD::TRUNCATE ||
6498 Scl.getOpcode() == ISD::ANY_EXTEND ||
6499 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6500 (Scl.getOpcode() == ISD::BITCAST &&
6503 Scl = Scl.getOperand(0);
6504 MinBitsPerElt =
6505 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6506 }
6507 if ((MinBitsPerElt % 8) != 0)
6508 return false;
6509
6510 // Attempt to find the source vector the scalar was extracted from.
6511 SDValue SrcExtract;
6512 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6513 Scl.getOpcode() == X86ISD::PEXTRW ||
6514 Scl.getOpcode() == X86ISD::PEXTRB) &&
6515 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6516 SrcExtract = Scl;
6517 }
6518 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6519 return false;
6520
6521 SDValue SrcVec = SrcExtract.getOperand(0);
6522 EVT SrcVT = SrcVec.getValueType();
6523 if (!SrcVT.getScalarType().isByteSized())
6524 return false;
6525 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6526 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6527 unsigned DstByte = DstIdx * NumBytesPerElt;
6528 MinBitsPerElt =
6529 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6530
6531 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6532 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6533 Ops.push_back(SrcVec);
6534 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6535 } else {
6536 Ops.push_back(SrcVec);
6537 Ops.push_back(N.getOperand(0));
6538 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6539 Mask.push_back(NumSizeInBytes + i);
6540 }
6541
6542 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6543 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6544 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6545 Mask[DstByte + i] = SrcByte + i;
6546 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6547 Mask[DstByte + i] = SM_SentinelZero;
6548 return true;
6549 }
6550 case X86ISD::PACKSS:
6551 case X86ISD::PACKUS: {
6552 SDValue N0 = N.getOperand(0);
6553 SDValue N1 = N.getOperand(1);
6554 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6555 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6556 "Unexpected input value type");
6557
6558 APInt EltsLHS, EltsRHS;
6559 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6560
6561 // If we know input saturation won't happen (or we don't care for particular
6562 // lanes), we can treat this as a truncation shuffle.
6563 bool Offset0 = false, Offset1 = false;
6564 if (Opcode == X86ISD::PACKSS) {
6565 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6566 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6567 (!(N1.isUndef() || EltsRHS.isZero()) &&
6568 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6569 return false;
6570 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6571 // PACKSS then it was likely being used for sign-extension for a
6572 // truncation, so just peek through and adjust the mask accordingly.
6573 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6574 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6575 Offset0 = true;
6576 N0 = N0.getOperand(0);
6577 }
6578 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6579 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6580 Offset1 = true;
6581 N1 = N1.getOperand(0);
6582 }
6583 } else {
6584 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6585 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6586 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6587 (!(N1.isUndef() || EltsRHS.isZero()) &&
6588 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6589 return false;
6590 }
6591
6592 bool IsUnary = (N0 == N1);
6593
6594 Ops.push_back(N0);
6595 if (!IsUnary)
6596 Ops.push_back(N1);
6597
6598 createPackShuffleMask(VT, Mask, IsUnary);
6599
6600 if (Offset0 || Offset1) {
6601 for (int &M : Mask)
6602 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6603 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6604 ++M;
6605 }
6606 return true;
6607 }
6608 case ISD::VSELECT:
6609 case X86ISD::BLENDV: {
6610 SDValue Cond = N.getOperand(0);
6611 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6612 Ops.push_back(N.getOperand(1));
6613 Ops.push_back(N.getOperand(2));
6614 return true;
6615 }
6616 return false;
6617 }
6618 case X86ISD::VTRUNC: {
6619 SDValue Src = N.getOperand(0);
6620 EVT SrcVT = Src.getValueType();
6621 if (SrcVT.getSizeInBits() != NumSizeInBits)
6622 return false;
6623 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6624 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6625 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6626 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6627 for (unsigned i = 0; i != NumSrcElts; ++i)
6628 Mask.push_back(i * Scale);
6629 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6630 Ops.push_back(Src);
6631 return true;
6632 }
6633 case ISD::SHL:
6634 case ISD::SRL: {
6635 APInt UndefElts;
6636 SmallVector<APInt, 32> EltBits;
6637 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6638 UndefElts, EltBits,
6639 /*AllowWholeUndefs*/ true,
6640 /*AllowPartialUndefs*/ false))
6641 return false;
6642
6643 // We can only decode 'whole byte' bit shifts as shuffles.
6644 for (unsigned I = 0; I != NumElts; ++I)
6645 if (DemandedElts[I] && !UndefElts[I] &&
6646 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6647 return false;
6648
6649 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6650 Ops.push_back(N.getOperand(0));
6651
6652 for (unsigned I = 0; I != NumElts; ++I) {
6653 if (!DemandedElts[I] || UndefElts[I])
6654 continue;
6655 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6656 unsigned Lo = I * NumBytesPerElt;
6657 unsigned Hi = Lo + NumBytesPerElt;
6658 // Clear mask to all zeros and insert the shifted byte indices.
6659 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6660 if (ISD::SHL == Opcode)
6661 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6662 else
6663 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6664 Lo + ByteShift);
6665 }
6666 return true;
6667 }
6668 case X86ISD::VSHLI:
6669 case X86ISD::VSRLI: {
6670 uint64_t ShiftVal = N.getConstantOperandVal(1);
6671 // Out of range bit shifts are guaranteed to be zero.
6672 if (NumBitsPerElt <= ShiftVal) {
6673 Mask.append(NumElts, SM_SentinelZero);
6674 return true;
6675 }
6676
6677 // We can only decode 'whole byte' bit shifts as shuffles.
6678 if ((ShiftVal % 8) != 0)
6679 break;
6680
6681 uint64_t ByteShift = ShiftVal / 8;
6682 Ops.push_back(N.getOperand(0));
6683
6684 // Clear mask to all zeros and insert the shifted byte indices.
6685 Mask.append(NumSizeInBytes, SM_SentinelZero);
6686
6687 if (X86ISD::VSHLI == Opcode) {
6688 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6689 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6690 Mask[i + j] = i + j - ByteShift;
6691 } else {
6692 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6693 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6694 Mask[i + j - ByteShift] = i + j;
6695 }
6696 return true;
6697 }
6698 case X86ISD::VROTLI:
6699 case X86ISD::VROTRI: {
6700 // We can only decode 'whole byte' bit rotates as shuffles.
6701 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6702 if ((RotateVal % 8) != 0)
6703 return false;
6704 Ops.push_back(N.getOperand(0));
6705 int Offset = RotateVal / 8;
6706 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6707 for (int i = 0; i != (int)NumElts; ++i) {
6708 int BaseIdx = i * NumBytesPerElt;
6709 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6710 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6711 }
6712 }
6713 return true;
6714 }
6715 case X86ISD::VBROADCAST: {
6716 SDValue Src = N.getOperand(0);
6717 if (!Src.getSimpleValueType().isVector()) {
6718 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6719 !isNullConstant(Src.getOperand(1)) ||
6720 Src.getOperand(0).getValueType().getScalarType() !=
6721 VT.getScalarType())
6722 return false;
6723 Src = Src.getOperand(0);
6724 }
6725 Ops.push_back(Src);
6726 Mask.append(NumElts, 0);
6727 return true;
6728 }
6730 SDValue Src = N.getOperand(0);
6731 EVT SrcVT = Src.getValueType();
6732 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6733
6734 // Extended source must be a simple vector.
6735 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6736 (NumBitsPerSrcElt % 8) != 0)
6737 return false;
6738
6739 // We can only handle all-signbits extensions.
6740 APInt DemandedSrcElts =
6741 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6742 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6743 return false;
6744
6745 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6746 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6747 for (unsigned I = 0; I != NumElts; ++I)
6748 Mask.append(Scale, I);
6749 Ops.push_back(Src);
6750 return true;
6751 }
6752 case ISD::ZERO_EXTEND:
6753 case ISD::ANY_EXTEND:
6756 SDValue Src = N.getOperand(0);
6757 EVT SrcVT = Src.getValueType();
6758
6759 // Extended source must be a simple vector.
6760 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6761 (SrcVT.getScalarSizeInBits() % 8) != 0)
6762 return false;
6763
6764 bool IsAnyExtend =
6765 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6766 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6767 IsAnyExtend, Mask);
6768 Ops.push_back(Src);
6769 return true;
6770 }
6771 }
6772
6773 return false;
6774}
6775
6776/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6778 SmallVectorImpl<int> &Mask) {
6779 int MaskWidth = Mask.size();
6780 SmallVector<SDValue, 16> UsedInputs;
6781 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6782 int lo = UsedInputs.size() * MaskWidth;
6783 int hi = lo + MaskWidth;
6784
6785 // Strip UNDEF input usage.
6786 if (Inputs[i].isUndef())
6787 for (int &M : Mask)
6788 if ((lo <= M) && (M < hi))
6789 M = SM_SentinelUndef;
6790
6791 // Check for unused inputs.
6792 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6793 for (int &M : Mask)
6794 if (lo <= M)
6795 M -= MaskWidth;
6796 continue;
6797 }
6798
6799 // Check for repeated inputs.
6800 bool IsRepeat = false;
6801 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6802 if (UsedInputs[j] != Inputs[i])
6803 continue;
6804 for (int &M : Mask)
6805 if (lo <= M)
6806 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6807 IsRepeat = true;
6808 break;
6809 }
6810 if (IsRepeat)
6811 continue;
6812
6813 UsedInputs.push_back(Inputs[i]);
6814 }
6815 Inputs = std::move(UsedInputs);
6816}
6817
6818/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6819/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6820/// Returns true if the target shuffle mask was decoded.
6821static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6824 APInt &KnownUndef, APInt &KnownZero,
6825 const SelectionDAG &DAG, unsigned Depth,
6826 bool ResolveKnownElts) {
6828 return false; // Limit search depth.
6829
6830 EVT VT = Op.getValueType();
6831 if (!VT.isSimple() || !VT.isVector())
6832 return false;
6833
6834 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6835 if (ResolveKnownElts)
6836 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6837 return true;
6838 }
6839 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6840 ResolveKnownElts)) {
6841 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6842 return true;
6843 }
6844 return false;
6845}
6846
6847static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6850 const SelectionDAG &DAG, unsigned Depth,
6851 bool ResolveKnownElts) {
6852 APInt KnownUndef, KnownZero;
6853 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6854 KnownZero, DAG, Depth, ResolveKnownElts);
6855}
6856
6859 const SelectionDAG &DAG, unsigned Depth = 0,
6860 bool ResolveKnownElts = true) {
6861 EVT VT = Op.getValueType();
6862 if (!VT.isSimple() || !VT.isVector())
6863 return false;
6864
6865 unsigned NumElts = Op.getValueType().getVectorNumElements();
6866 APInt DemandedElts = APInt::getAllOnes(NumElts);
6867 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6868 ResolveKnownElts);
6869}
6870
6871// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6872static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6873 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6874 SelectionDAG &DAG) {
6875 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6876 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6877 "Unknown broadcast load type");
6878
6879 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6880 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6881 return SDValue();
6882
6883 SDValue Ptr = DAG.getMemBasePlusOffset(Mem->getBasePtr(),
6885 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6886 SDValue Ops[] = {Mem->getChain(), Ptr};
6887 SDValue BcstLd = DAG.getMemIntrinsicNode(
6888 Opcode, DL, Tys, Ops, MemVT,
6890 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6891 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6892 return BcstLd;
6893}
6894
6895/// Returns the scalar element that will make up the i'th
6896/// element of the result of the vector shuffle.
6897static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6898 SelectionDAG &DAG, unsigned Depth) {
6900 return SDValue(); // Limit search depth.
6901
6902 EVT VT = Op.getValueType();
6903 unsigned Opcode = Op.getOpcode();
6904 unsigned NumElems = VT.getVectorNumElements();
6905
6906 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6907 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6908 int Elt = SV->getMaskElt(Index);
6909
6910 if (Elt < 0)
6911 return DAG.getUNDEF(VT.getVectorElementType());
6912
6913 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6914 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6915 }
6916
6917 // Recurse into target specific vector shuffles to find scalars.
6918 if (isTargetShuffle(Opcode)) {
6919 MVT ShufVT = VT.getSimpleVT();
6920 MVT ShufSVT = ShufVT.getVectorElementType();
6921 int NumElems = (int)ShufVT.getVectorNumElements();
6922 SmallVector<int, 16> ShuffleMask;
6924 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6925 return SDValue();
6926
6927 int Elt = ShuffleMask[Index];
6928 if (Elt == SM_SentinelZero)
6929 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6930 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6931 if (Elt == SM_SentinelUndef)
6932 return DAG.getUNDEF(ShufSVT);
6933
6934 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6935 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6936 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6937 }
6938
6939 // Recurse into insert_subvector base/sub vector to find scalars.
6940 if (Opcode == ISD::INSERT_SUBVECTOR) {
6941 SDValue Vec = Op.getOperand(0);
6942 SDValue Sub = Op.getOperand(1);
6943 uint64_t SubIdx = Op.getConstantOperandVal(2);
6944 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6945
6946 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6947 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6948 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6949 }
6950
6951 // Recurse into concat_vectors sub vector to find scalars.
6952 if (Opcode == ISD::CONCAT_VECTORS) {
6953 EVT SubVT = Op.getOperand(0).getValueType();
6954 unsigned NumSubElts = SubVT.getVectorNumElements();
6955 uint64_t SubIdx = Index / NumSubElts;
6956 uint64_t SubElt = Index % NumSubElts;
6957 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6958 }
6959
6960 // Recurse into extract_subvector src vector to find scalars.
6961 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6962 SDValue Src = Op.getOperand(0);
6963 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6964 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6965 }
6966
6967 // We only peek through bitcasts of the same vector width.
6968 if (Opcode == ISD::BITCAST) {
6969 SDValue Src = Op.getOperand(0);
6970 EVT SrcVT = Src.getValueType();
6971 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6972 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6973 return SDValue();
6974 }
6975
6976 // Actual nodes that may contain scalar elements
6977
6978 // For insert_vector_elt - either return the index matching scalar or recurse
6979 // into the base vector.
6980 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6981 isa<ConstantSDNode>(Op.getOperand(2))) {
6982 if (Op.getConstantOperandAPInt(2) == Index)
6983 return Op.getOperand(1);
6984 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6985 }
6986
6987 if (Opcode == ISD::SCALAR_TO_VECTOR)
6988 return (Index == 0) ? Op.getOperand(0)
6989 : DAG.getUNDEF(VT.getVectorElementType());
6990
6991 if (Opcode == ISD::BUILD_VECTOR)
6992 return Op.getOperand(Index);
6993
6994 return SDValue();
6995}
6996
6997// Use PINSRB/PINSRW/PINSRD to create a build vector.
6999 const APInt &NonZeroMask,
7000 unsigned NumNonZero, unsigned NumZero,
7001 SelectionDAG &DAG,
7002 const X86Subtarget &Subtarget) {
7003 MVT VT = Op.getSimpleValueType();
7004 unsigned NumElts = VT.getVectorNumElements();
7005 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
7006 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
7007 "Illegal vector insertion");
7008
7009 SDValue V;
7010 bool First = true;
7011
7012 for (unsigned i = 0; i < NumElts; ++i) {
7013 bool IsNonZero = NonZeroMask[i];
7014 if (!IsNonZero)
7015 continue;
7016
7017 // If the build vector contains zeros or our first insertion is not the
7018 // first index then insert into zero vector to break any register
7019 // dependency else use SCALAR_TO_VECTOR.
7020 if (First) {
7021 First = false;
7022 if (NumZero || 0 != i)
7023 V = getZeroVector(VT, Subtarget, DAG, DL);
7024 else {
7025 assert(0 == i && "Expected insertion into zero-index");
7026 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7027 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
7028 V = DAG.getBitcast(VT, V);
7029 continue;
7030 }
7031 }
7032 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
7033 DAG.getVectorIdxConstant(i, DL));
7034 }
7035
7036 return V;
7037}
7038
7039/// Custom lower build_vector of v16i8.
7041 const APInt &NonZeroMask,
7042 unsigned NumNonZero, unsigned NumZero,
7043 SelectionDAG &DAG,
7044 const X86Subtarget &Subtarget) {
7045 if (NumNonZero > 8 && !Subtarget.hasSSE41())
7046 return SDValue();
7047
7048 // SSE4.1 - use PINSRB to insert each byte directly.
7049 if (Subtarget.hasSSE41())
7050 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
7051 DAG, Subtarget);
7052
7053 SDValue V;
7054
7055 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
7056 // If both the lowest 16-bits are non-zero, then convert to MOVD.
7057 if (!NonZeroMask.extractBits(2, 0).isZero() &&
7058 !NonZeroMask.extractBits(2, 2).isZero()) {
7059 for (unsigned I = 0; I != 4; ++I) {
7060 if (!NonZeroMask[I])
7061 continue;
7062 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
7063 if (I != 0)
7064 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
7065 DAG.getConstant(I * 8, DL, MVT::i8));
7066 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
7067 }
7068 assert(V && "Failed to fold v16i8 vector to zero");
7069 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
7070 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
7071 V = DAG.getBitcast(MVT::v8i16, V);
7072 }
7073 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
7074 bool ThisIsNonZero = NonZeroMask[i];
7075 bool NextIsNonZero = NonZeroMask[i + 1];
7076 if (!ThisIsNonZero && !NextIsNonZero)
7077 continue;
7078
7079 SDValue Elt;
7080 if (ThisIsNonZero) {
7081 if (NumZero || NextIsNonZero)
7082 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7083 else
7084 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7085 }
7086
7087 if (NextIsNonZero) {
7088 SDValue NextElt = Op.getOperand(i + 1);
7089 if (i == 0 && NumZero)
7090 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
7091 else
7092 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
7093 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
7094 DAG.getConstant(8, DL, MVT::i8));
7095 if (ThisIsNonZero)
7096 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
7097 else
7098 Elt = NextElt;
7099 }
7100
7101 // If our first insertion is not the first index or zeros are needed, then
7102 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
7103 // elements undefined).
7104 if (!V) {
7105 if (i != 0 || NumZero)
7106 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
7107 else {
7108 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
7109 V = DAG.getBitcast(MVT::v8i16, V);
7110 continue;
7111 }
7112 }
7113 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7114 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
7115 DAG.getVectorIdxConstant(i / 2, DL));
7116 }
7117
7118 return DAG.getBitcast(MVT::v16i8, V);
7119}
7120
7121/// Custom lower build_vector of v8i16.
7123 const APInt &NonZeroMask,
7124 unsigned NumNonZero, unsigned NumZero,
7125 SelectionDAG &DAG,
7126 const X86Subtarget &Subtarget) {
7127 if (NumNonZero > 4 && !Subtarget.hasSSE41())
7128 return SDValue();
7129
7130 // Use PINSRW to insert each byte directly.
7131 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
7132 Subtarget);
7133}
7134
7135/// Custom lower build_vector of v4i32 or v4f32.
7137 SelectionDAG &DAG,
7138 const X86Subtarget &Subtarget) {
7139 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7140 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7141 // Because we're creating a less complicated build vector here, we may enable
7142 // further folding of the MOVDDUP via shuffle transforms.
7143 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7144 Op.getOperand(0) == Op.getOperand(2) &&
7145 Op.getOperand(1) == Op.getOperand(3) &&
7146 Op.getOperand(0) != Op.getOperand(1)) {
7147 MVT VT = Op.getSimpleValueType();
7148 MVT EltVT = VT.getVectorElementType();
7149 // Create a new build vector with the first 2 elements followed by undef
7150 // padding, bitcast to v2f64, duplicate, and bitcast back.
7151 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7152 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7153 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7154 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7155 return DAG.getBitcast(VT, Dup);
7156 }
7157
7158 // Find all zeroable elements.
7159 std::bitset<4> Zeroable, Undefs;
7160 for (int i = 0; i < 4; ++i) {
7161 SDValue Elt = Op.getOperand(i);
7162 Undefs[i] = Elt.isUndef();
7163 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7164 }
7165 assert(Zeroable.size() - Zeroable.count() > 1 &&
7166 "We expect at least two non-zero elements!");
7167
7168 // We only know how to deal with build_vector nodes where elements are either
7169 // zeroable or extract_vector_elt with constant index.
7170 SDValue FirstNonZero;
7171 unsigned FirstNonZeroIdx;
7172 for (unsigned i = 0; i < 4; ++i) {
7173 if (Zeroable[i])
7174 continue;
7175 SDValue Elt = Op.getOperand(i);
7176 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7178 return SDValue();
7179 // Make sure that this node is extracting from a 128-bit vector.
7180 MVT VT = Elt.getOperand(0).getSimpleValueType();
7181 if (!VT.is128BitVector())
7182 return SDValue();
7183 if (!FirstNonZero.getNode()) {
7184 FirstNonZero = Elt;
7185 FirstNonZeroIdx = i;
7186 }
7187 }
7188
7189 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7190 SDValue V1 = FirstNonZero.getOperand(0);
7191 MVT VT = V1.getSimpleValueType();
7192
7193 // See if this build_vector can be lowered as a blend with zero.
7194 SDValue Elt;
7195 unsigned EltMaskIdx, EltIdx;
7196 int Mask[4];
7197 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7198 if (Zeroable[EltIdx]) {
7199 // The zero vector will be on the right hand side.
7200 Mask[EltIdx] = EltIdx+4;
7201 continue;
7202 }
7203
7204 Elt = Op->getOperand(EltIdx);
7205 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7206 EltMaskIdx = Elt.getConstantOperandVal(1);
7207 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7208 break;
7209 Mask[EltIdx] = EltIdx;
7210 }
7211
7212 if (EltIdx == 4) {
7213 // Let the shuffle legalizer deal with blend operations.
7214 SDValue VZeroOrUndef = (Zeroable == Undefs)
7215 ? DAG.getUNDEF(VT)
7216 : getZeroVector(VT, Subtarget, DAG, DL);
7217 if (V1.getSimpleValueType() != VT)
7218 V1 = DAG.getBitcast(VT, V1);
7219 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7220 }
7221
7222 // See if we can lower this build_vector to a INSERTPS.
7223 if (!Subtarget.hasSSE41())
7224 return SDValue();
7225
7226 SDValue V2 = Elt.getOperand(0);
7227 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7228 V1 = SDValue();
7229
7230 bool CanFold = true;
7231 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7232 if (Zeroable[i])
7233 continue;
7234
7235 SDValue Current = Op->getOperand(i);
7236 SDValue SrcVector = Current->getOperand(0);
7237 if (!V1.getNode())
7238 V1 = SrcVector;
7239 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7240 }
7241
7242 if (!CanFold)
7243 return SDValue();
7244
7245 assert(V1.getNode() && "Expected at least two non-zero elements!");
7246 if (V1.getSimpleValueType() != MVT::v4f32)
7247 V1 = DAG.getBitcast(MVT::v4f32, V1);
7248 if (V2.getSimpleValueType() != MVT::v4f32)
7249 V2 = DAG.getBitcast(MVT::v4f32, V2);
7250
7251 // Ok, we can emit an INSERTPS instruction.
7252 unsigned ZMask = Zeroable.to_ulong();
7253
7254 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7255 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7256 SDValue Result =
7257 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7258 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7259 return DAG.getBitcast(VT, Result);
7260}
7261
7262/// Return a vector logical shift node.
7263static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7264 SelectionDAG &DAG, const TargetLowering &TLI,
7265 const SDLoc &dl) {
7266 assert(VT.is128BitVector() && "Unknown type for VShift");
7267 MVT ShVT = MVT::v16i8;
7268 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7269 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7270 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7271 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7272 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7273}
7274
7276 SelectionDAG &DAG) {
7277
7278 // Check if the scalar load can be widened into a vector load. And if
7279 // the address is "base + cst" see if the cst can be "absorbed" into
7280 // the shuffle mask.
7282 SDValue Ptr = LD->getBasePtr();
7283 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7284 return SDValue();
7285 EVT PVT = LD->getValueType(0);
7286 if (PVT != MVT::i32 && PVT != MVT::f32)
7287 return SDValue();
7288
7289 int FI = -1;
7290 int64_t Offset = 0;
7291 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7292 FI = FINode->getIndex();
7293 Offset = 0;
7294 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7296 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7298 Ptr = Ptr.getOperand(0);
7299 } else {
7300 return SDValue();
7301 }
7302
7303 // FIXME: 256-bit vector instructions don't require a strict alignment,
7304 // improve this code to support it better.
7305 Align RequiredAlign(VT.getSizeInBits() / 8);
7306 SDValue Chain = LD->getChain();
7307 // Make sure the stack object alignment is at least 16 or 32.
7309 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7310 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7311 if (MFI.isFixedObjectIndex(FI)) {
7312 // Can't change the alignment. FIXME: It's possible to compute
7313 // the exact stack offset and reference FI + adjust offset instead.
7314 // If someone *really* cares about this. That's the way to implement it.
7315 return SDValue();
7316 } else {
7317 MFI.setObjectAlignment(FI, RequiredAlign);
7318 }
7319 }
7320
7321 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7322 // Ptr + (Offset & ~15).
7323 if (Offset < 0)
7324 return SDValue();
7325 if ((Offset % RequiredAlign.value()) & 3)
7326 return SDValue();
7327 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7328 if (StartOffset) {
7329 SDLoc DL(Ptr);
7330 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7331 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7332 }
7333
7334 int EltNo = (Offset - StartOffset) >> 2;
7335 unsigned NumElems = VT.getVectorNumElements();
7336
7337 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7338 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7339 LD->getPointerInfo().getWithOffset(StartOffset));
7340
7341 SmallVector<int, 8> Mask(NumElems, EltNo);
7342
7343 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7344 }
7345
7346 return SDValue();
7347}
7348
7349// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7350static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7351 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7352 auto *BaseLd = cast<LoadSDNode>(Elt);
7353 if (!BaseLd->isSimple())
7354 return false;
7355 Ld = BaseLd;
7356 ByteOffset = 0;
7357 return true;
7358 }
7359
7360 switch (Elt.getOpcode()) {
7361 case ISD::BITCAST:
7362 case ISD::TRUNCATE:
7364 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7365 case ISD::SRL:
7366 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7367 uint64_t Amt = AmtC->getZExtValue();
7368 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7369 ByteOffset += Amt / 8;
7370 return true;
7371 }
7372 }
7373 break;
7375 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7376 SDValue Src = Elt.getOperand(0);
7377 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7378 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7379 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7380 findEltLoadSrc(Src, Ld, ByteOffset)) {
7381 uint64_t Idx = IdxC->getZExtValue();
7382 ByteOffset += Idx * (SrcSizeInBits / 8);
7383 return true;
7384 }
7385 }
7386 break;
7387 }
7388
7389 return false;
7390}
7391
7392/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7393/// elements can be replaced by a single large load which has the same value as
7394/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7395///
7396/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7398 const SDLoc &DL, SelectionDAG &DAG,
7399 const X86Subtarget &Subtarget,
7400 bool IsAfterLegalize,
7401 unsigned Depth = 0) {
7403 return SDValue(); // Limit search depth.
7404 if ((VT.getScalarSizeInBits() % 8) != 0)
7405 return SDValue();
7406
7407 unsigned NumElems = Elts.size();
7408
7409 int LastLoadedElt = -1;
7410 APInt LoadMask = APInt::getZero(NumElems);
7411 APInt ZeroMask = APInt::getZero(NumElems);
7412 APInt UndefMask = APInt::getZero(NumElems);
7413
7414 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7415 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7416
7417 // For each element in the initializer, see if we've found a load, zero or an
7418 // undef.
7419 for (unsigned i = 0; i < NumElems; ++i) {
7420 SDValue Elt = peekThroughBitcasts(Elts[i]);
7421 if (!Elt.getNode())
7422 return SDValue();
7423 if (Elt.isUndef()) {
7424 UndefMask.setBit(i);
7425 continue;
7426 }
7428 ZeroMask.setBit(i);
7429 continue;
7430 }
7431
7432 // Each loaded element must be the correct fractional portion of the
7433 // requested vector load.
7434 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7435 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7436 return SDValue();
7437
7438 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7439 return SDValue();
7440 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7441 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7442 return SDValue();
7443
7444 LoadMask.setBit(i);
7445 LastLoadedElt = i;
7446 }
7447 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7448 NumElems &&
7449 "Incomplete element masks");
7450
7451 // Handle Special Cases - all undef or undef/zero.
7452 if (UndefMask.popcount() == NumElems)
7453 return DAG.getUNDEF(VT);
7454 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7455 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7456 : DAG.getConstantFP(0.0, DL, VT);
7457
7458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7459 int FirstLoadedElt = LoadMask.countr_zero();
7460 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7461 EVT EltBaseVT = EltBase.getValueType();
7462 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7463 "Register/Memory size mismatch");
7464 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7465 assert(LDBase && "Did not find base load for merging consecutive loads");
7466 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7467 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7468 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7469 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7470 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7471
7472 // TODO: Support offsetting the base load.
7473 if (ByteOffsets[FirstLoadedElt] != 0)
7474 return SDValue();
7475
7476 // Check to see if the element's load is consecutive to the base load
7477 // or offset from a previous (already checked) load.
7478 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7479 LoadSDNode *Ld = Loads[EltIdx];
7480 int64_t ByteOffset = ByteOffsets[EltIdx];
7481 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7482 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7483 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7484 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7485 }
7486 int Stride = EltIdx - FirstLoadedElt;
7487 if (DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, Stride))
7488 return true;
7489 // Try again using the memory load size (we might have broken a large load
7490 // into smaller elements), ensure the stride is the full memory load size
7491 // apart and a whole number of elements fit in each memory load.
7492 unsigned BaseMemSizeInBits = Base->getMemoryVT().getSizeInBits();
7493 if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
7494 (BaseMemSizeInBits % BaseSizeInBits) == 0) {
7495 unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
7496 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseMemSizeInBits / 8,
7497 Stride / Scale);
7498 }
7499 return false;
7500 };
7501
7502 // Consecutive loads can contain UNDEFS but not ZERO elements.
7503 // Consecutive loads with UNDEFs and ZEROs elements require a
7504 // an additional shuffle stage to clear the ZERO elements.
7505 bool IsConsecutiveLoad = true;
7506 bool IsConsecutiveLoadWithZeros = true;
7507 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7508 if (LoadMask[i]) {
7509 if (!CheckConsecutiveLoad(LDBase, i)) {
7510 IsConsecutiveLoad = false;
7511 IsConsecutiveLoadWithZeros = false;
7512 break;
7513 }
7514 } else if (ZeroMask[i]) {
7515 IsConsecutiveLoad = false;
7516 }
7517 }
7518
7519 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7520 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7521 assert(LDBase->isSimple() &&
7522 "Cannot merge volatile or atomic loads.");
7523 SDValue NewLd =
7524 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7525 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7526 for (auto *LD : Loads)
7527 if (LD)
7528 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7529 return NewLd;
7530 };
7531
7532 // Check if the base load is entirely dereferenceable.
7533 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7534 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7535
7536 // LOAD - all consecutive load/undefs (must start/end with a load or be
7537 // entirely dereferenceable). If we have found an entire vector of loads and
7538 // undefs, then return a large load of the entire vector width starting at the
7539 // base pointer. If the vector contains zeros, then attempt to shuffle those
7540 // elements.
7541 if (FirstLoadedElt == 0 &&
7542 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7543 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7544 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7545 return SDValue();
7546
7547 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7548 // will lower to regular temporal loads and use the cache.
7549 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7550 VT.is256BitVector() && !Subtarget.hasInt256())
7551 return SDValue();
7552
7553 if (NumElems == 1)
7554 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7555
7556 if (!ZeroMask)
7557 return CreateLoad(VT, LDBase);
7558
7559 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7560 // vector and a zero vector to clear out the zero elements.
7561 if (!IsAfterLegalize && VT.isVector()) {
7562 unsigned NumMaskElts = VT.getVectorNumElements();
7563 if ((NumMaskElts % NumElems) == 0) {
7564 unsigned Scale = NumMaskElts / NumElems;
7565 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7566 for (unsigned i = 0; i < NumElems; ++i) {
7567 if (UndefMask[i])
7568 continue;
7569 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7570 for (unsigned j = 0; j != Scale; ++j)
7571 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7572 }
7573 SDValue V = CreateLoad(VT, LDBase);
7574 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7575 : DAG.getConstantFP(0.0, DL, VT);
7576 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7577 }
7578 }
7579 }
7580
7581 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7582 if (VT.is256BitVector() || VT.is512BitVector()) {
7583 unsigned HalfNumElems = NumElems / 2;
7584 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7585 EVT HalfVT =
7586 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7587 SDValue HalfLD =
7588 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7589 DAG, Subtarget, IsAfterLegalize, Depth + 1);
7590 if (HalfLD)
7591 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7592 HalfLD, DAG.getVectorIdxConstant(0, DL));
7593 }
7594 }
7595
7596 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7597 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7598 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7599 LoadSizeInBits == 64) &&
7600 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7601 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7602 : MVT::getIntegerVT(LoadSizeInBits);
7603 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7604 // Allow v4f32 on SSE1 only targets.
7605 // FIXME: Add more isel patterns so we can just use VT directly.
7606 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7607 VecVT = MVT::v4f32;
7608 if (TLI.isTypeLegal(VecVT)) {
7609 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7610 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7611 SDValue ResNode = DAG.getMemIntrinsicNode(
7612 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7614 for (auto *LD : Loads)
7615 if (LD)
7616 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7617 return DAG.getBitcast(VT, ResNode);
7618 }
7619 }
7620
7621 // BROADCAST - match the smallest possible repetition pattern, load that
7622 // scalar/subvector element and then broadcast to the entire vector.
7623 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7624 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7625 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7626 unsigned RepeatSize = SubElems * BaseSizeInBits;
7627 unsigned ScalarSize = std::min(RepeatSize, 64u);
7628 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7629 continue;
7630
7631 // Don't attempt a 1:N subvector broadcast - it should be caught by
7632 // combineConcatVectorOps, else will cause infinite loops.
7633 if (RepeatSize > ScalarSize && SubElems == 1)
7634 continue;
7635
7636 bool Match = true;
7637 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7638 for (unsigned i = 0; i != NumElems && Match; ++i) {
7639 if (!LoadMask[i])
7640 continue;
7641 SDValue Elt = peekThroughBitcasts(Elts[i]);
7642 if (RepeatedLoads[i % SubElems].isUndef())
7643 RepeatedLoads[i % SubElems] = Elt;
7644 else
7645 Match &= (RepeatedLoads[i % SubElems] == Elt);
7646 }
7647
7648 // We must have loads at both ends of the repetition.
7649 Match &= !RepeatedLoads.front().isUndef();
7650 Match &= !RepeatedLoads.back().isUndef();
7651 if (!Match)
7652 continue;
7653
7654 EVT RepeatVT =
7655 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7656 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7657 : EVT::getFloatingPointVT(ScalarSize);
7658 if (RepeatSize > ScalarSize)
7659 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7660 RepeatSize / ScalarSize);
7661 EVT BroadcastVT =
7662 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7663 VT.getSizeInBits() / ScalarSize);
7664 if (TLI.isTypeLegal(BroadcastVT)) {
7665 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7666 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize,
7667 Depth + 1)) {
7668 SDValue Broadcast = RepeatLoad;
7669 if (RepeatSize > ScalarSize) {
7670 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7671 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7672 } else {
7673 if (!Subtarget.hasAVX2() &&
7675 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7676 Subtarget,
7677 /*AssumeSingleUse=*/true))
7678 return SDValue();
7679 Broadcast =
7680 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7681 }
7682 return DAG.getBitcast(VT, Broadcast);
7683 }
7684 }
7685 }
7686 }
7687
7688 // REVERSE - attempt to match the loads in reverse and then shuffle back.
7689 // TODO: Do this for any permute or mismatching element counts.
7690 if (Depth == 0 && ZeroMask.isZero() && UndefMask.isZero() &&
7691 TLI.isTypeLegal(VT) && VT.isVector() &&
7692 NumElems == VT.getVectorNumElements()) {
7693 SmallVector<SDValue, 16> ReverseElts(Elts.rbegin(), Elts.rend());
7695 VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) {
7696 SmallVector<int, 16> ReverseMask(NumElems);
7697 std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0);
7698 return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask);
7699 }
7700 }
7701
7702 return SDValue();
7703}
7704
7705// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7706// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7707// are consecutive, non-overlapping, and in the right order.
7709 SelectionDAG &DAG,
7710 const X86Subtarget &Subtarget,
7711 bool IsAfterLegalize) {
7713 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7714 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7715 Elts.push_back(Elt);
7716 continue;
7717 }
7718 return SDValue();
7719 }
7720 assert(Elts.size() == VT.getVectorNumElements());
7721 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7722 IsAfterLegalize);
7723}
7724
7726 const APInt &Undefs, LLVMContext &C) {
7727 unsigned ScalarSize = VT.getScalarSizeInBits();
7728 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7729
7730 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7731 if (VT.isFloatingPoint()) {
7732 if (ScalarSize == 16)
7733 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7734 if (ScalarSize == 32)
7735 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7736 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7737 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7738 }
7739 return Constant::getIntegerValue(Ty, Val);
7740 };
7741
7742 SmallVector<Constant *, 32> ConstantVec;
7743 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7744 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7745 : getConstantScalar(Bits[I]));
7746
7747 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7748}
7749
7750static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7751 unsigned SplatBitSize, LLVMContext &C) {
7752 unsigned ScalarSize = VT.getScalarSizeInBits();
7753
7754 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7755 if (VT.isFloatingPoint()) {
7756 if (ScalarSize == 16)
7757 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7758 if (ScalarSize == 32)
7759 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7760 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7761 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7762 }
7763 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7764 };
7765
7766 if (ScalarSize == SplatBitSize)
7767 return getConstantScalar(SplatValue);
7768
7769 unsigned NumElm = SplatBitSize / ScalarSize;
7770 SmallVector<Constant *, 32> ConstantVec;
7771 for (unsigned I = 0; I != NumElm; ++I) {
7772 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7773 ConstantVec.push_back(getConstantScalar(Val));
7774 }
7775 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7776}
7777
7779 for (auto *U : N->users()) {
7780 unsigned Opc = U->getOpcode();
7781 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7782 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7783 return false;
7784 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7785 return false;
7786 if (isTargetShuffle(Opc))
7787 return true;
7788 if (Opc == ISD::BITCAST) // Ignore bitcasts
7789 return isFoldableUseOfShuffle(U);
7790 if (N->hasOneUse()) {
7791 // TODO, there may be some general way to know if a SDNode can
7792 // be folded. We now only know whether an MI is foldable.
7793 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7794 return false;
7795 return true;
7796 }
7797 }
7798 return false;
7799}
7800
7801// If the node has a single use by a VSELECT then AVX512 targets may be able to
7802// fold as a predicated instruction.
7803static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7804 unsigned SizeInBits = V.getValueSizeInBits();
7805 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7806 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7807 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7808 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7809 return true;
7810 }
7811 }
7812 return false;
7813}
7814
7815/// Attempt to use the vbroadcast instruction to generate a splat value
7816/// from a splat BUILD_VECTOR which uses:
7817/// a. A single scalar load, or a constant.
7818/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7819///
7820/// The VBROADCAST node is returned when a pattern is found,
7821/// or SDValue() otherwise.
7823 const SDLoc &dl,
7824 const X86Subtarget &Subtarget,
7825 SelectionDAG &DAG) {
7826 // VBROADCAST requires AVX.
7827 // TODO: Splats could be generated for non-AVX CPUs using SSE
7828 // instructions, but there's less potential gain for only 128-bit vectors.
7829 if (!Subtarget.hasAVX())
7830 return SDValue();
7831
7832 MVT VT = BVOp->getSimpleValueType(0);
7833 unsigned NumElts = VT.getVectorNumElements();
7834 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7835 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7836 "Unsupported vector type for broadcast.");
7837
7838 // See if the build vector is a repeating sequence of scalars (inc. splat).
7839 SDValue Ld;
7840 BitVector UndefElements;
7841 SmallVector<SDValue, 16> Sequence;
7842 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7843 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7844 if (Sequence.size() == 1)
7845 Ld = Sequence[0];
7846 }
7847
7848 // Attempt to use VBROADCASTM
7849 // From this pattern:
7850 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7851 // b. t1 = (build_vector t0 t0)
7852 //
7853 // Create (VBROADCASTM v2i1 X)
7854 if (!Sequence.empty() && Subtarget.hasCDI()) {
7855 // If not a splat, are the upper sequence values zeroable?
7856 unsigned SeqLen = Sequence.size();
7857 bool UpperZeroOrUndef =
7858 SeqLen == 1 ||
7859 llvm::all_of(ArrayRef(Sequence).drop_front(),
7860 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7861 SDValue Op0 = Sequence[0];
7862 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7863 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7864 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7865 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7866 ? Op0.getOperand(0)
7867 : Op0.getOperand(0).getOperand(0);
7868 MVT MaskVT = BOperand.getSimpleValueType();
7869 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7870 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7871 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7872 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7873 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7874 unsigned Scale = 512 / VT.getSizeInBits();
7875 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7876 }
7877 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7878 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7879 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7880 return DAG.getBitcast(VT, Bcst);
7881 }
7882 }
7883 }
7884
7885 unsigned NumUndefElts = UndefElements.count();
7886 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7887 APInt SplatValue, Undef;
7888 unsigned SplatBitSize;
7889 bool HasUndef;
7890 // Check if this is a repeated constant pattern suitable for broadcasting.
7891 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7892 SplatBitSize > VT.getScalarSizeInBits() &&
7893 SplatBitSize < VT.getSizeInBits()) {
7894 // Avoid replacing with broadcast when it's a use of a shuffle
7895 // instruction to preserve the present custom lowering of shuffles.
7896 if (isFoldableUseOfShuffle(BVOp))
7897 return SDValue();
7898 // replace BUILD_VECTOR with broadcast of the repeated constants.
7899 LLVMContext *Ctx = DAG.getContext();
7900 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7901 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7902 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7903 // Load the constant scalar/subvector and broadcast it.
7904 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7905 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7906 SDValue CP = DAG.getConstantPool(C, PVT);
7907 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7908
7909 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7910 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7911 SDValue Ops[] = {DAG.getEntryNode(), CP};
7912 MachinePointerInfo MPI =
7914 SDValue Brdcst =
7916 MPI, Alignment, MachineMemOperand::MOLoad);
7917 return DAG.getBitcast(VT, Brdcst);
7918 }
7919 if (SplatBitSize > 64) {
7920 // Load the vector of constants and broadcast it.
7921 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7922 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7923 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7924 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7925 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7926 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7927 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7928 MachinePointerInfo MPI =
7931 Ops, VVT, MPI, Alignment,
7933 }
7934 }
7935
7936 // If we are moving a scalar into a vector (Ld must be set and all elements
7937 // but 1 are undef) and that operation is not obviously supported by
7938 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7939 // That's better than general shuffling and may eliminate a load to GPR and
7940 // move from scalar to vector register.
7941 if (!Ld || NumElts - NumUndefElts != 1)
7942 return SDValue();
7943 unsigned ScalarSize = Ld.getValueSizeInBits();
7944 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7945 return SDValue();
7946 }
7947
7948 bool ConstSplatVal =
7949 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7950 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7951
7952 // TODO: Handle broadcasts of non-constant sequences.
7953
7954 // Make sure that all of the users of a non-constant load are from the
7955 // BUILD_VECTOR node.
7956 // FIXME: Is the use count needed for non-constant, non-load case?
7957 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7958 return SDValue();
7959
7960 unsigned ScalarSize = Ld.getValueSizeInBits();
7961 bool IsGE256 = (VT.getSizeInBits() >= 256);
7962
7963 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7964 // instruction to save 8 or more bytes of constant pool data.
7965 // TODO: If multiple splats are generated to load the same constant,
7966 // it may be detrimental to overall size. There needs to be a way to detect
7967 // that condition to know if this is truly a size win.
7968 bool OptForSize = DAG.shouldOptForSize();
7969
7970 // Handle broadcasting a single constant scalar from the constant pool
7971 // into a vector.
7972 // On Sandybridge (no AVX2), it is still better to load a constant vector
7973 // from the constant pool and not to broadcast it from a scalar.
7974 // But override that restriction when optimizing for size.
7975 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7976 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7977 EVT CVT = Ld.getValueType();
7978 assert(!CVT.isVector() && "Must not broadcast a vector type");
7979
7980 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7981 // For size optimization, also splat v2f64 and v2i64, and for size opt
7982 // with AVX2, also splat i8 and i16.
7983 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7984 if (ScalarSize == 32 ||
7985 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7986 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7987 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7988 const Constant *C = nullptr;
7990 C = CI->getConstantIntValue();
7992 C = CF->getConstantFPValue();
7993
7994 assert(C && "Invalid constant type");
7995
7996 SDValue CP =
7998 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7999
8000 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8001 SDValue Ops[] = {DAG.getEntryNode(), CP};
8002 MachinePointerInfo MPI =
8004 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
8005 MPI, Alignment, MachineMemOperand::MOLoad);
8006 }
8007 }
8008
8009 // Handle AVX2 in-register broadcasts.
8010 if (!IsLoad && Subtarget.hasInt256() &&
8011 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
8012 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8013
8014 // The scalar source must be a normal load.
8015 if (!IsLoad)
8016 return SDValue();
8017
8018 // Make sure the non-chain result is only used by this build vector.
8019 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
8020 return SDValue();
8021
8022 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8023 (Subtarget.hasVLX() && ScalarSize == 64)) {
8024 auto *LN = cast<LoadSDNode>(Ld);
8025 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8026 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8027 SDValue BCast =
8029 LN->getMemoryVT(), LN->getMemOperand());
8030 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8031 return BCast;
8032 }
8033
8034 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
8035 // double since there is no vbroadcastsd xmm
8036 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
8037 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
8038 auto *LN = cast<LoadSDNode>(Ld);
8039 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8040 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8041 SDValue BCast =
8043 LN->getMemoryVT(), LN->getMemOperand());
8044 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8045 return BCast;
8046 }
8047
8048 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
8049 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8050
8051 // Unsupported broadcast.
8052 return SDValue();
8053}
8054
8055/// For an EXTRACT_VECTOR_ELT with a constant index return the real
8056/// underlying vector and index.
8057///
8058/// Modifies \p ExtractedFromVec to the real vector and returns the real
8059/// index.
8060static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
8061 SDValue ExtIdx) {
8062 int Idx = ExtIdx->getAsZExtVal();
8063 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
8064 return Idx;
8065
8066 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
8067 // lowered this:
8068 // (extract_vector_elt (v8f32 %1), Constant<6>)
8069 // to:
8070 // (extract_vector_elt (vector_shuffle<2,u,u,u>
8071 // (extract_subvector (v8f32 %0), Constant<4>),
8072 // undef)
8073 // Constant<0>)
8074 // In this case the vector is the extract_subvector expression and the index
8075 // is 2, as specified by the shuffle.
8076 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
8077 SDValue ShuffleVec = SVOp->getOperand(0);
8078 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
8079 assert(ShuffleVecVT.getVectorElementType() ==
8080 ExtractedFromVec.getSimpleValueType().getVectorElementType());
8081
8082 int ShuffleIdx = SVOp->getMaskElt(Idx);
8083 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
8084 ExtractedFromVec = ShuffleVec;
8085 return ShuffleIdx;
8086 }
8087 return Idx;
8088}
8089
8091 SelectionDAG &DAG) {
8092 MVT VT = Op.getSimpleValueType();
8093
8094 // Skip if insert_vec_elt is not supported.
8095 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8097 return SDValue();
8098
8099 unsigned NumElems = Op.getNumOperands();
8100 SDValue VecIn1;
8101 SDValue VecIn2;
8102 SmallVector<unsigned, 4> InsertIndices;
8103 SmallVector<int, 8> Mask(NumElems, -1);
8104
8105 for (unsigned i = 0; i != NumElems; ++i) {
8106 unsigned Opc = Op.getOperand(i).getOpcode();
8107
8108 if (Opc == ISD::POISON || Opc == ISD::UNDEF)
8109 continue;
8110
8112 // Quit if more than 1 elements need inserting.
8113 if (InsertIndices.size() > 1)
8114 return SDValue();
8115
8116 InsertIndices.push_back(i);
8117 continue;
8118 }
8119
8120 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
8121 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
8122
8123 // Quit if non-constant index.
8124 if (!isa<ConstantSDNode>(ExtIdx))
8125 return SDValue();
8126 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
8127
8128 // Quit if extracted from vector of different type.
8129 if (ExtractedFromVec.getValueType() != VT)
8130 return SDValue();
8131
8132 if (!VecIn1.getNode())
8133 VecIn1 = ExtractedFromVec;
8134 else if (VecIn1 != ExtractedFromVec) {
8135 if (!VecIn2.getNode())
8136 VecIn2 = ExtractedFromVec;
8137 else if (VecIn2 != ExtractedFromVec)
8138 // Quit if more than 2 vectors to shuffle
8139 return SDValue();
8140 }
8141
8142 if (ExtractedFromVec == VecIn1)
8143 Mask[i] = Idx;
8144 else if (ExtractedFromVec == VecIn2)
8145 Mask[i] = Idx + NumElems;
8146 }
8147
8148 if (!VecIn1.getNode())
8149 return SDValue();
8150
8151 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getPOISON(VT);
8152 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
8153
8154 for (unsigned Idx : InsertIndices)
8155 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
8156 DAG.getVectorIdxConstant(Idx, DL));
8157
8158 return NV;
8159}
8160
8161// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
8163 const X86Subtarget &Subtarget) {
8164 MVT VT = Op.getSimpleValueType();
8165 MVT SVT = Subtarget.hasFP16() ? MVT::f16 : MVT::i16;
8166 MVT IVT = VT.changeVectorElementType(SVT);
8168 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8169 NewOps.push_back(DAG.getBitcast(SVT, Op.getOperand(I)));
8170 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8171 return DAG.getBitcast(VT, Res);
8172}
8173
8174// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8176 SelectionDAG &DAG,
8177 const X86Subtarget &Subtarget) {
8178
8179 MVT VT = Op.getSimpleValueType();
8180 assert((VT.getVectorElementType() == MVT::i1) &&
8181 "Unexpected type in LowerBUILD_VECTORvXi1!");
8182 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8183 ISD::isBuildVectorAllOnes(Op.getNode()))
8184 return Op;
8185
8186 uint64_t Immediate = 0;
8187 SmallVector<unsigned, 16> NonConstIdx;
8188 bool IsSplat = true;
8189 bool HasConstElts = false;
8190 int SplatIdx = -1;
8191 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8192 SDValue In = Op.getOperand(idx);
8193 if (In.isUndef())
8194 continue;
8195 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8196 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8197 HasConstElts = true;
8198 } else {
8199 NonConstIdx.push_back(idx);
8200 }
8201 if (SplatIdx < 0)
8202 SplatIdx = idx;
8203 else if (In != Op.getOperand(SplatIdx))
8204 IsSplat = false;
8205 }
8206
8207 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8208 if (IsSplat) {
8209 // The build_vector allows the scalar element to be larger than the vector
8210 // element type. We need to mask it to use as a condition unless we know
8211 // the upper bits are zero.
8212 // FIXME: Use computeKnownBits instead of checking specific opcode?
8213 SDValue Cond = Op.getOperand(SplatIdx);
8214 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8215 if (Cond.getOpcode() != ISD::SETCC)
8216 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8217 DAG.getConstant(1, dl, MVT::i8));
8218
8219 // Perform the select in the scalar domain so we can use cmov.
8220 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8221 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8222 DAG.getAllOnesConstant(dl, MVT::i32),
8223 DAG.getConstant(0, dl, MVT::i32));
8224 Select = DAG.getBitcast(MVT::v32i1, Select);
8225 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8226 } else {
8227 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8228 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8229 DAG.getAllOnesConstant(dl, ImmVT),
8230 DAG.getConstant(0, dl, ImmVT));
8231 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8232 Select = DAG.getBitcast(VecVT, Select);
8233 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8234 DAG.getVectorIdxConstant(0, dl));
8235 }
8236 }
8237
8238 // insert elements one by one
8239 SDValue DstVec;
8240 if (HasConstElts) {
8241 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8242 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8243 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8244 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8245 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8246 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8247 } else {
8248 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8249 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8250 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8251 DstVec = DAG.getBitcast(VecVT, Imm);
8252 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8253 DAG.getVectorIdxConstant(0, dl));
8254 }
8255 } else
8256 DstVec = DAG.getUNDEF(VT);
8257
8258 for (unsigned InsertIdx : NonConstIdx) {
8259 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8260 Op.getOperand(InsertIdx),
8261 DAG.getVectorIdxConstant(InsertIdx, dl));
8262 }
8263 return DstVec;
8264}
8265
8266[[maybe_unused]] static bool isHorizOp(unsigned Opcode) {
8267 switch (Opcode) {
8268 case X86ISD::PACKSS:
8269 case X86ISD::PACKUS:
8270 case X86ISD::FHADD:
8271 case X86ISD::FHSUB:
8272 case X86ISD::HADD:
8273 case X86ISD::HSUB:
8274 case X86ISD::HADDS:
8275 case X86ISD::HSUBS:
8276 return true;
8277 }
8278 return false;
8279}
8280
8281/// This is a helper function of LowerToHorizontalOp().
8282/// This function checks that the build_vector \p N in input implements a
8283/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8284/// may not match the layout of an x86 256-bit horizontal instruction.
8285/// In other words, if this returns true, then some extraction/insertion will
8286/// be required to produce a valid horizontal instruction.
8287///
8288/// Parameter \p Opcode defines the kind of horizontal operation to match.
8289/// For example, if \p Opcode is equal to ISD::ADD, then this function
8290/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8291/// is equal to ISD::SUB, then this function checks if this is a horizontal
8292/// arithmetic sub.
8293///
8294/// This function only analyzes elements of \p N whose indices are
8295/// in range [BaseIdx, LastIdx).
8296///
8297/// TODO: This function was originally used to match both real and fake partial
8298/// horizontal operations, but the index-matching logic is incorrect for that.
8299/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8300/// code because it is only used for partial h-op matching now?
8301static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8302 const SDLoc &DL, SelectionDAG &DAG,
8303 unsigned BaseIdx, unsigned LastIdx,
8304 SDValue &V0, SDValue &V1) {
8305 EVT VT = N->getValueType(0);
8306 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8307 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8308 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8309 "Invalid Vector in input!");
8310
8311 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8312 bool CanFold = true;
8313 unsigned ExpectedVExtractIdx = BaseIdx;
8314 unsigned NumElts = LastIdx - BaseIdx;
8315 V0 = DAG.getUNDEF(VT);
8316 V1 = DAG.getUNDEF(VT);
8317
8318 // Check if N implements a horizontal binop.
8319 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8320 SDValue Op = N->getOperand(i + BaseIdx);
8321
8322 // Skip UNDEFs.
8323 if (Op->isUndef()) {
8324 // Update the expected vector extract index.
8325 if (i * 2 == NumElts)
8326 ExpectedVExtractIdx = BaseIdx;
8327 ExpectedVExtractIdx += 2;
8328 continue;
8329 }
8330
8331 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8332
8333 if (!CanFold)
8334 break;
8335
8336 SDValue Op0 = Op.getOperand(0);
8337 SDValue Op1 = Op.getOperand(1);
8338
8339 // Try to match the following pattern:
8340 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8341 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8343 Op0.getOperand(0) == Op1.getOperand(0) &&
8346 if (!CanFold)
8347 break;
8348
8349 unsigned I0 = Op0.getConstantOperandVal(1);
8350 unsigned I1 = Op1.getConstantOperandVal(1);
8351
8352 if (i * 2 < NumElts) {
8353 if (V0.isUndef()) {
8354 V0 = Op0.getOperand(0);
8355 if (V0.getValueType() != VT)
8356 return false;
8357 }
8358 } else {
8359 if (V1.isUndef()) {
8360 V1 = Op0.getOperand(0);
8361 if (V1.getValueType() != VT)
8362 return false;
8363 }
8364 if (i * 2 == NumElts)
8365 ExpectedVExtractIdx = BaseIdx;
8366 }
8367
8368 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8369 if (I0 == ExpectedVExtractIdx)
8370 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8371 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8372 // Try to match the following dag sequence:
8373 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8374 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8375 } else
8376 CanFold = false;
8377
8378 ExpectedVExtractIdx += 2;
8379 }
8380
8381 return CanFold;
8382}
8383
8384/// Emit a sequence of two 128-bit horizontal add/sub followed by
8385/// a concat_vector.
8386///
8387/// This is a helper function of LowerToHorizontalOp().
8388/// This function expects two 256-bit vectors called V0 and V1.
8389/// At first, each vector is split into two separate 128-bit vectors.
8390/// Then, the resulting 128-bit vectors are used to implement two
8391/// horizontal binary operations.
8392///
8393/// The kind of horizontal binary operation is defined by \p X86Opcode.
8394///
8395/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8396/// the two new horizontal binop.
8397/// When Mode is set, the first horizontal binop dag node would take as input
8398/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8399/// horizontal binop dag node would take as input the lower 128-bit of V1
8400/// and the upper 128-bit of V1.
8401/// Example:
8402/// HADD V0_LO, V0_HI
8403/// HADD V1_LO, V1_HI
8404///
8405/// Otherwise, the first horizontal binop dag node takes as input the lower
8406/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8407/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8408/// Example:
8409/// HADD V0_LO, V1_LO
8410/// HADD V0_HI, V1_HI
8411///
8412/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8413/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8414/// the upper 128-bits of the result.
8415static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8416 const SDLoc &DL, SelectionDAG &DAG,
8417 unsigned X86Opcode, bool Mode,
8418 bool isUndefLO, bool isUndefHI) {
8419 MVT VT = V0.getSimpleValueType();
8420 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8421 "Invalid nodes in input!");
8422
8423 unsigned NumElts = VT.getVectorNumElements();
8424 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8425 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8426 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8427 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8428 MVT NewVT = V0_LO.getSimpleValueType();
8429
8430 SDValue LO = DAG.getUNDEF(NewVT);
8431 SDValue HI = DAG.getUNDEF(NewVT);
8432
8433 if (Mode) {
8434 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8435 if (!isUndefLO && !V0->isUndef())
8436 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8437 if (!isUndefHI && !V1->isUndef())
8438 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8439 } else {
8440 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8441 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8442 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8443
8444 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8445 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8446 }
8447
8448 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8449}
8450
8451/// Returns true iff \p BV builds a vector with the result equivalent to
8452/// the result of ADDSUB/SUBADD operation.
8453/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8454/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8455/// \p Opnd0 and \p Opnd1.
8457 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8458 SDValue &Opnd0, SDValue &Opnd1,
8459 unsigned &NumExtracts, bool &IsSubAdd,
8460 bool &HasAllowContract) {
8461 using namespace SDPatternMatch;
8462
8463 MVT VT = BV->getSimpleValueType(0);
8464 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8465 return false;
8466
8467 unsigned NumElts = VT.getVectorNumElements();
8468 SDValue InVec0 = DAG.getUNDEF(VT);
8469 SDValue InVec1 = DAG.getUNDEF(VT);
8470
8471 NumExtracts = 0;
8472 HasAllowContract = NumElts != 0;
8473
8474 // Odd-numbered elements in the input build vector are obtained from
8475 // adding/subtracting two integer/float elements.
8476 // Even-numbered elements in the input build vector are obtained from
8477 // subtracting/adding two integer/float elements.
8478 unsigned Opc[2] = {0, 0};
8479 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8480 SDValue Op = BV->getOperand(i);
8481
8482 // Skip 'undef' values.
8483 unsigned Opcode = Op.getOpcode();
8484 if (Opcode == ISD::UNDEF)
8485 continue;
8486
8487 // Early exit if we found an unexpected opcode.
8488 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8489 return false;
8490
8491 SDValue Op0 = Op.getOperand(0);
8492 SDValue Op1 = Op.getOperand(1);
8493
8494 // Try to match the following pattern:
8495 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8496 // Early exit if we cannot match that sequence.
8497 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8498 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8499 return false;
8500
8501 // We found a valid add/sub node, make sure its the same opcode as previous
8502 // elements for this parity.
8503 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8504 return false;
8505 Opc[i % 2] = Opcode;
8506
8507 // Update InVec0 and InVec1.
8508 if (InVec0.isUndef())
8509 InVec0 = Op0.getOperand(0);
8510 if (InVec1.isUndef())
8511 InVec1 = Op1.getOperand(0);
8512
8513 // Make sure that operands in input to each add/sub node always
8514 // come from a same pair of vectors.
8515 if (InVec0 != Op0.getOperand(0)) {
8516 if (Opcode == ISD::FSUB)
8517 return false;
8518
8519 // FADD is commutable. Try to commute the operands
8520 // and then test again.
8521 std::swap(Op0, Op1);
8522 if (InVec0 != Op0.getOperand(0))
8523 return false;
8524 }
8525
8526 if (InVec1 != Op1.getOperand(0))
8527 return false;
8528
8529 // Increment the number of extractions done.
8530 ++NumExtracts;
8531 HasAllowContract &= Op->getFlags().hasAllowContract();
8532 }
8533
8534 // Ensure we have found an opcode for both parities and that they are
8535 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8536 // inputs are undef.
8537 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8538 InVec0.isUndef() || InVec1.isUndef())
8539 return false;
8540
8541 IsSubAdd = Opc[0] == ISD::FADD;
8542
8543 Opnd0 = InVec0;
8544 Opnd1 = InVec1;
8545 return true;
8546}
8547
8548/// Returns true if is possible to fold MUL and an idiom that has already been
8549/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8550/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8551/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8552///
8553/// Prior to calling this function it should be known that there is some
8554/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8555/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8556/// before replacement of such SDNode with ADDSUB operation. Thus the number
8557/// of \p Opnd0 uses is expected to be equal to 2.
8558/// For example, this function may be called for the following IR:
8559/// %AB = fmul fast <2 x double> %A, %B
8560/// %Sub = fsub fast <2 x double> %AB, %C
8561/// %Add = fadd fast <2 x double> %AB, %C
8562/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8563/// <2 x i32> <i32 0, i32 3>
8564/// There is a def for %Addsub here, which potentially can be replaced by
8565/// X86ISD::ADDSUB operation:
8566/// %Addsub = X86ISD::ADDSUB %AB, %C
8567/// and such ADDSUB can further be replaced with FMADDSUB:
8568/// %Addsub = FMADDSUB %A, %B, %C.
8569///
8570/// The main reason why this method is called before the replacement of the
8571/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8572/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8573/// FMADDSUB is.
8574static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8575 SelectionDAG &DAG, SDValue &Opnd0,
8576 SDValue &Opnd1, SDValue &Opnd2,
8577 unsigned ExpectedUses,
8578 bool AllowSubAddOrAddSubContract) {
8579 if (Opnd0.getOpcode() != ISD::FMUL ||
8580 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8581 return false;
8582
8583 // FIXME: These checks must match the similar ones in
8584 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8585 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8586 // or MUL + ADDSUB to FMADDSUB.
8587 bool AllowFusion =
8588 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8589 if (!AllowFusion)
8590 return false;
8591
8592 Opnd2 = Opnd1;
8593 Opnd1 = Opnd0.getOperand(1);
8594 Opnd0 = Opnd0.getOperand(0);
8595
8596 return true;
8597}
8598
8599/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8600/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8601/// X86ISD::FMSUBADD node.
8603 const SDLoc &DL,
8604 const X86Subtarget &Subtarget,
8605 SelectionDAG &DAG) {
8606 SDValue Opnd0, Opnd1;
8607 unsigned NumExtracts;
8608 bool IsSubAdd;
8609 bool HasAllowContract;
8610 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8611 HasAllowContract))
8612 return SDValue();
8613
8614 MVT VT = BV->getSimpleValueType(0);
8615
8616 // Try to generate X86ISD::FMADDSUB node here.
8617 SDValue Opnd2;
8618 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8619 HasAllowContract)) {
8620 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8621 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8622 }
8623
8624 // We only support ADDSUB.
8625 if (IsSubAdd)
8626 return SDValue();
8627
8628 // There are no known X86 targets with 512-bit ADDSUB instructions!
8629 // Convert to blend(fsub,fadd).
8630 if (VT.is512BitVector()) {
8631 SmallVector<int> Mask;
8632 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8633 Mask.push_back(I);
8634 Mask.push_back(I + E + 1);
8635 }
8636 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8637 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8638 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8639 }
8640
8641 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8642}
8643
8645 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8646 // Initialize outputs to known values.
8647 MVT VT = BV->getSimpleValueType(0);
8648 HOpcode = ISD::DELETED_NODE;
8649 V0 = DAG.getUNDEF(VT);
8650 V1 = DAG.getUNDEF(VT);
8651
8652 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8653 // half of the result is calculated independently from the 128-bit halves of
8654 // the inputs, so that makes the index-checking logic below more complicated.
8655 unsigned NumElts = VT.getVectorNumElements();
8656 unsigned GenericOpcode = ISD::DELETED_NODE;
8657 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8658 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8659 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8660 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8661 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8662 // Ignore undef elements.
8663 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8664 if (Op.isUndef())
8665 continue;
8666
8667 // If there's an opcode mismatch, we're done.
8668 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8669 return false;
8670
8671 // Initialize horizontal opcode.
8672 if (HOpcode == ISD::DELETED_NODE) {
8673 GenericOpcode = Op.getOpcode();
8674 switch (GenericOpcode) {
8675 // clang-format off
8676 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8677 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8678 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8679 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8680 default: return false;
8681 // clang-format on
8682 }
8683 }
8684
8685 SDValue Op0 = Op.getOperand(0);
8686 SDValue Op1 = Op.getOperand(1);
8687 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8689 Op0.getOperand(0) != Op1.getOperand(0) ||
8691 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8692 return false;
8693
8694 // The source vector is chosen based on which 64-bit half of the
8695 // destination vector is being calculated.
8696 if (j < NumEltsIn64Bits) {
8697 if (V0.isUndef())
8698 V0 = Op0.getOperand(0);
8699 } else {
8700 if (V1.isUndef())
8701 V1 = Op0.getOperand(0);
8702 }
8703
8704 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8705 if (SourceVec != Op0.getOperand(0))
8706 return false;
8707
8708 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8709 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8710 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8711 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8712 (j % NumEltsIn64Bits) * 2;
8713 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8714 continue;
8715
8716 // If this is not a commutative op, this does not match.
8717 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8718 return false;
8719
8720 // Addition is commutative, so try swapping the extract indexes.
8721 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8722 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8723 continue;
8724
8725 // Extract indexes do not match horizontal requirement.
8726 return false;
8727 }
8728 }
8729 // We matched. Opcode and operands are returned by reference as arguments.
8730 return true;
8731}
8732
8734 const SDLoc &DL, SelectionDAG &DAG,
8735 unsigned HOpcode, SDValue V0, SDValue V1) {
8736 // If either input vector is not the same size as the build vector,
8737 // extract/insert the low bits to the correct size.
8738 // This is free (examples: zmm --> xmm, xmm --> ymm).
8739 MVT VT = BV->getSimpleValueType(0);
8740 unsigned Width = VT.getSizeInBits();
8741 if (V0.getValueSizeInBits() > Width)
8742 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8743 else if (V0.getValueSizeInBits() < Width)
8744 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8745
8746 if (V1.getValueSizeInBits() > Width)
8747 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8748 else if (V1.getValueSizeInBits() < Width)
8749 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8750
8751 unsigned NumElts = VT.getVectorNumElements();
8752 APInt DemandedElts = APInt::getAllOnes(NumElts);
8753 for (unsigned i = 0; i != NumElts; ++i)
8754 if (BV->getOperand(i).isUndef())
8755 DemandedElts.clearBit(i);
8756
8757 // If we don't need the upper xmm, then perform as a xmm hop.
8758 unsigned HalfNumElts = NumElts / 2;
8759 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8760 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8761 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8762 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8763 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8764 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8765 }
8766
8767 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8768}
8769
8770/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8772 const X86Subtarget &Subtarget,
8773 SelectionDAG &DAG) {
8774 // We need at least 2 non-undef elements to make this worthwhile by default.
8775 unsigned NumNonUndefs =
8776 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8777 if (NumNonUndefs < 2)
8778 return SDValue();
8779
8780 // There are 4 sets of horizontal math operations distinguished by type:
8781 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8782 // subtarget feature. Try to match those "native" patterns first.
8783 MVT VT = BV->getSimpleValueType(0);
8784 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8785 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8786 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8787 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8788 unsigned HOpcode;
8789 SDValue V0, V1;
8790 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8791 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8792 }
8793
8794 // Try harder to match 256-bit ops by using extract/concat.
8795 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8796 return SDValue();
8797
8798 // Count the number of UNDEF operands in the build_vector in input.
8799 unsigned NumElts = VT.getVectorNumElements();
8800 unsigned Half = NumElts / 2;
8801 unsigned NumUndefsLO = 0;
8802 unsigned NumUndefsHI = 0;
8803 for (unsigned i = 0, e = Half; i != e; ++i)
8804 if (BV->getOperand(i)->isUndef())
8805 NumUndefsLO++;
8806
8807 for (unsigned i = Half, e = NumElts; i != e; ++i)
8808 if (BV->getOperand(i)->isUndef())
8809 NumUndefsHI++;
8810
8811 SDValue InVec0, InVec1;
8812 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8813 SDValue InVec2, InVec3;
8814 unsigned X86Opcode;
8815 bool CanFold = true;
8816
8817 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8818 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8819 InVec3) &&
8820 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8821 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8822 X86Opcode = X86ISD::HADD;
8823 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8824 InVec1) &&
8825 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8826 InVec3) &&
8827 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8828 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8829 X86Opcode = X86ISD::HSUB;
8830 else
8831 CanFold = false;
8832
8833 if (CanFold) {
8834 // Do not try to expand this build_vector into a pair of horizontal
8835 // add/sub if we can emit a pair of scalar add/sub.
8836 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8837 return SDValue();
8838
8839 // Convert this build_vector into a pair of horizontal binops followed by
8840 // a concat vector. We must adjust the outputs from the partial horizontal
8841 // matching calls above to account for undefined vector halves.
8842 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8843 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8844 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8845 bool isUndefLO = NumUndefsLO == Half;
8846 bool isUndefHI = NumUndefsHI == Half;
8847 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8848 isUndefHI);
8849 }
8850 }
8851
8852 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8853 VT == MVT::v16i16) {
8854 unsigned X86Opcode;
8855 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8856 InVec1))
8857 X86Opcode = X86ISD::HADD;
8858 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8859 InVec1))
8860 X86Opcode = X86ISD::HSUB;
8861 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8862 InVec1))
8863 X86Opcode = X86ISD::FHADD;
8864 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8865 InVec1))
8866 X86Opcode = X86ISD::FHSUB;
8867 else
8868 return SDValue();
8869
8870 // Don't try to expand this build_vector into a pair of horizontal add/sub
8871 // if we can simply emit a pair of scalar add/sub.
8872 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8873 return SDValue();
8874
8875 // Convert this build_vector into two horizontal add/sub followed by
8876 // a concat vector.
8877 bool isUndefLO = NumUndefsLO == Half;
8878 bool isUndefHI = NumUndefsHI == Half;
8879 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8880 isUndefLO, isUndefHI);
8881 }
8882
8883 return SDValue();
8884}
8885
8886static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8887 SelectionDAG &DAG);
8888
8889/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8890/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8891/// just apply the bit to the vectors.
8892/// NOTE: Its not in our interest to start make a general purpose vectorizer
8893/// from this, but enough scalar bit operations are created from the later
8894/// legalization + scalarization stages to need basic support.
8896 const X86Subtarget &Subtarget,
8897 SelectionDAG &DAG) {
8898 MVT VT = Op->getSimpleValueType(0);
8899 unsigned NumElems = VT.getVectorNumElements();
8900 unsigned ElemSize = VT.getScalarSizeInBits();
8901 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8902
8903 // Check that all elements have the same opcode.
8904 // TODO: Should we allow UNDEFS and if so how many?
8905 unsigned Opcode = Op->getOperand(0).getOpcode();
8906 for (unsigned i = 1; i < NumElems; ++i)
8907 if (Opcode != Op->getOperand(i).getOpcode())
8908 return SDValue();
8909
8910 // TODO: We may be able to add support for other Ops (e.g. ADD/SUB).
8911 bool IsShift = false;
8912 switch (Opcode) {
8913 default:
8914 return SDValue();
8915 case ISD::SHL:
8916 case ISD::SRL:
8917 case ISD::SRA:
8918 IsShift = true;
8919 break;
8920 case ISD::AND:
8921 case ISD::XOR:
8922 case ISD::OR:
8923 // Don't do this if the buildvector is a splat - we'd replace one
8924 // constant with an entire vector.
8925 if (Op->getSplatValue())
8926 return SDValue();
8927 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8928 return SDValue();
8929 break;
8930 }
8931
8932 // Collect elements.
8933 bool RHSAllConst = true;
8934 SmallVector<SDValue, 4> LHSElts, RHSElts;
8935 for (SDValue Elt : Op->ops()) {
8936 SDValue LHS = Elt.getOperand(0);
8937 SDValue RHS = Elt.getOperand(1);
8938 RHSAllConst &= isa<ConstantSDNode>(RHS);
8939 LHSElts.push_back(LHS);
8940 RHSElts.push_back(RHS);
8941 }
8942
8943 // Canonicalize shift amounts.
8944 if (IsShift) {
8945 // We expect the canonicalized RHS operand to be the constant.
8946 // TODO: Permit non-constant XOP/AVX2 cases?
8947 if (!RHSAllConst)
8948 return SDValue();
8949
8950 // Extend shift amounts.
8951 for (SDValue &Op1 : RHSElts)
8952 if (Op1.getValueSizeInBits() != ElemSize)
8953 Op1 = DAG.getZExtOrTrunc(Op1, DL, VT.getScalarType());
8954
8955 // Limit to shifts by uniform immediates.
8956 // TODO: Only accept vXi8/vXi64 special cases?
8957 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8958 if (any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8959 return SDValue();
8960 }
8961 assert(all_of(llvm::concat<SDValue>(LHSElts, RHSElts),
8962 [ElemSize](SDValue V) {
8963 return V.getValueSizeInBits() == ElemSize;
8964 }) &&
8965 "Element size mismatch");
8966
8967 // To avoid an increase in GPR->FPU instructions, LHS/RHS must be foldable as
8968 // a load or RHS must be constant.
8969 SDValue LHS = EltsFromConsecutiveLoads(VT, LHSElts, DL, DAG, Subtarget,
8970 /*IsAfterLegalize=*/true);
8971 SDValue RHS = EltsFromConsecutiveLoads(VT, RHSElts, DL, DAG, Subtarget,
8972 /*IsAfterLegalize=*/true);
8973 if (!LHS && !RHS && !RHSAllConst)
8974 return SDValue();
8975
8976 if (!LHS)
8977 LHS = DAG.getBuildVector(VT, DL, LHSElts);
8978 if (!RHS)
8979 RHS = DAG.getBuildVector(VT, DL, RHSElts);
8980 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8981
8982 if (!IsShift)
8983 return Res;
8984
8985 // Immediately lower the shift to ensure the constant build vector doesn't
8986 // get converted to a constant pool before the shift is lowered.
8987 return LowerShift(Res, Subtarget, DAG);
8988}
8989
8990static bool isShuffleFoldableLoad(SDValue);
8991
8992/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8993/// representing a blend.
8995 X86Subtarget const &Subtarget,
8996 SelectionDAG &DAG) {
8997 MVT VT = BVOp->getSimpleValueType(0u);
8998
8999 if (VT != MVT::v4f64)
9000 return SDValue();
9001
9002 // Collect unique operands.
9003 auto UniqueOps = SmallSet<SDValue, 16u>();
9004 for (SDValue Op : BVOp->ops()) {
9005 if (isIntOrFPConstant(Op) || Op.isUndef())
9006 return SDValue();
9007 UniqueOps.insert(Op);
9008 }
9009
9010 // Candidate BUILD_VECTOR must have 2 unique operands.
9011 if (UniqueOps.size() != 2u)
9012 return SDValue();
9013
9014 SDValue Op0 = BVOp->getOperand(0u);
9015 UniqueOps.erase(Op0);
9016 SDValue Op1 = *UniqueOps.begin();
9017
9018 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
9019 isShuffleFoldableLoad(Op1)) {
9020 // Create shuffle mask.
9021 auto const NumElems = VT.getVectorNumElements();
9022 SmallVector<int, 16u> Mask(NumElems);
9023 for (auto I = 0u; I < NumElems; ++I) {
9024 SDValue Op = BVOp->getOperand(I);
9025 Mask[I] = Op == Op0 ? I : I + NumElems;
9026 }
9027 // Create shuffle of splats.
9028 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
9029 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
9030 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
9031 }
9032
9033 return SDValue();
9034}
9035
9036/// Widen a BUILD_VECTOR if the scalar operands are freely mergeable.
9038 X86Subtarget const &Subtarget,
9039 SelectionDAG &DAG) {
9040 using namespace SDPatternMatch;
9041 MVT VT = BVOp->getSimpleValueType(0);
9042 MVT SVT = VT.getScalarType();
9043 unsigned NumElts = VT.getVectorNumElements();
9044 unsigned EltBits = SVT.getSizeInBits();
9045
9046 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
9047 return SDValue();
9048
9049 unsigned WideBits = 2 * EltBits;
9050 MVT WideSVT = MVT::getIntegerVT(WideBits);
9051 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts / 2);
9052 if (!DAG.getTargetLoweringInfo().isTypeLegal(WideSVT))
9053 return SDValue();
9054
9056 for (unsigned I = 0; I != NumElts; I += 2) {
9057 SDValue Op0 = BVOp->getOperand(I + 0);
9058 SDValue Op1 = BVOp->getOperand(I + 1);
9059
9060 if (Op0.isUndef() && Op1.isUndef()) {
9061 WideOps.push_back(DAG.getUNDEF(WideSVT));
9062 continue;
9063 }
9064
9065 // TODO: Constant repacking?
9066
9067 // Merge scalars that have been split from the same source.
9068 SDValue X, Y;
9069 if (sd_match(Op0, m_Trunc(m_Value(X))) &&
9070 sd_match(Op1, m_Trunc(m_Srl(m_Value(Y), m_SpecificInt(EltBits)))) &&
9072 X.getValueType().bitsGE(WideSVT)) {
9073 if (X.getValueType().bitsGT(WideSVT))
9074 X = DAG.getNode(ISD::TRUNCATE, DL, WideSVT, X);
9075 WideOps.push_back(X);
9076 continue;
9077 }
9078
9079 return SDValue();
9080 }
9081
9082 assert(WideOps.size() == (NumElts / 2) && "Failed to widen build vector");
9083 return DAG.getBitcast(VT, DAG.getBuildVector(WideVT, DL, WideOps));
9084}
9085
9086/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9087/// functionality to do this, so it's all zeros, all ones, or some derivation
9088/// that is cheap to calculate.
9090 SelectionDAG &DAG,
9091 const X86Subtarget &Subtarget) {
9092 MVT VT = Op.getSimpleValueType();
9093
9094 // Vectors containing all zeros can be matched by pxor and xorps.
9095 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9096 return Op;
9097
9098 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9099 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9100 // vpcmpeqd on 256-bit vectors.
9101 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
9102 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9103 return Op;
9104
9105 return getOnesVector(VT, DAG, DL);
9106 }
9107
9108 return SDValue();
9109}
9110
9111/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
9112/// from a vector of source values and a vector of extraction indices.
9113/// The vectors might be manipulated to match the type of the permute op.
9114static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
9115 const SDLoc &DL, SelectionDAG &DAG,
9116 const X86Subtarget &Subtarget) {
9117 MVT ShuffleVT = VT;
9118 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9119 unsigned NumElts = VT.getVectorNumElements();
9120 unsigned SizeInBits = VT.getSizeInBits();
9121
9122 // Adjust IndicesVec to match VT size.
9123 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
9124 "Illegal variable permute mask size");
9125 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
9126 // Narrow/widen the indices vector to the correct size.
9127 if (IndicesVec.getValueSizeInBits() > SizeInBits)
9128 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
9129 NumElts * VT.getScalarSizeInBits());
9130 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
9131 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
9132 SDLoc(IndicesVec), SizeInBits);
9133 // Zero-extend the index elements within the vector.
9134 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
9135 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
9136 IndicesVT, IndicesVec);
9137 }
9138 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
9139
9140 // Handle SrcVec that don't match VT type.
9141 if (SrcVec.getValueSizeInBits() != SizeInBits) {
9142 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
9143 // Handle larger SrcVec by treating it as a larger permute.
9144 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
9145 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
9146 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9147 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
9148 Subtarget, DAG, SDLoc(IndicesVec));
9149 SDValue NewSrcVec =
9150 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9151 if (NewSrcVec)
9152 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
9153 return SDValue();
9154 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
9155 // Widen smaller SrcVec to match VT.
9156 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
9157 } else
9158 return SDValue();
9159 }
9160
9161 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
9162 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
9163 EVT SrcVT = Idx.getValueType();
9164 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
9165 uint64_t IndexScale = 0;
9166 uint64_t IndexOffset = 0;
9167
9168 // If we're scaling a smaller permute op, then we need to repeat the
9169 // indices, scaling and offsetting them as well.
9170 // e.g. v4i32 -> v16i8 (Scale = 4)
9171 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
9172 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
9173 for (uint64_t i = 0; i != Scale; ++i) {
9174 IndexScale |= Scale << (i * NumDstBits);
9175 IndexOffset |= i << (i * NumDstBits);
9176 }
9177
9178 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
9179 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
9180 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
9181 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
9182 return Idx;
9183 };
9184
9185 unsigned Opcode = 0;
9186 switch (VT.SimpleTy) {
9187 default:
9188 break;
9189 case MVT::v16i8:
9190 if (Subtarget.hasSSSE3())
9191 Opcode = X86ISD::PSHUFB;
9192 break;
9193 case MVT::v8i16:
9194 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9195 Opcode = X86ISD::VPERMV;
9196 else if (Subtarget.hasSSSE3()) {
9197 Opcode = X86ISD::PSHUFB;
9198 ShuffleVT = MVT::v16i8;
9199 }
9200 break;
9201 case MVT::v4f32:
9202 case MVT::v4i32:
9203 if (Subtarget.hasAVX()) {
9204 Opcode = X86ISD::VPERMILPV;
9205 ShuffleVT = MVT::v4f32;
9206 } else if (Subtarget.hasSSSE3()) {
9207 Opcode = X86ISD::PSHUFB;
9208 ShuffleVT = MVT::v16i8;
9209 }
9210 break;
9211 case MVT::v2f64:
9212 case MVT::v2i64:
9213 if (Subtarget.hasAVX()) {
9214 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
9215 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9216 Opcode = X86ISD::VPERMILPV;
9217 ShuffleVT = MVT::v2f64;
9218 } else if (Subtarget.hasSSE41()) {
9219 // SSE41 can compare v2i64 - select between indices 0 and 1.
9220 return DAG.getSelectCC(
9221 DL, IndicesVec,
9222 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
9223 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
9224 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
9226 }
9227 break;
9228 case MVT::v32i8:
9229 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9230 Opcode = X86ISD::VPERMV;
9231 else if (Subtarget.hasXOP()) {
9232 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
9233 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9234 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9235 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9236 return DAG.getNode(
9238 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9239 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9240 } else if (Subtarget.hasAVX()) {
9241 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9242 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9243 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9244 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9245 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9247 // Permute Lo and Hi and then select based on index range.
9248 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9249 // care about the bit[7] as its just an index vector.
9250 SDValue Idx = Ops[2];
9251 EVT VT = Idx.getValueType();
9252 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9253 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9254 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9256 };
9257 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9258 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9259 PSHUFBBuilder);
9260 }
9261 break;
9262 case MVT::v16i16:
9263 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9264 Opcode = X86ISD::VPERMV;
9265 else if (Subtarget.hasAVX()) {
9266 // Scale to v32i8 and perform as v32i8.
9267 IndicesVec = ScaleIndices(IndicesVec, 2);
9268 return DAG.getBitcast(
9270 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9271 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9272 }
9273 break;
9274 case MVT::v8f32:
9275 case MVT::v8i32:
9276 if (Subtarget.hasAVX2())
9277 Opcode = X86ISD::VPERMV;
9278 else if (Subtarget.hasAVX()) {
9279 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9280 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9281 {0, 1, 2, 3, 0, 1, 2, 3});
9282 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9283 {4, 5, 6, 7, 4, 5, 6, 7});
9284 if (Subtarget.hasXOP())
9285 return DAG.getBitcast(
9286 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9287 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9288 // Permute Lo and Hi and then select based on index range.
9289 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9290 SDValue Res = DAG.getSelectCC(
9291 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9292 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9293 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9295 return DAG.getBitcast(VT, Res);
9296 }
9297 break;
9298 case MVT::v4i64:
9299 case MVT::v4f64:
9300 if (Subtarget.hasAVX512()) {
9301 if (!Subtarget.hasVLX()) {
9302 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9303 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9304 SDLoc(SrcVec));
9305 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9306 DAG, SDLoc(IndicesVec));
9307 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9308 DAG, Subtarget);
9309 return extract256BitVector(Res, 0, DAG, DL);
9310 }
9311 Opcode = X86ISD::VPERMV;
9312 } else if (Subtarget.hasAVX()) {
9313 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9314 SDValue LoLo =
9315 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9316 SDValue HiHi =
9317 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9318 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9319 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9320 if (Subtarget.hasXOP())
9321 return DAG.getBitcast(
9322 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9323 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9324 // Permute Lo and Hi and then select based on index range.
9325 // This works as VPERMILPD only uses index bit[1] to permute elements.
9326 SDValue Res = DAG.getSelectCC(
9327 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9328 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9329 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9331 return DAG.getBitcast(VT, Res);
9332 }
9333 break;
9334 case MVT::v64i8:
9335 if (Subtarget.hasVBMI())
9336 Opcode = X86ISD::VPERMV;
9337 break;
9338 case MVT::v32i16:
9339 if (Subtarget.hasBWI())
9340 Opcode = X86ISD::VPERMV;
9341 break;
9342 case MVT::v16f32:
9343 case MVT::v16i32:
9344 case MVT::v8f64:
9345 case MVT::v8i64:
9346 if (Subtarget.hasAVX512())
9347 Opcode = X86ISD::VPERMV;
9348 break;
9349 }
9350 if (!Opcode)
9351 return SDValue();
9352
9353 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9354 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9355 "Illegal variable permute shuffle type");
9356
9357 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9358 if (Scale > 1)
9359 IndicesVec = ScaleIndices(IndicesVec, Scale);
9360
9361 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9362 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9363
9364 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9365 SDValue Res = Opcode == X86ISD::VPERMV
9366 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9367 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9368 return DAG.getBitcast(VT, Res);
9369}
9370
9371// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9372// reasoned to be a permutation of a vector by indices in a non-constant vector.
9373// (build_vector (extract_elt V, (extract_elt I, 0)),
9374// (extract_elt V, (extract_elt I, 1)),
9375// ...
9376// ->
9377// (vpermv I, V)
9378//
9379// TODO: Handle undefs
9380// TODO: Utilize pshufb and zero mask blending to support more efficient
9381// construction of vectors with constant-0 elements.
9382static SDValue
9384 SelectionDAG &DAG,
9385 const X86Subtarget &Subtarget) {
9386 SDValue SrcVec, IndicesVec;
9387
9388 auto PeekThroughFreeze = [](SDValue N) {
9389 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9390 return N->getOperand(0);
9391 return N;
9392 };
9393 // Check for a match of the permute source vector and permute index elements.
9394 // This is done by checking that the i-th build_vector operand is of the form:
9395 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9396 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9397 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9398 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9399 return SDValue();
9400
9401 // If this is the first extract encountered in V, set the source vector,
9402 // otherwise verify the extract is from the previously defined source
9403 // vector.
9404 if (!SrcVec)
9405 SrcVec = Op.getOperand(0);
9406 else if (SrcVec != Op.getOperand(0))
9407 return SDValue();
9408 SDValue ExtractedIndex = Op->getOperand(1);
9409 // Peek through extends.
9410 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9411 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9412 ExtractedIndex = ExtractedIndex.getOperand(0);
9413 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9414 return SDValue();
9415
9416 // If this is the first extract from the index vector candidate, set the
9417 // indices vector, otherwise verify the extract is from the previously
9418 // defined indices vector.
9419 if (!IndicesVec)
9420 IndicesVec = ExtractedIndex.getOperand(0);
9421 else if (IndicesVec != ExtractedIndex.getOperand(0))
9422 return SDValue();
9423
9424 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9425 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9426 return SDValue();
9427 }
9428
9429 MVT VT = V.getSimpleValueType();
9430 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9431}
9432
9433SDValue
9434X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9435 SDLoc dl(Op);
9436
9437 MVT VT = Op.getSimpleValueType();
9438 MVT EltVT = VT.getVectorElementType();
9439 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9440 unsigned NumElems = Op.getNumOperands();
9441
9442 // Generate vectors for predicate vectors.
9443 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9444 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9445
9446 if (VT.getVectorElementType() == MVT::bf16 &&
9447 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9448 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9449
9450 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9451 return VectorCst;
9452
9453 unsigned EVTBits = EltVT.getSizeInBits();
9454 APInt UndefMask = APInt::getZero(NumElems);
9455 APInt FrozenUndefMask = APInt::getZero(NumElems);
9456 APInt ZeroMask = APInt::getZero(NumElems);
9457 APInt NonZeroMask = APInt::getZero(NumElems);
9458 bool IsAllConstants = true;
9459 bool OneUseFrozenUndefs = true;
9460 SmallSet<SDValue, 8> Values;
9461 unsigned NumConstants = NumElems;
9462 for (unsigned i = 0; i < NumElems; ++i) {
9463 SDValue Elt = Op.getOperand(i);
9464 if (Elt.isUndef()) {
9465 UndefMask.setBit(i);
9466 continue;
9467 }
9468 if (ISD::isFreezeUndef(Elt.getNode())) {
9469 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9470 FrozenUndefMask.setBit(i);
9471 continue;
9472 }
9473 Values.insert(Elt);
9474 if (!isIntOrFPConstant(Elt)) {
9475 IsAllConstants = false;
9476 NumConstants--;
9477 }
9478 if (X86::isZeroNode(Elt)) {
9479 ZeroMask.setBit(i);
9480 } else {
9481 NonZeroMask.setBit(i);
9482 }
9483 }
9484
9485 // All undef vector. Return an UNDEF.
9486 if (UndefMask.isAllOnes())
9487 return DAG.getUNDEF(VT);
9488
9489 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9490 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9491 return DAG.getFreeze(DAG.getUNDEF(VT));
9492
9493 // All undef/freeze(undef)/zero vector. Return a zero vector.
9494 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9495 return getZeroVector(VT, Subtarget, DAG, dl);
9496
9497 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9498 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9499 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9500 // and blend the FREEZE-UNDEF operands back in.
9501 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9502 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9503 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9504 SmallVector<int, 16> BlendMask(NumElems, -1);
9505 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9506 for (unsigned i = 0; i < NumElems; ++i) {
9507 if (UndefMask[i]) {
9508 BlendMask[i] = -1;
9509 continue;
9510 }
9511 BlendMask[i] = i;
9512 if (!FrozenUndefMask[i])
9513 Elts[i] = Op.getOperand(i);
9514 else
9515 BlendMask[i] += NumElems;
9516 }
9517 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9518 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9519 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9520 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9521 }
9522
9523 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9524
9525 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9526 // be better off lowering to a smaller build vector and padding with
9527 // undef/zero.
9528 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9530 unsigned UpperElems = NumElems / 2;
9531 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9532 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9533 if (NumUpperUndefsOrZeros >= UpperElems) {
9534 if (VT.is512BitVector() &&
9535 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9536 UpperElems = NumElems - (NumElems / 4);
9537 // If freeze(undef) is in any upper elements, force to zero.
9538 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9539 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9540 SDValue NewBV =
9541 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9542 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9543 }
9544 }
9545
9546 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9547 return AddSub;
9548 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9549 return HorizontalOp;
9550 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9551 return Broadcast;
9552 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9553 return BitOp;
9554 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9555 return Blend;
9556 if (SDValue WideBV = widenBuildVector(BV, dl, Subtarget, DAG))
9557 return WideBV;
9558
9559 unsigned NumZero = ZeroMask.popcount();
9560 unsigned NumNonZero = NonZeroMask.popcount();
9561
9562 // If we are inserting one variable into a vector of non-zero constants, try
9563 // to avoid loading each constant element as a scalar. Load the constants as a
9564 // vector and then insert the variable scalar element. If insertion is not
9565 // supported, fall back to a shuffle to get the scalar blended with the
9566 // constants. Insertion into a zero vector is handled as a special-case
9567 // somewhere below here.
9568 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9569 FrozenUndefMask.isZero() &&
9572 // Create an all-constant vector. The variable element in the old
9573 // build vector is replaced by undef in the constant vector. Save the
9574 // variable scalar element and its index for use in the insertelement.
9575 LLVMContext &Context = *DAG.getContext();
9576 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9577 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9578 SDValue VarElt;
9579 SDValue InsIndex;
9580 for (unsigned i = 0; i != NumElems; ++i) {
9581 SDValue Elt = Op.getOperand(i);
9582 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9583 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9584 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9585 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9586 else if (!Elt.isUndef()) {
9587 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9588 "Expected one variable element in this vector");
9589 VarElt = Elt;
9590 InsIndex = DAG.getVectorIdxConstant(i, dl);
9591 }
9592 }
9593 Constant *CV = ConstantVector::get(ConstVecOps);
9594 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9595
9596 // The constants we just created may not be legal (eg, floating point). We
9597 // must lower the vector right here because we can not guarantee that we'll
9598 // legalize it before loading it. This is also why we could not just create
9599 // a new build vector here. If the build vector contains illegal constants,
9600 // it could get split back up into a series of insert elements.
9601 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9602 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9603 MachineFunction &MF = DAG.getMachineFunction();
9604 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9605 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9606 unsigned InsertC = InsIndex->getAsZExtVal();
9607 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9608 if (InsertC < NumEltsInLow128Bits)
9609 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9610
9611 // There's no good way to insert into the high elements of a >128-bit
9612 // vector, so use shuffles to avoid an extract/insert sequence.
9613 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9614 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9615 SmallVector<int, 8> ShuffleMask;
9616 unsigned NumElts = VT.getVectorNumElements();
9617 for (unsigned i = 0; i != NumElts; ++i)
9618 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9619 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9620 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9621 }
9622
9623 // Special case for single non-zero, non-undef, element.
9624 if (NumNonZero == 1) {
9625 unsigned Idx = NonZeroMask.countr_zero();
9626 SDValue Item = Op.getOperand(Idx);
9627
9628 // If we have a constant or non-constant insertion into the low element of
9629 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9630 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9631 // depending on what the source datatype is.
9632 if (Idx == 0) {
9633 if (NumZero == 0)
9634 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9635
9636 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9637 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9638 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9639 assert((VT.is128BitVector() || VT.is256BitVector() ||
9640 VT.is512BitVector()) &&
9641 "Expected an SSE value type!");
9642 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9643 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9644 // zero vector.
9645 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9646 }
9647
9648 // We can't directly insert an i8 or i16 into a vector, so zero extend
9649 // it to i32 first.
9650 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9651 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9652 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9653 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9654 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9655 return DAG.getBitcast(VT, Item);
9656 }
9657 }
9658
9659 // Is it a vector logical left shift?
9660 if (NumElems == 2 && Idx == 1 &&
9661 X86::isZeroNode(Op.getOperand(0)) &&
9662 !X86::isZeroNode(Op.getOperand(1))) {
9663 unsigned NumBits = VT.getSizeInBits();
9664 return getVShift(true, VT,
9666 VT, Op.getOperand(1)),
9667 NumBits/2, DAG, *this, dl);
9668 }
9669
9670 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9671 return SDValue();
9672
9673 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9674 // is a non-constant being inserted into an element other than the low one,
9675 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9676 // movd/movss) to move this into the low element, then shuffle it into
9677 // place.
9678 if (EVTBits == 32) {
9679 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9680 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9681 }
9682 }
9683
9684 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9685 if (Values.size() == 1) {
9686 if (EVTBits == 32) {
9687 // Instead of a shuffle like this:
9688 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9689 // Check if it's possible to issue this instead.
9690 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9691 unsigned Idx = NonZeroMask.countr_zero();
9692 SDValue Item = Op.getOperand(Idx);
9693 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9694 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9695 }
9696 return SDValue();
9697 }
9698
9699 // A vector full of immediates; various special cases are already
9700 // handled, so this is best done with a single constant-pool load.
9701 if (IsAllConstants)
9702 return SDValue();
9703
9704 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9705 return V;
9706
9707 // See if we can use a vector load to get all of the elements.
9708 {
9709 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9710 if (SDValue LD =
9711 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9712 return LD;
9713 }
9714
9715 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9716 // build_vector and broadcast it.
9717 // TODO: We could probably generalize this more.
9718 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9719 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9720 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9721 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9722 // Make sure all the even/odd operands match.
9723 for (unsigned i = 2; i != NumElems; ++i)
9724 if (Ops[i % 2] != Op.getOperand(i))
9725 return false;
9726 return true;
9727 };
9728 if (CanSplat(Op, NumElems, Ops)) {
9729 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9730 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9731 // Create a new build vector and cast to v2i64/v2f64.
9732 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9733 DAG.getBuildVector(NarrowVT, dl, Ops));
9734 // Broadcast from v2i64/v2f64 and cast to final VT.
9735 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9736 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9737 NewBV));
9738 }
9739 }
9740
9741 // For AVX-length vectors, build the individual 128-bit pieces and use
9742 // shuffles to put them in place.
9743 if (VT.getSizeInBits() > 128) {
9744 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9745
9746 // Build both the lower and upper subvector.
9747 SDValue Lower =
9748 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9750 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9751
9752 // Recreate the wider vector with the lower and upper part.
9753 return concatSubVectors(Lower, Upper, DAG, dl);
9754 }
9755
9756 // Let legalizer expand 2-wide build_vectors.
9757 if (EVTBits == 64) {
9758 if (NumNonZero == 1) {
9759 // One half is zero or undef.
9760 unsigned Idx = NonZeroMask.countr_zero();
9761 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9762 Op.getOperand(Idx));
9763 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9764 }
9765 return SDValue();
9766 }
9767
9768 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9769 if (EVTBits == 8 && NumElems == 16)
9770 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9771 NumZero, DAG, Subtarget))
9772 return V;
9773
9774 if (EltVT == MVT::i16 && NumElems == 8)
9775 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9776 NumZero, DAG, Subtarget))
9777 return V;
9778
9779 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9780 if (EVTBits == 32 && NumElems == 4)
9781 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9782 return V;
9783
9784 // If element VT is == 32 bits, turn it into a number of shuffles.
9785 if (NumElems == 4 && NumZero > 0) {
9786 SmallVector<SDValue, 8> Ops(NumElems);
9787 for (unsigned i = 0; i < 4; ++i) {
9788 bool isZero = !NonZeroMask[i];
9789 if (isZero)
9790 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9791 else
9792 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9793 }
9794
9795 for (unsigned i = 0; i < 2; ++i) {
9796 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9797 default: llvm_unreachable("Unexpected NonZero count");
9798 case 0:
9799 Ops[i] = Ops[i*2]; // Must be a zero vector.
9800 break;
9801 case 1:
9802 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9803 break;
9804 case 2:
9805 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9806 break;
9807 case 3:
9808 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9809 break;
9810 }
9811 }
9812
9813 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9814 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9815 int MaskVec[] = {
9816 Reverse1 ? 1 : 0,
9817 Reverse1 ? 0 : 1,
9818 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9819 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9820 };
9821 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9822 }
9823
9824 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9825
9826 // Check for a build vector from mostly shuffle plus few inserting.
9827 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9828 return Sh;
9829
9830 // For SSE 4.1, use insertps to put the high elements into the low element.
9831 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9833 if (!Op.getOperand(0).isUndef())
9834 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9835 else
9836 Result = DAG.getUNDEF(VT);
9837
9838 for (unsigned i = 1; i < NumElems; ++i) {
9839 if (Op.getOperand(i).isUndef()) continue;
9840 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9841 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9842 }
9843 return Result;
9844 }
9845
9846 // Otherwise, expand into a number of unpckl*, start by extending each of
9847 // our (non-undef) elements to the full vector width with the element in the
9848 // bottom slot of the vector (which generates no code for SSE).
9849 SmallVector<SDValue, 8> Ops(NumElems);
9850 for (unsigned i = 0; i < NumElems; ++i) {
9851 if (!Op.getOperand(i).isUndef())
9852 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9853 else
9854 Ops[i] = DAG.getUNDEF(VT);
9855 }
9856
9857 // Next, we iteratively mix elements, e.g. for v4f32:
9858 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9859 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9860 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9861 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9862 // Generate scaled UNPCKL shuffle mask.
9863 SmallVector<int, 16> Mask;
9864 for(unsigned i = 0; i != Scale; ++i)
9865 Mask.push_back(i);
9866 for (unsigned i = 0; i != Scale; ++i)
9867 Mask.push_back(NumElems+i);
9868 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9869
9870 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9871 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9872 }
9873 return Ops[0];
9874}
9875
9876// 256-bit AVX can use the vinsertf128 instruction
9877// to create 256-bit vectors from two other 128-bit ones.
9878// TODO: Detect subvector broadcast here instead of DAG combine?
9880 SelectionDAG &DAG,
9881 const X86Subtarget &Subtarget) {
9882 MVT ResVT = Op.getSimpleValueType();
9883 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9884 "Value type must be 256-/512-bit wide");
9885
9886 unsigned NumOperands = Op.getNumOperands();
9887 unsigned NumFreezeUndef = 0;
9888 unsigned NumZero = 0;
9889 unsigned NumNonZero = 0;
9890 unsigned NonZeros = 0;
9891 SmallSet<SDValue, 4> Undefs;
9892 for (unsigned i = 0; i != NumOperands; ++i) {
9893 SDValue SubVec = Op.getOperand(i);
9894 if (SubVec.isUndef())
9895 continue;
9896 if (ISD::isFreezeUndef(SubVec.getNode())) {
9897 // If the freeze(undef) has multiple uses then we must fold to zero.
9898 if (SubVec.hasOneUse()) {
9899 ++NumFreezeUndef;
9900 } else {
9901 ++NumZero;
9902 Undefs.insert(SubVec);
9903 }
9904 }
9905 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9906 ++NumZero;
9907 else {
9908 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9909 NonZeros |= 1 << i;
9910 ++NumNonZero;
9911 }
9912 }
9913
9914 // If we have more than 2 non-zeros, build each half separately.
9915 if (NumNonZero > 2) {
9916 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9917 ArrayRef<SDUse> Ops = Op->ops();
9918 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9919 Ops.slice(0, NumOperands/2));
9920 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9921 Ops.slice(NumOperands/2));
9922 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9923 }
9924
9925 // Otherwise, build it up through insert_subvectors.
9926 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9927 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9928 : DAG.getUNDEF(ResVT));
9929
9930 // Replace Undef operands with ZeroVector.
9931 for (SDValue U : Undefs)
9933 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9934
9935 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9936 unsigned NumSubElems = SubVT.getVectorNumElements();
9937 for (unsigned i = 0; i != NumOperands; ++i) {
9938 if ((NonZeros & (1 << i)) == 0)
9939 continue;
9940
9941 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9942 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9943 }
9944
9945 return Vec;
9946}
9947
9948// Returns true if the given node is a type promotion (by concatenating i1
9949// zeros) of the result of a node that already zeros all upper bits of
9950// k-register.
9951// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9953 const X86Subtarget &Subtarget,
9954 SelectionDAG & DAG) {
9955 MVT ResVT = Op.getSimpleValueType();
9956 unsigned NumOperands = Op.getNumOperands();
9957 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9958 "Unexpected number of operands in CONCAT_VECTORS");
9959
9960 uint64_t Zeros = 0;
9961 uint64_t NonZeros = 0;
9962 for (unsigned i = 0; i != NumOperands; ++i) {
9963 SDValue SubVec = Op.getOperand(i);
9964 if (SubVec.isUndef())
9965 continue;
9966 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9967 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9968 Zeros |= (uint64_t)1 << i;
9969 else
9970 NonZeros |= (uint64_t)1 << i;
9971 }
9972
9973 unsigned NumElems = ResVT.getVectorNumElements();
9974
9975 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9976 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9977 // insert_subvector will give us two kshifts.
9978 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9979 Log2_64(NonZeros) != NumOperands - 1) {
9980 unsigned Idx = Log2_64(NonZeros);
9981 SDValue SubVec = Op.getOperand(Idx);
9982 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9983 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9984 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9985 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9986 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9987 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9988 DAG.getVectorIdxConstant(0, dl));
9989 }
9990
9991 // If there are zero or one non-zeros we can handle this very simply.
9992 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9993 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9994 if (!NonZeros)
9995 return Vec;
9996 unsigned Idx = Log2_64(NonZeros);
9997 SDValue SubVec = Op.getOperand(Idx);
9998 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9999 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10000 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
10001 }
10002
10003 if (NumOperands > 2) {
10004 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10005 ArrayRef<SDUse> Ops = Op->ops();
10006 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10007 Ops.slice(0, NumOperands / 2));
10008 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10009 Ops.slice(NumOperands / 2));
10010 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10011 }
10012
10013 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
10014
10015 if (ResVT.getVectorNumElements() >= 16)
10016 return Op; // The operation is legal with KUNPCK
10017
10018 SDValue Vec =
10019 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
10020 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
10021 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10022 DAG.getVectorIdxConstant(NumElems / 2, dl));
10023}
10024
10026 const X86Subtarget &Subtarget,
10027 SelectionDAG &DAG) {
10028 SDLoc DL(Op);
10029 MVT VT = Op.getSimpleValueType();
10030 if (VT.getVectorElementType() == MVT::i1)
10031 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
10032
10033 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10034 // from two other 128-bit ones.
10035 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10036 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
10037 (VT.is512BitVector() &&
10038 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
10039 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
10040}
10041
10042//===----------------------------------------------------------------------===//
10043// Vector shuffle lowering
10044//
10045// This is an experimental code path for lowering vector shuffles on x86. It is
10046// designed to handle arbitrary vector shuffles and blends, gracefully
10047// degrading performance as necessary. It works hard to recognize idiomatic
10048// shuffles and lower them to optimal instruction patterns without leaving
10049// a framework that allows reasonably efficient handling of all vector shuffle
10050// patterns.
10051//===----------------------------------------------------------------------===//
10052
10053/// Checks whether the vector elements referenced by two shuffle masks are
10054/// equivalent.
10055static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
10056 int Idx, int ExpectedIdx) {
10057 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
10058 ExpectedIdx < MaskSize && "Out of range element index");
10059 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
10060 return false;
10061
10062 EVT VT = Op.getValueType();
10063 EVT ExpectedVT = ExpectedOp.getValueType();
10064
10065 // Sources must be vectors and match the mask's element count.
10066 if (!VT.isVector() || !ExpectedVT.isVector() ||
10067 (int)VT.getVectorNumElements() != MaskSize ||
10068 (int)ExpectedVT.getVectorNumElements() != MaskSize)
10069 return false;
10070
10071 // Exact match.
10072 if (Idx == ExpectedIdx && Op == ExpectedOp)
10073 return true;
10074
10075 switch (Op.getOpcode()) {
10076 case ISD::BUILD_VECTOR:
10077 // If the values are build vectors, we can look through them to find
10078 // equivalent inputs that make the shuffles equivalent.
10079 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
10080 case ISD::BITCAST: {
10082 EVT SrcVT = Src.getValueType();
10083 if (Op == ExpectedOp && SrcVT.isVector()) {
10084 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
10085 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
10086 return (Idx % Scale) == (ExpectedIdx % Scale) &&
10087 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
10088 Idx / Scale, ExpectedIdx / Scale);
10089 }
10090 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
10091 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
10092 for (unsigned I = 0; I != Scale; ++I)
10093 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
10094 (Idx * Scale) + I,
10095 (ExpectedIdx * Scale) + I))
10096 return false;
10097 return true;
10098 }
10099 }
10100 break;
10101 }
10102 case ISD::VECTOR_SHUFFLE: {
10103 auto *SVN = cast<ShuffleVectorSDNode>(Op);
10104 return Op == ExpectedOp &&
10105 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
10106 }
10107 case X86ISD::VBROADCAST:
10109 return Op == ExpectedOp;
10111 if (Op == ExpectedOp) {
10112 auto *MemOp = cast<MemSDNode>(Op);
10113 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
10114 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
10115 }
10116 break;
10117 case X86ISD::VPERMI: {
10118 if (Op == ExpectedOp) {
10120 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
10121 SDValue Src = Op.getOperand(0);
10122 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
10123 Mask[ExpectedIdx]);
10124 }
10125 break;
10126 }
10127 case X86ISD::HADD:
10128 case X86ISD::HSUB:
10129 case X86ISD::FHADD:
10130 case X86ISD::FHSUB:
10131 case X86ISD::PACKSS:
10132 case X86ISD::PACKUS:
10133 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
10134 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
10135 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
10136 int NumElts = VT.getVectorNumElements();
10137 int NumLanes = VT.getSizeInBits() / 128;
10138 int NumEltsPerLane = NumElts / NumLanes;
10139 int NumHalfEltsPerLane = NumEltsPerLane / 2;
10140 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
10141 bool SameElt =
10142 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
10143 return SameLane && SameElt;
10144 }
10145 break;
10146 }
10147
10148 return false;
10149}
10150
10151/// Tiny helper function to identify a no-op mask.
10152///
10153/// This is a somewhat boring predicate function. It checks whether the mask
10154/// array input, which is assumed to be a single-input shuffle mask of the kind
10155/// used by the X86 shuffle instructions (not a fully general
10156/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10157/// in-place shuffle are 'no-op's.
10159 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10160 assert(Mask[i] >= -1 && "Out of bound mask element!");
10161 if (Mask[i] >= 0 && Mask[i] != i)
10162 return false;
10163 }
10164 return true;
10165}
10166
10167/// Test whether there are elements crossing LaneSizeInBits lanes in this
10168/// shuffle mask.
10169///
10170/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10171/// and we routinely test for these.
10172static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10173 unsigned ScalarSizeInBits,
10174 ArrayRef<int> Mask) {
10175 assert(LaneSizeInBits && ScalarSizeInBits &&
10176 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10177 "Illegal shuffle lane size");
10178 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10179 int Size = Mask.size();
10180 for (int i = 0; i < Size; ++i)
10181 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10182 return true;
10183 return false;
10184}
10185
10186/// Test whether there are elements crossing 128-bit lanes in this
10187/// shuffle mask.
10189 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10190}
10191
10192/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10193/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10194/// better support 'repeated mask + lane permute' style shuffles.
10195static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10196 unsigned ScalarSizeInBits,
10197 ArrayRef<int> Mask) {
10198 assert(LaneSizeInBits && ScalarSizeInBits &&
10199 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10200 "Illegal shuffle lane size");
10201 int NumElts = Mask.size();
10202 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10203 int NumLanes = NumElts / NumEltsPerLane;
10204 if (NumLanes > 1) {
10205 for (int i = 0; i != NumLanes; ++i) {
10206 int SrcLane = -1;
10207 for (int j = 0; j != NumEltsPerLane; ++j) {
10208 int M = Mask[(i * NumEltsPerLane) + j];
10209 if (M < 0)
10210 continue;
10211 int Lane = (M % NumElts) / NumEltsPerLane;
10212 if (SrcLane >= 0 && SrcLane != Lane)
10213 return true;
10214 SrcLane = Lane;
10215 }
10216 }
10217 }
10218 return false;
10219}
10220
10221/// Test whether a shuffle mask is equivalent within each sub-lane.
10222///
10223/// This checks a shuffle mask to see if it is performing the same
10224/// lane-relative shuffle in each sub-lane. This trivially implies
10225/// that it is also not lane-crossing. It may however involve a blend from the
10226/// same lane of a second vector.
10227///
10228/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10229/// non-trivial to compute in the face of undef lanes. The representation is
10230/// suitable for use with existing 128-bit shuffles as entries from the second
10231/// vector have been remapped to [LaneSize, 2*LaneSize).
10232static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10233 ArrayRef<int> Mask,
10234 SmallVectorImpl<int> &RepeatedMask) {
10235 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10236 RepeatedMask.assign(LaneSize, -1);
10237 int Size = Mask.size();
10238 for (int i = 0; i < Size; ++i) {
10239 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10240 if (Mask[i] < 0)
10241 continue;
10242 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10243 // This entry crosses lanes, so there is no way to model this shuffle.
10244 return false;
10245
10246 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10247 // Adjust second vector indices to start at LaneSize instead of Size.
10248 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10249 : Mask[i] % LaneSize + LaneSize;
10250 if (RepeatedMask[i % LaneSize] < 0)
10251 // This is the first non-undef entry in this slot of a 128-bit lane.
10252 RepeatedMask[i % LaneSize] = LocalM;
10253 else if (RepeatedMask[i % LaneSize] != LocalM)
10254 // Found a mismatch with the repeated mask.
10255 return false;
10256 }
10257 return true;
10258}
10259
10260/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10261static bool
10263 SmallVectorImpl<int> &RepeatedMask) {
10264 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10265}
10266
10267static bool
10269 SmallVector<int, 32> RepeatedMask;
10270 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10271}
10272
10273/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10274static bool
10276 SmallVectorImpl<int> &RepeatedMask) {
10277 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10278}
10279
10280/// Test whether a target shuffle mask is equivalent within each sub-lane.
10281/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10282static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10283 unsigned EltSizeInBits,
10284 ArrayRef<int> Mask,
10285 SmallVectorImpl<int> &RepeatedMask) {
10286 int LaneSize = LaneSizeInBits / EltSizeInBits;
10287 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10288 int Size = Mask.size();
10289 for (int i = 0; i < Size; ++i) {
10290 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10291 if (Mask[i] == SM_SentinelUndef)
10292 continue;
10293 if (Mask[i] == SM_SentinelZero) {
10294 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10295 return false;
10296 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10297 continue;
10298 }
10299 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10300 // This entry crosses lanes, so there is no way to model this shuffle.
10301 return false;
10302
10303 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10304 // later vector indices to start at multiples of LaneSize instead of Size.
10305 int LaneM = Mask[i] / Size;
10306 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10307 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10308 // This is the first non-undef entry in this slot of a 128-bit lane.
10309 RepeatedMask[i % LaneSize] = LocalM;
10310 else if (RepeatedMask[i % LaneSize] != LocalM)
10311 // Found a mismatch with the repeated mask.
10312 return false;
10313 }
10314 return true;
10315}
10316
10317/// Test whether a target shuffle mask is equivalent within each sub-lane.
10318/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10319static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10320 ArrayRef<int> Mask,
10321 SmallVectorImpl<int> &RepeatedMask) {
10322 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10323 Mask, RepeatedMask);
10324}
10325
10326/// Checks whether a shuffle mask is equivalent to an explicit list of
10327/// arguments.
10328///
10329/// This is a fast way to test a shuffle mask against a fixed pattern:
10330///
10331/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10332///
10333/// It returns true if the mask is exactly as wide as the argument list, and
10334/// each element of the mask is either -1 (signifying undef) or the value given
10335/// in the argument.
10336static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10337 SDValue V1 = SDValue(),
10338 SDValue V2 = SDValue()) {
10339 int Size = Mask.size();
10340 if (Size != (int)ExpectedMask.size())
10341 return false;
10342
10343 for (int i = 0; i < Size; ++i) {
10344 assert(Mask[i] >= -1 && "Out of bound mask element!");
10345 int MaskIdx = Mask[i];
10346 int ExpectedIdx = ExpectedMask[i];
10347 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10348 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10349 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10350 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10351 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10352 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10353 return false;
10354 }
10355 }
10356 return true;
10357}
10358
10359/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10360///
10361/// The masks must be exactly the same width.
10362///
10363/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10364/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10365///
10366/// SM_SentinelZero is accepted as a valid negative index but must match in
10367/// both, or via a known bits test.
10369 ArrayRef<int> ExpectedMask,
10370 const SelectionDAG &DAG,
10371 SDValue V1 = SDValue(),
10372 SDValue V2 = SDValue()) {
10373 int Size = Mask.size();
10374 if (Size != (int)ExpectedMask.size())
10375 return false;
10376 assert(llvm::all_of(ExpectedMask,
10377 [Size](int M) {
10378 return M == SM_SentinelZero ||
10379 isInRange(M, 0, 2 * Size);
10380 }) &&
10381 "Illegal target shuffle mask");
10382
10383 // Check for out-of-range target shuffle mask indices.
10384 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10385 return false;
10386
10387 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10388 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10389 !V1.getValueType().isVector()))
10390 V1 = SDValue();
10391 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10392 !V2.getValueType().isVector()))
10393 V2 = SDValue();
10394
10395 APInt ZeroV1 = APInt::getZero(Size);
10396 APInt ZeroV2 = APInt::getZero(Size);
10397
10398 for (int i = 0; i < Size; ++i) {
10399 int MaskIdx = Mask[i];
10400 int ExpectedIdx = ExpectedMask[i];
10401 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10402 continue;
10403 // If we failed to match an expected SM_SentinelZero then early out.
10404 if (ExpectedIdx < 0)
10405 return false;
10406 if (MaskIdx == SM_SentinelZero) {
10407 // If we need this expected index to be a zero element, then update the
10408 // relevant zero mask and perform the known bits at the end to minimize
10409 // repeated computes.
10410 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10411 if (ExpectedV &&
10412 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10413 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10414 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10415 ZeroMask.setBit(BitIdx);
10416 continue;
10417 }
10418 }
10419 if (MaskIdx >= 0) {
10420 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10421 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10422 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10423 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10424 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10425 continue;
10426 }
10427 return false;
10428 }
10429 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10430 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10431}
10432
10433// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10434// instructions.
10436 const SelectionDAG &DAG) {
10437 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10438 return false;
10439
10440 SmallVector<int, 8> Unpcklwd;
10441 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10442 /* Unary = */ false);
10443 SmallVector<int, 8> Unpckhwd;
10444 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10445 /* Unary = */ false);
10446 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10447 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10448 return IsUnpackwdMask;
10449}
10450
10452 const SelectionDAG &DAG) {
10453 // Create 128-bit vector type based on mask size.
10454 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10455 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10456
10457 // We can't assume a canonical shuffle mask, so try the commuted version too.
10458 SmallVector<int, 4> CommutedMask(Mask);
10460
10461 // Match any of unary/binary or low/high.
10462 for (unsigned i = 0; i != 4; ++i) {
10463 SmallVector<int, 16> UnpackMask;
10464 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10465 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10466 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10467 return true;
10468 }
10469 return false;
10470}
10471
10472/// Return true if a shuffle mask chooses elements identically in its top and
10473/// bottom halves. For example, any splat mask has the same top and bottom
10474/// halves. If an element is undefined in only one half of the mask, the halves
10475/// are not considered identical.
10477 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10478 unsigned HalfSize = Mask.size() / 2;
10479 for (unsigned i = 0; i != HalfSize; ++i) {
10480 if (Mask[i] != Mask[i + HalfSize])
10481 return false;
10482 }
10483 return true;
10484}
10485
10486/// Get a 4-lane 8-bit shuffle immediate for a mask.
10487///
10488/// This helper function produces an 8-bit shuffle immediate corresponding to
10489/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10490/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10491/// example.
10492///
10493/// NB: We rely heavily on "undef" masks preserving the input lane.
10494static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10495 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10496 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10497 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10498 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10499 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10500
10501 // If the mask only uses one non-undef element, then fully 'splat' it to
10502 // improve later broadcast matching.
10503 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10504 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10505
10506 int FirstElt = Mask[FirstIndex];
10507 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10508 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10509
10510 unsigned Imm = 0;
10511 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10512 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10513 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10514 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10515 return Imm;
10516}
10517
10519 SelectionDAG &DAG) {
10520 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10521}
10522
10523// Canonicalize SHUFPD mask to improve chances of further folding.
10524// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10525static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10526 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10527 "Unexpected SHUFPD mask size");
10528 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10529 "Unexpected SHUFPD mask elements");
10530
10531 // If the mask only uses one non-undef element, then fully 'splat' it to
10532 // improve later broadcast matching.
10533 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10534 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10535 "All undef shuffle mask");
10536
10537 int FirstElt = Mask[FirstIndex];
10538 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10539 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10540 unsigned Imm = 0;
10541 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10542 Imm |= FirstElt << I;
10543 return Imm;
10544 }
10545
10546 // Attempt to keep any undef elements in place to improve chances of the
10547 // shuffle becoming a (commutative) blend.
10548 unsigned Imm = 0;
10549 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10550 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10551
10552 return Imm;
10553}
10554
10556 SelectionDAG &DAG) {
10557 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10558}
10559
10560// The Shuffle result is as follow:
10561// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10562// Each Zeroable's element correspond to a particular Mask's element.
10563// As described in computeZeroableShuffleElements function.
10564//
10565// The function looks for a sub-mask that the nonzero elements are in
10566// increasing order. If such sub-mask exist. The function returns true.
10567static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10568 ArrayRef<int> Mask, const EVT &VectorType,
10569 bool &IsZeroSideLeft) {
10570 int NextElement = -1;
10571 // Check if the Mask's nonzero elements are in increasing order.
10572 for (int i = 0, e = Mask.size(); i < e; i++) {
10573 // Checks if the mask's zeros elements are built from only zeros.
10574 assert(Mask[i] >= -1 && "Out of bound mask element!");
10575 if (Mask[i] < 0)
10576 return false;
10577 if (Zeroable[i])
10578 continue;
10579 // Find the lowest non zero element
10580 if (NextElement < 0) {
10581 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10582 IsZeroSideLeft = NextElement != 0;
10583 }
10584 // Exit if the mask's non zero elements are not in increasing order.
10585 if (NextElement != Mask[i])
10586 return false;
10587 NextElement++;
10588 }
10589 return true;
10590}
10591
10592static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10594 const X86Subtarget &Subtarget,
10595 unsigned Depth = 0);
10596
10597/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10599 ArrayRef<int> Mask, SDValue V1,
10600 SDValue V2, const APInt &Zeroable,
10601 const X86Subtarget &Subtarget,
10602 SelectionDAG &DAG) {
10603 int Size = Mask.size();
10604 int LaneSize = 128 / VT.getScalarSizeInBits();
10605 const int NumBytes = VT.getSizeInBits() / 8;
10606 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10607
10608 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10609 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10610 (Subtarget.hasBWI() && VT.is512BitVector()));
10611
10612 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10613 // Sign bit set in i8 mask means zero element.
10614 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10615
10616 SDValue V;
10617 for (int i = 0; i < NumBytes; ++i) {
10618 int M = Mask[i / NumEltBytes];
10619 if (M < 0) {
10620 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10621 continue;
10622 }
10623 if (Zeroable[i / NumEltBytes]) {
10624 PSHUFBMask[i] = ZeroMask;
10625 continue;
10626 }
10627
10628 // We can only use a single input of V1 or V2.
10629 SDValue SrcV = (M >= Size ? V2 : V1);
10630 if (V && V != SrcV)
10631 return SDValue();
10632 V = SrcV;
10633 M %= Size;
10634
10635 // PSHUFB can't cross lanes, ensure this doesn't happen.
10636 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10637 return SDValue();
10638
10639 M = M % LaneSize;
10640 M = M * NumEltBytes + (i % NumEltBytes);
10641 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10642 }
10643 assert(V && "Failed to find a source input");
10644
10645 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10646 return DAG.getBitcast(
10647 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10648 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10649}
10650
10651static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10652 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10653 const SDLoc &dl);
10654
10655// X86 has dedicated shuffle that can be lowered to VEXPAND
10657 SDValue V2, ArrayRef<int> Mask,
10658 const APInt &Zeroable,
10659 const X86Subtarget &Subtarget,
10660 SelectionDAG &DAG) {
10661 bool IsLeftZeroSide = true;
10662 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10663 IsLeftZeroSide))
10664 return SDValue();
10665 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10667 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10668 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10669 unsigned NumElts = VT.getVectorNumElements();
10670 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10671 "Unexpected number of vector elements");
10672 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10673 Subtarget, DAG, DL);
10674 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10675 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10676 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10677}
10678
10679static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10680 unsigned &UnpackOpcode, bool IsUnary,
10681 ArrayRef<int> TargetMask, const SDLoc &DL,
10682 SelectionDAG &DAG,
10683 const X86Subtarget &Subtarget) {
10684 int NumElts = VT.getVectorNumElements();
10685
10686 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10687 for (int i = 0; i != NumElts; i += 2) {
10688 int M1 = TargetMask[i + 0];
10689 int M2 = TargetMask[i + 1];
10690 Undef1 &= (SM_SentinelUndef == M1);
10691 Undef2 &= (SM_SentinelUndef == M2);
10692 Zero1 &= isUndefOrZero(M1);
10693 Zero2 &= isUndefOrZero(M2);
10694 }
10695 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10696 "Zeroable shuffle detected");
10697
10698 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10699 SmallVector<int, 64> Unpckl, Unpckh;
10700 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10701 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10702 (IsUnary ? V1 : V2))) {
10703 UnpackOpcode = X86ISD::UNPCKL;
10704 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10705 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10706 return true;
10707 }
10708
10709 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10710 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10711 (IsUnary ? V1 : V2))) {
10712 UnpackOpcode = X86ISD::UNPCKH;
10713 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10714 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10715 return true;
10716 }
10717
10718 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10719 if (IsUnary && (Zero1 || Zero2)) {
10720 // Don't bother if we can blend instead.
10721 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10722 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10723 return false;
10724
10725 bool MatchLo = true, MatchHi = true;
10726 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10727 int M = TargetMask[i];
10728
10729 // Ignore if the input is known to be zero or the index is undef.
10730 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10731 (M == SM_SentinelUndef))
10732 continue;
10733
10734 MatchLo &= (M == Unpckl[i]);
10735 MatchHi &= (M == Unpckh[i]);
10736 }
10737
10738 if (MatchLo || MatchHi) {
10739 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10740 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10741 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10742 return true;
10743 }
10744 }
10745
10746 // If a binary shuffle, commute and try again.
10747 if (!IsUnary) {
10749 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10750 UnpackOpcode = X86ISD::UNPCKL;
10751 std::swap(V1, V2);
10752 return true;
10753 }
10754
10756 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10757 UnpackOpcode = X86ISD::UNPCKH;
10758 std::swap(V1, V2);
10759 return true;
10760 }
10761 }
10762
10763 return false;
10764}
10765
10766// X86 has dedicated unpack instructions that can handle specific blend
10767// operations: UNPCKH and UNPCKL.
10769 SDValue V2, ArrayRef<int> Mask,
10770 SelectionDAG &DAG) {
10771 SmallVector<int, 8> Unpckl;
10772 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10773 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10774 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10775
10776 SmallVector<int, 8> Unpckh;
10777 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10778 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10779 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10780
10781 // Commute and try again.
10783 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10784 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10785
10787 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10788 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10789
10790 return SDValue();
10791}
10792
10793/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10794/// followed by unpack 256-bit.
10796 SDValue V2, ArrayRef<int> Mask,
10797 SelectionDAG &DAG) {
10798 SmallVector<int, 32> Unpckl, Unpckh;
10799 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10800 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10801
10802 unsigned UnpackOpcode;
10803 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10804 UnpackOpcode = X86ISD::UNPCKL;
10805 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10806 UnpackOpcode = X86ISD::UNPCKH;
10807 else
10808 return SDValue();
10809
10810 // This is a "natural" unpack operation (rather than the 128-bit sectored
10811 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10812 // input in order to use the x86 instruction.
10813 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10814 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10815 V1 = DAG.getBitcast(VT, V1);
10816 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10817}
10818
10819// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10820// source into the lower elements and zeroing the upper elements.
10821static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10822 ArrayRef<int> Mask, const APInt &Zeroable,
10823 const X86Subtarget &Subtarget) {
10824 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10825 return false;
10826
10827 unsigned NumElts = Mask.size();
10828 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10829 unsigned MaxScale = 64 / EltSizeInBits;
10830
10831 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10832 unsigned SrcEltBits = EltSizeInBits * Scale;
10833 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10834 continue;
10835 unsigned NumSrcElts = NumElts / Scale;
10836 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10837 continue;
10838 unsigned UpperElts = NumElts - NumSrcElts;
10839 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10840 continue;
10841 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10842 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10843 DstVT = MVT::getIntegerVT(EltSizeInBits);
10844 if ((NumSrcElts * EltSizeInBits) >= 128) {
10845 // ISD::TRUNCATE
10846 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10847 } else {
10848 // X86ISD::VTRUNC
10849 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10850 }
10851 return true;
10852 }
10853
10854 return false;
10855}
10856
10857// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10858// element padding to the final DstVT.
10859static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10860 const X86Subtarget &Subtarget,
10861 SelectionDAG &DAG, bool ZeroUppers) {
10862 MVT SrcVT = Src.getSimpleValueType();
10863 MVT DstSVT = DstVT.getScalarType();
10864 unsigned NumDstElts = DstVT.getVectorNumElements();
10865 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10866 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10867
10868 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10869 return SDValue();
10870
10871 // Perform a direct ISD::TRUNCATE if possible.
10872 if (NumSrcElts == NumDstElts)
10873 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10874
10875 if (NumSrcElts > NumDstElts) {
10876 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10877 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10878 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10879 }
10880
10881 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10882 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10883 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10884 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10885 DstVT.getSizeInBits());
10886 }
10887
10888 // Non-VLX targets must truncate from a 512-bit type, so we need to
10889 // widen, truncate and then possibly extract the original subvector.
10890 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10891 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10892 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10893 }
10894
10895 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10896 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10897 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10898 if (DstVT != TruncVT)
10899 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10900 DstVT.getSizeInBits());
10901 return Trunc;
10902}
10903
10904// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10905//
10906// An example is the following:
10907//
10908// t0: ch = EntryToken
10909// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10910// t25: v4i32 = truncate t2
10911// t41: v8i16 = bitcast t25
10912// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10913// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10914// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10915// t18: v2i64 = bitcast t51
10916//
10917// One can just use a single vpmovdw instruction, without avx512vl we need to
10918// use the zmm variant and extract the lower subvector, padding with zeroes.
10919// TODO: Merge with lowerShuffleAsVTRUNC.
10921 SDValue V2, ArrayRef<int> Mask,
10922 const APInt &Zeroable,
10923 const X86Subtarget &Subtarget,
10924 SelectionDAG &DAG) {
10925 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10926 if (!Subtarget.hasAVX512())
10927 return SDValue();
10928
10929 unsigned NumElts = VT.getVectorNumElements();
10930 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10931 unsigned MaxScale = 64 / EltSizeInBits;
10932 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10933 unsigned SrcEltBits = EltSizeInBits * Scale;
10934 unsigned NumSrcElts = NumElts / Scale;
10935 unsigned UpperElts = NumElts - NumSrcElts;
10936 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10937 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10938 continue;
10939
10940 // Attempt to find a matching source truncation, but as a fall back VLX
10941 // cases can use the VPMOV directly.
10942 SDValue Src = peekThroughBitcasts(V1);
10943 if (Src.getOpcode() == ISD::TRUNCATE &&
10944 Src.getScalarValueSizeInBits() == SrcEltBits) {
10945 Src = Src.getOperand(0);
10946 } else if (Subtarget.hasVLX()) {
10947 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10948 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10949 Src = DAG.getBitcast(SrcVT, Src);
10950 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10951 if (Scale == 2 &&
10952 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10953 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10954 return SDValue();
10955 } else
10956 return SDValue();
10957
10958 // VPMOVWB is only available with avx512bw.
10959 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10960 return SDValue();
10961
10962 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10963 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10964 }
10965
10966 return SDValue();
10967}
10968
10969// Attempt to match binary shuffle patterns as a truncate.
10971 SDValue V2, ArrayRef<int> Mask,
10972 const APInt &Zeroable,
10973 const X86Subtarget &Subtarget,
10974 SelectionDAG &DAG) {
10975 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10976 "Unexpected VTRUNC type");
10977 if (!Subtarget.hasAVX512() ||
10978 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10979 return SDValue();
10980
10981 unsigned NumElts = VT.getVectorNumElements();
10982 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10983 unsigned MaxScale = 64 / EltSizeInBits;
10984 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10985 // TODO: Support non-BWI VPMOVWB truncations?
10986 unsigned SrcEltBits = EltSizeInBits * Scale;
10987 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10988 continue;
10989
10990 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10991 // Bail if the V2 elements are undef.
10992 unsigned NumHalfSrcElts = NumElts / Scale;
10993 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10994 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10995 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10996 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10997 continue;
10998
10999 // The elements beyond the truncation must be undef/zero.
11000 unsigned UpperElts = NumElts - NumSrcElts;
11001 if (UpperElts > 0 &&
11002 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
11003 continue;
11004 bool UndefUppers =
11005 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11006
11007 // As we're using both sources then we need to concat them together
11008 // and truncate from the double-sized src.
11009 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
11010
11011 // For offset truncations, ensure that the concat is cheap.
11012 SDValue Src =
11013 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
11014 if (!Src) {
11015 if (Offset)
11016 continue;
11017 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11018 }
11019
11020 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11021 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11022 Src = DAG.getBitcast(SrcVT, Src);
11023
11024 // Shift the offset'd elements into place for the truncation.
11025 // TODO: Use getTargetVShiftByConstNode.
11026 if (Offset)
11027 Src = DAG.getNode(
11028 X86ISD::VSRLI, DL, SrcVT, Src,
11029 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
11030
11031 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11032 }
11033 }
11034
11035 return SDValue();
11036}
11037
11038/// Check whether a compaction lowering can be done by dropping even/odd
11039/// elements and compute how many times even/odd elements must be dropped.
11040///
11041/// This handles shuffles which take every Nth element where N is a power of
11042/// two. Example shuffle masks:
11043///
11044/// (even)
11045/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11046/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11047/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11048/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11049/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11050/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11051///
11052/// (odd)
11053/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
11054/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
11055///
11056/// Any of these lanes can of course be undef.
11057///
11058/// This routine only supports N <= 3.
11059/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11060/// for larger N.
11061///
11062/// \returns N above, or the number of times even/odd elements must be dropped
11063/// if there is such a number. Otherwise returns zero.
11064static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
11065 bool IsSingleInput) {
11066 // The modulus for the shuffle vector entries is based on whether this is
11067 // a single input or not.
11068 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11069 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11070 "We should only be called with masks with a power-of-2 size!");
11071
11072 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11073 int Offset = MatchEven ? 0 : 1;
11074
11075 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11076 // and 2^3 simultaneously. This is because we may have ambiguity with
11077 // partially undef inputs.
11078 bool ViableForN[3] = {true, true, true};
11079
11080 for (int i = 0, e = Mask.size(); i < e; ++i) {
11081 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11082 // want.
11083 if (Mask[i] < 0)
11084 continue;
11085
11086 bool IsAnyViable = false;
11087 for (unsigned j = 0; j != std::size(ViableForN); ++j)
11088 if (ViableForN[j]) {
11089 uint64_t N = j + 1;
11090
11091 // The shuffle mask must be equal to (i * 2^N) % M.
11092 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
11093 IsAnyViable = true;
11094 else
11095 ViableForN[j] = false;
11096 }
11097 // Early exit if we exhaust the possible powers of two.
11098 if (!IsAnyViable)
11099 break;
11100 }
11101
11102 for (unsigned j = 0; j != std::size(ViableForN); ++j)
11103 if (ViableForN[j])
11104 return j + 1;
11105
11106 // Return 0 as there is no viable power of two.
11107 return 0;
11108}
11109
11110// X86 has dedicated pack instructions that can handle specific truncation
11111// operations: PACKSS and PACKUS.
11112// Checks for compaction shuffle masks if MaxStages > 1.
11113// TODO: Add support for matching multiple PACKSS/PACKUS stages.
11114static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11115 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11116 const SelectionDAG &DAG,
11117 const X86Subtarget &Subtarget,
11118 unsigned MaxStages = 1) {
11119 unsigned NumElts = VT.getVectorNumElements();
11120 unsigned BitSize = VT.getScalarSizeInBits();
11121 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11122 "Illegal maximum compaction");
11123
11124 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11125 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11126 unsigned NumPackedBits = NumSrcBits - BitSize;
11127 N1 = peekThroughBitcasts(N1);
11128 N2 = peekThroughBitcasts(N2);
11129 unsigned NumBits1 = N1.getScalarValueSizeInBits();
11130 unsigned NumBits2 = N2.getScalarValueSizeInBits();
11131 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
11132 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
11133 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11134 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11135 return false;
11136 if (Subtarget.hasSSE41() || BitSize == 8) {
11137 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11138 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
11139 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
11140 V1 = N1;
11141 V2 = N2;
11142 SrcVT = PackVT;
11143 PackOpcode = X86ISD::PACKUS;
11144 return true;
11145 }
11146 }
11147 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
11148 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
11149 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
11150 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
11151 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11152 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
11153 V1 = N1;
11154 V2 = N2;
11155 SrcVT = PackVT;
11156 PackOpcode = X86ISD::PACKSS;
11157 return true;
11158 }
11159 return false;
11160 };
11161
11162 // Attempt to match against wider and wider compaction patterns.
11163 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11164 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11165 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11166
11167 // Try binary shuffle.
11168 SmallVector<int, 32> BinaryMask;
11169 createPackShuffleMask(VT, BinaryMask, false, NumStages);
11170 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
11171 if (MatchPACK(V1, V2, PackVT))
11172 return true;
11173
11174 // Try unary shuffle.
11175 SmallVector<int, 32> UnaryMask;
11176 createPackShuffleMask(VT, UnaryMask, true, NumStages);
11177 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
11178 if (MatchPACK(V1, V1, PackVT))
11179 return true;
11180 }
11181
11182 return false;
11183}
11184
11186 SDValue V2, ArrayRef<int> Mask,
11187 const X86Subtarget &Subtarget,
11188 SelectionDAG &DAG) {
11189 MVT PackVT;
11190 unsigned PackOpcode;
11191 unsigned SizeBits = VT.getSizeInBits();
11192 unsigned EltBits = VT.getScalarSizeInBits();
11193 unsigned MaxStages = Log2_32(64 / EltBits);
11194 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11195 Subtarget, MaxStages))
11196 return SDValue();
11197
11198 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11199 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11200
11201 // Don't lower multi-stage packs on AVX512, truncation is better.
11202 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11203 return SDValue();
11204
11205 // Pack to the largest type possible:
11206 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11207 unsigned MaxPackBits = 16;
11208 if (CurrentEltBits > 16 &&
11209 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11210 MaxPackBits = 32;
11211
11212 // Repeatedly pack down to the target size.
11213 SDValue Res;
11214 for (unsigned i = 0; i != NumStages; ++i) {
11215 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11216 unsigned NumSrcElts = SizeBits / SrcEltBits;
11217 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11218 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11219 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11220 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11221 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11222 DAG.getBitcast(SrcVT, V2));
11223 V1 = V2 = Res;
11224 CurrentEltBits /= 2;
11225 }
11226 assert(Res && Res.getValueType() == VT &&
11227 "Failed to lower compaction shuffle");
11228 return Res;
11229}
11230
11231/// Try to emit a bitmask instruction for a shuffle.
11232///
11233/// This handles cases where we can model a blend exactly as a bitmask due to
11234/// one of the inputs being zeroable.
11236 SDValue V2, ArrayRef<int> Mask,
11237 const APInt &Zeroable,
11238 const X86Subtarget &Subtarget,
11239 SelectionDAG &DAG) {
11240 MVT MaskVT = VT;
11241 MVT EltVT = VT.getVectorElementType();
11242 SDValue Zero, AllOnes;
11243 // Use f64 if i64 isn't legal.
11244 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11245 EltVT = MVT::f64;
11246 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11247 }
11248
11249 MVT LogicVT = VT;
11250 if (EltVT.isFloatingPoint()) {
11251 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11252 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11253 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11254 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11255 } else {
11256 Zero = DAG.getConstant(0, DL, EltVT);
11257 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11258 }
11259
11260 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11261 SDValue V;
11262 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11263 if (Zeroable[i])
11264 continue;
11265 if (Mask[i] % Size != i)
11266 return SDValue(); // Not a blend.
11267 if (!V)
11268 V = Mask[i] < Size ? V1 : V2;
11269 else if (V != (Mask[i] < Size ? V1 : V2))
11270 return SDValue(); // Can only let one input through the mask.
11271
11272 VMaskOps[i] = AllOnes;
11273 }
11274 if (!V)
11275 return SDValue(); // No non-zeroable elements!
11276
11277 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11278 VMask = DAG.getBitcast(LogicVT, VMask);
11279 V = DAG.getBitcast(LogicVT, V);
11280 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11281 return DAG.getBitcast(VT, And);
11282}
11283
11284/// Try to emit a blend instruction for a shuffle using bit math.
11285///
11286/// This is used as a fallback approach when first class blend instructions are
11287/// unavailable. Currently it is only suitable for integer vectors, but could
11288/// be generalized for floating point vectors if desirable.
11290 SDValue V2, ArrayRef<int> Mask,
11291 SelectionDAG &DAG) {
11292 assert(VT.isInteger() && "Only supports integer vector types!");
11293 MVT EltVT = VT.getVectorElementType();
11294 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11295 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11297 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11298 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11299 return SDValue(); // Shuffled input!
11300 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11301 }
11302
11303 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11304 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11305}
11306
11308 SDValue PreservedSrc,
11309 const X86Subtarget &Subtarget,
11310 SelectionDAG &DAG);
11311
11314 const APInt &Zeroable, bool &ForceV1Zero,
11315 bool &ForceV2Zero, uint64_t &BlendMask) {
11316 bool V1IsZeroOrUndef =
11318 bool V2IsZeroOrUndef =
11320
11321 BlendMask = 0;
11322 ForceV1Zero = false, ForceV2Zero = false;
11323 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11324
11325 int NumElts = Mask.size();
11326 int NumLanes = VT.getSizeInBits() / 128;
11327 int NumEltsPerLane = NumElts / NumLanes;
11328 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11329
11330 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11331 // then ensure the blend mask part for that lane just references that input.
11332 bool ForceWholeLaneMasks =
11333 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11334
11335 // Attempt to generate the binary blend mask. If an input is zero then
11336 // we can use any lane.
11337 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11338 // Keep track of the inputs used per lane.
11339 bool LaneV1InUse = false;
11340 bool LaneV2InUse = false;
11341 uint64_t LaneBlendMask = 0;
11342 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11343 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11344 int M = Mask[Elt];
11345 if (M == SM_SentinelUndef)
11346 continue;
11347 if (M == Elt || (0 <= M && M < NumElts &&
11348 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11349 Mask[Elt] = Elt;
11350 LaneV1InUse = true;
11351 continue;
11352 }
11353 if (M == (Elt + NumElts) ||
11354 (NumElts <= M &&
11355 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11356 LaneBlendMask |= 1ull << LaneElt;
11357 Mask[Elt] = Elt + NumElts;
11358 LaneV2InUse = true;
11359 continue;
11360 }
11361 if (Zeroable[Elt]) {
11362 if (V1IsZeroOrUndef) {
11363 ForceV1Zero = true;
11364 Mask[Elt] = Elt;
11365 LaneV1InUse = true;
11366 continue;
11367 }
11368 if (V2IsZeroOrUndef) {
11369 ForceV2Zero = true;
11370 LaneBlendMask |= 1ull << LaneElt;
11371 Mask[Elt] = Elt + NumElts;
11372 LaneV2InUse = true;
11373 continue;
11374 }
11375 }
11376 return false;
11377 }
11378
11379 // If we only used V2 then splat the lane blend mask to avoid any demanded
11380 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11381 // blend mask bit).
11382 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11383 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11384
11385 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11386 }
11387 return true;
11388}
11389
11390/// Try to emit a blend instruction for a shuffle.
11391///
11392/// This doesn't do any checks for the availability of instructions for blending
11393/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11394/// be matched in the backend with the type given. What it does check for is
11395/// that the shuffle mask is a blend, or convertible into a blend with zero.
11397 SDValue V2, ArrayRef<int> Original,
11398 const APInt &Zeroable,
11399 const X86Subtarget &Subtarget,
11400 SelectionDAG &DAG) {
11401 uint64_t BlendMask = 0;
11402 bool ForceV1Zero = false, ForceV2Zero = false;
11403 SmallVector<int, 64> Mask(Original);
11404 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11405 BlendMask))
11406 return SDValue();
11407
11408 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11409 if (ForceV1Zero)
11410 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11411 if (ForceV2Zero)
11412 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11413
11414 unsigned NumElts = VT.getVectorNumElements();
11415
11416 switch (VT.SimpleTy) {
11417 case MVT::v4i64:
11418 case MVT::v8i32:
11419 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11420 [[fallthrough]];
11421 case MVT::v4f64:
11422 case MVT::v8f32:
11423 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11424 [[fallthrough]];
11425 case MVT::v2f64:
11426 case MVT::v2i64:
11427 case MVT::v4f32:
11428 case MVT::v4i32:
11429 case MVT::v8i16:
11430 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11431 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11432 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11433 case MVT::v16i16: {
11434 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11435 SmallVector<int, 8> RepeatedMask;
11436 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11437 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11438 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11439 BlendMask = 0;
11440 for (int i = 0; i < 8; ++i)
11441 if (RepeatedMask[i] >= 8)
11442 BlendMask |= 1ull << i;
11443 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11444 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11445 }
11446 // Use PBLENDW for lower/upper lanes and then blend lanes.
11447 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11448 // merge to VSELECT where useful.
11449 uint64_t LoMask = BlendMask & 0xFF;
11450 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11451 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11452 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11453 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11454 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11455 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11456 return DAG.getVectorShuffle(
11457 MVT::v16i16, DL, Lo, Hi,
11458 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11459 }
11460 [[fallthrough]];
11461 }
11462 case MVT::v32i8:
11463 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11464 [[fallthrough]];
11465 case MVT::v16i8: {
11466 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11467
11468 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11469 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11470 Subtarget, DAG))
11471 return Masked;
11472
11473 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11474 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11475 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11476 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11477 }
11478
11479 // If we have VPTERNLOG, we can use that as a bit blend.
11480 if (Subtarget.hasVLX())
11481 if (SDValue BitBlend =
11482 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11483 return BitBlend;
11484
11485 // Scale the blend by the number of bytes per element.
11486 int Scale = VT.getScalarSizeInBits() / 8;
11487
11488 // This form of blend is always done on bytes. Compute the byte vector
11489 // type.
11490 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11491
11492 // x86 allows load folding with blendvb from the 2nd source operand. But
11493 // we are still using LLVM select here (see comment below), so that's V1.
11494 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11495 // allow that load-folding possibility.
11496 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11498 std::swap(V1, V2);
11499 }
11500
11501 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11502 // mix of LLVM's code generator and the x86 backend. We tell the code
11503 // generator that boolean values in the elements of an x86 vector register
11504 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11505 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11506 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11507 // of the element (the remaining are ignored) and 0 in that high bit would
11508 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11509 // the LLVM model for boolean values in vector elements gets the relevant
11510 // bit set, it is set backwards and over constrained relative to x86's
11511 // actual model.
11512 SmallVector<SDValue, 32> VSELECTMask;
11513 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11514 for (int j = 0; j < Scale; ++j)
11515 VSELECTMask.push_back(
11516 Mask[i] < 0
11517 ? DAG.getUNDEF(MVT::i8)
11518 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11519
11520 V1 = DAG.getBitcast(BlendVT, V1);
11521 V2 = DAG.getBitcast(BlendVT, V2);
11522 return DAG.getBitcast(
11523 VT,
11524 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11525 V1, V2));
11526 }
11527 case MVT::v16f32:
11528 case MVT::v8f64:
11529 case MVT::v8i64:
11530 case MVT::v16i32:
11531 case MVT::v32i16:
11532 case MVT::v64i8: {
11533 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11534 bool OptForSize = DAG.shouldOptForSize();
11535 if (!OptForSize) {
11536 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11537 Subtarget, DAG))
11538 return Masked;
11539 }
11540
11541 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11542 // masked move.
11543 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11544 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11545 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11546 }
11547 default:
11548 llvm_unreachable("Not a supported integer vector type!");
11549 }
11550}
11551
11552/// Try to lower as a blend of elements from two inputs followed by
11553/// a single-input permutation.
11554///
11555/// This matches the pattern where we can blend elements from two inputs and
11556/// then reduce the shuffle to a single-input permutation.
11558 SDValue V1, SDValue V2,
11559 ArrayRef<int> Mask,
11560 SelectionDAG &DAG,
11561 bool ImmBlends = false) {
11562 // We build up the blend mask while checking whether a blend is a viable way
11563 // to reduce the shuffle.
11564 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11565 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11566
11567 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11568 if (Mask[i] < 0)
11569 continue;
11570
11571 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11572
11573 if (BlendMask[Mask[i] % Size] < 0)
11574 BlendMask[Mask[i] % Size] = Mask[i];
11575 else if (BlendMask[Mask[i] % Size] != Mask[i])
11576 return SDValue(); // Can't blend in the needed input!
11577
11578 PermuteMask[i] = Mask[i] % Size;
11579 }
11580
11581 // If only immediate blends, then bail if the blend mask can't be widened to
11582 // i16.
11583 unsigned EltSize = VT.getScalarSizeInBits();
11584 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11585 return SDValue();
11586
11587 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11588 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11589}
11590
11591/// Try to lower as an unpack of elements from two inputs followed by
11592/// a single-input permutation.
11593///
11594/// This matches the pattern where we can unpack elements from two inputs and
11595/// then reduce the shuffle to a single-input (wider) permutation.
11597 SDValue V1, SDValue V2,
11598 ArrayRef<int> Mask,
11599 SelectionDAG &DAG) {
11600 int NumElts = Mask.size();
11601 int NumLanes = VT.getSizeInBits() / 128;
11602 int NumLaneElts = NumElts / NumLanes;
11603 int NumHalfLaneElts = NumLaneElts / 2;
11604
11605 bool MatchLo = true, MatchHi = true;
11606 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11607
11608 // Determine UNPCKL/UNPCKH type and operand order.
11609 for (int Elt = 0; Elt != NumElts; ++Elt) {
11610 int M = Mask[Elt];
11611 if (M < 0)
11612 continue;
11613
11614 // Normalize the mask value depending on whether it's V1 or V2.
11615 int NormM = M;
11616 SDValue &Op = Ops[Elt & 1];
11617 if (M < NumElts && (Op.isUndef() || Op == V1))
11618 Op = V1;
11619 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11620 Op = V2;
11621 NormM -= NumElts;
11622 } else
11623 return SDValue();
11624
11625 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11626 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11627 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11628 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11629 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11630 if (MatchLoAnyLane || MatchHiAnyLane) {
11631 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11632 "Failed to match UNPCKLO/UNPCKHI");
11633 break;
11634 }
11635 }
11636 MatchLo &= MatchLoAnyLane;
11637 MatchHi &= MatchHiAnyLane;
11638 if (!MatchLo && !MatchHi)
11639 return SDValue();
11640 }
11641 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11642
11643 // Element indices have changed after unpacking. Calculate permute mask
11644 // so that they will be put back to the position as dictated by the
11645 // original shuffle mask indices.
11646 SmallVector<int, 32> PermuteMask(NumElts, -1);
11647 for (int Elt = 0; Elt != NumElts; ++Elt) {
11648 int M = Mask[Elt];
11649 if (M < 0)
11650 continue;
11651 int NormM = M;
11652 if (NumElts <= M)
11653 NormM -= NumElts;
11654 bool IsFirstOp = M < NumElts;
11655 int BaseMaskElt =
11656 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11657 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11658 PermuteMask[Elt] = BaseMaskElt;
11659 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11660 PermuteMask[Elt] = BaseMaskElt + 1;
11661 assert(PermuteMask[Elt] != -1 &&
11662 "Input mask element is defined but failed to assign permute mask");
11663 }
11664
11665 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11666 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11667 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11668}
11669
11670/// Try to lower a shuffle as a permute of the inputs followed by an
11671/// UNPCK instruction.
11672///
11673/// This specifically targets cases where we end up with alternating between
11674/// the two inputs, and so can permute them into something that feeds a single
11675/// UNPCK instruction. Note that this routine only targets integer vectors
11676/// because for floating point vectors we have a generalized SHUFPS lowering
11677/// strategy that handles everything that doesn't *exactly* match an unpack,
11678/// making this clever lowering unnecessary.
11680 SDValue V1, SDValue V2,
11681 ArrayRef<int> Mask,
11682 const X86Subtarget &Subtarget,
11683 SelectionDAG &DAG) {
11684 int Size = Mask.size();
11685 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11686
11687 // This routine only supports 128-bit integer dual input vectors.
11688 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11689 return SDValue();
11690
11691 int NumLoInputs =
11692 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11693 int NumHiInputs =
11694 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11695
11696 bool UnpackLo = NumLoInputs >= NumHiInputs;
11697
11698 auto TryUnpack = [&](int ScalarSize, int Scale) {
11699 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11700 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11701
11702 for (int i = 0; i < Size; ++i) {
11703 if (Mask[i] < 0)
11704 continue;
11705
11706 // Each element of the unpack contains Scale elements from this mask.
11707 int UnpackIdx = i / Scale;
11708
11709 // We only handle the case where V1 feeds the first slots of the unpack.
11710 // We rely on canonicalization to ensure this is the case.
11711 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11712 return SDValue();
11713
11714 // Setup the mask for this input. The indexing is tricky as we have to
11715 // handle the unpack stride.
11716 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11717 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11718 Mask[i] % Size;
11719 }
11720
11721 // If we will have to shuffle both inputs to use the unpack, check whether
11722 // we can just unpack first and shuffle the result. If so, skip this unpack.
11723 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11724 !isNoopShuffleMask(V2Mask))
11725 return SDValue();
11726
11727 // Shuffle the inputs into place.
11728 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11729 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11730
11731 // Cast the inputs to the type we will use to unpack them.
11732 MVT UnpackVT =
11733 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11734 V1 = DAG.getBitcast(UnpackVT, V1);
11735 V2 = DAG.getBitcast(UnpackVT, V2);
11736
11737 // Unpack the inputs and cast the result back to the desired type.
11738 return DAG.getBitcast(
11739 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11740 UnpackVT, V1, V2));
11741 };
11742
11743 // We try each unpack from the largest to the smallest to try and find one
11744 // that fits this mask.
11745 int OrigScalarSize = VT.getScalarSizeInBits();
11746 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11747 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11748 return Unpack;
11749
11750 // If we're shuffling with a zero vector then we're better off not doing
11751 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11754 return SDValue();
11755
11756 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11757 // initial unpack.
11758 if (NumLoInputs == 0 || NumHiInputs == 0) {
11759 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11760 "We have to have *some* inputs!");
11761 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11762
11763 // FIXME: We could consider the total complexity of the permute of each
11764 // possible unpacking. Or at the least we should consider how many
11765 // half-crossings are created.
11766 // FIXME: We could consider commuting the unpacks.
11767
11768 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11769 for (int i = 0; i < Size; ++i) {
11770 if (Mask[i] < 0)
11771 continue;
11772
11773 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11774
11775 PermMask[i] =
11776 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11777 }
11778 return DAG.getVectorShuffle(
11779 VT, DL,
11780 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11781 V1, V2),
11782 DAG.getUNDEF(VT), PermMask);
11783 }
11784
11785 return SDValue();
11786}
11787
11788/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11789/// permuting the elements of the result in place.
11791 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11792 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11793 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11794 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11795 (VT.is512BitVector() && !Subtarget.hasBWI()))
11796 return SDValue();
11797
11798 // We don't currently support lane crossing permutes.
11799 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11800 return SDValue();
11801
11802 int Scale = VT.getScalarSizeInBits() / 8;
11803 int NumLanes = VT.getSizeInBits() / 128;
11804 int NumElts = VT.getVectorNumElements();
11805 int NumEltsPerLane = NumElts / NumLanes;
11806
11807 // Determine range of mask elts.
11808 bool Blend1 = true;
11809 bool Blend2 = true;
11810 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11811 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11812 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11813 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11814 int M = Mask[Lane + Elt];
11815 if (M < 0)
11816 continue;
11817 if (M < NumElts) {
11818 Blend1 &= (M == (Lane + Elt));
11819 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11820 M = M % NumEltsPerLane;
11821 Range1.first = std::min(Range1.first, M);
11822 Range1.second = std::max(Range1.second, M);
11823 } else {
11824 M -= NumElts;
11825 Blend2 &= (M == (Lane + Elt));
11826 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11827 M = M % NumEltsPerLane;
11828 Range2.first = std::min(Range2.first, M);
11829 Range2.second = std::max(Range2.second, M);
11830 }
11831 }
11832 }
11833
11834 // Bail if we don't need both elements.
11835 // TODO - it might be worth doing this for unary shuffles if the permute
11836 // can be widened.
11837 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11838 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11839 return SDValue();
11840
11841 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11842 return SDValue();
11843
11844 // Rotate the 2 ops so we can access both ranges, then permute the result.
11845 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11846 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11847 SDValue Rotate = DAG.getBitcast(
11848 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11849 DAG.getBitcast(ByteVT, Lo),
11850 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11851 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11852 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11853 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11854 int M = Mask[Lane + Elt];
11855 if (M < 0)
11856 continue;
11857 if (M < NumElts)
11858 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11859 else
11860 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11861 }
11862 }
11863 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11864 };
11865
11866 // Check if the ranges are small enough to rotate from either direction.
11867 if (Range2.second < Range1.first)
11868 return RotateAndPermute(V1, V2, Range1.first, 0);
11869 if (Range1.second < Range2.first)
11870 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11871 return SDValue();
11872}
11873
11875 return isUndefOrEqual(Mask, 0);
11876}
11877
11879 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11880}
11881
11882/// Check if the Mask consists of the same element repeated multiple times.
11884 size_t NumUndefs = 0;
11885 std::optional<int> UniqueElt;
11886 for (int Elt : Mask) {
11887 if (Elt == SM_SentinelUndef) {
11888 NumUndefs++;
11889 continue;
11890 }
11891 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11892 return false;
11893 UniqueElt = Elt;
11894 }
11895 // Make sure the element is repeated enough times by checking the number of
11896 // undefs is small.
11897 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11898}
11899
11900/// Generic routine to decompose a shuffle and blend into independent
11901/// blends and permutes.
11902///
11903/// This matches the extremely common pattern for handling combined
11904/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11905/// operations. It will try to pick the best arrangement of shuffles and
11906/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11908 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11909 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11910 int NumElts = Mask.size();
11911 int NumLanes = VT.getSizeInBits() / 128;
11912 int NumEltsPerLane = NumElts / NumLanes;
11913
11914 // Shuffle the input elements into the desired positions in V1 and V2 and
11915 // unpack/blend them together.
11916 bool IsAlternating = true;
11917 bool V1Zero = true, V2Zero = true;
11918 SmallVector<int, 32> V1Mask(NumElts, -1);
11919 SmallVector<int, 32> V2Mask(NumElts, -1);
11920 SmallVector<int, 32> FinalMask(NumElts, -1);
11921 for (int i = 0; i < NumElts; ++i) {
11922 int M = Mask[i];
11923 if (M >= 0 && M < NumElts) {
11924 V1Mask[i] = M;
11925 FinalMask[i] = i;
11926 V1Zero &= Zeroable[i];
11927 IsAlternating &= (i & 1) == 0;
11928 } else if (M >= NumElts) {
11929 V2Mask[i] = M - NumElts;
11930 FinalMask[i] = i + NumElts;
11931 V2Zero &= Zeroable[i];
11932 IsAlternating &= (i & 1) == 1;
11933 }
11934 }
11935
11936 // If we effectively only demand the 0'th element of \p Input, and not only
11937 // as 0'th element, then broadcast said input,
11938 // and change \p InputMask to be a no-op (identity) mask.
11939 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11940 &DAG](SDValue &Input,
11941 MutableArrayRef<int> InputMask) {
11942 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11943 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11944 !X86::mayFoldLoad(Input, Subtarget)))
11945 return;
11946 if (isNoopShuffleMask(InputMask))
11947 return;
11948 assert(isBroadcastShuffleMask(InputMask) &&
11949 "Expected to demand only the 0'th element.");
11951 for (auto I : enumerate(InputMask)) {
11952 int &InputMaskElt = I.value();
11953 if (InputMaskElt >= 0)
11954 InputMaskElt = I.index();
11955 }
11956 };
11957
11958 // Currently, we may need to produce one shuffle per input, and blend results.
11959 // It is possible that the shuffle for one of the inputs is already a no-op.
11960 // See if we can simplify non-no-op shuffles into broadcasts,
11961 // which we consider to be strictly better than an arbitrary shuffle.
11962 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11964 canonicalizeBroadcastableInput(V1, V1Mask);
11965 canonicalizeBroadcastableInput(V2, V2Mask);
11966 }
11967
11968 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11969 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11970 // the shuffle may be able to fold with a load or other benefit. However, when
11971 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11972 // pre-shuffle first is a better strategy.
11973 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11974 // If we don't have blends, see if we can create a cheap unpack.
11975 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11976 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11977 is128BitUnpackShuffleMask(V2Mask, DAG)))
11978 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11979 DL, VT, V1, V2, Mask, Subtarget, DAG))
11980 return PermUnpack;
11981
11982 // Only prefer immediate blends to unpack/rotate.
11983 if (SDValue BlendPerm =
11984 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11985 return BlendPerm;
11986
11987 // If either input vector provides only a single element which is repeated
11988 // multiple times, unpacking from both input vectors would generate worse
11989 // code. e.g. for
11990 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11991 // it is better to process t4 first to create a vector of t4[0], then unpack
11992 // that vector with t2.
11993 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11995 if (SDValue UnpackPerm =
11996 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11997 return UnpackPerm;
11998
12000 DL, VT, V1, V2, Mask, Subtarget, DAG))
12001 return RotatePerm;
12002
12003 // Unpack/rotate failed - try again with variable blends.
12004 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12005 DAG))
12006 return BlendPerm;
12007
12008 if (VT.getScalarSizeInBits() >= 32)
12009 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
12010 DL, VT, V1, V2, Mask, Subtarget, DAG))
12011 return PermUnpack;
12012 }
12013
12014 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12015 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12016 // TODO: It doesn't have to be alternating - but each lane mustn't have more
12017 // than half the elements coming from each source.
12018 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12019 V1Mask.assign(NumElts, -1);
12020 V2Mask.assign(NumElts, -1);
12021 FinalMask.assign(NumElts, -1);
12022 for (int i = 0; i != NumElts; i += NumEltsPerLane)
12023 for (int j = 0; j != NumEltsPerLane; ++j) {
12024 int M = Mask[i + j];
12025 if (M >= 0 && M < NumElts) {
12026 V1Mask[i + (j / 2)] = M;
12027 FinalMask[i + j] = i + (j / 2);
12028 } else if (M >= NumElts) {
12029 V2Mask[i + (j / 2)] = M - NumElts;
12030 FinalMask[i + j] = i + (j / 2) + NumElts;
12031 }
12032 }
12033 }
12034
12035 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12036 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12037 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12038}
12039
12040static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12041 const X86Subtarget &Subtarget,
12042 ArrayRef<int> Mask) {
12043 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12044 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
12045
12046 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12047 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12048 int MaxSubElts = 64 / EltSizeInBits;
12049 unsigned RotateAmt, NumSubElts;
12050 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
12051 MaxSubElts, NumSubElts, RotateAmt))
12052 return -1;
12053 unsigned NumElts = Mask.size();
12054 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12055 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12056 return RotateAmt;
12057}
12058
12059/// Lower shuffle using X86ISD::VROTLI rotations.
12061 ArrayRef<int> Mask,
12062 const X86Subtarget &Subtarget,
12063 SelectionDAG &DAG) {
12064 // Only XOP + AVX512 targets have bit rotation instructions.
12065 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12066 bool IsLegal =
12067 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12068 if (!IsLegal && Subtarget.hasSSE3())
12069 return SDValue();
12070
12071 MVT RotateVT;
12072 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12073 Subtarget, Mask);
12074 if (RotateAmt < 0)
12075 return SDValue();
12076
12077 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12078 // expanded to OR(SRL,SHL), will be more efficient, but if they can
12079 // widen to vXi16 or more then existing lowering should will be better.
12080 if (!IsLegal) {
12081 if ((RotateAmt % 16) == 0)
12082 return SDValue();
12083 // TODO: Use getTargetVShiftByConstNode.
12084 unsigned ShlAmt = RotateAmt;
12085 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12086 V1 = DAG.getBitcast(RotateVT, V1);
12087 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12088 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12089 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12090 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12091 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12092 return DAG.getBitcast(VT, Rot);
12093 }
12094
12095 SDValue Rot =
12096 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12097 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12098 return DAG.getBitcast(VT, Rot);
12099}
12100
12101/// Try to match a vector shuffle as an element rotation.
12102///
12103/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12105 ArrayRef<int> Mask) {
12106 int NumElts = Mask.size();
12107
12108 // We need to detect various ways of spelling a rotation:
12109 // [11, 12, 13, 14, 15, 0, 1, 2]
12110 // [-1, 12, 13, 14, -1, -1, 1, -1]
12111 // [-1, -1, -1, -1, -1, -1, 1, 2]
12112 // [ 3, 4, 5, 6, 7, 8, 9, 10]
12113 // [-1, 4, 5, 6, -1, -1, 9, -1]
12114 // [-1, 4, 5, 6, -1, -1, -1, -1]
12115 int Rotation = 0;
12116 SDValue Lo, Hi;
12117 for (int i = 0; i < NumElts; ++i) {
12118 int M = Mask[i];
12119 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
12120 "Unexpected mask index.");
12121 if (M < 0)
12122 continue;
12123
12124 // Determine where a rotated vector would have started.
12125 int StartIdx = i - (M % NumElts);
12126 if (StartIdx == 0)
12127 // The identity rotation isn't interesting, stop.
12128 return -1;
12129
12130 // If we found the tail of a vector the rotation must be the missing
12131 // front. If we found the head of a vector, it must be how much of the
12132 // head.
12133 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12134
12135 if (Rotation == 0)
12136 Rotation = CandidateRotation;
12137 else if (Rotation != CandidateRotation)
12138 // The rotations don't match, so we can't match this mask.
12139 return -1;
12140
12141 // Compute which value this mask is pointing at.
12142 SDValue MaskV = M < NumElts ? V1 : V2;
12143
12144 // Compute which of the two target values this index should be assigned
12145 // to. This reflects whether the high elements are remaining or the low
12146 // elements are remaining.
12147 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12148
12149 // Either set up this value if we've not encountered it before, or check
12150 // that it remains consistent.
12151 if (!TargetV)
12152 TargetV = MaskV;
12153 else if (TargetV != MaskV)
12154 // This may be a rotation, but it pulls from the inputs in some
12155 // unsupported interleaving.
12156 return -1;
12157 }
12158
12159 // Check that we successfully analyzed the mask, and normalize the results.
12160 assert(Rotation != 0 && "Failed to locate a viable rotation!");
12161 assert((Lo || Hi) && "Failed to find a rotated input vector!");
12162 if (!Lo)
12163 Lo = Hi;
12164 else if (!Hi)
12165 Hi = Lo;
12166
12167 V1 = Lo;
12168 V2 = Hi;
12169
12170 return Rotation;
12171}
12172
12173/// Try to lower a vector shuffle as a byte rotation.
12174///
12175/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12176/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12177/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12178/// try to generically lower a vector shuffle through such an pattern. It
12179/// does not check for the profitability of lowering either as PALIGNR or
12180/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12181/// This matches shuffle vectors that look like:
12182///
12183/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12184///
12185/// Essentially it concatenates V1 and V2, shifts right by some number of
12186/// elements, and takes the low elements as the result. Note that while this is
12187/// specified as a *right shift* because x86 is little-endian, it is a *left
12188/// rotate* of the vector lanes.
12190 ArrayRef<int> Mask) {
12191 // Don't accept any shuffles with zero elements.
12192 if (isAnyZero(Mask))
12193 return -1;
12194
12195 // PALIGNR works on 128-bit lanes.
12196 SmallVector<int, 16> RepeatedMask;
12197 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12198 return -1;
12199
12200 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12201 if (Rotation <= 0)
12202 return -1;
12203
12204 // PALIGNR rotates bytes, so we need to scale the
12205 // rotation based on how many bytes are in the vector lane.
12206 int NumElts = RepeatedMask.size();
12207 int Scale = 16 / NumElts;
12208 return Rotation * Scale;
12209}
12210
12212 SDValue V2, ArrayRef<int> Mask,
12213 const X86Subtarget &Subtarget,
12214 SelectionDAG &DAG) {
12215 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12216
12217 SDValue Lo = V1, Hi = V2;
12218 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12219 if (ByteRotation <= 0)
12220 return SDValue();
12221
12222 // Cast the inputs to i8 vector of correct length to match PALIGNR or
12223 // PSLLDQ/PSRLDQ.
12224 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12225 Lo = DAG.getBitcast(ByteVT, Lo);
12226 Hi = DAG.getBitcast(ByteVT, Hi);
12227
12228 // SSSE3 targets can use the palignr instruction.
12229 if (Subtarget.hasSSSE3()) {
12230 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
12231 "512-bit PALIGNR requires BWI instructions");
12232 return DAG.getBitcast(
12233 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12234 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12235 }
12236
12237 assert(VT.is128BitVector() &&
12238 "Rotate-based lowering only supports 128-bit lowering!");
12239 assert(Mask.size() <= 16 &&
12240 "Can shuffle at most 16 bytes in a 128-bit vector!");
12241 assert(ByteVT == MVT::v16i8 &&
12242 "SSE2 rotate lowering only needed for v16i8!");
12243
12244 // Default SSE2 implementation
12245 int LoByteShift = 16 - ByteRotation;
12246 int HiByteShift = ByteRotation;
12247
12248 SDValue LoShift =
12249 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12250 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12251 SDValue HiShift =
12252 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12253 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12254 return DAG.getBitcast(VT,
12255 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12256}
12257
12258/// Try to lower a vector shuffle as a dword/qword rotation.
12259///
12260/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12261/// rotation of the concatenation of two vectors; This routine will
12262/// try to generically lower a vector shuffle through such an pattern.
12263///
12264/// Essentially it concatenates V1 and V2, shifts right by some number of
12265/// elements, and takes the low elements as the result. Note that while this is
12266/// specified as a *right shift* because x86 is little-endian, it is a *left
12267/// rotate* of the vector lanes.
12269 SDValue V2, ArrayRef<int> Mask,
12270 const APInt &Zeroable,
12271 const X86Subtarget &Subtarget,
12272 SelectionDAG &DAG) {
12273 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12274 "Only 32-bit and 64-bit elements are supported!");
12275
12276 // 128/256-bit vectors are only supported with VLX.
12277 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12278 && "VLX required for 128/256-bit vectors");
12279
12280 SDValue Lo = V1, Hi = V2;
12281 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12282 if (0 < Rotation)
12283 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12284 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12285
12286 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12287 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12288 // TODO: We can probably make this more aggressive and use shift-pairs like
12289 // lowerShuffleAsByteShiftMask.
12290 unsigned NumElts = Mask.size();
12291 unsigned ZeroLo = Zeroable.countr_one();
12292 unsigned ZeroHi = Zeroable.countl_one();
12293 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12294 if (!ZeroLo && !ZeroHi)
12295 return SDValue();
12296
12297 if (ZeroLo) {
12298 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12299 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12300 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12301 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12302 getZeroVector(VT, Subtarget, DAG, DL),
12303 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12304 }
12305
12306 if (ZeroHi) {
12307 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12308 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12309 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12310 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12311 getZeroVector(VT, Subtarget, DAG, DL), Src,
12312 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12313 }
12314
12315 return SDValue();
12316}
12317
12318/// Try to lower a vector shuffle as a byte shift sequence.
12320 SDValue V2, ArrayRef<int> Mask,
12321 const APInt &Zeroable,
12322 const X86Subtarget &Subtarget,
12323 SelectionDAG &DAG) {
12324 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12325 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12326
12327 // We need a shuffle that has zeros at one/both ends and a sequential
12328 // shuffle from one source within.
12329 unsigned ZeroLo = Zeroable.countr_one();
12330 unsigned ZeroHi = Zeroable.countl_one();
12331 if (!ZeroLo && !ZeroHi)
12332 return SDValue();
12333
12334 unsigned NumElts = Mask.size();
12335 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12336 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12337 return SDValue();
12338
12339 unsigned Scale = VT.getScalarSizeInBits() / 8;
12340 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12341 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12342 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12343 return SDValue();
12344
12345 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12346 Res = DAG.getBitcast(MVT::v16i8, Res);
12347
12348 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12349 // inner sequential set of elements, possibly offset:
12350 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12351 // 01234567 --> 4567zzzz --> zzzzz456
12352 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12353 if (ZeroLo == 0) {
12354 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12355 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12356 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12357 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12358 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12359 } else if (ZeroHi == 0) {
12360 unsigned Shift = Mask[ZeroLo] % NumElts;
12361 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12362 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12363 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12364 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12365 } else if (!Subtarget.hasSSSE3()) {
12366 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12367 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12368 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12369 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12370 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12371 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12372 Shift += Mask[ZeroLo] % NumElts;
12373 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12374 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12375 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12376 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12377 } else
12378 return SDValue();
12379
12380 return DAG.getBitcast(VT, Res);
12381}
12382
12383/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12384///
12385/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12386/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12387/// matches elements from one of the input vectors shuffled to the left or
12388/// right with zeroable elements 'shifted in'. It handles both the strictly
12389/// bit-wise element shifts and the byte shift across an entire 128-bit double
12390/// quad word lane.
12391///
12392/// PSHL : (little-endian) left bit shift.
12393/// [ zz, 0, zz, 2 ]
12394/// [ -1, 4, zz, -1 ]
12395/// PSRL : (little-endian) right bit shift.
12396/// [ 1, zz, 3, zz]
12397/// [ -1, -1, 7, zz]
12398/// PSLLDQ : (little-endian) left byte shift
12399/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12400/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12401/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12402/// PSRLDQ : (little-endian) right byte shift
12403/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12404/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12405/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12406static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12407 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12408 int MaskOffset, const APInt &Zeroable,
12409 const X86Subtarget &Subtarget) {
12410 int Size = Mask.size();
12411 unsigned SizeInBits = Size * ScalarSizeInBits;
12412
12413 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12414 for (int i = 0; i < Size; i += Scale)
12415 for (int j = 0; j < Shift; ++j)
12416 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12417 return false;
12418
12419 return true;
12420 };
12421
12422 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12423 for (int i = 0; i != Size; i += Scale) {
12424 unsigned Pos = Left ? i + Shift : i;
12425 unsigned Low = Left ? i : i + Shift;
12426 unsigned Len = Scale - Shift;
12427 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12428 return -1;
12429 }
12430
12431 int ShiftEltBits = ScalarSizeInBits * Scale;
12432 bool ByteShift = ShiftEltBits > 64;
12433 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12434 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12435 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12436
12437 // Normalize the scale for byte shifts to still produce an i64 element
12438 // type.
12439 Scale = ByteShift ? Scale / 2 : Scale;
12440
12441 // We need to round trip through the appropriate type for the shift.
12442 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12443 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12444 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12445 return ShiftAmt;
12446 };
12447
12448 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12449 // keep doubling the size of the integer elements up to that. We can
12450 // then shift the elements of the integer vector by whole multiples of
12451 // their width within the elements of the larger integer vector. Test each
12452 // multiple to see if we can find a match with the moved element indices
12453 // and that the shifted in elements are all zeroable.
12454 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12455 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12456 for (int Shift = 1; Shift != Scale; ++Shift)
12457 for (bool Left : {true, false})
12458 if (CheckZeros(Shift, Scale, Left)) {
12459 int ShiftAmt = MatchShift(Shift, Scale, Left);
12460 if (0 < ShiftAmt)
12461 return ShiftAmt;
12462 }
12463
12464 // no match
12465 return -1;
12466}
12467
12469 SDValue V2, ArrayRef<int> Mask,
12470 const APInt &Zeroable,
12471 const X86Subtarget &Subtarget,
12472 SelectionDAG &DAG, bool BitwiseOnly) {
12473 int Size = Mask.size();
12474 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12475
12476 MVT ShiftVT;
12477 SDValue V = V1;
12478 unsigned Opcode;
12479
12480 // Try to match shuffle against V1 shift.
12481 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12482 Mask, 0, Zeroable, Subtarget);
12483
12484 // If V1 failed, try to match shuffle against V2 shift.
12485 if (ShiftAmt < 0) {
12486 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12487 Mask, Size, Zeroable, Subtarget);
12488 V = V2;
12489 }
12490
12491 if (ShiftAmt < 0)
12492 return SDValue();
12493
12494 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12495 return SDValue();
12496
12497 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12498 "Illegal integer vector type");
12499 V = DAG.getBitcast(ShiftVT, V);
12500 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12501 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12502 return DAG.getBitcast(VT, V);
12503}
12504
12505// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12506// Remainder of lower half result is zero and upper half is all undef.
12507static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12508 ArrayRef<int> Mask, uint64_t &BitLen,
12509 uint64_t &BitIdx, const APInt &Zeroable) {
12510 int Size = Mask.size();
12511 int HalfSize = Size / 2;
12512 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12513 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12514
12515 // Upper half must be undefined.
12516 if (!isUndefUpperHalf(Mask))
12517 return false;
12518
12519 // Determine the extraction length from the part of the
12520 // lower half that isn't zeroable.
12521 int Len = HalfSize;
12522 for (; Len > 0; --Len)
12523 if (!Zeroable[Len - 1])
12524 break;
12525 assert(Len > 0 && "Zeroable shuffle mask");
12526
12527 // Attempt to match first Len sequential elements from the lower half.
12528 SDValue Src;
12529 int Idx = -1;
12530 for (int i = 0; i != Len; ++i) {
12531 int M = Mask[i];
12532 if (M == SM_SentinelUndef)
12533 continue;
12534 SDValue &V = (M < Size ? V1 : V2);
12535 M = M % Size;
12536
12537 // The extracted elements must start at a valid index and all mask
12538 // elements must be in the lower half.
12539 if (i > M || M >= HalfSize)
12540 return false;
12541
12542 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12543 Src = V;
12544 Idx = M - i;
12545 continue;
12546 }
12547 return false;
12548 }
12549
12550 if (!Src || Idx < 0)
12551 return false;
12552
12553 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12554 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12555 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12556 V1 = Src;
12557 return true;
12558}
12559
12560// INSERTQ: Extract lowest Len elements from lower half of second source and
12561// insert over first source, starting at Idx.
12562// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12563static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12564 ArrayRef<int> Mask, uint64_t &BitLen,
12565 uint64_t &BitIdx) {
12566 int Size = Mask.size();
12567 int HalfSize = Size / 2;
12568 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12569
12570 // Upper half must be undefined.
12571 if (!isUndefUpperHalf(Mask))
12572 return false;
12573
12574 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12575 SDValue Base;
12576
12577 // Attempt to match first source from mask before insertion point.
12578 if (isUndefInRange(Mask, 0, Idx)) {
12579 /* EMPTY */
12580 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12581 Base = V1;
12582 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12583 Base = V2;
12584 } else {
12585 continue;
12586 }
12587
12588 // Extend the extraction length looking to match both the insertion of
12589 // the second source and the remaining elements of the first.
12590 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12591 SDValue Insert;
12592 int Len = Hi - Idx;
12593
12594 // Match insertion.
12595 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12596 Insert = V1;
12597 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12598 Insert = V2;
12599 } else {
12600 continue;
12601 }
12602
12603 // Match the remaining elements of the lower half.
12604 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12605 /* EMPTY */
12606 } else if ((!Base || (Base == V1)) &&
12607 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12608 Base = V1;
12609 } else if ((!Base || (Base == V2)) &&
12610 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12611 Size + Hi)) {
12612 Base = V2;
12613 } else {
12614 continue;
12615 }
12616
12617 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12618 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12619 V1 = Base;
12620 V2 = Insert;
12621 return true;
12622 }
12623 }
12624
12625 return false;
12626}
12627
12628/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12630 SDValue V2, ArrayRef<int> Mask,
12631 const APInt &Zeroable, SelectionDAG &DAG) {
12632 uint64_t BitLen, BitIdx;
12633 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12634 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12635 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12636 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12637
12638 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12639 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12640 V2 ? V2 : DAG.getUNDEF(VT),
12641 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12642 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12643
12644 return SDValue();
12645}
12646
12647/// Lower a vector shuffle as an any/signed/zero extension.
12648///
12649/// Given a specific number of elements, element bit width, and extension
12650/// stride, produce either an extension based on the available
12651/// features of the subtarget. The extended elements are consecutive and
12652/// begin and can start from an offsetted element index in the input; to
12653/// avoid excess shuffling the offset must either being in the bottom lane
12654/// or at the start of a higher lane. All extended elements must be from
12655/// the same lane.
12657 int Scale, int Offset,
12658 unsigned ExtOpc, SDValue InputV,
12659 ArrayRef<int> Mask,
12660 const X86Subtarget &Subtarget,
12661 SelectionDAG &DAG) {
12662 assert(Scale > 1 && "Need a scale to extend.");
12663 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12664 int EltBits = VT.getScalarSizeInBits();
12665 int NumElements = VT.getVectorNumElements();
12666 int NumEltsPerLane = 128 / EltBits;
12667 int OffsetLane = Offset / NumEltsPerLane;
12668 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12669 "Only 8, 16, and 32 bit elements can be extended.");
12670 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12671 assert(0 <= Offset && "Extension offset must be positive.");
12672 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12673 "Extension offset must be in the first lane or start an upper lane.");
12674
12675 // Check that an index is in same lane as the base offset.
12676 auto SafeOffset = [&](int Idx) {
12677 return OffsetLane == (Idx / NumEltsPerLane);
12678 };
12679
12680 // Shift along an input so that the offset base moves to the first element.
12681 auto ShuffleOffset = [&](SDValue V) {
12682 if (!Offset)
12683 return V;
12684
12685 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12686 for (int i = 0; i * Scale < NumElements; ++i) {
12687 int SrcIdx = i + Offset;
12688 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12689 }
12690 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12691 };
12692
12693 // Found a valid a/zext mask! Try various lowering strategies based on the
12694 // input type and available ISA extensions.
12695 if (Subtarget.hasSSE41()) {
12696 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12697 // PUNPCK will catch this in a later shuffle match.
12698 if (Offset && Scale == 2 && VT.is128BitVector())
12699 return SDValue();
12700 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12701 NumElements / Scale);
12702 InputV = DAG.getBitcast(VT, InputV);
12703 InputV = ShuffleOffset(InputV);
12704 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12705 return DAG.getBitcast(VT, InputV);
12706 }
12707
12708 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12709 InputV = DAG.getBitcast(VT, InputV);
12710 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12711
12712 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12713 if (ExtOpc == ISD::SIGN_EXTEND)
12714 return SDValue();
12715
12716 // For any extends we can cheat for larger element sizes and use shuffle
12717 // instructions that can fold with a load and/or copy.
12718 if (AnyExt && EltBits == 32) {
12719 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12720 -1};
12721 return DAG.getBitcast(
12722 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12723 DAG.getBitcast(MVT::v4i32, InputV),
12724 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12725 }
12726 if (AnyExt && EltBits == 16 && Scale > 2) {
12727 int PSHUFDMask[4] = {Offset / 2, -1,
12728 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12729 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12730 DAG.getBitcast(MVT::v4i32, InputV),
12731 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12732 int PSHUFWMask[4] = {1, -1, -1, -1};
12733 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12734 return DAG.getBitcast(
12735 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12736 DAG.getBitcast(MVT::v8i16, InputV),
12737 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12738 }
12739
12740 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12741 // to 64-bits.
12742 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12743 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12744 assert(VT.is128BitVector() && "Unexpected vector width!");
12745
12746 int LoIdx = Offset * EltBits;
12747 SDValue Lo = DAG.getBitcast(
12748 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12749 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12750 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12751
12752 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12753 return DAG.getBitcast(VT, Lo);
12754
12755 int HiIdx = (Offset + 1) * EltBits;
12756 SDValue Hi = DAG.getBitcast(
12757 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12758 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12759 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12760 return DAG.getBitcast(VT,
12761 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12762 }
12763
12764 // If this would require more than 2 unpack instructions to expand, use
12765 // pshufb when available. We can only use more than 2 unpack instructions
12766 // when zero extending i8 elements which also makes it easier to use pshufb.
12767 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12768 assert(NumElements == 16 && "Unexpected byte vector width!");
12769 SDValue PSHUFBMask[16];
12770 for (int i = 0; i < 16; ++i) {
12771 int Idx = Offset + (i / Scale);
12772 if ((i % Scale == 0 && SafeOffset(Idx))) {
12773 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12774 continue;
12775 }
12776 PSHUFBMask[i] =
12777 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12778 }
12779 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12780 return DAG.getBitcast(
12781 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12782 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12783 }
12784
12785 // If we are extending from an offset, ensure we start on a boundary that
12786 // we can unpack from.
12787 int AlignToUnpack = Offset % (NumElements / Scale);
12788 if (AlignToUnpack) {
12789 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12790 for (int i = AlignToUnpack; i < NumElements; ++i)
12791 ShMask[i - AlignToUnpack] = i;
12792 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12793 Offset -= AlignToUnpack;
12794 }
12795
12796 // Otherwise emit a sequence of unpacks.
12797 do {
12798 unsigned UnpackLoHi = X86ISD::UNPCKL;
12799 if (Offset >= (NumElements / 2)) {
12800 UnpackLoHi = X86ISD::UNPCKH;
12801 Offset -= (NumElements / 2);
12802 }
12803
12804 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12805 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12806 : getZeroVector(InputVT, Subtarget, DAG, DL);
12807 InputV = DAG.getBitcast(InputVT, InputV);
12808 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12809 Scale /= 2;
12810 EltBits *= 2;
12811 NumElements /= 2;
12812 } while (Scale > 1);
12813 return DAG.getBitcast(VT, InputV);
12814}
12815
12816/// Try to lower a vector shuffle as a zero extension on any microarch.
12817///
12818/// This routine will try to do everything in its power to cleverly lower
12819/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12820/// check for the profitability of this lowering, it tries to aggressively
12821/// match this pattern. It will use all of the micro-architectural details it
12822/// can to emit an efficient lowering. It handles both blends with all-zero
12823/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12824/// masking out later).
12825///
12826/// The reason we have dedicated lowering for zext-style shuffles is that they
12827/// are both incredibly common and often quite performance sensitive.
12829 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12830 const APInt &Zeroable, const X86Subtarget &Subtarget,
12831 SelectionDAG &DAG) {
12832 int Bits = VT.getSizeInBits();
12833 int NumLanes = Bits / 128;
12834 int NumElements = VT.getVectorNumElements();
12835 int NumEltsPerLane = NumElements / NumLanes;
12836 assert(VT.getScalarSizeInBits() <= 32 &&
12837 "Exceeds 32-bit integer zero extension limit");
12838 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12839
12840 // Define a helper function to check a particular ext-scale and lower to it if
12841 // valid.
12842 auto Lower = [&](int Scale) -> SDValue {
12843 SDValue InputV;
12844 bool AnyExt = true;
12845 int Offset = 0;
12846 int Matches = 0;
12847 for (int i = 0; i < NumElements; ++i) {
12848 int M = Mask[i];
12849 if (M < 0)
12850 continue; // Valid anywhere but doesn't tell us anything.
12851 if (i % Scale != 0) {
12852 // Each of the extended elements need to be zeroable.
12853 if (!Zeroable[i])
12854 return SDValue();
12855
12856 // We no longer are in the anyext case.
12857 AnyExt = false;
12858 continue;
12859 }
12860
12861 // Each of the base elements needs to be consecutive indices into the
12862 // same input vector.
12863 SDValue V = M < NumElements ? V1 : V2;
12864 M = M % NumElements;
12865 if (!InputV) {
12866 InputV = V;
12867 Offset = M - (i / Scale);
12868 } else if (InputV != V)
12869 return SDValue(); // Flip-flopping inputs.
12870
12871 // Offset must start in the lowest 128-bit lane or at the start of an
12872 // upper lane.
12873 // FIXME: Is it ever worth allowing a negative base offset?
12874 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12875 (Offset % NumEltsPerLane) == 0))
12876 return SDValue();
12877
12878 // If we are offsetting, all referenced entries must come from the same
12879 // lane.
12880 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12881 return SDValue();
12882
12883 if ((M % NumElements) != (Offset + (i / Scale)))
12884 return SDValue(); // Non-consecutive strided elements.
12885 Matches++;
12886 }
12887
12888 // If we fail to find an input, we have a zero-shuffle which should always
12889 // have already been handled.
12890 // FIXME: Maybe handle this here in case during blending we end up with one?
12891 if (!InputV)
12892 return SDValue();
12893
12894 // If we are offsetting, don't extend if we only match a single input, we
12895 // can always do better by using a basic PSHUF or PUNPCK.
12896 if (Offset != 0 && Matches < 2)
12897 return SDValue();
12898
12899 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12900 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12901 InputV, Mask, Subtarget, DAG);
12902 };
12903
12904 // The widest scale possible for extending is to a 64-bit integer.
12905 assert(Bits % 64 == 0 &&
12906 "The number of bits in a vector must be divisible by 64 on x86!");
12907 int NumExtElements = Bits / 64;
12908
12909 // Each iteration, try extending the elements half as much, but into twice as
12910 // many elements.
12911 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12912 assert(NumElements % NumExtElements == 0 &&
12913 "The input vector size must be divisible by the extended size.");
12914 if (SDValue V = Lower(NumElements / NumExtElements))
12915 return V;
12916 }
12917
12918 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12919 if (Bits != 128)
12920 return SDValue();
12921
12922 // Returns one of the source operands if the shuffle can be reduced to a
12923 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12924 auto CanZExtLowHalf = [&]() {
12925 for (int i = NumElements / 2; i != NumElements; ++i)
12926 if (!Zeroable[i])
12927 return SDValue();
12928 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12929 return V1;
12930 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12931 return V2;
12932 return SDValue();
12933 };
12934
12935 if (SDValue V = CanZExtLowHalf()) {
12936 V = DAG.getBitcast(MVT::v2i64, V);
12937 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12938 return DAG.getBitcast(VT, V);
12939 }
12940
12941 // No viable ext lowering found.
12942 return SDValue();
12943}
12944
12945/// Try to get a scalar value for a specific element of a vector.
12946///
12947/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12949 SelectionDAG &DAG) {
12950 MVT VT = V.getSimpleValueType();
12951 MVT EltVT = VT.getVectorElementType();
12952 V = peekThroughBitcasts(V);
12953
12954 // If the bitcasts shift the element size, we can't extract an equivalent
12955 // element from it.
12956 MVT NewVT = V.getSimpleValueType();
12957 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12958 return SDValue();
12959
12960 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12961 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12962 // Ensure the scalar operand is the same size as the destination.
12963 // FIXME: Add support for scalar truncation where possible.
12964 SDValue S = V.getOperand(Idx);
12965 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12966 return DAG.getBitcast(EltVT, S);
12967 }
12968
12969 return SDValue();
12970}
12971
12972/// Helper to test for a load that can be folded with x86 shuffles.
12973///
12974/// This is particularly important because the set of instructions varies
12975/// significantly based on whether the operand is a load or not.
12977 return V.hasOneUse() &&
12979}
12980
12981template<typename T>
12982static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12983 T EltVT = VT.getScalarType();
12984 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12985 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12986}
12987
12988/// Try to lower insertion of a single element into a zero vector.
12989///
12990/// This is a common pattern that we have especially efficient patterns to lower
12991/// across all subtarget feature sets.
12993 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12994 const APInt &Zeroable, const X86Subtarget &Subtarget,
12995 SelectionDAG &DAG) {
12996 MVT ExtVT = VT;
12997 MVT EltVT = VT.getVectorElementType();
12998 unsigned NumElts = VT.getVectorNumElements();
12999 unsigned EltBits = VT.getScalarSizeInBits();
13000
13001 if (isSoftF16(EltVT, Subtarget))
13002 return SDValue();
13003
13004 int V2Index =
13005 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13006 Mask.begin();
13007 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
13008 bool IsV1Zeroable = true;
13009 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13010 if (i != V2Index && !Zeroable[i]) {
13011 IsV1Zeroable = false;
13012 break;
13013 }
13014
13015 // Bail if a non-zero V1 isn't used in place.
13016 if (!IsV1Zeroable) {
13017 SmallVector<int, 8> V1Mask(Mask);
13018 V1Mask[V2Index] = -1;
13019 if (!isNoopShuffleMask(V1Mask))
13020 return SDValue();
13021 }
13022
13023 // Check for a single input from a SCALAR_TO_VECTOR node.
13024 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13025 // all the smarts here sunk into that routine. However, the current
13026 // lowering of BUILD_VECTOR makes that nearly impossible until the old
13027 // vector shuffle lowering is dead.
13028 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13029 DAG);
13030 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13031 // We need to zext the scalar if it is smaller than an i32.
13032 V2S = DAG.getBitcast(EltVT, V2S);
13033 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
13034 // Using zext to expand a narrow element won't work for non-zero
13035 // insertions. But we can use a masked constant vector if we're
13036 // inserting V2 into the bottom of V1.
13037 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
13038 return SDValue();
13039
13040 // Zero-extend directly to i32.
13041 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13042 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13043
13044 // If we're inserting into a constant, mask off the inserted index
13045 // and OR with the zero-extended scalar.
13046 if (!IsV1Zeroable) {
13047 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
13048 Bits[V2Index] = APInt::getZero(EltBits);
13049 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
13050 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
13051 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13052 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
13053 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
13054 }
13055 }
13056 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13057 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13058 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
13059 // Either not inserting from the low element of the input or the input
13060 // element size is too small to use VZEXT_MOVL to clear the high bits.
13061 return SDValue();
13062 }
13063
13064 if (!IsV1Zeroable) {
13065 // If V1 can't be treated as a zero vector we have fewer options to lower
13066 // this. We can't support integer vectors or non-zero targets cheaply.
13067 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
13068 if (!VT.isFloatingPoint() || V2Index != 0)
13069 return SDValue();
13070 if (!VT.is128BitVector())
13071 return SDValue();
13072
13073 // Otherwise, use MOVSD, MOVSS or MOVSH.
13074 unsigned MovOpc = 0;
13075 if (EltVT == MVT::f16)
13076 MovOpc = X86ISD::MOVSH;
13077 else if (EltVT == MVT::f32)
13078 MovOpc = X86ISD::MOVSS;
13079 else if (EltVT == MVT::f64)
13080 MovOpc = X86ISD::MOVSD;
13081 else
13082 llvm_unreachable("Unsupported floating point element type to handle!");
13083 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
13084 }
13085
13086 // This lowering only works for the low element with floating point vectors.
13087 if (VT.isFloatingPoint() && V2Index != 0)
13088 return SDValue();
13089
13090 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13091 if (ExtVT != VT)
13092 V2 = DAG.getBitcast(VT, V2);
13093
13094 if (V2Index != 0) {
13095 // If we have 4 or fewer lanes we can cheaply shuffle the element into
13096 // the desired position. Otherwise it is more efficient to do a vector
13097 // shift left. We know that we can do a vector shift left because all
13098 // the inputs are zero.
13099 if (VT.isFloatingPoint() || NumElts <= 4) {
13100 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13101 V2Shuffle[V2Index] = 0;
13102 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13103 } else {
13104 V2 = DAG.getBitcast(MVT::v16i8, V2);
13105 V2 = DAG.getNode(
13106 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13107 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
13108 V2 = DAG.getBitcast(VT, V2);
13109 }
13110 }
13111 return V2;
13112}
13113
13114/// Try to lower broadcast of a single - truncated - integer element,
13115/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13116///
13117/// This assumes we have AVX2.
13119 int BroadcastIdx,
13120 const X86Subtarget &Subtarget,
13121 SelectionDAG &DAG) {
13122 assert(Subtarget.hasAVX2() &&
13123 "We can only lower integer broadcasts with AVX2!");
13124
13125 MVT EltVT = VT.getVectorElementType();
13126 MVT V0VT = V0.getSimpleValueType();
13127
13128 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
13129 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
13130
13131 MVT V0EltVT = V0VT.getVectorElementType();
13132 if (!V0EltVT.isInteger())
13133 return SDValue();
13134
13135 const unsigned EltSize = EltVT.getSizeInBits();
13136 const unsigned V0EltSize = V0EltVT.getSizeInBits();
13137
13138 // This is only a truncation if the original element type is larger.
13139 if (V0EltSize <= EltSize)
13140 return SDValue();
13141
13142 assert(((V0EltSize % EltSize) == 0) &&
13143 "Scalar type sizes must all be powers of 2 on x86!");
13144
13145 const unsigned V0Opc = V0.getOpcode();
13146 const unsigned Scale = V0EltSize / EltSize;
13147 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13148
13149 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13150 V0Opc != ISD::BUILD_VECTOR)
13151 return SDValue();
13152
13153 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13154
13155 // If we're extracting non-least-significant bits, shift so we can truncate.
13156 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13157 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13158 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13159 if (const int OffsetIdx = BroadcastIdx % Scale)
13160 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13161 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13162
13163 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13164 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13165}
13166
13167/// Test whether this can be lowered with a single SHUFPS instruction.
13168///
13169/// This is used to disable more specialized lowerings when the shufps lowering
13170/// will happen to be efficient.
13172 // This routine only handles 128-bit shufps.
13173 assert(Mask.size() == 4 && "Unsupported mask size!");
13174 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
13175 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
13176 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
13177 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
13178
13179 // To lower with a single SHUFPS we need to have the low half and high half
13180 // each requiring a single input.
13181 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13182 return false;
13183 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13184 return false;
13185
13186 return true;
13187}
13188
13189/// Test whether the specified input (0 or 1) is in-place blended by the
13190/// given mask.
13191///
13192/// This returns true if the elements from a particular input are already in the
13193/// slot required by the given mask and require no permutation.
13195 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13196 int Size = Mask.size();
13197 for (int i = 0; i < Size; ++i)
13198 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13199 return false;
13200
13201 return true;
13202}
13203
13204/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
13205/// the given mask.
13206///
13208 int BroadcastableElement = 0) {
13209 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13210 int Size = Mask.size();
13211 for (int i = 0; i < Size; ++i)
13212 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
13213 Mask[i] % Size != BroadcastableElement)
13214 return false;
13215 return true;
13216}
13217
13218/// If we are extracting two 128-bit halves of a vector and shuffling the
13219/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13220/// multi-shuffle lowering.
13222 SDValue N1, ArrayRef<int> Mask,
13223 SelectionDAG &DAG) {
13224 MVT VT = N0.getSimpleValueType();
13225 assert((VT.is128BitVector() &&
13226 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
13227 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13228
13229 // Check that both sources are extracts of the same source vector.
13230 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13232 N0.getOperand(0) != N1.getOperand(0) ||
13233 !N0.hasOneUse() || !N1.hasOneUse())
13234 return SDValue();
13235
13236 SDValue WideVec = N0.getOperand(0);
13237 MVT WideVT = WideVec.getSimpleValueType();
13238 if (!WideVT.is256BitVector())
13239 return SDValue();
13240
13241 // Match extracts of each half of the wide source vector. Commute the shuffle
13242 // if the extract of the low half is N1.
13243 unsigned NumElts = VT.getVectorNumElements();
13244 SmallVector<int, 4> NewMask(Mask);
13245 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13246 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13247 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13249 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13250 return SDValue();
13251
13252 // Final bailout: if the mask is simple, we are better off using an extract
13253 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13254 // because that avoids a constant load from memory.
13255 if (NumElts == 4 &&
13256 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13257 return SDValue();
13258
13259 // Extend the shuffle mask with undef elements.
13260 NewMask.append(NumElts, -1);
13261
13262 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13263 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13264 NewMask);
13265 // This is free: ymm -> xmm.
13266 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13267 DAG.getVectorIdxConstant(0, DL));
13268}
13269
13270/// Try to lower broadcast of a single element.
13271///
13272/// For convenience, this code also bundles all of the subtarget feature set
13273/// filtering. While a little annoying to re-dispatch on type here, there isn't
13274/// a convenient way to factor it out.
13276 SDValue V2, ArrayRef<int> Mask,
13277 const X86Subtarget &Subtarget,
13278 SelectionDAG &DAG) {
13279 MVT EltVT = VT.getVectorElementType();
13280 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13281 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13282 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13283 return SDValue();
13284
13285 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13286 // we can only broadcast from a register with AVX2.
13287 unsigned NumEltBits = VT.getScalarSizeInBits();
13288 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13291 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13292
13293 // Check that the mask is a broadcast.
13294 int BroadcastIdx = getSplatIndex(Mask);
13295 if (BroadcastIdx < 0) {
13296 // Check for hidden broadcast.
13297 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13298 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13299 return SDValue();
13300 BroadcastIdx = 0;
13301 }
13302 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13303 "a sorted mask where the broadcast "
13304 "comes from V1.");
13305 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13306
13307 // Go up the chain of (vector) values to find a scalar load that we can
13308 // combine with the broadcast.
13309 // TODO: Combine this logic with findEltLoadSrc() used by
13310 // EltsFromConsecutiveLoads().
13311 int BitOffset = BroadcastIdx * NumEltBits;
13312 SDValue V = V1;
13313 for (;;) {
13314 switch (V.getOpcode()) {
13315 case ISD::BITCAST: {
13316 V = V.getOperand(0);
13317 continue;
13318 }
13319 case ISD::CONCAT_VECTORS: {
13320 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13321 int OpIdx = BitOffset / OpBitWidth;
13322 V = V.getOperand(OpIdx);
13323 BitOffset %= OpBitWidth;
13324 continue;
13325 }
13327 // The extraction index adds to the existing offset.
13328 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13329 unsigned Idx = V.getConstantOperandVal(1);
13330 unsigned BeginOffset = Idx * EltBitWidth;
13331 BitOffset += BeginOffset;
13332 V = V.getOperand(0);
13333 continue;
13334 }
13335 case ISD::INSERT_SUBVECTOR: {
13336 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13337 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13338 int Idx = (int)V.getConstantOperandVal(2);
13339 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13340 int BeginOffset = Idx * EltBitWidth;
13341 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13342 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13343 BitOffset -= BeginOffset;
13344 V = VInner;
13345 } else {
13346 V = VOuter;
13347 }
13348 continue;
13349 }
13350 }
13351 break;
13352 }
13353 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13354 BroadcastIdx = BitOffset / NumEltBits;
13355
13356 // Do we need to bitcast the source to retrieve the original broadcast index?
13357 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13358
13359 // Check if this is a broadcast of a scalar. We special case lowering
13360 // for scalars so that we can more effectively fold with loads.
13361 // If the original value has a larger element type than the shuffle, the
13362 // broadcast element is in essence truncated. Make that explicit to ease
13363 // folding.
13364 if (BitCastSrc && VT.isInteger())
13365 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13366 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13367 return TruncBroadcast;
13368
13369 // Also check the simpler case, where we can directly reuse the scalar.
13370 if (!BitCastSrc &&
13371 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13372 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13373 V = V.getOperand(BroadcastIdx);
13374
13375 // If we can't broadcast from a register, check that the input is a load.
13376 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13377 return SDValue();
13378 } else if (ISD::isNormalLoad(V.getNode()) &&
13379 cast<LoadSDNode>(V)->isSimple()) {
13380 // We do not check for one-use of the vector load because a broadcast load
13381 // is expected to be a win for code size, register pressure, and possibly
13382 // uops even if the original vector load is not eliminated.
13383
13384 // Reduce the vector load and shuffle to a broadcasted scalar load.
13385 auto *Ld = cast<LoadSDNode>(V);
13386 SDValue BaseAddr = Ld->getBasePtr();
13387 MVT SVT = VT.getScalarType();
13388 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13389 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13390 SDValue NewAddr =
13392
13393 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13394 // than MOVDDUP.
13395 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13396 if (Opcode == X86ISD::VBROADCAST) {
13397 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13398 SDValue Ops[] = {Ld->getChain(), NewAddr};
13399 V = DAG.getMemIntrinsicNode(
13400 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13402 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13404 return DAG.getBitcast(VT, V);
13405 }
13406 assert(SVT == MVT::f64 && "Unexpected VT!");
13407 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13409 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13411 } else if (!BroadcastFromReg) {
13412 // We can't broadcast from a vector register.
13413 return SDValue();
13414 } else if (BitOffset != 0) {
13415 // We can only broadcast from the zero-element of a vector register,
13416 // but it can be advantageous to broadcast from the zero-element of a
13417 // subvector.
13418 if (!VT.is256BitVector() && !VT.is512BitVector())
13419 return SDValue();
13420
13421 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13422 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13423 return SDValue();
13424
13425 // If we are broadcasting an element from the lowest 128-bit subvector, try
13426 // to move the element in position.
13427 if (BitOffset < 128 && NumActiveElts > 1 &&
13428 V.getScalarValueSizeInBits() == NumEltBits) {
13429 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13430 "Unexpected bit-offset");
13431 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13432 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13433 V = extractSubVector(V, 0, DAG, DL, 128);
13434 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13435 } else {
13436 // Only broadcast the zero-element of a 128-bit subvector.
13437 if ((BitOffset % 128) != 0)
13438 return SDValue();
13439
13440 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13441 "Unexpected bit-offset");
13442 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13443 "Unexpected vector size");
13444 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13445 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13446 }
13447 }
13448
13449 // On AVX we can use VBROADCAST directly for scalar sources.
13450 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13451 V = DAG.getBitcast(MVT::f64, V);
13452 if (Subtarget.hasAVX()) {
13453 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13454 return DAG.getBitcast(VT, V);
13455 }
13456 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13457 }
13458
13459 // If this is a scalar, do the broadcast on this type and bitcast.
13460 if (!V.getValueType().isVector()) {
13461 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13462 "Unexpected scalar size");
13463 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13465 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13466 }
13467
13468 // We only support broadcasting from 128-bit vectors to minimize the
13469 // number of patterns we need to deal with in isel. So extract down to
13470 // 128-bits, removing as many bitcasts as possible.
13471 if (V.getValueSizeInBits() > 128)
13473
13474 // Otherwise cast V to a vector with the same element type as VT, but
13475 // possibly narrower than VT. Then perform the broadcast.
13476 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13477 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13478 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13479}
13480
13481// Check for whether we can use INSERTPS to perform the shuffle. We only use
13482// INSERTPS when the V1 elements are already in the correct locations
13483// because otherwise we can just always use two SHUFPS instructions which
13484// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13485// perform INSERTPS if a single V1 element is out of place and all V2
13486// elements are zeroable.
13488 unsigned &InsertPSMask,
13489 const APInt &Zeroable,
13490 ArrayRef<int> Mask, SelectionDAG &DAG) {
13491 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13492 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13493 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13494
13495 // Attempt to match INSERTPS with one element from VA or VB being
13496 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13497 // are updated.
13498 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13499 ArrayRef<int> CandidateMask) {
13500 unsigned ZMask = 0;
13501 int VADstIndex = -1;
13502 int VBDstIndex = -1;
13503 bool VAUsedInPlace = false;
13504
13505 for (int i = 0; i < 4; ++i) {
13506 // Synthesize a zero mask from the zeroable elements (includes undefs).
13507 if (Zeroable[i]) {
13508 ZMask |= 1 << i;
13509 continue;
13510 }
13511
13512 // Flag if we use any VA inputs in place.
13513 if (i == CandidateMask[i]) {
13514 VAUsedInPlace = true;
13515 continue;
13516 }
13517
13518 // We can only insert a single non-zeroable element.
13519 if (VADstIndex >= 0 || VBDstIndex >= 0)
13520 return false;
13521
13522 if (CandidateMask[i] < 4) {
13523 // VA input out of place for insertion.
13524 VADstIndex = i;
13525 } else {
13526 // VB input for insertion.
13527 VBDstIndex = i;
13528 }
13529 }
13530
13531 // Don't bother if we have no (non-zeroable) element for insertion.
13532 if (VADstIndex < 0 && VBDstIndex < 0)
13533 return false;
13534
13535 // Determine element insertion src/dst indices. The src index is from the
13536 // start of the inserted vector, not the start of the concatenated vector.
13537 unsigned VBSrcIndex = 0;
13538 if (VADstIndex >= 0) {
13539 // If we have a VA input out of place, we use VA as the V2 element
13540 // insertion and don't use the original V2 at all.
13541 VBSrcIndex = CandidateMask[VADstIndex];
13542 VBDstIndex = VADstIndex;
13543 VB = VA;
13544 } else {
13545 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13546 }
13547
13548 // If no V1 inputs are used in place, then the result is created only from
13549 // the zero mask and the V2 insertion - so remove V1 dependency.
13550 if (!VAUsedInPlace)
13551 VA = DAG.getUNDEF(MVT::v4f32);
13552
13553 // Update V1, V2 and InsertPSMask accordingly.
13554 V1 = VA;
13555 V2 = VB;
13556
13557 // Insert the V2 element into the desired position.
13558 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13559 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13560 return true;
13561 };
13562
13563 if (matchAsInsertPS(V1, V2, Mask))
13564 return true;
13565
13566 // Commute and try again.
13567 SmallVector<int, 4> CommutedMask(Mask);
13569 if (matchAsInsertPS(V2, V1, CommutedMask))
13570 return true;
13571
13572 return false;
13573}
13574
13576 ArrayRef<int> Mask, const APInt &Zeroable,
13577 SelectionDAG &DAG) {
13578 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13579 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13580
13581 // Attempt to match the insertps pattern.
13582 unsigned InsertPSMask = 0;
13583 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13584 return SDValue();
13585
13586 // Insert the V2 element into the desired position.
13587 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13588 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13589}
13590
13591/// Handle lowering of 2-lane 64-bit floating point shuffles.
13592///
13593/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13594/// support for floating point shuffles but not integer shuffles. These
13595/// instructions will incur a domain crossing penalty on some chips though so
13596/// it is better to avoid lowering through this for integer vectors where
13597/// possible.
13599 const APInt &Zeroable, SDValue V1, SDValue V2,
13600 const X86Subtarget &Subtarget,
13601 SelectionDAG &DAG) {
13602 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13603 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13604 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13605
13606 if (V2.isUndef()) {
13607 // Check for being able to broadcast a single element.
13608 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13609 Mask, Subtarget, DAG))
13610 return Broadcast;
13611
13612 // Straight shuffle of a single input vector. Simulate this by using the
13613 // single input as both of the "inputs" to this instruction..
13614 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13615
13616 if (Subtarget.hasAVX()) {
13617 // If we have AVX, we can use VPERMILPS which will allow folding a load
13618 // into the shuffle.
13619 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13620 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13621 }
13622
13623 return DAG.getNode(
13624 X86ISD::SHUFP, DL, MVT::v2f64,
13625 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13626 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13627 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13628 }
13629 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13630 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13631 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13632 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13633
13634 if (Subtarget.hasAVX2())
13635 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13636 return Extract;
13637
13638 // When loading a scalar and then shuffling it into a vector we can often do
13639 // the insertion cheaply.
13641 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13642 return Insertion;
13643 // Try inverting the insertion since for v2 masks it is easy to do and we
13644 // can't reliably sort the mask one way or the other.
13645 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13646 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13648 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13649 return Insertion;
13650
13651 // Try to use one of the special instruction patterns to handle two common
13652 // blend patterns if a zero-blend above didn't work.
13653 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13654 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13655 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13656 // We can either use a special instruction to load over the low double or
13657 // to move just the low double.
13658 return DAG.getNode(
13659 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13660 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13661
13662 if (Subtarget.hasSSE41())
13663 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13664 Zeroable, Subtarget, DAG))
13665 return Blend;
13666
13667 // Use dedicated unpack instructions for masks that match their pattern.
13668 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13669 return V;
13670
13671 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13672 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13673 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13674}
13675
13676/// Handle lowering of 2-lane 64-bit integer shuffles.
13677///
13678/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13679/// the integer unit to minimize domain crossing penalties. However, for blends
13680/// it falls back to the floating point shuffle operation with appropriate bit
13681/// casting.
13683 const APInt &Zeroable, SDValue V1, SDValue V2,
13684 const X86Subtarget &Subtarget,
13685 SelectionDAG &DAG) {
13686 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13687 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13688 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13689
13690 if (V2.isUndef()) {
13691 // Check for being able to broadcast a single element.
13692 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13693 Mask, Subtarget, DAG))
13694 return Broadcast;
13695
13696 // Straight shuffle of a single input vector. For everything from SSE2
13697 // onward this has a single fast instruction with no scary immediates.
13698 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13699 V1 = DAG.getBitcast(MVT::v4i32, V1);
13700 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13701 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13702 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13703 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13704 return DAG.getBitcast(
13705 MVT::v2i64,
13706 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13707 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13708 }
13709 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13710 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13711 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13712 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13713
13714 if (Subtarget.hasAVX2())
13715 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13716 return Extract;
13717
13718 // Try to use shift instructions.
13719 if (SDValue Shift =
13720 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13721 DAG, /*BitwiseOnly*/ false))
13722 return Shift;
13723
13724 // When loading a scalar and then shuffling it into a vector we can often do
13725 // the insertion cheaply.
13727 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13728 return Insertion;
13729 // Try inverting the insertion since for v2 masks it is easy to do and we
13730 // can't reliably sort the mask one way or the other.
13731 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13733 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13734 return Insertion;
13735
13736 // We have different paths for blend lowering, but they all must use the
13737 // *exact* same predicate.
13738 bool IsBlendSupported = Subtarget.hasSSE41();
13739 if (IsBlendSupported)
13740 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13741 Zeroable, Subtarget, DAG))
13742 return Blend;
13743
13744 // Use dedicated unpack instructions for masks that match their pattern.
13745 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13746 return V;
13747
13748 // Try to use byte rotation instructions.
13749 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13750 if (Subtarget.hasSSSE3()) {
13751 if (Subtarget.hasVLX())
13752 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13753 Zeroable, Subtarget, DAG))
13754 return Rotate;
13755
13756 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13757 Subtarget, DAG))
13758 return Rotate;
13759 }
13760
13761 // If we have direct support for blends, we should lower by decomposing into
13762 // a permute. That will be faster than the domain cross.
13763 if (IsBlendSupported)
13764 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13765 Zeroable, Subtarget, DAG);
13766
13767 // We implement this with SHUFPD which is pretty lame because it will likely
13768 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13769 // However, all the alternatives are still more cycles and newer chips don't
13770 // have this problem. It would be really nice if x86 had better shuffles here.
13771 V1 = DAG.getBitcast(MVT::v2f64, V1);
13772 V2 = DAG.getBitcast(MVT::v2f64, V2);
13773 return DAG.getBitcast(MVT::v2i64,
13774 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13775}
13776
13777/// Lower a vector shuffle using the SHUFPS instruction.
13778///
13779/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13780/// It makes no assumptions about whether this is the *best* lowering, it simply
13781/// uses it.
13783 ArrayRef<int> Mask, SDValue V1,
13784 SDValue V2, SelectionDAG &DAG) {
13785 SDValue LowV = V1, HighV = V2;
13786 SmallVector<int, 4> NewMask(Mask);
13787 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13788
13789 if (NumV2Elements == 1) {
13790 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13791
13792 // Compute the index adjacent to V2Index and in the same half by toggling
13793 // the low bit.
13794 int V2AdjIndex = V2Index ^ 1;
13795
13796 if (Mask[V2AdjIndex] < 0) {
13797 // Handles all the cases where we have a single V2 element and an undef.
13798 // This will only ever happen in the high lanes because we commute the
13799 // vector otherwise.
13800 if (V2Index < 2)
13801 std::swap(LowV, HighV);
13802 NewMask[V2Index] -= 4;
13803 } else {
13804 // Handle the case where the V2 element ends up adjacent to a V1 element.
13805 // To make this work, blend them together as the first step.
13806 int V1Index = V2AdjIndex;
13807 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13808 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13809 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13810
13811 // Now proceed to reconstruct the final blend as we have the necessary
13812 // high or low half formed.
13813 if (V2Index < 2) {
13814 LowV = V2;
13815 HighV = V1;
13816 } else {
13817 HighV = V2;
13818 }
13819 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13820 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13821 }
13822 } else if (NumV2Elements == 2) {
13823 if (Mask[0] < 4 && Mask[1] < 4) {
13824 // Handle the easy case where we have V1 in the low lanes and V2 in the
13825 // high lanes.
13826 NewMask[2] -= 4;
13827 NewMask[3] -= 4;
13828 } else if (Mask[2] < 4 && Mask[3] < 4) {
13829 // We also handle the reversed case because this utility may get called
13830 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13831 // arrange things in the right direction.
13832 NewMask[0] -= 4;
13833 NewMask[1] -= 4;
13834 HighV = V1;
13835 LowV = V2;
13836 } else {
13837 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13838 // trying to place elements directly, just blend them and set up the final
13839 // shuffle to place them.
13840
13841 // The first two blend mask elements are for V1, the second two are for
13842 // V2.
13843 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13844 Mask[2] < 4 ? Mask[2] : Mask[3],
13845 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13846 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13847 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13848 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13849
13850 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13851 // a blend.
13852 LowV = HighV = V1;
13853 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13854 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13855 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13856 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13857 }
13858 } else if (NumV2Elements == 3) {
13859 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13860 // we can get here due to other paths (e.g repeated mask matching) that we
13861 // don't want to do another round of lowerVECTOR_SHUFFLE.
13863 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13864 }
13865 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13866 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13867}
13868
13869/// Lower 4-lane 32-bit floating point shuffles.
13870///
13871/// Uses instructions exclusively from the floating point unit to minimize
13872/// domain crossing penalties, as these are sufficient to implement all v4f32
13873/// shuffles.
13875 const APInt &Zeroable, SDValue V1, SDValue V2,
13876 const X86Subtarget &Subtarget,
13877 SelectionDAG &DAG) {
13878 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13879 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13880 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13881
13882 if (Subtarget.hasSSE41())
13883 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13884 Zeroable, Subtarget, DAG))
13885 return Blend;
13886
13887 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13888
13889 if (NumV2Elements == 0) {
13890 // Check for being able to broadcast a single element.
13891 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13892 Mask, Subtarget, DAG))
13893 return Broadcast;
13894
13895 // Use even/odd duplicate instructions for masks that match their pattern.
13896 if (Subtarget.hasSSE3()) {
13897 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13898 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13899 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13900 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13901 }
13902
13903 if (Subtarget.hasAVX()) {
13904 // If we have AVX, we can use VPERMILPS which will allow folding a load
13905 // into the shuffle.
13906 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13907 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13908 }
13909
13910 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13911 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13912 if (!Subtarget.hasSSE2()) {
13913 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13914 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13915 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13916 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13917 }
13918
13919 // Otherwise, use a straight shuffle of a single input vector. We pass the
13920 // input vector to both operands to simulate this with a SHUFPS.
13921 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13922 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13923 }
13924
13925 if (Subtarget.hasSSE2())
13927 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13928 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13929 return ZExt;
13930 }
13931
13932 if (Subtarget.hasAVX2())
13933 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13934 return Extract;
13935
13936 // There are special ways we can lower some single-element blends. However, we
13937 // have custom ways we can lower more complex single-element blends below that
13938 // we defer to if both this and BLENDPS fail to match, so restrict this to
13939 // when the V2 input is targeting element 0 of the mask -- that is the fast
13940 // case here.
13941 if (NumV2Elements == 1 && Mask[0] >= 4)
13943 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13944 return V;
13945
13946 if (Subtarget.hasSSE41()) {
13947 // Use INSERTPS if we can complete the shuffle efficiently.
13948 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13949 return V;
13950
13951 if (!isSingleSHUFPSMask(Mask))
13952 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13953 V2, Mask, DAG))
13954 return BlendPerm;
13955 }
13956
13957 // Use low/high mov instructions. These are only valid in SSE1 because
13958 // otherwise they are widened to v2f64 and never get here.
13959 if (!Subtarget.hasSSE2()) {
13960 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13961 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13962 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13963 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13964 }
13965
13966 // Use dedicated unpack instructions for masks that match their pattern.
13967 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13968 return V;
13969
13970 // Otherwise fall back to a SHUFPS lowering strategy.
13971 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13972}
13973
13974/// Lower 4-lane i32 vector shuffles.
13975///
13976/// We try to handle these with integer-domain shuffles where we can, but for
13977/// blends we use the floating point domain blend instructions.
13979 const APInt &Zeroable, SDValue V1, SDValue V2,
13980 const X86Subtarget &Subtarget,
13981 SelectionDAG &DAG) {
13982 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13983 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13984 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13985
13986 // Whenever we can lower this as a zext, that instruction is strictly faster
13987 // than any alternative. It also allows us to fold memory operands into the
13988 // shuffle in many cases.
13989 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13990 Zeroable, Subtarget, DAG))
13991 return ZExt;
13992
13993 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13994
13995 // Try to use shift instructions if fast.
13996 if (Subtarget.preferLowerShuffleAsShift()) {
13997 if (SDValue Shift =
13998 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13999 Subtarget, DAG, /*BitwiseOnly*/ true))
14000 return Shift;
14001 if (NumV2Elements == 0)
14002 if (SDValue Rotate =
14003 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
14004 return Rotate;
14005 }
14006
14007 if (NumV2Elements == 0) {
14008 // Try to use broadcast unless the mask only has one non-undef element.
14009 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14010 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14011 Mask, Subtarget, DAG))
14012 return Broadcast;
14013 }
14014
14015 // Straight shuffle of a single input vector. For everything from SSE2
14016 // onward this has a single fast instruction with no scary immediates.
14017 // We coerce the shuffle pattern to be compatible with UNPCK instructions
14018 // but we aren't actually going to use the UNPCK instruction because doing
14019 // so prevents folding a load into this instruction or making a copy.
14020 const int UnpackLoMask[] = {0, 0, 1, 1};
14021 const int UnpackHiMask[] = {2, 2, 3, 3};
14022 if (!isSingleElementRepeatedMask(Mask)) {
14023 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14024 Mask = UnpackLoMask;
14025 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14026 Mask = UnpackHiMask;
14027 }
14028
14029 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14030 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14031 }
14032
14033 if (Subtarget.hasAVX2())
14034 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14035 return Extract;
14036
14037 // Try to use shift instructions.
14038 if (SDValue Shift =
14039 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
14040 DAG, /*BitwiseOnly*/ false))
14041 return Shift;
14042
14043 // There are special ways we can lower some single-element blends.
14044 if (NumV2Elements == 1)
14046 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14047 return V;
14048
14049 // We have different paths for blend lowering, but they all must use the
14050 // *exact* same predicate.
14051 bool IsBlendSupported = Subtarget.hasSSE41();
14052 if (IsBlendSupported)
14053 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14054 Zeroable, Subtarget, DAG))
14055 return Blend;
14056
14057 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14058 Zeroable, Subtarget, DAG))
14059 return Masked;
14060
14061 // Use dedicated unpack instructions for masks that match their pattern.
14062 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
14063 return V;
14064
14065 // Try to use byte rotation instructions.
14066 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14067 if (Subtarget.hasSSSE3()) {
14068 if (Subtarget.hasVLX())
14069 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14070 Zeroable, Subtarget, DAG))
14071 return Rotate;
14072
14073 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14074 Subtarget, DAG))
14075 return Rotate;
14076 }
14077
14078 // Assume that a single SHUFPS is faster than an alternative sequence of
14079 // multiple instructions (even if the CPU has a domain penalty).
14080 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14081 if (!isSingleSHUFPSMask(Mask)) {
14082 // If we have direct support for blends, we should lower by decomposing into
14083 // a permute. That will be faster than the domain cross.
14084 if (IsBlendSupported)
14085 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14086 Zeroable, Subtarget, DAG);
14087
14088 // Try to lower by permuting the inputs into an unpack instruction.
14089 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14090 Mask, Subtarget, DAG))
14091 return Unpack;
14092 }
14093
14094 // We implement this with SHUFPS because it can blend from two vectors.
14095 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14096 // up the inputs, bypassing domain shift penalties that we would incur if we
14097 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14098 // relevant.
14099 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14100 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14101 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14102 return DAG.getBitcast(MVT::v4i32, ShufPS);
14103}
14104
14105/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14106/// shuffle lowering, and the most complex part.
14107///
14108/// The lowering strategy is to try to form pairs of input lanes which are
14109/// targeted at the same half of the final vector, and then use a dword shuffle
14110/// to place them onto the right half, and finally unpack the paired lanes into
14111/// their final position.
14112///
14113/// The exact breakdown of how to form these dword pairs and align them on the
14114/// correct sides is really tricky. See the comments within the function for
14115/// more of the details.
14116///
14117/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14118/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14119/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14120/// vector, form the analogous 128-bit 8-element Mask.
14122 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14123 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14124 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
14125 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14126
14127 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
14128 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14129 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14130
14131 // Attempt to directly match PSHUFLW or PSHUFHW.
14132 if (isUndefOrInRange(LoMask, 0, 4) &&
14133 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14134 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14135 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14136 }
14137 if (isUndefOrInRange(HiMask, 4, 8) &&
14138 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14139 for (int i = 0; i != 4; ++i)
14140 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14141 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14142 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14143 }
14144
14145 SmallVector<int, 4> LoInputs;
14146 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14147 array_pod_sort(LoInputs.begin(), LoInputs.end());
14148 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14149 SmallVector<int, 4> HiInputs;
14150 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14151 array_pod_sort(HiInputs.begin(), HiInputs.end());
14152 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14153 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14154 int NumHToL = LoInputs.size() - NumLToL;
14155 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14156 int NumHToH = HiInputs.size() - NumLToH;
14157 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14158 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14159 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14160 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14161
14162 // If we are shuffling values from one half - check how many different DWORD
14163 // pairs we need to create. If only 1 or 2 then we can perform this as a
14164 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14165 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14166 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14167 V = DAG.getNode(ShufWOp, DL, VT, V,
14168 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14169 V = DAG.getBitcast(PSHUFDVT, V);
14170 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14171 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14172 return DAG.getBitcast(VT, V);
14173 };
14174
14175 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14176 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14177 SmallVector<std::pair<int, int>, 4> DWordPairs;
14178 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14179
14180 // Collect the different DWORD pairs.
14181 for (int DWord = 0; DWord != 4; ++DWord) {
14182 int M0 = Mask[2 * DWord + 0];
14183 int M1 = Mask[2 * DWord + 1];
14184 M0 = (M0 >= 0 ? M0 % 4 : M0);
14185 M1 = (M1 >= 0 ? M1 % 4 : M1);
14186 if (M0 < 0 && M1 < 0)
14187 continue;
14188
14189 bool Match = false;
14190 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14191 auto &DWordPair = DWordPairs[j];
14192 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14193 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14194 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14195 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14196 PSHUFDMask[DWord] = DOffset + j;
14197 Match = true;
14198 break;
14199 }
14200 }
14201 if (!Match) {
14202 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14203 DWordPairs.push_back(std::make_pair(M0, M1));
14204 }
14205 }
14206
14207 if (DWordPairs.size() <= 2) {
14208 DWordPairs.resize(2, std::make_pair(-1, -1));
14209 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14210 DWordPairs[1].first, DWordPairs[1].second};
14211 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
14212 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
14213 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
14214 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
14215 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
14216 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
14217 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
14218 }
14219 if ((NumHToL + NumHToH) == 0)
14220 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14221 if ((NumLToL + NumLToH) == 0)
14222 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14223 }
14224 }
14225
14226 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14227 // such inputs we can swap two of the dwords across the half mark and end up
14228 // with <=2 inputs to each half in each half. Once there, we can fall through
14229 // to the generic code below. For example:
14230 //
14231 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14232 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14233 //
14234 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14235 // and an existing 2-into-2 on the other half. In this case we may have to
14236 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14237 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14238 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14239 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14240 // half than the one we target for fixing) will be fixed when we re-enter this
14241 // path. We will also combine away any sequence of PSHUFD instructions that
14242 // result into a single instruction. Here is an example of the tricky case:
14243 //
14244 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14245 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14246 //
14247 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14248 //
14249 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14250 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14251 //
14252 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14253 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14254 //
14255 // The result is fine to be handled by the generic logic.
14256 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14257 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14258 int AOffset, int BOffset) {
14259 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14260 "Must call this with A having 3 or 1 inputs from the A half.");
14261 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14262 "Must call this with B having 1 or 3 inputs from the B half.");
14263 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14264 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14265
14266 bool ThreeAInputs = AToAInputs.size() == 3;
14267
14268 // Compute the index of dword with only one word among the three inputs in
14269 // a half by taking the sum of the half with three inputs and subtracting
14270 // the sum of the actual three inputs. The difference is the remaining
14271 // slot.
14272 int ADWord = 0, BDWord = 0;
14273 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14274 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14275 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14276 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14277 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14278 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14279 int TripleNonInputIdx =
14280 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14281 TripleDWord = TripleNonInputIdx / 2;
14282
14283 // We use xor with one to compute the adjacent DWord to whichever one the
14284 // OneInput is in.
14285 OneInputDWord = (OneInput / 2) ^ 1;
14286
14287 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14288 // and BToA inputs. If there is also such a problem with the BToB and AToB
14289 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14290 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14291 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14292 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14293 // Compute how many inputs will be flipped by swapping these DWords. We
14294 // need
14295 // to balance this to ensure we don't form a 3-1 shuffle in the other
14296 // half.
14297 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14298 llvm::count(AToBInputs, 2 * ADWord + 1);
14299 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14300 llvm::count(BToBInputs, 2 * BDWord + 1);
14301 if ((NumFlippedAToBInputs == 1 &&
14302 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14303 (NumFlippedBToBInputs == 1 &&
14304 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14305 // We choose whether to fix the A half or B half based on whether that
14306 // half has zero flipped inputs. At zero, we may not be able to fix it
14307 // with that half. We also bias towards fixing the B half because that
14308 // will more commonly be the high half, and we have to bias one way.
14309 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14310 ArrayRef<int> Inputs) {
14311 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14312 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14313 // Determine whether the free index is in the flipped dword or the
14314 // unflipped dword based on where the pinned index is. We use this bit
14315 // in an xor to conditionally select the adjacent dword.
14316 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14317 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14318 if (IsFixIdxInput == IsFixFreeIdxInput)
14319 FixFreeIdx += 1;
14320 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14321 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14322 "We need to be changing the number of flipped inputs!");
14323 int PSHUFHalfMask[] = {0, 1, 2, 3};
14324 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14325 V = DAG.getNode(
14326 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14327 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14328 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14329
14330 for (int &M : Mask)
14331 if (M >= 0 && M == FixIdx)
14332 M = FixFreeIdx;
14333 else if (M >= 0 && M == FixFreeIdx)
14334 M = FixIdx;
14335 };
14336 if (NumFlippedBToBInputs != 0) {
14337 int BPinnedIdx =
14338 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14339 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14340 } else {
14341 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14342 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14343 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14344 }
14345 }
14346 }
14347
14348 int PSHUFDMask[] = {0, 1, 2, 3};
14349 PSHUFDMask[ADWord] = BDWord;
14350 PSHUFDMask[BDWord] = ADWord;
14351 V = DAG.getBitcast(
14352 VT,
14353 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14354 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14355
14356 // Adjust the mask to match the new locations of A and B.
14357 for (int &M : Mask)
14358 if (M >= 0 && M/2 == ADWord)
14359 M = 2 * BDWord + M % 2;
14360 else if (M >= 0 && M/2 == BDWord)
14361 M = 2 * ADWord + M % 2;
14362
14363 // Recurse back into this routine to re-compute state now that this isn't
14364 // a 3 and 1 problem.
14365 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14366 };
14367 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14368 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14369 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14370 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14371
14372 // At this point there are at most two inputs to the low and high halves from
14373 // each half. That means the inputs can always be grouped into dwords and
14374 // those dwords can then be moved to the correct half with a dword shuffle.
14375 // We use at most one low and one high word shuffle to collect these paired
14376 // inputs into dwords, and finally a dword shuffle to place them.
14377 int PSHUFLMask[4] = {-1, -1, -1, -1};
14378 int PSHUFHMask[4] = {-1, -1, -1, -1};
14379 int PSHUFDMask[4] = {-1, -1, -1, -1};
14380
14381 // First fix the masks for all the inputs that are staying in their
14382 // original halves. This will then dictate the targets of the cross-half
14383 // shuffles.
14384 auto fixInPlaceInputs =
14385 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14386 MutableArrayRef<int> SourceHalfMask,
14387 MutableArrayRef<int> HalfMask, int HalfOffset) {
14388 if (InPlaceInputs.empty())
14389 return;
14390 if (InPlaceInputs.size() == 1) {
14391 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14392 InPlaceInputs[0] - HalfOffset;
14393 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14394 return;
14395 }
14396 if (IncomingInputs.empty()) {
14397 // Just fix all of the in place inputs.
14398 for (int Input : InPlaceInputs) {
14399 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14400 PSHUFDMask[Input / 2] = Input / 2;
14401 }
14402 return;
14403 }
14404
14405 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14406 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14407 InPlaceInputs[0] - HalfOffset;
14408 // Put the second input next to the first so that they are packed into
14409 // a dword. We find the adjacent index by toggling the low bit.
14410 int AdjIndex = InPlaceInputs[0] ^ 1;
14411 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14412 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14413 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14414 };
14415 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14416 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14417
14418 // Now gather the cross-half inputs and place them into a free dword of
14419 // their target half.
14420 // FIXME: This operation could almost certainly be simplified dramatically to
14421 // look more like the 3-1 fixing operation.
14422 auto moveInputsToRightHalf = [&PSHUFDMask](
14423 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14424 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14425 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14426 int DestOffset) {
14427 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14428 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14429 };
14430 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14431 int Word) {
14432 int LowWord = Word & ~1;
14433 int HighWord = Word | 1;
14434 return isWordClobbered(SourceHalfMask, LowWord) ||
14435 isWordClobbered(SourceHalfMask, HighWord);
14436 };
14437
14438 if (IncomingInputs.empty())
14439 return;
14440
14441 if (ExistingInputs.empty()) {
14442 // Map any dwords with inputs from them into the right half.
14443 for (int Input : IncomingInputs) {
14444 // If the source half mask maps over the inputs, turn those into
14445 // swaps and use the swapped lane.
14446 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14447 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14448 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14449 Input - SourceOffset;
14450 // We have to swap the uses in our half mask in one sweep.
14451 for (int &M : HalfMask)
14452 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14453 M = Input;
14454 else if (M == Input)
14455 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14456 } else {
14457 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14458 Input - SourceOffset &&
14459 "Previous placement doesn't match!");
14460 }
14461 // Note that this correctly re-maps both when we do a swap and when
14462 // we observe the other side of the swap above. We rely on that to
14463 // avoid swapping the members of the input list directly.
14464 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14465 }
14466
14467 // Map the input's dword into the correct half.
14468 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14469 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14470 else
14471 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14472 Input / 2 &&
14473 "Previous placement doesn't match!");
14474 }
14475
14476 // And just directly shift any other-half mask elements to be same-half
14477 // as we will have mirrored the dword containing the element into the
14478 // same position within that half.
14479 for (int &M : HalfMask)
14480 if (M >= SourceOffset && M < SourceOffset + 4) {
14481 M = M - SourceOffset + DestOffset;
14482 assert(M >= 0 && "This should never wrap below zero!");
14483 }
14484 return;
14485 }
14486
14487 // Ensure we have the input in a viable dword of its current half. This
14488 // is particularly tricky because the original position may be clobbered
14489 // by inputs being moved and *staying* in that half.
14490 if (IncomingInputs.size() == 1) {
14491 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14492 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14493 SourceOffset;
14494 SourceHalfMask[InputFixed - SourceOffset] =
14495 IncomingInputs[0] - SourceOffset;
14496 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14497 IncomingInputs[0] = InputFixed;
14498 }
14499 } else if (IncomingInputs.size() == 2) {
14500 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14501 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14502 // We have two non-adjacent or clobbered inputs we need to extract from
14503 // the source half. To do this, we need to map them into some adjacent
14504 // dword slot in the source mask.
14505 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14506 IncomingInputs[1] - SourceOffset};
14507
14508 // If there is a free slot in the source half mask adjacent to one of
14509 // the inputs, place the other input in it. We use (Index XOR 1) to
14510 // compute an adjacent index.
14511 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14512 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14513 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14514 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14515 InputsFixed[1] = InputsFixed[0] ^ 1;
14516 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14517 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14518 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14519 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14520 InputsFixed[0] = InputsFixed[1] ^ 1;
14521 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14522 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14523 // The two inputs are in the same DWord but it is clobbered and the
14524 // adjacent DWord isn't used at all. Move both inputs to the free
14525 // slot.
14526 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14527 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14528 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14529 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14530 } else {
14531 // The only way we hit this point is if there is no clobbering
14532 // (because there are no off-half inputs to this half) and there is no
14533 // free slot adjacent to one of the inputs. In this case, we have to
14534 // swap an input with a non-input.
14535 for (int i = 0; i < 4; ++i)
14536 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14537 "We can't handle any clobbers here!");
14538 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14539 "Cannot have adjacent inputs here!");
14540
14541 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14542 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14543
14544 // We also have to update the final source mask in this case because
14545 // it may need to undo the above swap.
14546 for (int &M : FinalSourceHalfMask)
14547 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14548 M = InputsFixed[1] + SourceOffset;
14549 else if (M == InputsFixed[1] + SourceOffset)
14550 M = (InputsFixed[0] ^ 1) + SourceOffset;
14551
14552 InputsFixed[1] = InputsFixed[0] ^ 1;
14553 }
14554
14555 // Point everything at the fixed inputs.
14556 for (int &M : HalfMask)
14557 if (M == IncomingInputs[0])
14558 M = InputsFixed[0] + SourceOffset;
14559 else if (M == IncomingInputs[1])
14560 M = InputsFixed[1] + SourceOffset;
14561
14562 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14563 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14564 }
14565 } else {
14566 llvm_unreachable("Unhandled input size!");
14567 }
14568
14569 // Now hoist the DWord down to the right half.
14570 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14571 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14572 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14573 for (int &M : HalfMask)
14574 for (int Input : IncomingInputs)
14575 if (M == Input)
14576 M = FreeDWord * 2 + Input % 2;
14577 };
14578 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14579 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14580 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14581 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14582
14583 // Now enact all the shuffles we've computed to move the inputs into their
14584 // target half.
14585 if (!isNoopShuffleMask(PSHUFLMask))
14586 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14587 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14588 if (!isNoopShuffleMask(PSHUFHMask))
14589 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14590 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14591 if (!isNoopShuffleMask(PSHUFDMask))
14592 V = DAG.getBitcast(
14593 VT,
14594 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14595 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14596
14597 // At this point, each half should contain all its inputs, and we can then
14598 // just shuffle them into their final position.
14599 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14600 "Failed to lift all the high half inputs to the low mask!");
14601 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14602 "Failed to lift all the low half inputs to the high mask!");
14603
14604 // Do a half shuffle for the low mask.
14605 if (!isNoopShuffleMask(LoMask))
14606 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14607 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14608
14609 // Do a half shuffle with the high mask after shifting its values down.
14610 for (int &M : HiMask)
14611 if (M >= 0)
14612 M -= 4;
14613 if (!isNoopShuffleMask(HiMask))
14614 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14615 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14616
14617 return V;
14618}
14619
14620/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14621/// blend if only one input is used.
14623 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14624 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14626 "Lane crossing shuffle masks not supported");
14627
14628 int NumBytes = VT.getSizeInBits() / 8;
14629 int Size = Mask.size();
14630 int Scale = NumBytes / Size;
14631
14632 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14633 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14634 V1InUse = false;
14635 V2InUse = false;
14636
14637 for (int i = 0; i < NumBytes; ++i) {
14638 int M = Mask[i / Scale];
14639 if (M < 0)
14640 continue;
14641
14642 const int ZeroMask = 0x80;
14643 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14644 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14645 if (Zeroable[i / Scale])
14646 V1Idx = V2Idx = ZeroMask;
14647
14648 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14649 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14650 V1InUse |= (ZeroMask != V1Idx);
14651 V2InUse |= (ZeroMask != V2Idx);
14652 }
14653
14654 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14655 if (V1InUse)
14656 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14657 DAG.getBuildVector(ShufVT, DL, V1Mask));
14658 if (V2InUse)
14659 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14660 DAG.getBuildVector(ShufVT, DL, V2Mask));
14661
14662 // If we need shuffled inputs from both, blend the two.
14663 SDValue V;
14664 if (V1InUse && V2InUse)
14665 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14666 else
14667 V = V1InUse ? V1 : V2;
14668
14669 // Cast the result back to the correct type.
14670 return DAG.getBitcast(VT, V);
14671}
14672
14673/// Generic lowering of 8-lane i16 shuffles.
14674///
14675/// This handles both single-input shuffles and combined shuffle/blends with
14676/// two inputs. The single input shuffles are immediately delegated to
14677/// a dedicated lowering routine.
14678///
14679/// The blends are lowered in one of three fundamental ways. If there are few
14680/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14681/// of the input is significantly cheaper when lowered as an interleaving of
14682/// the two inputs, try to interleave them. Otherwise, blend the low and high
14683/// halves of the inputs separately (making them have relatively few inputs)
14684/// and then concatenate them.
14686 const APInt &Zeroable, SDValue V1, SDValue V2,
14687 const X86Subtarget &Subtarget,
14688 SelectionDAG &DAG) {
14689 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14690 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14691 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14692
14693 // Whenever we can lower this as a zext, that instruction is strictly faster
14694 // than any alternative.
14695 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14696 Zeroable, Subtarget, DAG))
14697 return ZExt;
14698
14699 // Try to use lower using a truncation.
14700 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14701 Subtarget, DAG))
14702 return V;
14703
14704 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14705
14706 if (NumV2Inputs == 0) {
14707 // Try to use shift instructions.
14708 if (SDValue Shift =
14709 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14710 Subtarget, DAG, /*BitwiseOnly*/ false))
14711 return Shift;
14712
14713 // Check for being able to broadcast a single element.
14714 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14715 Mask, Subtarget, DAG))
14716 return Broadcast;
14717
14718 // Try to use bit rotation instructions.
14719 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14720 Subtarget, DAG))
14721 return Rotate;
14722
14723 // Use dedicated unpack instructions for masks that match their pattern.
14724 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14725 return V;
14726
14727 // Use dedicated pack instructions for masks that match their pattern.
14728 if (SDValue V =
14729 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14730 return V;
14731
14732 // Try to use byte rotation instructions.
14733 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14734 Subtarget, DAG))
14735 return Rotate;
14736
14737 // Make a copy of the mask so it can be modified.
14738 SmallVector<int, 8> MutableMask(Mask);
14739 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14740 Subtarget, DAG);
14741 }
14742
14743 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14744 "All single-input shuffles should be canonicalized to be V1-input "
14745 "shuffles.");
14746
14747 // Try to use shift instructions.
14748 if (SDValue Shift =
14749 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14750 DAG, /*BitwiseOnly*/ false))
14751 return Shift;
14752
14753 // See if we can use SSE4A Extraction / Insertion.
14754 if (Subtarget.hasSSE4A())
14755 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14756 Zeroable, DAG))
14757 return V;
14758
14759 // There are special ways we can lower some single-element blends.
14760 if (NumV2Inputs == 1)
14762 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14763 return V;
14764
14765 // We have different paths for blend lowering, but they all must use the
14766 // *exact* same predicate.
14767 bool IsBlendSupported = Subtarget.hasSSE41();
14768 if (IsBlendSupported)
14769 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14770 Zeroable, Subtarget, DAG))
14771 return Blend;
14772
14773 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14774 Zeroable, Subtarget, DAG))
14775 return Masked;
14776
14777 // Use dedicated unpack instructions for masks that match their pattern.
14778 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14779 return V;
14780
14781 // Use dedicated pack instructions for masks that match their pattern.
14782 if (SDValue V =
14783 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14784 return V;
14785
14786 // Try to use lower using a truncation.
14787 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14788 Subtarget, DAG))
14789 return V;
14790
14791 // Try to use byte rotation instructions.
14792 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14793 Subtarget, DAG))
14794 return Rotate;
14795
14796 if (SDValue BitBlend =
14797 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14798 return BitBlend;
14799
14800 // Try to use byte shift instructions to mask.
14801 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14802 Zeroable, Subtarget, DAG))
14803 return V;
14804
14805 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14806 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14807 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14808 !Subtarget.hasVLX()) {
14809 // Check if this is part of a 256-bit vector truncation.
14810 unsigned PackOpc = 0;
14811 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14814 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14815 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14816 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14817 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14818 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14819 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14820 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14821 PackOpc = X86ISD::PACKUS;
14822 } else if (Subtarget.hasSSE41()) {
14823 SmallVector<SDValue, 4> DWordClearOps(4,
14824 DAG.getConstant(0, DL, MVT::i32));
14825 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14826 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14827 SDValue DWordClearMask =
14828 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14829 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14830 DWordClearMask);
14831 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14832 DWordClearMask);
14833 PackOpc = X86ISD::PACKUS;
14834 } else if (!Subtarget.hasSSSE3()) {
14835 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14836 V1 = DAG.getBitcast(MVT::v4i32, V1);
14837 V2 = DAG.getBitcast(MVT::v4i32, V2);
14838 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14839 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14840 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14841 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14842 PackOpc = X86ISD::PACKSS;
14843 }
14844 if (PackOpc) {
14845 // Now pack things back together.
14846 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14847 if (NumEvenDrops == 2) {
14848 Result = DAG.getBitcast(MVT::v4i32, Result);
14849 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14850 }
14851 return Result;
14852 }
14853 }
14854
14855 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14856 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14857 if (NumOddDrops == 1) {
14858 bool HasSSE41 = Subtarget.hasSSE41();
14859 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14860 DAG.getBitcast(MVT::v4i32, V1),
14861 DAG.getTargetConstant(16, DL, MVT::i8));
14862 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14863 DAG.getBitcast(MVT::v4i32, V2),
14864 DAG.getTargetConstant(16, DL, MVT::i8));
14865 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14866 MVT::v8i16, V1, V2);
14867 }
14868
14869 // Try to lower by permuting the inputs into an unpack instruction.
14870 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14871 Mask, Subtarget, DAG))
14872 return Unpack;
14873
14874 // If we can't directly blend but can use PSHUFB, that will be better as it
14875 // can both shuffle and set up the inefficient blend.
14876 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14877 bool V1InUse, V2InUse;
14878 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14879 Zeroable, DAG, V1InUse, V2InUse);
14880 }
14881
14882 // We can always bit-blend if we have to so the fallback strategy is to
14883 // decompose into single-input permutes and blends/unpacks.
14884 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14885 Zeroable, Subtarget, DAG);
14886}
14887
14888/// Lower 8-lane 16-bit floating point shuffles.
14890 const APInt &Zeroable, SDValue V1, SDValue V2,
14891 const X86Subtarget &Subtarget,
14892 SelectionDAG &DAG) {
14893 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14894 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14895 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14896 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14897
14898 if (Subtarget.hasFP16()) {
14899 if (NumV2Elements == 0) {
14900 // Check for being able to broadcast a single element.
14901 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14902 Mask, Subtarget, DAG))
14903 return Broadcast;
14904 }
14905 if (NumV2Elements == 1 && Mask[0] >= 8)
14907 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14908 return V;
14909 }
14910
14911 V1 = DAG.getBitcast(MVT::v8i16, V1);
14912 V2 = DAG.getBitcast(MVT::v8i16, V2);
14913 return DAG.getBitcast(MVT::v8f16,
14914 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14915}
14916
14917// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14918// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14919// the active subvector is extracted.
14921 ArrayRef<int> OriginalMask, SDValue V1,
14922 SDValue V2, const X86Subtarget &Subtarget,
14923 SelectionDAG &DAG) {
14924 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14925 SmallVector<int, 32> Mask(OriginalMask);
14926 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14927 !isShuffleFoldableLoad(V2)) {
14929 std::swap(V1, V2);
14930 }
14931
14932 MVT MaskVT = VT.changeTypeToInteger();
14933 SDValue MaskNode;
14934 MVT ShuffleVT = VT;
14935 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14936 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14937 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14938 ShuffleVT = V1.getSimpleValueType();
14939
14940 // Adjust mask to correct indices for the second input.
14941 int NumElts = VT.getVectorNumElements();
14942 unsigned Scale = 512 / VT.getSizeInBits();
14943 SmallVector<int, 32> AdjustedMask(Mask);
14944 for (int &M : AdjustedMask)
14945 if (NumElts <= M)
14946 M += (Scale - 1) * NumElts;
14947 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14948 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14949 } else {
14950 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14951 }
14952
14953 SDValue Result;
14954 if (V2.isUndef())
14955 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14956 else
14957 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14958
14959 if (VT != ShuffleVT)
14960 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14961
14962 return Result;
14963}
14964
14965/// Generic lowering of v16i8 shuffles.
14966///
14967/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14968/// detect any complexity reducing interleaving. If that doesn't help, it uses
14969/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14970/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14971/// back together.
14973 const APInt &Zeroable, SDValue V1, SDValue V2,
14974 const X86Subtarget &Subtarget,
14975 SelectionDAG &DAG) {
14976 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14977 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14978 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14979
14980 // Try to use shift instructions.
14981 if (SDValue Shift =
14982 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14983 DAG, /*BitwiseOnly*/ false))
14984 return Shift;
14985
14986 // Try to use byte rotation instructions.
14987 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14988 Subtarget, DAG))
14989 return Rotate;
14990
14991 // Use dedicated pack instructions for masks that match their pattern.
14992 if (SDValue V =
14993 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14994 return V;
14995
14996 // Try to use a zext lowering.
14997 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14998 Zeroable, Subtarget, DAG))
14999 return ZExt;
15000
15001 // Try to use lower using a truncation.
15002 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15003 Subtarget, DAG))
15004 return V;
15005
15006 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15007 Subtarget, DAG))
15008 return V;
15009
15010 // See if we can use SSE4A Extraction / Insertion.
15011 if (Subtarget.hasSSE4A())
15012 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15013 Zeroable, DAG))
15014 return V;
15015
15016 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15017
15018 // For single-input shuffles, there are some nicer lowering tricks we can use.
15019 if (NumV2Elements == 0) {
15020 // Check for being able to broadcast a single element.
15021 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15022 Mask, Subtarget, DAG))
15023 return Broadcast;
15024
15025 // Try to use bit rotation instructions.
15026 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15027 Subtarget, DAG))
15028 return Rotate;
15029
15030 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
15031 return V;
15032
15033 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15034 // Notably, this handles splat and partial-splat shuffles more efficiently.
15035 // However, it only makes sense if the pre-duplication shuffle simplifies
15036 // things significantly. Currently, this means we need to be able to
15037 // express the pre-duplication shuffle as an i16 shuffle.
15038 //
15039 // FIXME: We should check for other patterns which can be widened into an
15040 // i16 shuffle as well.
15041 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15042 for (int i = 0; i < 16; i += 2)
15043 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15044 return false;
15045
15046 return true;
15047 };
15048 auto tryToWidenViaDuplication = [&]() -> SDValue {
15049 if (!canWidenViaDuplication(Mask))
15050 return SDValue();
15051 SmallVector<int, 4> LoInputs;
15052 copy_if(Mask, std::back_inserter(LoInputs),
15053 [](int M) { return M >= 0 && M < 8; });
15054 array_pod_sort(LoInputs.begin(), LoInputs.end());
15055 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
15056 SmallVector<int, 4> HiInputs;
15057 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15058 array_pod_sort(HiInputs.begin(), HiInputs.end());
15059 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
15060
15061 bool TargetLo = LoInputs.size() >= HiInputs.size();
15062 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15063 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15064
15065 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15067 for (int I : InPlaceInputs) {
15068 PreDupI16Shuffle[I/2] = I/2;
15069 LaneMap[I] = I;
15070 }
15071 int j = TargetLo ? 0 : 4, je = j + 4;
15072 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15073 // Check if j is already a shuffle of this input. This happens when
15074 // there are two adjacent bytes after we move the low one.
15075 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15076 // If we haven't yet mapped the input, search for a slot into which
15077 // we can map it.
15078 while (j < je && PreDupI16Shuffle[j] >= 0)
15079 ++j;
15080
15081 if (j == je)
15082 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15083 return SDValue();
15084
15085 // Map this input with the i16 shuffle.
15086 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15087 }
15088
15089 // Update the lane map based on the mapping we ended up with.
15090 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15091 }
15092 V1 = DAG.getBitcast(
15093 MVT::v16i8,
15094 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15095 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15096
15097 // Unpack the bytes to form the i16s that will be shuffled into place.
15098 bool EvenInUse = false, OddInUse = false;
15099 for (int i = 0; i < 16; i += 2) {
15100 EvenInUse |= (Mask[i + 0] >= 0);
15101 OddInUse |= (Mask[i + 1] >= 0);
15102 if (EvenInUse && OddInUse)
15103 break;
15104 }
15105 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15106 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15107 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15108
15109 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15110 for (int i = 0; i < 16; ++i)
15111 if (Mask[i] >= 0) {
15112 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15113 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
15114 if (PostDupI16Shuffle[i / 2] < 0)
15115 PostDupI16Shuffle[i / 2] = MappedMask;
15116 else
15117 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15118 "Conflicting entries in the original shuffle!");
15119 }
15120 return DAG.getBitcast(
15121 MVT::v16i8,
15122 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15123 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15124 };
15125 if (SDValue V = tryToWidenViaDuplication())
15126 return V;
15127 }
15128
15129 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15130 Zeroable, Subtarget, DAG))
15131 return Masked;
15132
15133 // Use dedicated unpack instructions for masks that match their pattern.
15134 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
15135 return V;
15136
15137 // Try to use byte shift instructions to mask.
15138 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15139 Zeroable, Subtarget, DAG))
15140 return V;
15141
15142 // Check for compaction patterns.
15143 bool IsSingleInput = V2.isUndef();
15144 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
15145
15146 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15147 // with PSHUFB. It is important to do this before we attempt to generate any
15148 // blends but after all of the single-input lowerings. If the single input
15149 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15150 // want to preserve that and we can DAG combine any longer sequences into
15151 // a PSHUFB in the end. But once we start blending from multiple inputs,
15152 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15153 // and there are *very* few patterns that would actually be faster than the
15154 // PSHUFB approach because of its ability to zero lanes.
15155 //
15156 // If the mask is a binary compaction, we can more efficiently perform this
15157 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15158 //
15159 // FIXME: The only exceptions to the above are blends which are exact
15160 // interleavings with direct instructions supporting them. We currently don't
15161 // handle those well here.
15162 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15163 bool V1InUse = false;
15164 bool V2InUse = false;
15165
15167 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15168
15169 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15170 // do so. This avoids using them to handle blends-with-zero which is
15171 // important as a single pshufb is significantly faster for that.
15172 if (V1InUse && V2InUse) {
15173 if (Subtarget.hasSSE41())
15174 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15175 Zeroable, Subtarget, DAG))
15176 return Blend;
15177
15178 // We can use an unpack to do the blending rather than an or in some
15179 // cases. Even though the or may be (very minorly) more efficient, we
15180 // preference this lowering because there are common cases where part of
15181 // the complexity of the shuffles goes away when we do the final blend as
15182 // an unpack.
15183 // FIXME: It might be worth trying to detect if the unpack-feeding
15184 // shuffles will both be pshufb, in which case we shouldn't bother with
15185 // this.
15187 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15188 return Unpack;
15189
15190 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15191 if (Subtarget.hasVBMI())
15192 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15193 DAG);
15194
15195 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15196 if (Subtarget.hasXOP()) {
15197 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15198 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15199 }
15200
15201 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15202 // PALIGNR will be cheaper than the second PSHUFB+OR.
15204 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15205 return V;
15206 }
15207
15208 return PSHUFB;
15209 }
15210
15211 // There are special ways we can lower some single-element blends.
15212 if (NumV2Elements == 1)
15214 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15215 return V;
15216
15217 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15218 return Blend;
15219
15220 // Check whether a compaction lowering can be done. This handles shuffles
15221 // which take every Nth element for some even N. See the helper function for
15222 // details.
15223 //
15224 // We special case these as they can be particularly efficiently handled with
15225 // the PACKUSB instruction on x86 and they show up in common patterns of
15226 // rearranging bytes to truncate wide elements.
15227 if (NumEvenDrops) {
15228 // NumEvenDrops is the power of two stride of the elements. Another way of
15229 // thinking about it is that we need to drop the even elements this many
15230 // times to get the original input.
15231
15232 // First we need to zero all the dropped bytes.
15233 assert(NumEvenDrops <= 3 &&
15234 "No support for dropping even elements more than 3 times.");
15235 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15236 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15237 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15238 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15239 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15240 WordClearMask);
15241 if (!IsSingleInput)
15242 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15243 WordClearMask);
15244
15245 // Now pack things back together.
15246 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15247 IsSingleInput ? V1 : V2);
15248 for (int i = 1; i < NumEvenDrops; ++i) {
15249 Result = DAG.getBitcast(MVT::v8i16, Result);
15250 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15251 }
15252 return Result;
15253 }
15254
15255 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15256 if (NumOddDrops == 1) {
15257 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15258 DAG.getBitcast(MVT::v8i16, V1),
15259 DAG.getTargetConstant(8, DL, MVT::i8));
15260 if (!IsSingleInput)
15261 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15262 DAG.getBitcast(MVT::v8i16, V2),
15263 DAG.getTargetConstant(8, DL, MVT::i8));
15264 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15265 IsSingleInput ? V1 : V2);
15266 }
15267
15268 // Handle multi-input cases by blending/unpacking single-input shuffles.
15269 if (NumV2Elements > 0)
15270 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15271 Zeroable, Subtarget, DAG);
15272
15273 // The fallback path for single-input shuffles widens this into two v8i16
15274 // vectors with unpacks, shuffles those, and then pulls them back together
15275 // with a pack.
15276 SDValue V = V1;
15277
15278 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15279 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15280 for (int i = 0; i < 16; ++i)
15281 if (Mask[i] >= 0)
15282 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15283
15284 SDValue VLoHalf, VHiHalf;
15285 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15286 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15287 // i16s.
15288 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15289 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15290 // Use a mask to drop the high bytes.
15291 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15292 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15293 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15294
15295 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15296 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15297
15298 // Squash the masks to point directly into VLoHalf.
15299 for (int &M : LoBlendMask)
15300 if (M >= 0)
15301 M /= 2;
15302 for (int &M : HiBlendMask)
15303 if (M >= 0)
15304 M /= 2;
15305 } else {
15306 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15307 // VHiHalf so that we can blend them as i16s.
15308 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15309
15310 VLoHalf = DAG.getBitcast(
15311 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15312 VHiHalf = DAG.getBitcast(
15313 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15314 }
15315
15316 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15317 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15318
15319 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15320}
15321
15322/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15323///
15324/// This routine breaks down the specific type of 128-bit shuffle and
15325/// dispatches to the lowering routines accordingly.
15327 MVT VT, SDValue V1, SDValue V2,
15328 const APInt &Zeroable,
15329 const X86Subtarget &Subtarget,
15330 SelectionDAG &DAG) {
15331 if (VT == MVT::v8bf16) {
15332 V1 = DAG.getBitcast(MVT::v8i16, V1);
15333 V2 = DAG.getBitcast(MVT::v8i16, V2);
15334 return DAG.getBitcast(VT,
15335 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15336 }
15337
15338 switch (VT.SimpleTy) {
15339 case MVT::v2i64:
15340 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15341 case MVT::v2f64:
15342 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15343 case MVT::v4i32:
15344 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15345 case MVT::v4f32:
15346 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15347 case MVT::v8i16:
15348 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15349 case MVT::v8f16:
15350 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15351 case MVT::v16i8:
15352 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15353
15354 default:
15355 llvm_unreachable("Unimplemented!");
15356 }
15357}
15358
15359/// Generic routine to split vector shuffle into half-sized shuffles.
15360///
15361/// This routine just extracts two subvectors, shuffles them independently, and
15362/// then concatenates them back together. This should work effectively with all
15363/// AVX vector shuffle types.
15365 SDValue V2, ArrayRef<int> Mask,
15366 SelectionDAG &DAG, bool SimpleOnly) {
15367 assert(VT.getSizeInBits() >= 256 &&
15368 "Only for 256-bit or wider vector shuffles!");
15369 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15370 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15371
15372 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15373 if (VT == MVT::v8f32) {
15374 SDValue BC1 = peekThroughBitcasts(V1);
15375 SDValue BC2 = peekThroughBitcasts(V2);
15376 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15377 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15378 DAG, SimpleOnly))
15379 return DAG.getBitcast(VT, Split);
15380 }
15381 }
15382
15383 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15384 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15385
15386 int NumElements = VT.getVectorNumElements();
15387 int SplitNumElements = NumElements / 2;
15388 MVT ScalarVT = VT.getVectorElementType();
15389 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15390
15391 // Use splitVector/extractSubVector so that split build-vectors just build two
15392 // narrower build vectors. This helps shuffling with splats and zeros.
15393 auto SplitVector = [&](SDValue V) {
15394 SDValue LoV, HiV;
15395 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15396 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15397 DAG.getBitcast(SplitVT, HiV));
15398 };
15399
15400 SDValue LoV1, HiV1, LoV2, HiV2;
15401 std::tie(LoV1, HiV1) = SplitVector(V1);
15402 std::tie(LoV2, HiV2) = SplitVector(V2);
15403
15404 // Now create two 4-way blends of these half-width vectors.
15405 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15406 bool &UseHiV1, bool &UseLoV2,
15407 bool &UseHiV2) {
15408 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15409 for (int i = 0; i < SplitNumElements; ++i) {
15410 int M = HalfMask[i];
15411 if (M >= NumElements) {
15412 if (M >= NumElements + SplitNumElements)
15413 UseHiV2 = true;
15414 else
15415 UseLoV2 = true;
15416 } else if (M >= 0) {
15417 if (M >= SplitNumElements)
15418 UseHiV1 = true;
15419 else
15420 UseLoV1 = true;
15421 }
15422 }
15423 };
15424
15425 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15426 if (!SimpleOnly)
15427 return true;
15428
15429 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15430 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15431
15432 return !(UseHiV1 || UseHiV2);
15433 };
15434
15435 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15436 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15437 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15438 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15439 for (int i = 0; i < SplitNumElements; ++i) {
15440 int M = HalfMask[i];
15441 if (M >= NumElements) {
15442 V2BlendMask[i] = M - NumElements;
15443 BlendMask[i] = SplitNumElements + i;
15444 } else if (M >= 0) {
15445 V1BlendMask[i] = M;
15446 BlendMask[i] = i;
15447 }
15448 }
15449
15450 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15451 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15452
15453 // Because the lowering happens after all combining takes place, we need to
15454 // manually combine these blend masks as much as possible so that we create
15455 // a minimal number of high-level vector shuffle nodes.
15456 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15457
15458 // First try just blending the halves of V1 or V2.
15459 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15460 return DAG.getUNDEF(SplitVT);
15461 if (!UseLoV2 && !UseHiV2)
15462 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15463 if (!UseLoV1 && !UseHiV1)
15464 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15465
15466 SDValue V1Blend, V2Blend;
15467 if (UseLoV1 && UseHiV1) {
15468 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15469 } else {
15470 // We only use half of V1 so map the usage down into the final blend mask.
15471 V1Blend = UseLoV1 ? LoV1 : HiV1;
15472 for (int i = 0; i < SplitNumElements; ++i)
15473 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15474 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15475 }
15476 if (UseLoV2 && UseHiV2) {
15477 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15478 } else {
15479 // We only use half of V2 so map the usage down into the final blend mask.
15480 V2Blend = UseLoV2 ? LoV2 : HiV2;
15481 for (int i = 0; i < SplitNumElements; ++i)
15482 if (BlendMask[i] >= SplitNumElements)
15483 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15484 }
15485 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15486 };
15487
15488 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15489 return SDValue();
15490
15491 SDValue Lo = HalfBlend(LoMask);
15492 SDValue Hi = HalfBlend(HiMask);
15493 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15494}
15495
15496/// Either split a vector in halves or decompose the shuffles and the
15497/// blend/unpack.
15498///
15499/// This is provided as a good fallback for many lowerings of non-single-input
15500/// shuffles with more than one 128-bit lane. In those cases, we want to select
15501/// between splitting the shuffle into 128-bit components and stitching those
15502/// back together vs. extracting the single-input shuffles and blending those
15503/// results.
15505 SDValue V2, ArrayRef<int> Mask,
15506 const APInt &Zeroable,
15507 const X86Subtarget &Subtarget,
15508 SelectionDAG &DAG) {
15509 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15510 "shuffles as it could then recurse on itself.");
15511 int Size = Mask.size();
15512
15513 // If this can be modeled as a broadcast of two elements followed by a blend,
15514 // prefer that lowering. This is especially important because broadcasts can
15515 // often fold with memory operands.
15516 auto DoBothBroadcast = [&] {
15517 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15518 for (int M : Mask)
15519 if (M >= Size) {
15520 if (V2BroadcastIdx < 0)
15521 V2BroadcastIdx = M - Size;
15522 else if ((M - Size) != V2BroadcastIdx &&
15523 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15524 return false;
15525 } else if (M >= 0) {
15526 if (V1BroadcastIdx < 0)
15527 V1BroadcastIdx = M;
15528 else if (M != V1BroadcastIdx &&
15529 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15530 return false;
15531 }
15532 return true;
15533 };
15534 if (DoBothBroadcast())
15535 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15536 Subtarget, DAG);
15537
15538 // If the inputs all stem from a single 128-bit lane of each input, then we
15539 // split them rather than blending because the split will decompose to
15540 // unusually few instructions.
15541 int LaneCount = VT.getSizeInBits() / 128;
15542 int LaneSize = Size / LaneCount;
15543 SmallBitVector LaneInputs[2];
15544 LaneInputs[0].resize(LaneCount, false);
15545 LaneInputs[1].resize(LaneCount, false);
15546 for (int i = 0; i < Size; ++i)
15547 if (Mask[i] >= 0)
15548 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15549 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15550 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15551 /*SimpleOnly*/ false);
15552
15553 // Without AVX2, if we can freely split the subvectors then we're better off
15554 // performing half width shuffles.
15555 if (!Subtarget.hasAVX2()) {
15556 SDValue BC1 = peekThroughBitcasts(V1);
15557 SDValue BC2 = peekThroughBitcasts(V2);
15558 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15559 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15560 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15561 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15562 if (SplatOrSplitV1 && SplatOrSplitV2)
15563 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15564 /*SimpleOnly*/ false);
15565 }
15566
15567 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15568 // requires that the decomposed single-input shuffles don't end up here.
15569 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15570 Subtarget, DAG);
15571}
15572
15573// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15574// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15576 SDValue V1, SDValue V2,
15577 ArrayRef<int> Mask,
15578 SelectionDAG &DAG) {
15579 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15580
15581 int LHSMask[4] = {-1, -1, -1, -1};
15582 int RHSMask[4] = {-1, -1, -1, -1};
15583 int SHUFPDMask[4] = {-1, -1, -1, -1};
15584
15585 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15586 // perform the shuffle once the lanes have been shuffled in place.
15587 for (int i = 0; i != 4; ++i) {
15588 int M = Mask[i];
15589 if (M < 0)
15590 continue;
15591 int LaneBase = i & ~1;
15592 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15593 LaneMask[LaneBase + (M & 1)] = M;
15594 SHUFPDMask[i] = M & 1;
15595 }
15596
15597 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15598 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15599 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15600 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15601}
15602
15603/// Lower a vector shuffle crossing multiple 128-bit lanes as
15604/// a lane permutation followed by a per-lane permutation.
15605///
15606/// This is mainly for cases where we can have non-repeating permutes
15607/// in each lane.
15608///
15609/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15610/// we should investigate merging them.
15612 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15613 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15614 int NumElts = VT.getVectorNumElements();
15615 int NumLanes = VT.getSizeInBits() / 128;
15616 int NumEltsPerLane = NumElts / NumLanes;
15617 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15618
15619 /// Attempts to find a sublane permute with the given size
15620 /// that gets all elements into their target lanes.
15621 ///
15622 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15623 /// If unsuccessful, returns false and may overwrite InLaneMask.
15624 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15625 int NumSublanesPerLane = NumSublanes / NumLanes;
15626 int NumEltsPerSublane = NumElts / NumSublanes;
15627
15628 SmallVector<int, 16> CrossLaneMask;
15629 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15630 // CrossLaneMask but one entry == one sublane.
15631 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15632 APInt DemandedCrossLane = APInt::getZero(NumElts);
15633
15634 for (int i = 0; i != NumElts; ++i) {
15635 int M = Mask[i];
15636 if (M < 0)
15637 continue;
15638
15639 int SrcSublane = M / NumEltsPerSublane;
15640 int DstLane = i / NumEltsPerLane;
15641
15642 // We only need to get the elements into the right lane, not sublane.
15643 // So search all sublanes that make up the destination lane.
15644 bool Found = false;
15645 int DstSubStart = DstLane * NumSublanesPerLane;
15646 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15647 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15648 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15649 continue;
15650
15651 Found = true;
15652 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15653 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15654 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15655 DemandedCrossLane.setBit(InLaneMask[i]);
15656 break;
15657 }
15658 if (!Found)
15659 return SDValue();
15660 }
15661
15662 // Fill CrossLaneMask using CrossLaneMaskLarge.
15663 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15664
15665 if (!CanUseSublanes) {
15666 // If we're only shuffling a single lowest lane and the rest are identity
15667 // then don't bother.
15668 // TODO - isShuffleMaskInputInPlace could be extended to something like
15669 // this.
15670 int NumIdentityLanes = 0;
15671 bool OnlyShuffleLowestLane = true;
15672 for (int i = 0; i != NumLanes; ++i) {
15673 int LaneOffset = i * NumEltsPerLane;
15674 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15675 i * NumEltsPerLane))
15676 NumIdentityLanes++;
15677 else if (CrossLaneMask[LaneOffset] != 0)
15678 OnlyShuffleLowestLane = false;
15679 }
15680 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15681 return SDValue();
15682 }
15683
15684 // Simplify CrossLaneMask based on the actual demanded elements.
15685 if (V1.hasOneUse())
15686 for (int i = 0; i != NumElts; ++i)
15687 if (!DemandedCrossLane[i])
15688 CrossLaneMask[i] = SM_SentinelUndef;
15689
15690 // Avoid returning the same shuffle operation. For example,
15691 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15692 // undef:v16i16
15693 if (CrossLaneMask == Mask || InLaneMask == Mask)
15694 return SDValue();
15695
15696 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15697 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15698 InLaneMask);
15699 };
15700
15701 // First attempt a solution with full lanes.
15702 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15703 return V;
15704
15705 // The rest of the solutions use sublanes.
15706 if (!CanUseSublanes)
15707 return SDValue();
15708
15709 // Then attempt a solution with 64-bit sublanes (vpermq).
15710 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15711 return V;
15712
15713 // If that doesn't work and we have fast variable cross-lane shuffle,
15714 // attempt 32-bit sublanes (vpermd).
15715 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15716 return SDValue();
15717
15718 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15719}
15720
15721/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15722static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15723 SmallVector<int> &InLaneMask) {
15724 int Size = Mask.size();
15725 InLaneMask.assign(Mask.begin(), Mask.end());
15726 for (int i = 0; i < Size; ++i) {
15727 int &M = InLaneMask[i];
15728 if (M < 0)
15729 continue;
15730 if (((M % Size) / LaneSize) != (i / LaneSize))
15731 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15732 }
15733}
15734
15735/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15736/// source with a lane permutation.
15737///
15738/// This lowering strategy results in four instructions in the worst case for a
15739/// single-input cross lane shuffle which is lower than any other fully general
15740/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15741/// shuffle pattern should be handled prior to trying this lowering.
15743 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15744 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15745 // FIXME: This should probably be generalized for 512-bit vectors as well.
15746 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15747 int Size = Mask.size();
15748 int LaneSize = Size / 2;
15749
15750 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15751 // Only do this if the elements aren't all from the lower lane,
15752 // otherwise we're (probably) better off doing a split.
15753 if (VT == MVT::v4f64 &&
15754 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15755 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15756
15757 // If there are only inputs from one 128-bit lane, splitting will in fact be
15758 // less expensive. The flags track whether the given lane contains an element
15759 // that crosses to another lane.
15760 bool AllLanes;
15761 if (!Subtarget.hasAVX2()) {
15762 bool LaneCrossing[2] = {false, false};
15763 for (int i = 0; i < Size; ++i)
15764 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15765 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15766 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15767 } else {
15768 bool LaneUsed[2] = {false, false};
15769 for (int i = 0; i < Size; ++i)
15770 if (Mask[i] >= 0)
15771 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15772 AllLanes = LaneUsed[0] && LaneUsed[1];
15773 }
15774
15775 // TODO - we could support shuffling V2 in the Flipped input.
15776 assert(V2.isUndef() &&
15777 "This last part of this routine only works on single input shuffles");
15778
15779 SmallVector<int> InLaneMask;
15780 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15781
15782 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15783 "In-lane shuffle mask expected");
15784
15785 // If we're not using both lanes in each lane and the inlane mask is not
15786 // repeating, then we're better off splitting.
15787 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15788 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15789 /*SimpleOnly*/ false);
15790
15791 // Flip the lanes, and shuffle the results which should now be in-lane.
15792 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15793 SDValue Flipped = DAG.getBitcast(PVT, V1);
15794 Flipped =
15795 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15796 Flipped = DAG.getBitcast(VT, Flipped);
15797 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15798}
15799
15800/// Handle lowering 2-lane 128-bit shuffles.
15802 SDValue V2, ArrayRef<int> Mask,
15803 const APInt &Zeroable,
15804 const X86Subtarget &Subtarget,
15805 SelectionDAG &DAG) {
15806 if (V2.isUndef()) {
15807 // Attempt to match VBROADCAST*128 subvector broadcast load.
15808 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15809 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15810 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15812 MVT MemVT = VT.getHalfNumVectorElementsVT();
15813 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15816 VT, MemVT, Ld, Ofs, DAG))
15817 return BcstLd;
15818 }
15819
15820 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15821 if (Subtarget.hasAVX2())
15822 return SDValue();
15823 }
15824
15825 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15826
15827 SmallVector<int, 4> WidenedMask;
15828 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15829 return SDValue();
15830
15831 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15832 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15833
15834 // Try to use an insert into a zero vector.
15835 if (WidenedMask[0] == 0 && IsHighZero) {
15836 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15837 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15838 DAG.getVectorIdxConstant(0, DL));
15839 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15840 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15841 DAG.getVectorIdxConstant(0, DL));
15842 }
15843
15844 // TODO: If minimizing size and one of the inputs is a zero vector and the
15845 // the zero vector has only one use, we could use a VPERM2X128 to save the
15846 // instruction bytes needed to explicitly generate the zero vector.
15847
15848 // Blends are faster and handle all the non-lane-crossing cases.
15849 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15850 Subtarget, DAG))
15851 return Blend;
15852
15853 // If either input operand is a zero vector, use VPERM2X128 because its mask
15854 // allows us to replace the zero input with an implicit zero.
15855 if (!IsLowZero && !IsHighZero) {
15856 // Check for patterns which can be matched with a single insert of a 128-bit
15857 // subvector.
15858 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15859 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15860
15861 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15862 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15864 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15865 SDValue SubVec =
15866 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15867 DAG.getVectorIdxConstant(0, DL));
15868 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15869 DAG.getVectorIdxConstant(2, DL));
15870 }
15871 }
15872
15873 // Try to use SHUF128 if possible.
15874 if (Subtarget.hasVLX()) {
15875 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15876 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15877 ((WidenedMask[1] % 2) << 1);
15878 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15879 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15880 }
15881 }
15882 }
15883
15884 // Otherwise form a 128-bit permutation. After accounting for undefs,
15885 // convert the 64-bit shuffle mask selection values into 128-bit
15886 // selection bits by dividing the indexes by 2 and shifting into positions
15887 // defined by a vperm2*128 instruction's immediate control byte.
15888
15889 // The immediate permute control byte looks like this:
15890 // [1:0] - select 128 bits from sources for low half of destination
15891 // [2] - ignore
15892 // [3] - zero low half of destination
15893 // [5:4] - select 128 bits from sources for high half of destination
15894 // [6] - ignore
15895 // [7] - zero high half of destination
15896
15897 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15898 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15899
15900 unsigned PermMask = 0;
15901 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15902 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15903
15904 // Check the immediate mask and replace unused sources with undef.
15905 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15906 V1 = DAG.getUNDEF(VT);
15907 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15908 V2 = DAG.getUNDEF(VT);
15909
15910 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15911 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15912}
15913
15914/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15915/// shuffling each lane.
15916///
15917/// This attempts to create a repeated lane shuffle where each lane uses one
15918/// or two of the lanes of the inputs. The lanes of the input vectors are
15919/// shuffled in one or two independent shuffles to get the lanes into the
15920/// position needed by the final shuffle.
15922 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15923 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15924 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15925
15926 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15927 return SDValue();
15928
15929 int NumElts = Mask.size();
15930 int NumLanes = VT.getSizeInBits() / 128;
15931 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15932 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15933 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15934
15935 // First pass will try to fill in the RepeatMask from lanes that need two
15936 // sources.
15937 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15938 int Srcs[2] = {-1, -1};
15939 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15940 for (int i = 0; i != NumLaneElts; ++i) {
15941 int M = Mask[(Lane * NumLaneElts) + i];
15942 if (M < 0)
15943 continue;
15944 // Determine which of the possible input lanes (NumLanes from each source)
15945 // this element comes from. Assign that as one of the sources for this
15946 // lane. We can assign up to 2 sources for this lane. If we run out
15947 // sources we can't do anything.
15948 int LaneSrc = M / NumLaneElts;
15949 int Src;
15950 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15951 Src = 0;
15952 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15953 Src = 1;
15954 else
15955 return SDValue();
15956
15957 Srcs[Src] = LaneSrc;
15958 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15959 }
15960
15961 // If this lane has two sources, see if it fits with the repeat mask so far.
15962 if (Srcs[1] < 0)
15963 continue;
15964
15965 LaneSrcs[Lane][0] = Srcs[0];
15966 LaneSrcs[Lane][1] = Srcs[1];
15967
15968 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15969 assert(M1.size() == M2.size() && "Unexpected mask size");
15970 for (int i = 0, e = M1.size(); i != e; ++i)
15971 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15972 return false;
15973 return true;
15974 };
15975
15976 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15977 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15978 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15979 int M = Mask[i];
15980 if (M < 0)
15981 continue;
15982 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15983 "Unexpected mask element");
15984 MergedMask[i] = M;
15985 }
15986 };
15987
15988 if (MatchMasks(InLaneMask, RepeatMask)) {
15989 // Merge this lane mask into the final repeat mask.
15990 MergeMasks(InLaneMask, RepeatMask);
15991 continue;
15992 }
15993
15994 // Didn't find a match. Swap the operands and try again.
15995 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15997
15998 if (MatchMasks(InLaneMask, RepeatMask)) {
15999 // Merge this lane mask into the final repeat mask.
16000 MergeMasks(InLaneMask, RepeatMask);
16001 continue;
16002 }
16003
16004 // Couldn't find a match with the operands in either order.
16005 return SDValue();
16006 }
16007
16008 // Now handle any lanes with only one source.
16009 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16010 // If this lane has already been processed, skip it.
16011 if (LaneSrcs[Lane][0] >= 0)
16012 continue;
16013
16014 for (int i = 0; i != NumLaneElts; ++i) {
16015 int M = Mask[(Lane * NumLaneElts) + i];
16016 if (M < 0)
16017 continue;
16018
16019 // If RepeatMask isn't defined yet we can define it ourself.
16020 if (RepeatMask[i] < 0)
16021 RepeatMask[i] = M % NumLaneElts;
16022
16023 if (RepeatMask[i] < NumElts) {
16024 if (RepeatMask[i] != M % NumLaneElts)
16025 return SDValue();
16026 LaneSrcs[Lane][0] = M / NumLaneElts;
16027 } else {
16028 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16029 return SDValue();
16030 LaneSrcs[Lane][1] = M / NumLaneElts;
16031 }
16032 }
16033
16034 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16035 return SDValue();
16036 }
16037
16038 SmallVector<int, 16> NewMask(NumElts, -1);
16039 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16040 int Src = LaneSrcs[Lane][0];
16041 for (int i = 0; i != NumLaneElts; ++i) {
16042 int M = -1;
16043 if (Src >= 0)
16044 M = Src * NumLaneElts + i;
16045 NewMask[Lane * NumLaneElts + i] = M;
16046 }
16047 }
16048 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16049 // Ensure we didn't get back the shuffle we started with.
16050 // FIXME: This is a hack to make up for some splat handling code in
16051 // getVectorShuffle.
16052 if (isa<ShuffleVectorSDNode>(NewV1) &&
16053 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16054 return SDValue();
16055
16056 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16057 int Src = LaneSrcs[Lane][1];
16058 for (int i = 0; i != NumLaneElts; ++i) {
16059 int M = -1;
16060 if (Src >= 0)
16061 M = Src * NumLaneElts + i;
16062 NewMask[Lane * NumLaneElts + i] = M;
16063 }
16064 }
16065 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16066 // Ensure we didn't get back the shuffle we started with.
16067 // FIXME: This is a hack to make up for some splat handling code in
16068 // getVectorShuffle.
16069 if (isa<ShuffleVectorSDNode>(NewV2) &&
16070 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16071 return SDValue();
16072
16073 for (int i = 0; i != NumElts; ++i) {
16074 if (Mask[i] < 0) {
16075 NewMask[i] = -1;
16076 continue;
16077 }
16078 NewMask[i] = RepeatMask[i % NumLaneElts];
16079 if (NewMask[i] < 0)
16080 continue;
16081
16082 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16083 }
16084 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16085}
16086
16087/// If the input shuffle mask results in a vector that is undefined in all upper
16088/// or lower half elements and that mask accesses only 2 halves of the
16089/// shuffle's operands, return true. A mask of half the width with mask indexes
16090/// adjusted to access the extracted halves of the original shuffle operands is
16091/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16092/// lower half of each input operand is accessed.
16093static bool
16095 int &HalfIdx1, int &HalfIdx2) {
16096 assert((Mask.size() == HalfMask.size() * 2) &&
16097 "Expected input mask to be twice as long as output");
16098
16099 // Exactly one half of the result must be undef to allow narrowing.
16100 bool UndefLower = isUndefLowerHalf(Mask);
16101 bool UndefUpper = isUndefUpperHalf(Mask);
16102 if (UndefLower == UndefUpper)
16103 return false;
16104
16105 unsigned HalfNumElts = HalfMask.size();
16106 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16107 HalfIdx1 = -1;
16108 HalfIdx2 = -1;
16109 for (unsigned i = 0; i != HalfNumElts; ++i) {
16110 int M = Mask[i + MaskIndexOffset];
16111 if (M < 0) {
16112 HalfMask[i] = M;
16113 continue;
16114 }
16115
16116 // Determine which of the 4 half vectors this element is from.
16117 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16118 int HalfIdx = M / HalfNumElts;
16119
16120 // Determine the element index into its half vector source.
16121 int HalfElt = M % HalfNumElts;
16122
16123 // We can shuffle with up to 2 half vectors, set the new 'half'
16124 // shuffle mask accordingly.
16125 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16126 HalfMask[i] = HalfElt;
16127 HalfIdx1 = HalfIdx;
16128 continue;
16129 }
16130 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16131 HalfMask[i] = HalfElt + HalfNumElts;
16132 HalfIdx2 = HalfIdx;
16133 continue;
16134 }
16135
16136 // Too many half vectors referenced.
16137 return false;
16138 }
16139
16140 return true;
16141}
16142
16143/// Given the output values from getHalfShuffleMask(), create a half width
16144/// shuffle of extracted vectors followed by an insert back to full width.
16146 ArrayRef<int> HalfMask, int HalfIdx1,
16147 int HalfIdx2, bool UndefLower,
16148 SelectionDAG &DAG, bool UseConcat = false) {
16149 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
16150 assert(V1.getValueType().isSimple() && "Expecting only simple types");
16151
16152 MVT VT = V1.getSimpleValueType();
16153 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16154 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16155
16156 auto getHalfVector = [&](int HalfIdx) {
16157 if (HalfIdx < 0)
16158 return DAG.getUNDEF(HalfVT);
16159 SDValue V = (HalfIdx < 2 ? V1 : V2);
16160 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16161 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16162 DAG.getVectorIdxConstant(HalfIdx, DL));
16163 };
16164
16165 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16166 SDValue Half1 = getHalfVector(HalfIdx1);
16167 SDValue Half2 = getHalfVector(HalfIdx2);
16168 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16169 if (UseConcat) {
16170 SDValue Op0 = V;
16171 SDValue Op1 = DAG.getUNDEF(HalfVT);
16172 if (UndefLower)
16173 std::swap(Op0, Op1);
16174 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16175 }
16176
16177 unsigned Offset = UndefLower ? HalfNumElts : 0;
16178 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16180}
16181
16182/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16183/// This allows for fast cases such as subvector extraction/insertion
16184/// or shuffling smaller vector types which can lower more efficiently.
16186 SDValue V2, ArrayRef<int> Mask,
16187 const X86Subtarget &Subtarget,
16188 SelectionDAG &DAG) {
16189 assert((VT.is256BitVector() || VT.is512BitVector()) &&
16190 "Expected 256-bit or 512-bit vector");
16191
16192 bool UndefLower = isUndefLowerHalf(Mask);
16193 if (!UndefLower && !isUndefUpperHalf(Mask))
16194 return SDValue();
16195
16196 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
16197 "Completely undef shuffle mask should have been simplified already");
16198
16199 // Upper half is undef and lower half is whole upper subvector.
16200 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16201 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16202 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16203 if (!UndefLower &&
16204 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16205 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16206 DAG.getVectorIdxConstant(HalfNumElts, DL));
16207 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16208 DAG.getVectorIdxConstant(0, DL));
16209 }
16210
16211 // Lower half is undef and upper half is whole lower subvector.
16212 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16213 if (UndefLower &&
16214 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16215 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16216 DAG.getVectorIdxConstant(0, DL));
16217 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16218 DAG.getVectorIdxConstant(HalfNumElts, DL));
16219 }
16220
16221 int HalfIdx1, HalfIdx2;
16222 SmallVector<int, 8> HalfMask(HalfNumElts);
16223 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16224 return SDValue();
16225
16226 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
16227
16228 // Only shuffle the halves of the inputs when useful.
16229 unsigned NumLowerHalves =
16230 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16231 unsigned NumUpperHalves =
16232 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16233 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
16234
16235 // Determine the larger pattern of undef/halves, then decide if it's worth
16236 // splitting the shuffle based on subtarget capabilities and types.
16237 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16238 if (!UndefLower) {
16239 // XXXXuuuu: no insert is needed.
16240 // Always extract lowers when setting lower - these are all free subreg ops.
16241 if (NumUpperHalves == 0)
16242 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16243 UndefLower, DAG);
16244
16245 if (NumUpperHalves == 1) {
16246 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16247 if (Subtarget.hasAVX2()) {
16248 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16249 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16250 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16251 (!isSingleSHUFPSMask(HalfMask) ||
16252 Subtarget.hasFastVariableCrossLaneShuffle()))
16253 return SDValue();
16254 // If this is an unary shuffle (assume that the 2nd operand is
16255 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16256 // are better off extracting the upper half of 1 operand and using a
16257 // narrow shuffle.
16258 if (EltWidth == 64 && V2.isUndef())
16259 return SDValue();
16260 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16261 // full width pshufb, and then merge.
16262 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16263 return SDValue();
16264 }
16265 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16266 if (Subtarget.hasAVX512() && VT.is512BitVector())
16267 return SDValue();
16268 // Extract + narrow shuffle is better than the wide alternative.
16269 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16270 UndefLower, DAG);
16271 }
16272
16273 // Don't extract both uppers, instead shuffle and then extract.
16274 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16275 return SDValue();
16276 }
16277
16278 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16279 if (NumUpperHalves == 0) {
16280 // AVX2 has efficient 64-bit element cross-lane shuffles.
16281 // TODO: Refine to account for unary shuffle, splat, and other masks?
16282 if (Subtarget.hasAVX2() && EltWidth == 64)
16283 return SDValue();
16284 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16285 if (Subtarget.hasAVX512() && VT.is512BitVector())
16286 return SDValue();
16287 // Narrow shuffle + insert is better than the wide alternative.
16288 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16289 UndefLower, DAG);
16290 }
16291
16292 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16293 return SDValue();
16294}
16295
16296/// Handle case where shuffle sources are coming from the same 128-bit lane and
16297/// every lane can be represented as the same repeating mask - allowing us to
16298/// shuffle the sources with the repeating shuffle and then permute the result
16299/// to the destination lanes.
16301 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16302 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16303 int NumElts = VT.getVectorNumElements();
16304 int NumLanes = VT.getSizeInBits() / 128;
16305 int NumLaneElts = NumElts / NumLanes;
16306
16307 // On AVX2 we may be able to just shuffle the lowest elements and then
16308 // broadcast the result.
16309 if (Subtarget.hasAVX2()) {
16310 for (unsigned BroadcastSize : {16, 32, 64}) {
16311 if (BroadcastSize <= VT.getScalarSizeInBits())
16312 continue;
16313 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16314
16315 // Attempt to match a repeating pattern every NumBroadcastElts,
16316 // accounting for UNDEFs but only references the lowest 128-bit
16317 // lane of the inputs.
16318 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16319 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16320 for (int j = 0; j != NumBroadcastElts; ++j) {
16321 int M = Mask[i + j];
16322 if (M < 0)
16323 continue;
16324 int &R = RepeatMask[j];
16325 if (0 != ((M % NumElts) / NumLaneElts))
16326 return false;
16327 if (0 <= R && R != M)
16328 return false;
16329 R = M;
16330 }
16331 return true;
16332 };
16333
16334 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16335 if (!FindRepeatingBroadcastMask(RepeatMask))
16336 continue;
16337
16338 // Shuffle the (lowest) repeated elements in place for broadcast.
16339 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16340
16341 // Shuffle the actual broadcast.
16342 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16343 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16344 for (int j = 0; j != NumBroadcastElts; ++j)
16345 BroadcastMask[i + j] = j;
16346
16347 // Avoid returning the same shuffle operation. For example,
16348 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16349 if (BroadcastMask == Mask)
16350 return SDValue();
16351
16352 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16353 BroadcastMask);
16354 }
16355 }
16356
16357 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16358 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16359 return SDValue();
16360
16361 // Bail if we already have a repeated lane shuffle mask.
16362 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16363 return SDValue();
16364
16365 // Helper to look for repeated mask in each split sublane, and that those
16366 // sublanes can then be permuted into place.
16367 auto ShuffleSubLanes = [&](int SubLaneScale) {
16368 int NumSubLanes = NumLanes * SubLaneScale;
16369 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16370
16371 // Check that all the sources are coming from the same lane and see if we
16372 // can form a repeating shuffle mask (local to each sub-lane). At the same
16373 // time, determine the source sub-lane for each destination sub-lane.
16374 int TopSrcSubLane = -1;
16375 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16376 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16377 SubLaneScale,
16378 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16379
16380 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16381 // Extract the sub-lane mask, check that it all comes from the same lane
16382 // and normalize the mask entries to come from the first lane.
16383 int SrcLane = -1;
16384 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16385 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16386 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16387 if (M < 0)
16388 continue;
16389 int Lane = (M % NumElts) / NumLaneElts;
16390 if ((0 <= SrcLane) && (SrcLane != Lane))
16391 return SDValue();
16392 SrcLane = Lane;
16393 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16394 SubLaneMask[Elt] = LocalM;
16395 }
16396
16397 // Whole sub-lane is UNDEF.
16398 if (SrcLane < 0)
16399 continue;
16400
16401 // Attempt to match against the candidate repeated sub-lane masks.
16402 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16403 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16404 for (int i = 0; i != NumSubLaneElts; ++i) {
16405 if (M1[i] < 0 || M2[i] < 0)
16406 continue;
16407 if (M1[i] != M2[i])
16408 return false;
16409 }
16410 return true;
16411 };
16412
16413 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16414 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16415 continue;
16416
16417 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16418 for (int i = 0; i != NumSubLaneElts; ++i) {
16419 int M = SubLaneMask[i];
16420 if (M < 0)
16421 continue;
16422 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16423 "Unexpected mask element");
16424 RepeatedSubLaneMask[i] = M;
16425 }
16426
16427 // Track the top most source sub-lane - by setting the remaining to
16428 // UNDEF we can greatly simplify shuffle matching.
16429 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16430 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16431 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16432 break;
16433 }
16434
16435 // Bail if we failed to find a matching repeated sub-lane mask.
16436 if (Dst2SrcSubLanes[DstSubLane] < 0)
16437 return SDValue();
16438 }
16439 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16440 "Unexpected source lane");
16441
16442 // Create a repeating shuffle mask for the entire vector.
16443 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16444 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16445 int Lane = SubLane / SubLaneScale;
16446 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16447 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16448 int M = RepeatedSubLaneMask[Elt];
16449 if (M < 0)
16450 continue;
16451 int Idx = (SubLane * NumSubLaneElts) + Elt;
16452 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16453 }
16454 }
16455
16456 // Shuffle each source sub-lane to its destination.
16457 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16458 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16459 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16460 if (SrcSubLane < 0)
16461 continue;
16462 for (int j = 0; j != NumSubLaneElts; ++j)
16463 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16464 }
16465
16466 // Avoid returning the same shuffle operation.
16467 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16468 if (RepeatedMask == Mask || SubLaneMask == Mask)
16469 return SDValue();
16470
16471 SDValue RepeatedShuffle =
16472 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16473
16474 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16475 SubLaneMask);
16476 };
16477
16478 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16479 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16480 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16481 // Otherwise we can only permute whole 128-bit lanes.
16482 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16483 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16484 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16485 MinSubLaneScale = 2;
16486 MaxSubLaneScale =
16487 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16488 }
16489 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16490 MinSubLaneScale = MaxSubLaneScale = 4;
16491
16492 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16493 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16494 return Shuffle;
16495
16496 return SDValue();
16497}
16498
16500 bool &ForceV1Zero, bool &ForceV2Zero,
16501 unsigned &ShuffleImm, ArrayRef<int> Mask,
16502 const APInt &Zeroable) {
16503 int NumElts = VT.getVectorNumElements();
16504 assert(VT.getScalarSizeInBits() == 64 &&
16505 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16506 "Unexpected data type for VSHUFPD");
16507 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16508 "Illegal shuffle mask");
16509
16510 bool ZeroLane[2] = { true, true };
16511 for (int i = 0; i < NumElts; ++i)
16512 ZeroLane[i & 1] &= Zeroable[i];
16513
16514 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16515 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16516 bool IsSHUFPD = true;
16517 bool IsCommutable = true;
16518 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16519 for (int i = 0; i < NumElts; ++i) {
16520 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16521 continue;
16522 if (Mask[i] < 0)
16523 return false;
16524 int Val = (i & 6) + NumElts * (i & 1);
16525 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16526 if (Mask[i] < Val || Mask[i] > Val + 1)
16527 IsSHUFPD = false;
16528 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16529 IsCommutable = false;
16530 SHUFPDMask[i] = Mask[i] % 2;
16531 }
16532
16533 if (!IsSHUFPD && !IsCommutable)
16534 return false;
16535
16536 if (!IsSHUFPD && IsCommutable)
16537 std::swap(V1, V2);
16538
16539 ForceV1Zero = ZeroLane[0];
16540 ForceV2Zero = ZeroLane[1];
16541 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16542 return true;
16543}
16544
16546 SDValue V2, ArrayRef<int> Mask,
16547 const APInt &Zeroable,
16548 const X86Subtarget &Subtarget,
16549 SelectionDAG &DAG) {
16550 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16551 "Unexpected data type for VSHUFPD");
16552
16553 unsigned Immediate = 0;
16554 bool ForceV1Zero = false, ForceV2Zero = false;
16555 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16556 Mask, Zeroable))
16557 return SDValue();
16558
16559 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16560 if (ForceV1Zero)
16561 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16562 if (ForceV2Zero)
16563 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16564
16565 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16566 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16567}
16568
16569// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16570// by zeroable elements in the remaining 24 elements. Turn this into two
16571// vmovqb instructions shuffled together.
16573 SDValue V1, SDValue V2,
16574 ArrayRef<int> Mask,
16575 const APInt &Zeroable,
16576 SelectionDAG &DAG) {
16577 assert(VT == MVT::v32i8 && "Unexpected type!");
16578
16579 // The first 8 indices should be every 8th element.
16580 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16581 return SDValue();
16582
16583 // Remaining elements need to be zeroable.
16584 if (Zeroable.countl_one() < (Mask.size() - 8))
16585 return SDValue();
16586
16587 V1 = DAG.getBitcast(MVT::v4i64, V1);
16588 V2 = DAG.getBitcast(MVT::v4i64, V2);
16589
16590 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16591 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16592
16593 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16594 // the upper bits of the result using an unpckldq.
16595 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16596 { 0, 1, 2, 3, 16, 17, 18, 19,
16597 4, 5, 6, 7, 20, 21, 22, 23 });
16598 // Insert the unpckldq into a zero vector to widen to v32i8.
16599 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16600 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16601 DAG.getVectorIdxConstant(0, DL));
16602}
16603
16604// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16605// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16606// =>
16607// ul = unpckl v1, v2
16608// uh = unpckh v1, v2
16609// a = vperm ul, uh
16610// b = vperm ul, uh
16611//
16612// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16613// and permute. We cannot directly match v3 because it is split into two
16614// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16615// pair of 256-bit shuffles and makes sure the masks are consecutive.
16616//
16617// Once unpck and permute nodes are created, the permute corresponding to this
16618// shuffle is returned, while the other permute replaces the other half of the
16619// shuffle in the selection dag.
16621 SDValue V1, SDValue V2,
16622 ArrayRef<int> Mask,
16623 SelectionDAG &DAG) {
16624 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16625 VT != MVT::v32i8)
16626 return SDValue();
16627 // <B0, B1, B0+1, B1+1, ..., >
16628 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16629 unsigned Begin1) {
16630 size_t Size = Mask.size();
16631 assert(Size % 2 == 0 && "Expected even mask size");
16632 for (unsigned I = 0; I < Size; I += 2) {
16633 if (Mask[I] != (int)(Begin0 + I / 2) ||
16634 Mask[I + 1] != (int)(Begin1 + I / 2))
16635 return false;
16636 }
16637 return true;
16638 };
16639 // Check which half is this shuffle node
16640 int NumElts = VT.getVectorNumElements();
16641 size_t FirstQtr = NumElts / 2;
16642 size_t ThirdQtr = NumElts + NumElts / 2;
16643 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16644 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16645 if (!IsFirstHalf && !IsSecondHalf)
16646 return SDValue();
16647
16648 // Find the intersection between shuffle users of V1 and V2.
16649 SmallVector<SDNode *, 2> Shuffles;
16650 for (SDNode *User : V1->users())
16651 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16652 User->getOperand(1) == V2)
16653 Shuffles.push_back(User);
16654 // Limit user size to two for now.
16655 if (Shuffles.size() != 2)
16656 return SDValue();
16657 // Find out which half of the 512-bit shuffles is each smaller shuffle
16658 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16659 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16660 SDNode *FirstHalf;
16661 SDNode *SecondHalf;
16662 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16663 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16664 FirstHalf = Shuffles[0];
16665 SecondHalf = Shuffles[1];
16666 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16667 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16668 FirstHalf = Shuffles[1];
16669 SecondHalf = Shuffles[0];
16670 } else {
16671 return SDValue();
16672 }
16673 // Lower into unpck and perm. Return the perm of this shuffle and replace
16674 // the other.
16675 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16676 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16677 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16678 DAG.getTargetConstant(0x20, DL, MVT::i8));
16679 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16680 DAG.getTargetConstant(0x31, DL, MVT::i8));
16681 if (IsFirstHalf) {
16682 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16683 return Perm1;
16684 }
16685 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16686 return Perm2;
16687}
16688
16689/// Handle lowering of 4-lane 64-bit floating point shuffles.
16690///
16691/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16692/// isn't available.
16694 const APInt &Zeroable, SDValue V1, SDValue V2,
16695 const X86Subtarget &Subtarget,
16696 SelectionDAG &DAG) {
16697 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16698 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16699 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16700
16701 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16702 Subtarget, DAG))
16703 return V;
16704
16705 if (V2.isUndef()) {
16706 // Check for being able to broadcast a single element.
16707 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16708 Mask, Subtarget, DAG))
16709 return Broadcast;
16710
16711 // Use low duplicate instructions for masks that match their pattern.
16712 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16713 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16714
16715 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16716 // Non-half-crossing single input shuffles can be lowered with an
16717 // interleaved permutation.
16718 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16719 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16720 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16721 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16722 }
16723
16724 // With AVX2 we have direct support for this permutation.
16725 if (Subtarget.hasAVX2())
16726 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16727 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16728
16729 // Try to create an in-lane repeating shuffle mask and then shuffle the
16730 // results into the target lanes.
16732 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16733 return V;
16734
16735 // Try to permute the lanes and then use a per-lane permute.
16736 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16737 Mask, DAG, Subtarget))
16738 return V;
16739
16740 // Otherwise, fall back.
16741 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16742 DAG, Subtarget);
16743 }
16744
16745 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16746 Zeroable, Subtarget, DAG))
16747 return Blend;
16748
16749 // Use dedicated unpack instructions for masks that match their pattern.
16750 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16751 return V;
16752
16753 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16754 Zeroable, Subtarget, DAG))
16755 return Op;
16756
16757 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16758 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16759 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16760 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16761
16762 // If we have lane crossing shuffles AND they don't all come from the lower
16763 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16764 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16765 // canonicalize to a blend of splat which isn't necessary for this combine.
16766 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16767 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16768 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16769 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16770 (!Subtarget.hasAVX2() ||
16771 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16772 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16773
16774 // If we have one input in place, then we can permute the other input and
16775 // blend the result.
16776 if (V1IsInPlace || V2IsInPlace)
16777 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16778 Zeroable, Subtarget, DAG);
16779
16780 // Try to create an in-lane repeating shuffle mask and then shuffle the
16781 // results into the target lanes.
16783 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16784 return V;
16785
16786 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16787 // shuffle. However, if we have AVX2 and either inputs are already in place,
16788 // we will be able to shuffle even across lanes the other input in a single
16789 // instruction so skip this pattern.
16790 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16792 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16793 return V;
16794
16795 // If we have VLX support, we can use VEXPAND.
16796 if (Subtarget.hasVLX())
16797 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16798 Zeroable, Subtarget, DAG))
16799 return V;
16800
16801 // If we have AVX2 then we always want to lower with a blend because an v4 we
16802 // can fully permute the elements.
16803 if (Subtarget.hasAVX2())
16804 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16805 Zeroable, Subtarget, DAG);
16806
16807 // Otherwise fall back on generic lowering.
16808 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16809 Subtarget, DAG);
16810}
16811
16812/// Handle lowering of 4-lane 64-bit integer shuffles.
16813///
16814/// This routine is only called when we have AVX2 and thus a reasonable
16815/// instruction set for v4i64 shuffling..
16817 const APInt &Zeroable, SDValue V1, SDValue V2,
16818 const X86Subtarget &Subtarget,
16819 SelectionDAG &DAG) {
16820 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16821 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16822 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16823 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16824
16825 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16826 Subtarget, DAG))
16827 return V;
16828
16829 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16830 Zeroable, Subtarget, DAG))
16831 return Blend;
16832
16833 // Check for being able to broadcast a single element.
16834 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16835 Subtarget, DAG))
16836 return Broadcast;
16837
16838 // Try to use shift instructions if fast.
16839 if (Subtarget.preferLowerShuffleAsShift())
16840 if (SDValue Shift =
16841 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16842 Subtarget, DAG, /*BitwiseOnly*/ true))
16843 return Shift;
16844
16845 if (V2.isUndef()) {
16846 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16847 // can use lower latency instructions that will operate on both lanes.
16848 SmallVector<int, 2> RepeatedMask;
16849 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16850 SmallVector<int, 4> PSHUFDMask;
16851 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16852 return DAG.getBitcast(
16853 MVT::v4i64,
16854 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16855 DAG.getBitcast(MVT::v8i32, V1),
16856 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16857 }
16858
16859 // AVX2 provides a direct instruction for permuting a single input across
16860 // lanes.
16861 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16862 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16863 }
16864
16865 // Try to use shift instructions.
16866 if (SDValue Shift =
16867 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16868 DAG, /*BitwiseOnly*/ false))
16869 return Shift;
16870
16871 // If we have VLX support, we can use VALIGN or VEXPAND.
16872 if (Subtarget.hasVLX()) {
16873 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16874 Zeroable, Subtarget, DAG))
16875 return Rotate;
16876
16877 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16878 Zeroable, Subtarget, DAG))
16879 return V;
16880 }
16881
16882 // Try to use PALIGNR.
16883 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16884 Subtarget, DAG))
16885 return Rotate;
16886
16887 // Use dedicated unpack instructions for masks that match their pattern.
16888 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16889 return V;
16890
16891 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16892 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16893
16894 // If we have one input in place, then we can permute the other input and
16895 // blend the result.
16896 if (V1IsInPlace || V2IsInPlace)
16897 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16898 Zeroable, Subtarget, DAG);
16899
16900 // Try to create an in-lane repeating shuffle mask and then shuffle the
16901 // results into the target lanes.
16903 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16904 return V;
16905
16906 // Try to lower to PERMQ(BLENDD(V1,V2)).
16907 if (SDValue V =
16908 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16909 return V;
16910
16911 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16912 // shuffle. However, if we have AVX2 and either inputs are already in place,
16913 // we will be able to shuffle even across lanes the other input in a single
16914 // instruction so skip this pattern.
16915 if (!V1IsInPlace && !V2IsInPlace)
16917 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16918 return Result;
16919
16920 // Otherwise fall back on generic blend lowering.
16921 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16922 Zeroable, Subtarget, DAG);
16923}
16924
16925/// Handle lowering of 8-lane 32-bit floating point shuffles.
16926///
16927/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16928/// isn't available.
16930 const APInt &Zeroable, SDValue V1, SDValue V2,
16931 const X86Subtarget &Subtarget,
16932 SelectionDAG &DAG) {
16933 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16934 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16935 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16936
16937 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16938 Zeroable, Subtarget, DAG))
16939 return Blend;
16940
16941 // Check for being able to broadcast a single element.
16942 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16943 Subtarget, DAG))
16944 return Broadcast;
16945
16946 if (!Subtarget.hasAVX2()) {
16947 SmallVector<int> InLaneMask;
16948 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16949
16950 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16951 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16952 /*SimpleOnly*/ true))
16953 return R;
16954 }
16955 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16956 Zeroable, Subtarget, DAG))
16957 return DAG.getBitcast(MVT::v8f32, ZExt);
16958
16959 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16960 // options to efficiently lower the shuffle.
16961 SmallVector<int, 4> RepeatedMask;
16962 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16963 assert(RepeatedMask.size() == 4 &&
16964 "Repeated masks must be half the mask width!");
16965
16966 // Use even/odd duplicate instructions for masks that match their pattern.
16967 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16968 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16969 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16970 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16971
16972 if (V2.isUndef())
16973 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16974 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16975
16976 // Use dedicated unpack instructions for masks that match their pattern.
16977 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16978 return V;
16979
16980 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16981 // have already handled any direct blends.
16982 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16983 }
16984
16985 // Try to create an in-lane repeating shuffle mask and then shuffle the
16986 // results into the target lanes.
16988 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16989 return V;
16990
16991 // If we have a single input shuffle with different shuffle patterns in the
16992 // two 128-bit lanes use the variable mask to VPERMILPS.
16993 if (V2.isUndef()) {
16994 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16995 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16996 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16997 }
16998 if (Subtarget.hasAVX2()) {
16999 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17000 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17001 }
17002 // Otherwise, fall back.
17003 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17004 DAG, Subtarget);
17005 }
17006
17007 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17008 // shuffle.
17010 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17011 return Result;
17012
17013 // If we have VLX support, we can use VEXPAND.
17014 if (Subtarget.hasVLX())
17015 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
17016 Zeroable, Subtarget, DAG))
17017 return V;
17018
17019 // Try to match an interleave of two v8f32s and lower them as unpck and
17020 // permutes using ymms. This needs to go before we try to split the vectors.
17021 // Don't attempt on AVX1 if we're likely to split vectors anyway.
17022 if ((Subtarget.hasAVX2() ||
17025 !Subtarget.hasAVX512())
17026 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
17027 Mask, DAG))
17028 return V;
17029
17030 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17031 // since after split we get a more efficient code using vpunpcklwd and
17032 // vpunpckhwd instrs than vblend.
17033 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
17034 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
17035 Subtarget, DAG);
17036
17037 // If we have AVX2 then we always want to lower with a blend because at v8 we
17038 // can fully permute the elements.
17039 if (Subtarget.hasAVX2())
17040 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17041 Zeroable, Subtarget, DAG);
17042
17043 // Otherwise fall back on generic lowering.
17044 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
17045 Subtarget, DAG);
17046}
17047
17048/// Handle lowering of 8-lane 32-bit integer shuffles.
17049///
17050/// This routine is only called when we have AVX2 and thus a reasonable
17051/// instruction set for v8i32 shuffling..
17053 const APInt &Zeroable, SDValue V1, SDValue V2,
17054 const X86Subtarget &Subtarget,
17055 SelectionDAG &DAG) {
17056 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17057 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17058 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17059 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
17060
17061 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
17062
17063 // Whenever we can lower this as a zext, that instruction is strictly faster
17064 // than any alternative. It also allows us to fold memory operands into the
17065 // shuffle in many cases.
17066 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17067 Zeroable, Subtarget, DAG))
17068 return ZExt;
17069
17070 // Try to match an interleave of two v8i32s and lower them as unpck and
17071 // permutes using ymms. This needs to go before we try to split the vectors.
17072 if (!Subtarget.hasAVX512())
17073 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
17074 Mask, DAG))
17075 return V;
17076
17077 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17078 // since after split we get a more efficient code than vblend by using
17079 // vpunpcklwd and vpunpckhwd instrs.
17080 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
17081 !Subtarget.hasAVX512())
17082 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
17083 Subtarget, DAG);
17084
17085 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17086 Zeroable, Subtarget, DAG))
17087 return Blend;
17088
17089 // Check for being able to broadcast a single element.
17090 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17091 Subtarget, DAG))
17092 return Broadcast;
17093
17094 // Try to use shift instructions if fast.
17095 if (Subtarget.preferLowerShuffleAsShift()) {
17096 if (SDValue Shift =
17097 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
17098 Subtarget, DAG, /*BitwiseOnly*/ true))
17099 return Shift;
17100 if (NumV2Elements == 0)
17101 if (SDValue Rotate =
17102 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
17103 return Rotate;
17104 }
17105
17106 // If the shuffle mask is repeated in each 128-bit lane we can use more
17107 // efficient instructions that mirror the shuffles across the two 128-bit
17108 // lanes.
17109 SmallVector<int, 4> RepeatedMask;
17110 bool Is128BitLaneRepeatedShuffle =
17111 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17112 if (Is128BitLaneRepeatedShuffle) {
17113 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17114 if (V2.isUndef())
17115 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17116 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17117
17118 // Use dedicated unpack instructions for masks that match their pattern.
17119 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
17120 return V;
17121 }
17122
17123 // Try to use shift instructions.
17124 if (SDValue Shift =
17125 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
17126 DAG, /*BitwiseOnly*/ false))
17127 return Shift;
17128
17129 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
17130 if (SDValue Rotate =
17131 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
17132 return Rotate;
17133
17134 // If we have VLX support, we can use VALIGN or EXPAND.
17135 if (Subtarget.hasVLX()) {
17136 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17137 Zeroable, Subtarget, DAG))
17138 return Rotate;
17139
17140 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
17141 Zeroable, Subtarget, DAG))
17142 return V;
17143 }
17144
17145 // Try to use byte rotation instructions.
17146 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17147 Subtarget, DAG))
17148 return Rotate;
17149
17150 // Try to create an in-lane repeating shuffle mask and then shuffle the
17151 // results into the target lanes.
17153 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17154 return V;
17155
17156 if (V2.isUndef()) {
17157 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17158 // because that should be faster than the variable permute alternatives.
17159 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
17160 return V;
17161
17162 // If the shuffle patterns aren't repeated but it's a single input, directly
17163 // generate a cross-lane VPERMD instruction.
17164 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17165 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17166 }
17167
17168 // Assume that a single SHUFPS is faster than an alternative sequence of
17169 // multiple instructions (even if the CPU has a domain penalty).
17170 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17171 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17172 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17173 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17174 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17175 CastV1, CastV2, DAG);
17176 return DAG.getBitcast(MVT::v8i32, ShufPS);
17177 }
17178
17179 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17180 // shuffle.
17182 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17183 return Result;
17184
17185 // Otherwise fall back on generic blend lowering.
17186 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17187 Zeroable, Subtarget, DAG);
17188}
17189
17190/// Handle lowering of 16-lane 16-bit integer shuffles.
17191///
17192/// This routine is only called when we have AVX2 and thus a reasonable
17193/// instruction set for v16i16 shuffling..
17195 const APInt &Zeroable, SDValue V1, SDValue V2,
17196 const X86Subtarget &Subtarget,
17197 SelectionDAG &DAG) {
17198 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17199 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17200 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17201 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
17202
17203 // Whenever we can lower this as a zext, that instruction is strictly faster
17204 // than any alternative. It also allows us to fold memory operands into the
17205 // shuffle in many cases.
17207 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17208 return ZExt;
17209
17210 // Check for being able to broadcast a single element.
17211 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17212 Subtarget, DAG))
17213 return Broadcast;
17214
17215 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17216 Zeroable, Subtarget, DAG))
17217 return Blend;
17218
17219 // Use dedicated unpack instructions for masks that match their pattern.
17220 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
17221 return V;
17222
17223 // Use dedicated pack instructions for masks that match their pattern.
17224 if (SDValue V =
17225 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17226 return V;
17227
17228 // Try to use lower using a truncation.
17229 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17230 Subtarget, DAG))
17231 return V;
17232
17233 // Try to use shift instructions.
17234 if (SDValue Shift =
17235 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17236 Subtarget, DAG, /*BitwiseOnly*/ false))
17237 return Shift;
17238
17239 // Try to use byte rotation instructions.
17240 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17241 Subtarget, DAG))
17242 return Rotate;
17243
17244 // Try to create an in-lane repeating shuffle mask and then shuffle the
17245 // results into the target lanes.
17247 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17248 return V;
17249
17250 if (V2.isUndef()) {
17251 // Try to use bit rotation instructions.
17252 if (SDValue Rotate =
17253 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17254 return Rotate;
17255
17256 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17257 // because that should be faster than the variable permute alternatives.
17258 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17259 return V;
17260
17261 // There are no generalized cross-lane shuffle operations available on i16
17262 // element types.
17263 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17265 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17266 return V;
17267
17268 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17269 DAG, Subtarget);
17270 }
17271
17272 SmallVector<int, 8> RepeatedMask;
17273 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17274 // As this is a single-input shuffle, the repeated mask should be
17275 // a strictly valid v8i16 mask that we can pass through to the v8i16
17276 // lowering to handle even the v16 case.
17278 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17279 }
17280 }
17281
17282 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17283 Zeroable, Subtarget, DAG))
17284 return PSHUFB;
17285
17286 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17287 if (Subtarget.hasBWI())
17288 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17289
17290 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17291 // shuffle.
17293 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17294 return Result;
17295
17296 // Try to permute the lanes and then use a per-lane permute.
17298 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17299 return V;
17300
17301 // Try to match an interleave of two v16i16s and lower them as unpck and
17302 // permutes using ymms.
17303 if (!Subtarget.hasAVX512())
17304 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17305 Mask, DAG))
17306 return V;
17307
17308 // Otherwise fall back on generic lowering.
17309 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17310 Subtarget, DAG);
17311}
17312
17313/// Handle lowering of 32-lane 8-bit integer shuffles.
17314///
17315/// This routine is only called when we have AVX2 and thus a reasonable
17316/// instruction set for v32i8 shuffling..
17318 const APInt &Zeroable, SDValue V1, SDValue V2,
17319 const X86Subtarget &Subtarget,
17320 SelectionDAG &DAG) {
17321 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17322 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17323 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17324 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17325
17326 // Whenever we can lower this as a zext, that instruction is strictly faster
17327 // than any alternative. It also allows us to fold memory operands into the
17328 // shuffle in many cases.
17329 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17330 Zeroable, Subtarget, DAG))
17331 return ZExt;
17332
17333 // Check for being able to broadcast a single element.
17334 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17335 Subtarget, DAG))
17336 return Broadcast;
17337
17338 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17339 Zeroable, Subtarget, DAG))
17340 return Blend;
17341
17342 // Use dedicated unpack instructions for masks that match their pattern.
17343 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17344 return V;
17345
17346 // Use dedicated pack instructions for masks that match their pattern.
17347 if (SDValue V =
17348 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17349 return V;
17350
17351 // Try to use lower using a truncation.
17352 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17353 Subtarget, DAG))
17354 return V;
17355
17356 // Try to use shift instructions.
17357 if (SDValue Shift =
17358 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17359 DAG, /*BitwiseOnly*/ false))
17360 return Shift;
17361
17362 // Try to use byte rotation instructions.
17363 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17364 Subtarget, DAG))
17365 return Rotate;
17366
17367 // Try to use bit rotation instructions.
17368 if (V2.isUndef())
17369 if (SDValue Rotate =
17370 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17371 return Rotate;
17372
17373 // Try to create an in-lane repeating shuffle mask and then shuffle the
17374 // results into the target lanes.
17376 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17377 return V;
17378
17379 // There are no generalized cross-lane shuffle operations available on i8
17380 // element types.
17381 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17382 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17383 // because that should be faster than the variable permute alternatives.
17384 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17385 return V;
17386
17388 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17389 return V;
17390
17391 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17392 DAG, Subtarget);
17393 }
17394
17395 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17396 Zeroable, Subtarget, DAG))
17397 return PSHUFB;
17398
17399 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17400 if (Subtarget.hasVBMI())
17401 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17402
17403 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17404 // shuffle.
17406 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17407 return Result;
17408
17409 // Try to permute the lanes and then use a per-lane permute.
17411 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17412 return V;
17413
17414 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17415 // by zeroable elements in the remaining 24 elements. Turn this into two
17416 // vmovqb instructions shuffled together.
17417 if (Subtarget.hasVLX())
17418 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17419 Mask, Zeroable, DAG))
17420 return V;
17421
17422 // Try to match an interleave of two v32i8s and lower them as unpck and
17423 // permutes using ymms.
17424 if (!Subtarget.hasAVX512())
17425 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17426 Mask, DAG))
17427 return V;
17428
17429 // Otherwise fall back on generic lowering.
17430 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17431 Subtarget, DAG);
17432}
17433
17434/// High-level routine to lower various 256-bit x86 vector shuffles.
17435///
17436/// This routine either breaks down the specific type of a 256-bit x86 vector
17437/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17438/// together based on the available instructions.
17440 SDValue V1, SDValue V2, const APInt &Zeroable,
17441 const X86Subtarget &Subtarget,
17442 SelectionDAG &DAG) {
17443 // If we have a single input to the zero element, insert that into V1 if we
17444 // can do so cheaply.
17445 int NumElts = VT.getVectorNumElements();
17446 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17447
17448 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17450 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17451 return Insertion;
17452
17453 // Handle special cases where the lower or upper half is UNDEF.
17454 if (SDValue V =
17455 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17456 return V;
17457
17458 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17459 // can check for those subtargets here and avoid much of the subtarget
17460 // querying in the per-vector-type lowering routines. With AVX1 we have
17461 // essentially *zero* ability to manipulate a 256-bit vector with integer
17462 // types. Since we'll use floating point types there eventually, just
17463 // immediately cast everything to a float and operate entirely in that domain.
17464 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17465 int ElementBits = VT.getScalarSizeInBits();
17466 if (ElementBits < 32) {
17467 // No floating point type available, if we can't use the bit operations
17468 // for masking/blending then decompose into 128-bit vectors.
17469 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17470 Subtarget, DAG))
17471 return V;
17472 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17473 return V;
17474 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17475 }
17476
17477 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17479 V1 = DAG.getBitcast(FpVT, V1);
17480 V2 = DAG.getBitcast(FpVT, V2);
17481 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17482 }
17483
17484 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17485 V1 = DAG.getBitcast(MVT::v16i16, V1);
17486 V2 = DAG.getBitcast(MVT::v16i16, V2);
17487 return DAG.getBitcast(VT,
17488 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17489 }
17490
17491 switch (VT.SimpleTy) {
17492 case MVT::v4f64:
17493 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17494 case MVT::v4i64:
17495 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17496 case MVT::v8f32:
17497 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17498 case MVT::v8i32:
17499 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17500 case MVT::v16i16:
17501 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17502 case MVT::v32i8:
17503 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17504
17505 default:
17506 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17507 }
17508}
17509
17510/// Try to lower a vector shuffle as a 128-bit shuffles.
17512 const APInt &Zeroable, SDValue V1, SDValue V2,
17513 const X86Subtarget &Subtarget,
17514 SelectionDAG &DAG) {
17515 assert(VT.getScalarSizeInBits() == 64 &&
17516 "Unexpected element type size for 128bit shuffle.");
17517
17518 // To handle 256 bit vector requires VLX and most probably
17519 // function lowerV2X128VectorShuffle() is better solution.
17520 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17521
17522 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17523 SmallVector<int, 4> Widened128Mask;
17524 if (!canWidenShuffleElements(Mask, Widened128Mask))
17525 return SDValue();
17526 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17527
17528 // Try to use an insert into a zero vector.
17529 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17530 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17531 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17532 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17533 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17534 DAG.getVectorIdxConstant(0, DL));
17535 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17536 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17537 DAG.getVectorIdxConstant(0, DL));
17538 }
17539
17540 // Check for patterns which can be matched with a single insert of a 256-bit
17541 // subvector.
17542 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17543 if (OnlyUsesV1 ||
17544 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17545 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17546 SDValue SubVec =
17547 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17548 DAG.getVectorIdxConstant(0, DL));
17549 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17550 DAG.getVectorIdxConstant(4, DL));
17551 }
17552
17553 // See if this is an insertion of the lower 128-bits of V2 into V1.
17554 bool IsInsert = true;
17555 int V2Index = -1;
17556 for (int i = 0; i < 4; ++i) {
17557 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17558 if (Widened128Mask[i] < 0)
17559 continue;
17560
17561 // Make sure all V1 subvectors are in place.
17562 if (Widened128Mask[i] < 4) {
17563 if (Widened128Mask[i] != i) {
17564 IsInsert = false;
17565 break;
17566 }
17567 } else {
17568 // Make sure we only have a single V2 index and its the lowest 128-bits.
17569 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17570 IsInsert = false;
17571 break;
17572 }
17573 V2Index = i;
17574 }
17575 }
17576 if (IsInsert && V2Index >= 0) {
17577 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17578 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17579 DAG.getVectorIdxConstant(0, DL));
17580 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17581 }
17582
17583 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17584 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17585 // possible we at least ensure the lanes stay sequential to help later
17586 // combines.
17587 SmallVector<int, 2> Widened256Mask;
17588 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17589 Widened128Mask.clear();
17590 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17591 }
17592
17593 // Try to lower to vshuf64x2/vshuf32x4.
17594 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17595 int PermMask[4] = {-1, -1, -1, -1};
17596 // Ensure elements came from the same Op.
17597 for (int i = 0; i < 4; ++i) {
17598 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17599 if (Widened128Mask[i] < 0)
17600 continue;
17601
17602 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17603 unsigned OpIndex = i / 2;
17604 if (Ops[OpIndex].isUndef())
17605 Ops[OpIndex] = Op;
17606 else if (Ops[OpIndex] != Op)
17607 return SDValue();
17608
17609 PermMask[i] = Widened128Mask[i] % 4;
17610 }
17611
17612 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17613 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17614}
17615
17616/// Handle lowering of 8-lane 64-bit floating point shuffles.
17618 const APInt &Zeroable, SDValue V1, SDValue V2,
17619 const X86Subtarget &Subtarget,
17620 SelectionDAG &DAG) {
17621 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17622 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17623 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17624
17625 if (V2.isUndef()) {
17626 // Use low duplicate instructions for masks that match their pattern.
17627 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17628 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17629
17630 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17631 // Non-half-crossing single input shuffles can be lowered with an
17632 // interleaved permutation.
17633 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17634 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17635 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17636 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17637 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17638 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17639 }
17640
17641 SmallVector<int, 4> RepeatedMask;
17642 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17643 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17644 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17645 }
17646
17647 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17648 V2, Subtarget, DAG))
17649 return Shuf128;
17650
17651 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17652 return Unpck;
17653
17654 // Check if the blend happens to exactly fit that of SHUFPD.
17655 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17656 Zeroable, Subtarget, DAG))
17657 return Op;
17658
17659 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17660 Subtarget, DAG))
17661 return V;
17662
17663 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17664 Zeroable, Subtarget, DAG))
17665 return Blend;
17666
17667 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17668}
17669
17670/// Handle lowering of 16-lane 32-bit floating point shuffles.
17672 const APInt &Zeroable, SDValue V1, SDValue V2,
17673 const X86Subtarget &Subtarget,
17674 SelectionDAG &DAG) {
17675 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17676 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17677 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17678
17679 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17680 // options to efficiently lower the shuffle.
17681 SmallVector<int, 4> RepeatedMask;
17682 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17683 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17684
17685 // Use even/odd duplicate instructions for masks that match their pattern.
17686 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17687 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17688 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17689 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17690
17691 if (V2.isUndef())
17692 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17693 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17694
17695 // Use dedicated unpack instructions for masks that match their pattern.
17696 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17697 return V;
17698
17699 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17700 Zeroable, Subtarget, DAG))
17701 return Blend;
17702
17703 // Otherwise, fall back to a SHUFPS sequence.
17704 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17705 }
17706
17707 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17708 Zeroable, Subtarget, DAG))
17709 return Blend;
17710
17712 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17713 return DAG.getBitcast(MVT::v16f32, ZExt);
17714
17715 // Try to create an in-lane repeating shuffle mask and then shuffle the
17716 // results into the target lanes.
17718 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17719 return V;
17720
17721 // If we have a single input shuffle with different shuffle patterns in the
17722 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17723 if (V2.isUndef() &&
17724 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17725 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17726 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17727 }
17728
17729 // If we have AVX512F support, we can use VEXPAND.
17730 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17731 Zeroable, Subtarget, DAG))
17732 return V;
17733
17734 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17735}
17736
17737/// Handle lowering of 8-lane 64-bit integer shuffles.
17739 const APInt &Zeroable, SDValue V1, SDValue V2,
17740 const X86Subtarget &Subtarget,
17741 SelectionDAG &DAG) {
17742 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17743 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17744 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17745
17746 // Try to use shift instructions if fast.
17747 if (Subtarget.preferLowerShuffleAsShift())
17748 if (SDValue Shift =
17749 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17750 Subtarget, DAG, /*BitwiseOnly*/ true))
17751 return Shift;
17752
17753 if (V2.isUndef()) {
17754 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17755 // can use lower latency instructions that will operate on all four
17756 // 128-bit lanes.
17757 SmallVector<int, 2> Repeated128Mask;
17758 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17759 SmallVector<int, 4> PSHUFDMask;
17760 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17761 return DAG.getBitcast(
17762 MVT::v8i64,
17763 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17764 DAG.getBitcast(MVT::v16i32, V1),
17765 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17766 }
17767
17768 SmallVector<int, 4> Repeated256Mask;
17769 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17770 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17771 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17772 }
17773
17774 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17775 V2, Subtarget, DAG))
17776 return Shuf128;
17777
17778 // Try to use shift instructions.
17779 if (SDValue Shift =
17780 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17781 DAG, /*BitwiseOnly*/ false))
17782 return Shift;
17783
17784 // Try to use VALIGN.
17785 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17786 Zeroable, Subtarget, DAG))
17787 return Rotate;
17788
17789 // Try to use PALIGNR.
17790 if (Subtarget.hasBWI())
17791 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17792 Subtarget, DAG))
17793 return Rotate;
17794
17795 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17796 return Unpck;
17797
17798 // If we have AVX512F support, we can use VEXPAND.
17799 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17800 Subtarget, DAG))
17801 return V;
17802
17803 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17804 Zeroable, Subtarget, DAG))
17805 return Blend;
17806
17807 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17808}
17809
17810/// Handle lowering of 16-lane 32-bit integer shuffles.
17812 const APInt &Zeroable, SDValue V1, SDValue V2,
17813 const X86Subtarget &Subtarget,
17814 SelectionDAG &DAG) {
17815 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17816 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17817 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17818
17819 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17820
17821 // Whenever we can lower this as a zext, that instruction is strictly faster
17822 // than any alternative. It also allows us to fold memory operands into the
17823 // shuffle in many cases.
17825 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17826 return ZExt;
17827
17828 // Try to use shift instructions if fast.
17829 if (Subtarget.preferLowerShuffleAsShift()) {
17830 if (SDValue Shift =
17831 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17832 Subtarget, DAG, /*BitwiseOnly*/ true))
17833 return Shift;
17834 if (NumV2Elements == 0)
17835 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17836 Subtarget, DAG))
17837 return Rotate;
17838 }
17839
17840 // If the shuffle mask is repeated in each 128-bit lane we can use more
17841 // efficient instructions that mirror the shuffles across the four 128-bit
17842 // lanes.
17843 SmallVector<int, 4> RepeatedMask;
17844 bool Is128BitLaneRepeatedShuffle =
17845 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17846 if (Is128BitLaneRepeatedShuffle) {
17847 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17848 if (V2.isUndef())
17849 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17850 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17851
17852 // Use dedicated unpack instructions for masks that match their pattern.
17853 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17854 return V;
17855 }
17856
17857 // Try to use shift instructions.
17858 if (SDValue Shift =
17859 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17860 Subtarget, DAG, /*BitwiseOnly*/ false))
17861 return Shift;
17862
17863 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17864 if (SDValue Rotate =
17865 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17866 return Rotate;
17867
17868 // Try to use VALIGN.
17869 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17870 Zeroable, Subtarget, DAG))
17871 return Rotate;
17872
17873 // Try to use byte rotation instructions.
17874 if (Subtarget.hasBWI())
17875 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17876 Subtarget, DAG))
17877 return Rotate;
17878
17879 // Assume that a single SHUFPS is faster than using a permv shuffle.
17880 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17881 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17882 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17883 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17884 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17885 CastV1, CastV2, DAG);
17886 return DAG.getBitcast(MVT::v16i32, ShufPS);
17887 }
17888
17889 // Try to create an in-lane repeating shuffle mask and then shuffle the
17890 // results into the target lanes.
17892 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17893 return V;
17894
17895 // If we have AVX512F support, we can use VEXPAND.
17896 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17897 Zeroable, Subtarget, DAG))
17898 return V;
17899
17900 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17901 Zeroable, Subtarget, DAG))
17902 return Blend;
17903
17904 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17905}
17906
17907/// Handle lowering of 32-lane 16-bit integer shuffles.
17909 const APInt &Zeroable, SDValue V1, SDValue V2,
17910 const X86Subtarget &Subtarget,
17911 SelectionDAG &DAG) {
17912 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17913 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17914 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17915 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17916
17917 // Whenever we can lower this as a zext, that instruction is strictly faster
17918 // than any alternative. It also allows us to fold memory operands into the
17919 // shuffle in many cases.
17921 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17922 return ZExt;
17923
17924 // Use dedicated unpack instructions for masks that match their pattern.
17925 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17926 return V;
17927
17928 // Use dedicated pack instructions for masks that match their pattern.
17929 if (SDValue V =
17930 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17931 return V;
17932
17933 // Try to use shift instructions.
17934 if (SDValue Shift =
17935 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17936 Subtarget, DAG, /*BitwiseOnly*/ false))
17937 return Shift;
17938
17939 // Try to use byte rotation instructions.
17940 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17941 Subtarget, DAG))
17942 return Rotate;
17943
17944 if (V2.isUndef()) {
17945 // Try to use bit rotation instructions.
17946 if (SDValue Rotate =
17947 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17948 return Rotate;
17949
17950 SmallVector<int, 8> RepeatedMask;
17951 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17952 // As this is a single-input shuffle, the repeated mask should be
17953 // a strictly valid v8i16 mask that we can pass through to the v8i16
17954 // lowering to handle even the v32 case.
17955 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17956 RepeatedMask, Subtarget, DAG);
17957 }
17958 }
17959
17960 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17961 Zeroable, Subtarget, DAG))
17962 return Blend;
17963
17964 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17965 Zeroable, Subtarget, DAG))
17966 return PSHUFB;
17967
17968 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17969 // shuffle.
17970 if (!V2.isUndef())
17972 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17973 return Result;
17974
17975 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17976}
17977
17978/// Handle lowering of 64-lane 8-bit integer shuffles.
17980 const APInt &Zeroable, SDValue V1, SDValue V2,
17981 const X86Subtarget &Subtarget,
17982 SelectionDAG &DAG) {
17983 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17984 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17985 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17986 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17987
17988 // Whenever we can lower this as a zext, that instruction is strictly faster
17989 // than any alternative. It also allows us to fold memory operands into the
17990 // shuffle in many cases.
17992 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17993 return ZExt;
17994
17995 // Use dedicated unpack instructions for masks that match their pattern.
17996 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17997 return V;
17998
17999 // Use dedicated pack instructions for masks that match their pattern.
18000 if (SDValue V =
18001 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18002 return V;
18003
18004 // Try to use shift instructions.
18005 if (SDValue Shift =
18006 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
18007 DAG, /*BitwiseOnly*/ false))
18008 return Shift;
18009
18010 // Try to use byte rotation instructions.
18011 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18012 Subtarget, DAG))
18013 return Rotate;
18014
18015 // Try to use bit rotation instructions.
18016 if (V2.isUndef())
18017 if (SDValue Rotate =
18018 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18019 return Rotate;
18020
18021 // Lower as AND if possible.
18022 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18023 Zeroable, Subtarget, DAG))
18024 return Masked;
18025
18026 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18027 Zeroable, Subtarget, DAG))
18028 return PSHUFB;
18029
18030 // Try to create an in-lane repeating shuffle mask and then shuffle the
18031 // results into the target lanes.
18033 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18034 return V;
18035
18037 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
18038 return Result;
18039
18040 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18041 Zeroable, Subtarget, DAG))
18042 return Blend;
18043
18044 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
18045 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
18046 // PALIGNR will be cheaper than the second PSHUFB+OR.
18047 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
18048 Mask, Subtarget, DAG))
18049 return V;
18050
18051 // If we can't directly blend but can use PSHUFB, that will be better as it
18052 // can both shuffle and set up the inefficient blend.
18053 bool V1InUse, V2InUse;
18054 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
18055 DAG, V1InUse, V2InUse);
18056 }
18057
18058 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18059 // shuffle.
18060 if (!V2.isUndef())
18062 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18063 return Result;
18064
18065 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18066 if (Subtarget.hasVBMI())
18067 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18068
18069 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
18070}
18071
18072/// High-level routine to lower various 512-bit x86 vector shuffles.
18073///
18074/// This routine either breaks down the specific type of a 512-bit x86 vector
18075/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18076/// together based on the available instructions.
18078 MVT VT, SDValue V1, SDValue V2,
18079 const APInt &Zeroable,
18080 const X86Subtarget &Subtarget,
18081 SelectionDAG &DAG) {
18082 assert(Subtarget.hasAVX512() &&
18083 "Cannot lower 512-bit vectors w/ basic ISA!");
18084
18085 // If we have a single input to the zero element, insert that into V1 if we
18086 // can do so cheaply.
18087 int NumElts = Mask.size();
18088 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18089
18090 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18092 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18093 return Insertion;
18094
18095 // Handle special cases where the lower or upper half is UNDEF.
18096 if (SDValue V =
18097 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18098 return V;
18099
18100 // Check for being able to broadcast a single element.
18101 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18102 Subtarget, DAG))
18103 return Broadcast;
18104
18105 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18106 // Try using bit ops for masking and blending before falling back to
18107 // splitting.
18108 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18109 Subtarget, DAG))
18110 return V;
18111 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18112 return V;
18113
18114 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
18115 }
18116
18117 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
18118 if (!Subtarget.hasBWI())
18119 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
18120 /*SimpleOnly*/ false);
18121
18122 V1 = DAG.getBitcast(MVT::v32i16, V1);
18123 V2 = DAG.getBitcast(MVT::v32i16, V2);
18124 return DAG.getBitcast(VT,
18125 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
18126 }
18127
18128 // Dispatch to each element type for lowering. If we don't have support for
18129 // specific element type shuffles at 512 bits, immediately split them and
18130 // lower them. Each lowering routine of a given type is allowed to assume that
18131 // the requisite ISA extensions for that element type are available.
18132 switch (VT.SimpleTy) {
18133 case MVT::v8f64:
18134 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18135 case MVT::v16f32:
18136 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18137 case MVT::v8i64:
18138 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18139 case MVT::v16i32:
18140 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18141 case MVT::v32i16:
18142 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18143 case MVT::v64i8:
18144 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18145
18146 default:
18147 llvm_unreachable("Not a valid 512-bit x86 vector type!");
18148 }
18149}
18150
18152 MVT VT, SDValue V1, SDValue V2,
18153 const X86Subtarget &Subtarget,
18154 SelectionDAG &DAG) {
18155 // Shuffle should be unary.
18156 if (!V2.isUndef())
18157 return SDValue();
18158
18159 int ShiftAmt = -1;
18160 int NumElts = Mask.size();
18161 for (int i = 0; i != NumElts; ++i) {
18162 int M = Mask[i];
18163 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
18164 "Unexpected mask index.");
18165 if (M < 0)
18166 continue;
18167
18168 // The first non-undef element determines our shift amount.
18169 if (ShiftAmt < 0) {
18170 ShiftAmt = M - i;
18171 // Need to be shifting right.
18172 if (ShiftAmt <= 0)
18173 return SDValue();
18174 }
18175 // All non-undef elements must shift by the same amount.
18176 if (ShiftAmt != M - i)
18177 return SDValue();
18178 }
18179 assert(ShiftAmt >= 0 && "All undef?");
18180
18181 // Great we found a shift right.
18182 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
18183 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
18184 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18185 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18186 DAG.getVectorIdxConstant(0, DL));
18187}
18188
18189// Determine if this shuffle can be implemented with a KSHIFT instruction.
18190// Returns the shift amount if possible or -1 if not. This is a simplified
18191// version of matchShuffleAsShift.
18192static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18193 int MaskOffset, const APInt &Zeroable) {
18194 int Size = Mask.size();
18195
18196 auto CheckZeros = [&](int Shift, bool Left) {
18197 for (int j = 0; j < Shift; ++j)
18198 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18199 return false;
18200
18201 return true;
18202 };
18203
18204 auto MatchShift = [&](int Shift, bool Left) {
18205 unsigned Pos = Left ? Shift : 0;
18206 unsigned Low = Left ? 0 : Shift;
18207 unsigned Len = Size - Shift;
18208 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18209 };
18210
18211 for (int Shift = 1; Shift != Size; ++Shift)
18212 for (bool Left : {true, false})
18213 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18215 return Shift;
18216 }
18217
18218 return -1;
18219}
18220
18221
18222// Lower vXi1 vector shuffles.
18223// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18224// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18225// vector, shuffle and then truncate it back.
18227 MVT VT, SDValue V1, SDValue V2,
18228 const APInt &Zeroable,
18229 const X86Subtarget &Subtarget,
18230 SelectionDAG &DAG) {
18231 assert(Subtarget.hasAVX512() &&
18232 "Cannot lower 512-bit vectors w/o basic ISA!");
18233
18234 int NumElts = Mask.size();
18235 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18236
18237 // Try to recognize shuffles that are just padding a subvector with zeros.
18238 int SubvecElts = 0;
18239 int Src = -1;
18240 for (int i = 0; i != NumElts; ++i) {
18241 if (Mask[i] >= 0) {
18242 // Grab the source from the first valid mask. All subsequent elements need
18243 // to use this same source.
18244 if (Src < 0)
18245 Src = Mask[i] / NumElts;
18246 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18247 break;
18248 }
18249
18250 ++SubvecElts;
18251 }
18252 assert(SubvecElts != NumElts && "Identity shuffle?");
18253
18254 // Clip to a power 2.
18255 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18256
18257 // Make sure the number of zeroable bits in the top at least covers the bits
18258 // not covered by the subvector.
18259 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18260 assert(Src >= 0 && "Expected a source!");
18261 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18262 SDValue Extract =
18263 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18264 DAG.getVectorIdxConstant(0, DL));
18265 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18266 DAG.getConstant(0, DL, VT), Extract,
18267 DAG.getVectorIdxConstant(0, DL));
18268 }
18269
18270 // Try a simple shift right with undef elements. Later we'll try with zeros.
18271 if (SDValue Shift =
18272 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18273 return Shift;
18274
18275 // Try to match KSHIFTs.
18276 unsigned Offset = 0;
18277 for (SDValue V : {V1, V2}) {
18278 unsigned Opcode;
18279 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18280 if (ShiftAmt >= 0) {
18281 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18282 MVT WideVT = Res.getSimpleValueType();
18283 // Widened right shifts need two shifts to ensure we shift in zeroes.
18284 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18285 int WideElts = WideVT.getVectorNumElements();
18286 // Shift left to put the original vector in the MSBs of the new size.
18287 Res =
18288 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18289 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18290 // Increase the shift amount to account for the left shift.
18291 ShiftAmt += WideElts - NumElts;
18292 }
18293
18294 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18295 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18296 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18297 DAG.getVectorIdxConstant(0, DL));
18298 }
18299 Offset += NumElts; // Increment for next iteration.
18300 }
18301
18302 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18303 // ops instead.
18304 // TODO: What other unary shuffles would benefit from this?
18305 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18306 SDValue Op0 = V1.getOperand(0);
18307 SDValue Op1 = V1.getOperand(1);
18309 EVT OpVT = Op0.getValueType();
18310 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18311 return DAG.getSetCC(
18312 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18313 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18314 }
18315
18316 MVT ExtVT;
18317 switch (VT.SimpleTy) {
18318 default:
18319 llvm_unreachable("Expected a vector of i1 elements");
18320 case MVT::v2i1:
18321 ExtVT = MVT::v2i64;
18322 break;
18323 case MVT::v4i1:
18324 ExtVT = MVT::v4i32;
18325 break;
18326 case MVT::v8i1:
18327 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18328 // shuffle.
18329 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18330 break;
18331 case MVT::v16i1:
18332 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18333 // 256-bit operation available.
18334 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18335 break;
18336 case MVT::v32i1:
18337 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18338 // 256-bit operation available.
18339 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18340 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18341 break;
18342 case MVT::v64i1:
18343 // Fall back to scalarization. FIXME: We can do better if the shuffle
18344 // can be partitioned cleanly.
18345 if (!Subtarget.useBWIRegs())
18346 return SDValue();
18347 ExtVT = MVT::v64i8;
18348 break;
18349 }
18350
18351 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18352 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18353
18354 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18355 // i1 was sign extended we can use X86ISD::CVT2MASK.
18356 int NumElems = VT.getVectorNumElements();
18357 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18358 (Subtarget.hasDQI() && (NumElems < 32)))
18359 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18360 Shuffle, ISD::SETGT);
18361
18362 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18363}
18364
18365/// Helper function that returns true if the shuffle mask should be
18366/// commuted to improve canonicalization.
18368 int NumElements = Mask.size();
18369
18370 int NumV1Elements = 0, NumV2Elements = 0;
18371 for (int M : Mask)
18372 if (M < 0)
18373 continue;
18374 else if (M < NumElements)
18375 ++NumV1Elements;
18376 else
18377 ++NumV2Elements;
18378
18379 // Commute the shuffle as needed such that more elements come from V1 than
18380 // V2. This allows us to match the shuffle pattern strictly on how many
18381 // elements come from V1 without handling the symmetric cases.
18382 if (NumV2Elements > NumV1Elements)
18383 return true;
18384
18385 assert(NumV1Elements > 0 && "No V1 indices");
18386
18387 if (NumV2Elements == 0)
18388 return false;
18389
18390 // When the number of V1 and V2 elements are the same, try to minimize the
18391 // number of uses of V2 in the low half of the vector. When that is tied,
18392 // ensure that the sum of indices for V1 is equal to or lower than the sum
18393 // indices for V2. When those are equal, try to ensure that the number of odd
18394 // indices for V1 is lower than the number of odd indices for V2.
18395 if (NumV1Elements == NumV2Elements) {
18396 int LowV1Elements = 0, LowV2Elements = 0;
18397 for (int M : Mask.slice(0, NumElements / 2))
18398 if (M >= NumElements)
18399 ++LowV2Elements;
18400 else if (M >= 0)
18401 ++LowV1Elements;
18402 if (LowV2Elements > LowV1Elements)
18403 return true;
18404 if (LowV2Elements == LowV1Elements) {
18405 int SumV1Indices = 0, SumV2Indices = 0;
18406 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18407 if (Mask[i] >= NumElements)
18408 SumV2Indices += i;
18409 else if (Mask[i] >= 0)
18410 SumV1Indices += i;
18411 if (SumV2Indices < SumV1Indices)
18412 return true;
18413 if (SumV2Indices == SumV1Indices) {
18414 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18415 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18416 if (Mask[i] >= NumElements)
18417 NumV2OddIndices += i % 2;
18418 else if (Mask[i] >= 0)
18419 NumV1OddIndices += i % 2;
18420 if (NumV2OddIndices < NumV1OddIndices)
18421 return true;
18422 }
18423 }
18424 }
18425
18426 return false;
18427}
18428
18430 const X86Subtarget &Subtarget) {
18431 if (!Subtarget.hasAVX512())
18432 return false;
18433
18434 if (!V.getValueType().isSimple())
18435 return false;
18436
18437 MVT VT = V.getSimpleValueType().getScalarType();
18438 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18439 return false;
18440
18441 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18442 // are preferable to blendw/blendvb/masked-mov.
18443 if ((VT == MVT::i16 || VT == MVT::i8) &&
18444 V.getSimpleValueType().getSizeInBits() < 512)
18445 return false;
18446
18447 auto HasMaskOperation = [&](SDValue V) {
18448 // TODO: Currently we only check limited opcode. We probably extend
18449 // it to all binary operation by checking TLI.isBinOp().
18450 switch (V->getOpcode()) {
18451 default:
18452 return false;
18453 case ISD::ADD:
18454 case ISD::SUB:
18455 case ISD::AND:
18456 case ISD::XOR:
18457 case ISD::OR:
18458 case ISD::SMAX:
18459 case ISD::SMIN:
18460 case ISD::UMAX:
18461 case ISD::UMIN:
18462 case ISD::ABS:
18463 case ISD::SHL:
18464 case ISD::SRL:
18465 case ISD::SRA:
18466 case ISD::MUL:
18467 break;
18468 }
18469 if (!V->hasOneUse())
18470 return false;
18471
18472 return true;
18473 };
18474
18475 if (HasMaskOperation(V))
18476 return true;
18477
18478 return false;
18479}
18480
18481// Forward declaration.
18484 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18485 const X86Subtarget &Subtarget);
18486
18487 /// Top-level lowering for x86 vector shuffles.
18488///
18489/// This handles decomposition, canonicalization, and lowering of all x86
18490/// vector shuffles. Most of the specific lowering strategies are encapsulated
18491/// above in helper routines. The canonicalization attempts to widen shuffles
18492/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18493/// s.t. only one of the two inputs needs to be tested, etc.
18495 SelectionDAG &DAG) {
18497 ArrayRef<int> OrigMask = SVOp->getMask();
18498 SDValue V1 = Op.getOperand(0);
18499 SDValue V2 = Op.getOperand(1);
18500 MVT VT = Op.getSimpleValueType();
18501 int NumElements = VT.getVectorNumElements();
18502 SDLoc DL(Op);
18503 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18504
18505 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18506 "Can't lower MMX shuffles");
18507
18508 bool V1IsUndef = V1.isUndef();
18509 bool V2IsUndef = V2.isUndef();
18510 if (V1IsUndef && V2IsUndef)
18511 return DAG.getUNDEF(VT);
18512
18513 // When we create a shuffle node we put the UNDEF node to second operand,
18514 // but in some cases the first operand may be transformed to UNDEF.
18515 // In this case we should just commute the node.
18516 if (V1IsUndef)
18517 return DAG.getCommutedVectorShuffle(*SVOp);
18518
18519 // Check for non-undef masks pointing at an undef vector and make the masks
18520 // undef as well. This makes it easier to match the shuffle based solely on
18521 // the mask.
18522 if (V2IsUndef &&
18523 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18524 SmallVector<int, 8> NewMask(OrigMask);
18525 for (int &M : NewMask)
18526 if (M >= NumElements)
18527 M = -1;
18528 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18529 }
18530
18531 // Check for illegal shuffle mask element index values.
18532 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18533 (void)MaskUpperLimit;
18534 assert(llvm::all_of(OrigMask,
18535 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18536 "Out of bounds shuffle index");
18537
18538 // We actually see shuffles that are entirely re-arrangements of a set of
18539 // zero inputs. This mostly happens while decomposing complex shuffles into
18540 // simple ones. Directly lower these as a buildvector of zeros.
18541 APInt KnownUndef, KnownZero;
18542 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18543
18544 APInt Zeroable = KnownUndef | KnownZero;
18545 if (Zeroable.isAllOnes())
18546 return getZeroVector(VT, Subtarget, DAG, DL);
18547
18548 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18549
18550 // Try to collapse shuffles into using a vector type with fewer elements but
18551 // wider element types. We cap this to not form integers or floating point
18552 // elements wider than 64 bits. It does not seem beneficial to form i128
18553 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18554 SmallVector<int, 16> WidenedMask;
18555 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18556 !canCombineAsMaskOperation(V1, Subtarget) &&
18557 !canCombineAsMaskOperation(V2, Subtarget) &&
18558 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18559 // Shuffle mask widening should not interfere with a broadcast opportunity
18560 // by obfuscating the operands with bitcasts.
18561 // TODO: Avoid lowering directly from this top-level function: make this
18562 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18563 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18564 Subtarget, DAG))
18565 return Broadcast;
18566
18567 MVT NewEltVT = VT.isFloatingPoint()
18570 int NewNumElts = NumElements / 2;
18571 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18572 // Make sure that the new vector type is legal. For example, v2f64 isn't
18573 // legal on SSE1.
18574 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18575 if (V2IsZero) {
18576 // Modify the new Mask to take all zeros from the all-zero vector.
18577 // Choose indices that are blend-friendly.
18578 bool UsedZeroVector = false;
18579 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18580 "V2's non-undef elements are used?!");
18581 for (int i = 0; i != NewNumElts; ++i)
18582 if (WidenedMask[i] == SM_SentinelZero) {
18583 WidenedMask[i] = i + NewNumElts;
18584 UsedZeroVector = true;
18585 }
18586 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18587 // some elements to be undef.
18588 if (UsedZeroVector)
18589 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18590 }
18591 V1 = DAG.getBitcast(NewVT, V1);
18592 V2 = DAG.getBitcast(NewVT, V2);
18593 return DAG.getBitcast(
18594 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18595 }
18596 }
18597
18598 SmallVector<SDValue> Ops = {V1, V2};
18599 SmallVector<int> Mask(OrigMask);
18600
18601 // Canonicalize the shuffle with any horizontal ops inputs.
18602 // Don't attempt this if the shuffle can still be widened as we may lose
18603 // whole lane shuffle patterns.
18604 // NOTE: This may update Ops and Mask.
18605 if (!canWidenShuffleElements(Mask)) {
18607 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18608 return DAG.getBitcast(VT, HOp);
18609
18610 V1 = DAG.getBitcast(VT, Ops[0]);
18611 V2 = DAG.getBitcast(VT, Ops[1]);
18612 assert(NumElements == (int)Mask.size() &&
18613 "canonicalizeShuffleMaskWithHorizOp "
18614 "shouldn't alter the shuffle mask size");
18615 }
18616
18617 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18618 // These will be materialized uniformly anyway, so make splat matching easier.
18619 // TODO: Allow all int constants?
18620 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18621 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18622 BitVector Undefs;
18623 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18624 if (Undefs.any() &&
18627 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18628 }
18629 }
18630 }
18631 return V;
18632 };
18633 V1 = CanonicalizeConstant(V1);
18634 V2 = CanonicalizeConstant(V2);
18635
18636 // Commute the shuffle if it will improve canonicalization.
18639 std::swap(V1, V2);
18640 }
18641
18642 // For each vector width, delegate to a specialized lowering routine.
18643 if (VT.is128BitVector())
18644 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18645
18646 if (VT.is256BitVector())
18647 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18648
18649 if (VT.is512BitVector())
18650 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18651
18652 if (Is1BitVector)
18653 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18654
18655 llvm_unreachable("Unimplemented!");
18656}
18657
18658// As legal vpcompress instructions depend on various AVX512 extensions, try to
18659// convert illegal vector sizes to legal ones to avoid expansion.
18661 SelectionDAG &DAG) {
18662 assert(Subtarget.hasAVX512() &&
18663 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18664
18665 SDLoc DL(Op);
18666 SDValue Vec = Op.getOperand(0);
18667 SDValue Mask = Op.getOperand(1);
18668 SDValue Passthru = Op.getOperand(2);
18669
18670 EVT VecVT = Vec.getValueType();
18671 EVT ElementVT = VecVT.getVectorElementType();
18672 unsigned NumElements = VecVT.getVectorNumElements();
18673 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18674 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18675
18676 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18677 // compressed as 512-bit vectors in AVX512F.
18678 if (NumVecBits != 128 && NumVecBits != 256)
18679 return SDValue();
18680
18681 if (NumElementBits == 32 || NumElementBits == 64) {
18682 unsigned NumLargeElements = 512 / NumElementBits;
18683 MVT LargeVecVT =
18684 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18685 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18686
18687 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18688 DAG, DL);
18689 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18690 Subtarget, DAG, DL);
18691 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18692 : widenSubVector(LargeVecVT, Passthru,
18693 /*ZeroNewElements=*/false,
18694 Subtarget, DAG, DL);
18695
18696 SDValue Compressed =
18697 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18698 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18699 DAG.getConstant(0, DL, MVT::i64));
18700 }
18701
18702 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18703 VecVT == MVT::v16i16) {
18704 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18705 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18706
18707 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18708 Passthru = Passthru.isUndef()
18709 ? DAG.getUNDEF(LargeVecVT)
18710 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18711
18712 SDValue Compressed =
18713 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18714 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18715 }
18716
18717 return SDValue();
18718}
18719
18720/// Try to lower a VSELECT instruction to a vector shuffle.
18722 const X86Subtarget &Subtarget,
18723 SelectionDAG &DAG) {
18724 SDValue Cond = Op.getOperand(0);
18725 SDValue LHS = Op.getOperand(1);
18726 SDValue RHS = Op.getOperand(2);
18727 MVT VT = Op.getSimpleValueType();
18728
18729 // Only non-legal VSELECTs reach this lowering, convert those into generic
18730 // shuffles and re-use the shuffle lowering path for blends.
18734 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18735 }
18736
18737 return SDValue();
18738}
18739
18740SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18741 SDValue Cond = Op.getOperand(0);
18742 SDValue LHS = Op.getOperand(1);
18743 SDValue RHS = Op.getOperand(2);
18744
18745 SDLoc dl(Op);
18746 MVT VT = Op.getSimpleValueType();
18747 if (isSoftF16(VT, Subtarget)) {
18748 MVT NVT = VT.changeVectorElementTypeToInteger();
18749 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18750 DAG.getBitcast(NVT, LHS),
18751 DAG.getBitcast(NVT, RHS)));
18752 }
18753
18754 // A vselect where all conditions and data are constants can be optimized into
18755 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18759 return SDValue();
18760
18761 // Try to lower this to a blend-style vector shuffle. This can handle all
18762 // constant condition cases.
18763 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18764 return BlendOp;
18765
18766 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18767 // with patterns on the mask registers on AVX-512.
18768 MVT CondVT = Cond.getSimpleValueType();
18769 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18770 if (CondEltSize == 1)
18771 return Op;
18772
18773 // Variable blends are only legal from SSE4.1 onward.
18774 if (!Subtarget.hasSSE41())
18775 return SDValue();
18776
18777 unsigned EltSize = VT.getScalarSizeInBits();
18778 unsigned NumElts = VT.getVectorNumElements();
18779
18780 // Expand v32i16/v64i8 without BWI.
18781 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18782 return SDValue();
18783
18784 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18785 // into an i1 condition so that we can use the mask-based 512-bit blend
18786 // instructions.
18787 if (VT.getSizeInBits() == 512) {
18788 // Build a mask by testing the condition against zero.
18789 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18790 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18791 DAG.getConstant(0, dl, CondVT),
18792 ISD::SETNE);
18793 // Now return a new VSELECT using the mask.
18794 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18795 }
18796
18797 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18798 if (CondEltSize != EltSize) {
18799 // If we don't have a sign splat, rely on the expansion.
18800 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18801 return SDValue();
18802
18803 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18804 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18805 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18806 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18807 }
18808
18809 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18810 // are free to split, then better to split before expanding the
18811 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18812 // TODO: This is very similar to narrowVectorSelect.
18813 // TODO: Add Load splitting to isFreeToSplitVector ?
18814 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18815 !Subtarget.hasXOP()) {
18816 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18817 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18818 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18819 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18820 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18821 if (FreeCond && (FreeLHS || FreeRHS))
18822 return splitVectorOp(Op, DAG, dl);
18823 }
18824
18825 // Only some types will be legal on some subtargets. If we can emit a legal
18826 // VSELECT-matching blend, return Op, and but if we need to expand, return
18827 // a null value.
18828 switch (VT.SimpleTy) {
18829 default:
18830 // Most of the vector types have blends past SSE4.1.
18831 return Op;
18832
18833 case MVT::v32i8:
18834 // The byte blends for AVX vectors were introduced only in AVX2.
18835 if (Subtarget.hasAVX2())
18836 return Op;
18837
18838 return SDValue();
18839
18840 case MVT::v8i16:
18841 case MVT::v16i16:
18842 case MVT::v8f16:
18843 case MVT::v16f16: {
18844 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18845 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18846 Cond = DAG.getBitcast(CastVT, Cond);
18847 LHS = DAG.getBitcast(CastVT, LHS);
18848 RHS = DAG.getBitcast(CastVT, RHS);
18849 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18850 return DAG.getBitcast(VT, Select);
18851 }
18852 }
18853}
18854
18856 MVT VT = Op.getSimpleValueType();
18857 SDValue Vec = Op.getOperand(0);
18858 SDValue Idx = Op.getOperand(1);
18859 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18860 SDLoc dl(Op);
18861
18863 return SDValue();
18864
18865 if (VT.getSizeInBits() == 8) {
18866 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18867 // we're going to zero extend the register or fold the store.
18870 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18871 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18872 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18873
18874 unsigned IdxVal = Idx->getAsZExtVal();
18875 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18876 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18877 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18878 }
18879
18880 if (VT == MVT::f32) {
18881 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18882 // the result back to FR32 register. It's only worth matching if the
18883 // result has a single use which is a store or a bitcast to i32. And in
18884 // the case of a store, it's not worth it if the index is a constant 0,
18885 // because a MOVSSmr can be used instead, which is smaller and faster.
18886 if (!Op.hasOneUse())
18887 return SDValue();
18888 SDNode *User = *Op.getNode()->user_begin();
18889 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18890 (User->getOpcode() != ISD::BITCAST ||
18891 User->getValueType(0) != MVT::i32))
18892 return SDValue();
18893 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18894 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18895 return DAG.getBitcast(MVT::f32, Extract);
18896 }
18897
18898 if (VT == MVT::i32 || VT == MVT::i64)
18899 return Op;
18900
18901 return SDValue();
18902}
18903
18904/// Extract one bit from mask vector, like v16i1 or v8i1.
18905/// AVX-512 feature.
18907 const X86Subtarget &Subtarget) {
18908 SDValue Vec = Op.getOperand(0);
18909 SDLoc dl(Vec);
18910 MVT VecVT = Vec.getSimpleValueType();
18911 SDValue Idx = Op.getOperand(1);
18912 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18913 MVT EltVT = Op.getSimpleValueType();
18914
18915 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18916 "Unexpected vector type in ExtractBitFromMaskVector");
18917
18918 // variable index can't be handled in mask registers,
18919 // extend vector to VR512/128
18920 if (!IdxC) {
18921 unsigned NumElts = VecVT.getVectorNumElements();
18922 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18923 // than extending to 128/256bit.
18924 if (NumElts == 1) {
18925 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18927 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18928 }
18929 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18930 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18931 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18932 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18933 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18934 }
18935
18936 unsigned IdxVal = IdxC->getZExtValue();
18937 if (IdxVal == 0) // the operation is legal
18938 return Op;
18939
18940 // Extend to natively supported kshift.
18941 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18942
18943 // Use kshiftr instruction to move to the lower element.
18944 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18945 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18946
18947 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18948 DAG.getVectorIdxConstant(0, dl));
18949}
18950
18951// Helper to find all the extracted elements from a vector.
18953 MVT VT = N->getSimpleValueType(0);
18954 unsigned NumElts = VT.getVectorNumElements();
18955 APInt DemandedElts = APInt::getZero(NumElts);
18956 for (SDNode *User : N->users()) {
18957 switch (User->getOpcode()) {
18958 case X86ISD::PEXTRB:
18959 case X86ISD::PEXTRW:
18962 DemandedElts.setAllBits();
18963 return DemandedElts;
18964 }
18965 DemandedElts.setBit(User->getConstantOperandVal(1));
18966 break;
18967 case ISD::BITCAST: {
18968 if (!User->getValueType(0).isSimple() ||
18969 !User->getValueType(0).isVector()) {
18970 DemandedElts.setAllBits();
18971 return DemandedElts;
18972 }
18973 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18974 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18975 break;
18976 }
18977 default:
18978 DemandedElts.setAllBits();
18979 return DemandedElts;
18980 }
18981 }
18982 return DemandedElts;
18983}
18984
18985SDValue
18986X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18987 SelectionDAG &DAG) const {
18988 SDLoc dl(Op);
18989 SDValue Vec = Op.getOperand(0);
18990 MVT VecVT = Vec.getSimpleValueType();
18991 SDValue Idx = Op.getOperand(1);
18992 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18993
18994 if (VecVT.getVectorElementType() == MVT::i1)
18995 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18996
18997 if (!IdxC) {
18998 // Its more profitable to go through memory (1 cycles throughput)
18999 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
19000 // IACA tool was used to get performance estimation
19001 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
19002 //
19003 // example : extractelement <16 x i8> %a, i32 %i
19004 //
19005 // Block Throughput: 3.00 Cycles
19006 // Throughput Bottleneck: Port5
19007 //
19008 // | Num Of | Ports pressure in cycles | |
19009 // | Uops | 0 - DV | 5 | 6 | 7 | |
19010 // ---------------------------------------------
19011 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
19012 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
19013 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
19014 // Total Num Of Uops: 4
19015 //
19016 //
19017 // Block Throughput: 1.00 Cycles
19018 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
19019 //
19020 // | | Ports pressure in cycles | |
19021 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
19022 // ---------------------------------------------------------
19023 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
19024 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
19025 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
19026 // Total Num Of Uops: 4
19027
19028 return SDValue();
19029 }
19030
19031 unsigned IdxVal = IdxC->getZExtValue();
19032
19033 // If this is a 256-bit vector result, first extract the 128-bit vector and
19034 // then extract the element from the 128-bit vector.
19035 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
19036 // Get the 128-bit vector.
19037 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
19038 MVT EltVT = VecVT.getVectorElementType();
19039
19040 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
19041 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
19042
19043 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
19044 // this can be done with a mask.
19045 IdxVal &= ElemsPerChunk - 1;
19046 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19047 DAG.getVectorIdxConstant(IdxVal, dl));
19048 }
19049
19050 assert(VecVT.is128BitVector() && "Unexpected vector length");
19051
19052 MVT VT = Op.getSimpleValueType();
19053
19054 if (VT == MVT::i16) {
19055 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
19056 // we're going to zero extend the register or fold the store (SSE41 only).
19057 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
19058 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
19059 if (Subtarget.hasFP16())
19060 return Op;
19061
19062 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
19063 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19064 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19065 }
19066
19067 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19068 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19069 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19070 }
19071
19072 if (Subtarget.hasSSE41())
19073 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
19074 return Res;
19075
19076 // Only extract a single element from a v16i8 source - determine the common
19077 // DWORD/WORD that all extractions share, and extract the sub-byte.
19078 // TODO: Add QWORD MOVQ extraction?
19079 if (VT == MVT::i8) {
19080 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
19081 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
19082
19083 // Extract either the lowest i32 or any i16, and extract the sub-byte.
19084 int DWordIdx = IdxVal / 4;
19085 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
19086 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19087 DAG.getBitcast(MVT::v4i32, Vec),
19088 DAG.getVectorIdxConstant(DWordIdx, dl));
19089 int ShiftVal = (IdxVal % 4) * 8;
19090 if (ShiftVal != 0)
19091 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
19092 DAG.getConstant(ShiftVal, dl, MVT::i8));
19093 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19094 }
19095
19096 int WordIdx = IdxVal / 2;
19097 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
19098 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
19099 DAG.getBitcast(MVT::v8i16, Vec),
19100 DAG.getVectorIdxConstant(WordIdx, dl));
19101 int ShiftVal = (IdxVal % 2) * 8;
19102 if (ShiftVal != 0)
19103 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
19104 DAG.getConstant(ShiftVal, dl, MVT::i8));
19105 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19106 }
19107 }
19108
19109 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
19110 if (IdxVal == 0)
19111 return Op;
19112
19113 // Shuffle the element to the lowest element, then movss or movsh.
19114 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
19115 Mask[0] = static_cast<int>(IdxVal);
19116 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19117 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19118 DAG.getVectorIdxConstant(0, dl));
19119 }
19120
19121 if (VT.getSizeInBits() == 64) {
19122 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
19123 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
19124 // to match extract_elt for f64.
19125 if (IdxVal == 0)
19126 return Op;
19127
19128 // UNPCKHPD the element to the lowest double word, then movsd.
19129 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
19130 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
19131 int Mask[2] = { 1, -1 };
19132 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19133 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19134 DAG.getVectorIdxConstant(0, dl));
19135 }
19136
19137 return SDValue();
19138}
19139
19140/// Insert one bit to mask vector, like v16i1 or v8i1.
19141/// AVX-512 feature.
19143 const X86Subtarget &Subtarget) {
19144 SDLoc dl(Op);
19145 SDValue Vec = Op.getOperand(0);
19146 SDValue Elt = Op.getOperand(1);
19147 SDValue Idx = Op.getOperand(2);
19148 MVT VecVT = Vec.getSimpleValueType();
19149
19150 if (!isa<ConstantSDNode>(Idx)) {
19151 // Non constant index. Extend source and destination,
19152 // insert element and then truncate the result.
19153 unsigned NumElts = VecVT.getVectorNumElements();
19154 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19155 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19156 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
19157 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
19158 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
19159 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
19160 }
19161
19162 // Copy into a k-register, extract to v1i1 and insert_subvector.
19163 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
19164 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
19165}
19166
19167SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
19168 SelectionDAG &DAG) const {
19169 MVT VT = Op.getSimpleValueType();
19170 MVT EltVT = VT.getVectorElementType();
19171 unsigned NumElts = VT.getVectorNumElements();
19172 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
19173
19174 if (EltVT == MVT::i1)
19175 return InsertBitToMaskVector(Op, DAG, Subtarget);
19176
19177 SDLoc dl(Op);
19178 SDValue N0 = Op.getOperand(0);
19179 SDValue N1 = Op.getOperand(1);
19180 SDValue N2 = Op.getOperand(2);
19181 auto *N2C = dyn_cast<ConstantSDNode>(N2);
19182
19183 if (EltVT == MVT::bf16) {
19184 MVT IVT = VT.changeVectorElementTypeToInteger();
19185 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
19186 DAG.getBitcast(IVT, N0),
19187 DAG.getBitcast(MVT::i16, N1), N2);
19188 return DAG.getBitcast(VT, Res);
19189 }
19190
19191 if (!N2C) {
19192 // Variable insertion indices, usually we're better off spilling to stack,
19193 // but AVX512 can use a variable compare+select by comparing against all
19194 // possible vector indices, and FP insertion has less gpr->simd traffic.
19195 if (!(Subtarget.hasBWI() ||
19196 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19197 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
19198 return SDValue();
19199
19200 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
19201 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
19202 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
19203 return SDValue();
19204
19205 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
19206 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
19207 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
19208
19209 SmallVector<SDValue, 16> RawIndices;
19210 for (unsigned I = 0; I != NumElts; ++I)
19211 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
19212 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
19213
19214 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
19215 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19217 }
19218
19219 if (N2C->getAPIntValue().uge(NumElts))
19220 return SDValue();
19221 uint64_t IdxVal = N2C->getZExtValue();
19222
19223 bool IsZeroElt = X86::isZeroNode(N1);
19224 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19225
19226 if (IsZeroElt || IsAllOnesElt) {
19227 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
19228 // We don't deal with i8 0 since it appears to be handled elsewhere.
19229 if (IsAllOnesElt &&
19230 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
19231 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
19232 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
19233 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
19234 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
19235 CstVectorElts[IdxVal] = OnesCst;
19236 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
19237 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
19238 }
19239 // See if we can do this more efficiently with a blend shuffle with a
19240 // rematerializable vector.
19241 if (Subtarget.hasSSE41() &&
19242 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19243 SmallVector<int, 8> BlendMask;
19244 for (unsigned i = 0; i != NumElts; ++i)
19245 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19246 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19247 : getOnesVector(VT, DAG, dl);
19248 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19249 }
19250 }
19251
19252 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19253 // into that, and then insert the subvector back into the result.
19254 if (VT.is256BitVector() || VT.is512BitVector()) {
19255 // With a 256-bit vector, we can insert into the zero element efficiently
19256 // using a blend if we have AVX or AVX2 and the right data type.
19257 if (VT.is256BitVector() && IdxVal == 0) {
19258 // TODO: It is worthwhile to cast integer to floating point and back
19259 // and incur a domain crossing penalty if that's what we'll end up
19260 // doing anyway after extracting to a 128-bit vector.
19261 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19262 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19263 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19264 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19265 DAG.getTargetConstant(1, dl, MVT::i8));
19266 }
19267 }
19268
19269 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19270 assert(isPowerOf2_32(NumEltsIn128) &&
19271 "Vectors will always have power-of-two number of elements.");
19272
19273 // If we are not inserting into the low 128-bit vector chunk,
19274 // then prefer the broadcast+blend sequence.
19275 // FIXME: relax the profitability check iff all N1 uses are insertions.
19276 if (IdxVal >= NumEltsIn128 &&
19277 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19278 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19279 X86::mayFoldLoad(N1, Subtarget)))) {
19280 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19281 SmallVector<int, 8> BlendMask;
19282 for (unsigned i = 0; i != NumElts; ++i)
19283 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19284 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19285 }
19286
19287 // Get the desired 128-bit vector chunk.
19288 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19289
19290 // Insert the element into the desired chunk.
19291 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19292 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19293
19294 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19295 DAG.getVectorIdxConstant(IdxIn128, dl));
19296
19297 // Insert the changed part back into the bigger vector
19298 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19299 }
19300 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19301
19302 // This will be just movw/movd/movq/movsh/movss/movsd.
19303 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19304 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19305 EltVT == MVT::f16 || EltVT == MVT::i64) {
19306 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19307 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19308 }
19309
19310 // We can't directly insert an i8 or i16 into a vector, so zero extend
19311 // it to i32 first.
19312 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19313 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19314 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19315 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19316 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19317 return DAG.getBitcast(VT, N1);
19318 }
19319 }
19320
19321 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19322 // argument. SSE41 required for pinsrb.
19323 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19324 unsigned Opc;
19325 if (VT == MVT::v8i16) {
19326 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19328 } else {
19329 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19330 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19332 }
19333
19334 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19335 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19336 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19337 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19338 }
19339
19340 if (Subtarget.hasSSE41()) {
19341 if (EltVT == MVT::f32) {
19342 // Bits [7:6] of the constant are the source select. This will always be
19343 // zero here. The DAG Combiner may combine an extract_elt index into
19344 // these bits. For example (insert (extract, 3), 2) could be matched by
19345 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19346 // Bits [5:4] of the constant are the destination select. This is the
19347 // value of the incoming immediate.
19348 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19349 // combine either bitwise AND or insert of float 0.0 to set these bits.
19350
19351 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19352 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19353 // If this is an insertion of 32-bits into the low 32-bits of
19354 // a vector, we prefer to generate a blend with immediate rather
19355 // than an insertps. Blends are simpler operations in hardware and so
19356 // will always have equal or better performance than insertps.
19357 // But if optimizing for size and there's a load folding opportunity,
19358 // generate insertps because blendps does not have a 32-bit memory
19359 // operand form.
19360 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19361 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19362 DAG.getTargetConstant(1, dl, MVT::i8));
19363 }
19364 // Create this as a scalar to vector..
19365 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19366 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19367 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19368 }
19369
19370 // PINSR* works with constant index.
19371 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19372 return Op;
19373 }
19374
19375 return SDValue();
19376}
19377
19378static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
19379 SelectionDAG &DAG) {
19380 SDLoc DL(Op);
19381 SDValue X = Op.getOperand(0);
19382 MVT XTy = X.getSimpleValueType();
19383 SDValue Exp = Op.getOperand(1);
19384
19385 switch (XTy.SimpleTy) {
19386 default:
19387 return SDValue();
19388 case MVT::f16:
19389 if (!Subtarget.hasFP16())
19390 X = DAG.getFPExtendOrRound(X, DL, MVT::f32);
19391 [[fallthrough]];
19392 case MVT::f32:
19393 case MVT::f64: {
19394 MVT VT = MVT::getVectorVT(X.getSimpleValueType(),
19395 128 / X.getSimpleValueType().getSizeInBits());
19396 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
19397 SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X);
19398 SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp);
19399 SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp);
19400 SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0);
19401 return DAG.getFPExtendOrRound(Final, DL, XTy);
19402 }
19403 case MVT::v4f32:
19404 case MVT::v2f64:
19405 case MVT::v8f32:
19406 case MVT::v4f64:
19407 case MVT::v16f32:
19408 case MVT::v8f64:
19409 if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) {
19410 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19411 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19412 }
19413 break;
19414 case MVT::v8f16:
19415 case MVT::v16f16:
19416 if (Subtarget.hasFP16()) {
19417 if (Subtarget.hasVLX()) {
19418 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19419 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19420 }
19421 break;
19422 }
19423 X = DAG.getFPExtendOrRound(X, DL, XTy.changeVectorElementType(MVT::f32));
19424 Exp = DAG.getSExtOrTrunc(Exp, DL,
19425 X.getSimpleValueType().changeTypeToInteger());
19426 break;
19427 case MVT::v32f16:
19428 if (Subtarget.hasFP16()) {
19429 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19430 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19431 }
19432 return splitVectorOp(Op, DAG, DL);
19433 }
19434 SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512);
19435 SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512);
19436 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, WideExp.getSimpleValueType(), Exp);
19437 SDValue Scalef =
19438 DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp);
19439 SDValue Final =
19440 DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0);
19441 return DAG.getFPExtendOrRound(Final, DL, XTy);
19442}
19443
19445 SelectionDAG &DAG) {
19446 SDLoc dl(Op);
19447 MVT OpVT = Op.getSimpleValueType();
19448
19449 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19450 // combines.
19451 if (X86::isZeroNode(Op.getOperand(0)))
19452 return getZeroVector(OpVT, Subtarget, DAG, dl);
19453
19454 // If this is a 256-bit vector result, first insert into a 128-bit
19455 // vector and then insert into the 256-bit vector.
19456 if (!OpVT.is128BitVector()) {
19457 // Insert into a 128-bit vector.
19458 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19460 OpVT.getVectorNumElements() / SizeFactor);
19461
19462 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19463
19464 // Insert the 128-bit vector.
19465 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19466 }
19467 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19468 "Expected an SSE type!");
19469
19470 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19471 // tblgen.
19472 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19473 return Op;
19474
19475 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19476 return DAG.getBitcast(
19477 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19478}
19479
19480// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19481// simple superregister reference or explicit instructions to insert
19482// the upper bits of a vector.
19484 SelectionDAG &DAG) {
19485 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19486
19487 return insert1BitVector(Op, DAG, Subtarget);
19488}
19489
19491 SelectionDAG &DAG) {
19492 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19493 "Only vXi1 extract_subvectors need custom lowering");
19494
19495 SDLoc dl(Op);
19496 SDValue Vec = Op.getOperand(0);
19497 uint64_t IdxVal = Op.getConstantOperandVal(1);
19498
19499 if (IdxVal == 0) // the operation is legal
19500 return Op;
19501
19502 // Extend to natively supported kshift.
19503 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19504
19505 // Shift to the LSB.
19506 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19507 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19508
19509 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19510 DAG.getVectorIdxConstant(0, dl));
19511}
19512
19513// Returns the appropriate wrapper opcode for a global reference.
19514unsigned X86TargetLowering::getGlobalWrapperKind(
19515 const GlobalValue *GV, const unsigned char OpFlags) const {
19516 // References to absolute symbols are never PC-relative.
19517 if (GV && GV->isAbsoluteSymbolRef())
19518 return X86ISD::Wrapper;
19519
19520 // The following OpFlags under RIP-rel PIC use RIP.
19521 if (Subtarget.isPICStyleRIPRel() &&
19522 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19523 OpFlags == X86II::MO_DLLIMPORT))
19524 return X86ISD::WrapperRIP;
19525
19526 // GOTPCREL references must always use RIP.
19527 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19528 return X86ISD::WrapperRIP;
19529
19530 return X86ISD::Wrapper;
19531}
19532
19533// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19534// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19535// one of the above mentioned nodes. It has to be wrapped because otherwise
19536// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19537// be used to form addressing mode. These wrapped nodes will be selected
19538// into MOV32ri.
19539SDValue
19540X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19541 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19542
19543 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19544 // global base reg.
19545 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19546
19547 auto PtrVT = getPointerTy(DAG.getDataLayout());
19549 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19550 SDLoc DL(CP);
19551 Result =
19552 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19553 // With PIC, the address is actually $g + Offset.
19554 if (OpFlag) {
19555 Result =
19556 DAG.getNode(ISD::ADD, DL, PtrVT,
19557 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19558 }
19559
19560 return Result;
19561}
19562
19563SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19564 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19565
19566 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19567 // global base reg.
19568 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19569
19570 EVT PtrVT = Op.getValueType();
19571 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19572 SDLoc DL(JT);
19573 Result =
19574 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19575
19576 // With PIC, the address is actually $g + Offset.
19577 if (OpFlag)
19578 Result =
19579 DAG.getNode(ISD::ADD, DL, PtrVT,
19580 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19581
19582 return Result;
19583}
19584
19585SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19586 SelectionDAG &DAG) const {
19587 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19588}
19589
19590SDValue
19591X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19592 // Create the TargetBlockAddressAddress node.
19593 unsigned char OpFlags =
19594 Subtarget.classifyBlockAddressReference();
19595 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19596 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19597 SDLoc dl(Op);
19598 EVT PtrVT = Op.getValueType();
19599 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19600 Result =
19601 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19602
19603 // With PIC, the address is actually $g + Offset.
19604 if (isGlobalRelativeToPICBase(OpFlags)) {
19605 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19606 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19607 }
19608
19609 return Result;
19610}
19611
19612/// Creates target global address or external symbol nodes for calls or
19613/// other uses.
19614SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19615 bool ForCall,
19616 bool *IsImpCall) const {
19617 // Unpack the global address or external symbol.
19618 SDLoc dl(Op);
19619 const GlobalValue *GV = nullptr;
19620 int64_t Offset = 0;
19621 const char *ExternalSym = nullptr;
19622 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19623 GV = G->getGlobal();
19624 Offset = G->getOffset();
19625 } else {
19626 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19627 ExternalSym = ES->getSymbol();
19628 }
19629
19630 // Calculate some flags for address lowering.
19632 unsigned char OpFlags;
19633 if (ForCall)
19634 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19635 else
19636 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19637 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19638 bool NeedsLoad = isGlobalStubReference(OpFlags);
19639
19641 EVT PtrVT = Op.getValueType();
19643
19644 if (GV) {
19645 // Create a target global address if this is a global. If possible, fold the
19646 // offset into the global address reference. Otherwise, ADD it on later.
19647 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19648 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19649 // relocation will compute to a negative value, which is invalid.
19650 int64_t GlobalOffset = 0;
19651 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19653 std::swap(GlobalOffset, Offset);
19654 }
19655 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19656 } else {
19657 // If this is not a global address, this must be an external symbol.
19658 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19659 }
19660
19661 // If this is a direct call, avoid the wrapper if we don't need to do any
19662 // loads or adds. This allows SDAG ISel to match direct calls.
19663 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19664 return Result;
19665
19666 // If Import Call Optimization is enabled and this is an imported function
19667 // then make a note of it and return the global address without wrapping.
19668 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19669 Mod.getModuleFlag("import-call-optimization")) {
19670 assert(ForCall && "Should only enable import call optimization if we are "
19671 "lowering a call");
19672 *IsImpCall = true;
19673 return Result;
19674 }
19675
19676 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19677
19678 // With PIC, the address is actually $g + Offset.
19679 if (HasPICReg) {
19680 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19681 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19682 }
19683
19684 // For globals that require a load from a stub to get the address, emit the
19685 // load.
19686 if (NeedsLoad)
19687 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19689
19690 // If there was a non-zero offset that we didn't fold, create an explicit
19691 // addition for it.
19692 if (Offset != 0)
19693 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19694 DAG.getSignedConstant(Offset, dl, PtrVT));
19695
19696 return Result;
19697}
19698
19699SDValue
19700X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19701 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19702}
19703
19705 const EVT PtrVT, unsigned ReturnReg,
19706 unsigned char OperandFlags,
19707 bool LoadGlobalBaseReg = false,
19708 bool LocalDynamic = false) {
19710 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19711 SDLoc dl(GA);
19712 SDValue TGA;
19713 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19714 SDValue Chain = DAG.getEntryNode();
19715 SDValue Ret;
19716 if (LocalDynamic && UseTLSDESC) {
19717 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19718 // Reuse existing GetTLSADDR node if we can find it.
19719 if (TGA->hasOneUse()) {
19720 // TLSDESC uses TGA.
19721 SDNode *TLSDescOp = *TGA->user_begin();
19722 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19723 "Unexpected TLSDESC DAG");
19724 // CALLSEQ_END uses TGA via a chain and glue.
19725 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19726 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19727 "Unexpected TLSDESC DAG");
19728 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19729 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19730 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19731 "Unexpected TLSDESC DAG");
19732 Ret = SDValue(CopyFromRegOp, 0);
19733 }
19734 } else {
19735 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19736 GA->getOffset(), OperandFlags);
19737 }
19738
19739 if (!Ret) {
19740 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19741 : LocalDynamic ? X86ISD::TLSBASEADDR
19743
19744 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19745 if (LoadGlobalBaseReg) {
19746 SDValue InGlue;
19747 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19748 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19749 InGlue);
19750 InGlue = Chain.getValue(1);
19751 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19752 } else {
19753 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19754 }
19755 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19756
19757 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19758 MFI.setHasCalls(true);
19759
19760 SDValue Glue = Chain.getValue(1);
19761 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19762 }
19763
19764 if (!UseTLSDESC)
19765 return Ret;
19766
19767 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19768 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19769
19771 SDValue Offset =
19772 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19773 MachinePointerInfo(Ptr));
19774 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19775}
19776
19777// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19778static SDValue
19780 const EVT PtrVT) {
19781 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19782 /*LoadGlobalBaseReg=*/true);
19783}
19784
19785// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19786static SDValue
19788 const EVT PtrVT) {
19789 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19790}
19791
19792// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19793static SDValue
19795 const EVT PtrVT) {
19796 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19797}
19798
19800 SelectionDAG &DAG, const EVT PtrVT,
19801 bool Is64Bit, bool Is64BitLP64) {
19802 SDLoc dl(GA);
19803
19804 // Get the start address of the TLS block for this module.
19808
19809 SDValue Base;
19810 if (Is64Bit) {
19811 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19812 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19813 /*LoadGlobalBaseReg=*/false,
19814 /*LocalDynamic=*/true);
19815 } else {
19816 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19817 /*LoadGlobalBaseReg=*/true,
19818 /*LocalDynamic=*/true);
19819 }
19820
19821 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19822 // of Base.
19823
19824 // Build x@dtpoff.
19825 unsigned char OperandFlags = X86II::MO_DTPOFF;
19826 unsigned WrapperKind = X86ISD::Wrapper;
19827 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19828 GA->getValueType(0),
19829 GA->getOffset(), OperandFlags);
19830 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19831
19832 // Add x@dtpoff with the base.
19833 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19834}
19835
19836// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19838 const EVT PtrVT, TLSModel::Model model,
19839 bool is64Bit, bool isPIC) {
19840 SDLoc dl(GA);
19841
19842 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19845
19846 SDValue ThreadPointer =
19847 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19848 MachinePointerInfo(Ptr));
19849
19850 unsigned char OperandFlags = 0;
19851 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19852 // initialexec.
19853 unsigned WrapperKind = X86ISD::Wrapper;
19854 if (model == TLSModel::LocalExec) {
19855 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19856 } else if (model == TLSModel::InitialExec) {
19857 if (is64Bit) {
19858 OperandFlags = X86II::MO_GOTTPOFF;
19859 WrapperKind = X86ISD::WrapperRIP;
19860 } else {
19861 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19862 }
19863 } else {
19864 llvm_unreachable("Unexpected model");
19865 }
19866
19867 // emit "addl x@ntpoff,%eax" (local exec)
19868 // or "addl x@indntpoff,%eax" (initial exec)
19869 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19870 SDValue TGA =
19871 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19872 GA->getOffset(), OperandFlags);
19873 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19874
19875 if (model == TLSModel::InitialExec) {
19876 if (isPIC && !is64Bit) {
19877 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19878 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19879 Offset);
19880 }
19881
19882 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19884 }
19885
19886 // The address of the thread local variable is the add of the thread
19887 // pointer with the offset of the variable.
19888 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19889}
19890
19891SDValue
19892X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19893
19894 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19895
19896 if (DAG.getTarget().useEmulatedTLS())
19897 return LowerToTLSEmulatedModel(GA, DAG);
19898
19899 const GlobalValue *GV = GA->getGlobal();
19900 EVT PtrVT = Op.getValueType();
19901 bool PositionIndependent = isPositionIndependent();
19902
19903 if (Subtarget.isTargetELF()) {
19904 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19905 switch (model) {
19907 if (Subtarget.is64Bit()) {
19908 if (Subtarget.isTarget64BitLP64())
19909 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19910 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19911 }
19912 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19914 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19915 Subtarget.isTarget64BitLP64());
19918 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19919 PositionIndependent);
19920 }
19921 llvm_unreachable("Unknown TLS model.");
19922 }
19923
19924 if (Subtarget.isTargetDarwin()) {
19925 // Darwin only has one model of TLS. Lower to that.
19926 unsigned char OpFlag = 0;
19927 unsigned WrapperKind = 0;
19928
19929 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19930 // global base reg.
19931 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19932 if (PIC32) {
19933 OpFlag = X86II::MO_TLVP_PIC_BASE;
19934 WrapperKind = X86ISD::Wrapper;
19935 } else {
19936 OpFlag = X86II::MO_TLVP;
19937 WrapperKind = X86ISD::WrapperRIP;
19938 }
19939 SDLoc DL(Op);
19941 GA->getValueType(0),
19942 GA->getOffset(), OpFlag);
19943 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19944
19945 // With PIC32, the address is actually $g + Offset.
19946 if (PIC32)
19947 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19948 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19949 Offset);
19950
19951 // Lowering the machine isd will make sure everything is in the right
19952 // location.
19953 SDValue Chain = DAG.getEntryNode();
19954 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19955 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19956 SDValue Args[] = { Chain, Offset };
19957 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19958 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19959
19960 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19961 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19962 MFI.setAdjustsStack(true);
19963
19964 // And our return value (tls address) is in the standard call return value
19965 // location.
19966 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19967 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19968 }
19969
19970 if (Subtarget.isOSWindows()) {
19971 // Just use the implicit TLS architecture
19972 // Need to generate something similar to:
19973 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19974 // ; from TEB
19975 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19976 // mov rcx, qword [rdx+rcx*8]
19977 // mov eax, .tls$:tlsvar
19978 // [rax+rcx] contains the address
19979 // Windows 64bit: gs:0x58
19980 // Windows 32bit: fs:__tls_array
19981
19982 SDLoc dl(GA);
19983 SDValue Chain = DAG.getEntryNode();
19984
19985 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19986 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19987 // use its literal value of 0x2C.
19989 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19991
19992 SDValue TlsArray = Subtarget.is64Bit()
19993 ? DAG.getIntPtrConstant(0x58, dl)
19994 : (Subtarget.isTargetWindowsGNU()
19995 ? DAG.getIntPtrConstant(0x2C, dl)
19996 : DAG.getExternalSymbol("_tls_array", PtrVT));
19997
19998 SDValue ThreadPointer =
19999 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
20000
20001 SDValue res;
20003 res = ThreadPointer;
20004 } else {
20005 // Load the _tls_index variable
20006 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
20007 if (Subtarget.is64Bit())
20008 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
20009 MachinePointerInfo(), MVT::i32);
20010 else
20011 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
20012
20013 const DataLayout &DL = DAG.getDataLayout();
20014 SDValue Scale =
20015 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
20016 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
20017
20018 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
20019 }
20020
20021 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
20022
20023 // Get the offset of start of .tls section
20024 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20025 GA->getValueType(0),
20027 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
20028
20029 // The address of the thread local variable is the add of the thread
20030 // pointer with the offset of the variable.
20031 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
20032 }
20033
20034 llvm_unreachable("TLS not implemented for this target.");
20035}
20036
20038 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
20039 const TargetMachine &TM = getTargetMachine();
20040 TLSModel::Model Model = TM.getTLSModel(&GV);
20041 switch (Model) {
20044 // We can include the %fs segment register in addressing modes.
20045 return true;
20048 // These models do not result in %fs relative addresses unless
20049 // TLS descriptior are used.
20050 //
20051 // Even in the case of TLS descriptors we currently have no way to model
20052 // the difference between %fs access and the computations needed for the
20053 // offset and returning `true` for TLS-desc currently duplicates both
20054 // which is detrimental :-/
20055 return false;
20056 }
20057 }
20058 return false;
20059}
20060
20061/// Lower SRA_PARTS and friends, which return two i32 values
20062/// and take a 2 x i32 value to shift plus a shift amount.
20063/// TODO: Can this be moved to general expansion code?
20065 SDValue Lo, Hi;
20066 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
20067 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
20068}
20069
20070// Try to use a packed vector operation to handle i64 on 32-bit targets when
20071// AVX512DQ is enabled.
20073 SelectionDAG &DAG,
20074 const X86Subtarget &Subtarget) {
20075 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20076 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20077 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20078 Op.getOpcode() == ISD::UINT_TO_FP) &&
20079 "Unexpected opcode!");
20080 bool IsStrict = Op->isStrictFPOpcode();
20081 unsigned OpNo = IsStrict ? 1 : 0;
20082 SDValue Src = Op.getOperand(OpNo);
20083 MVT SrcVT = Src.getSimpleValueType();
20084 MVT VT = Op.getSimpleValueType();
20085
20086 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20087 (VT != MVT::f32 && VT != MVT::f64))
20088 return SDValue();
20089
20090 // Pack the i64 into a vector, do the operation and extract.
20091
20092 // Using 256-bit to ensure result is 128-bits for f32 case.
20093 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20094 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
20095 MVT VecVT = MVT::getVectorVT(VT, NumElts);
20096
20097 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
20098 if (IsStrict) {
20099 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
20100 {Op.getOperand(0), InVec});
20101 SDValue Chain = CvtVec.getValue(1);
20102 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20103 DAG.getVectorIdxConstant(0, dl));
20104 return DAG.getMergeValues({Value, Chain}, dl);
20105 }
20106
20107 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
20108
20109 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20110 DAG.getVectorIdxConstant(0, dl));
20111}
20112
20113// Try to use a packed vector operation to handle i64 on 32-bit targets.
20115 const X86Subtarget &Subtarget) {
20116 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20117 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20118 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20119 Op.getOpcode() == ISD::UINT_TO_FP) &&
20120 "Unexpected opcode!");
20121 bool IsStrict = Op->isStrictFPOpcode();
20122 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20123 MVT SrcVT = Src.getSimpleValueType();
20124 MVT VT = Op.getSimpleValueType();
20125
20126 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20127 return SDValue();
20128
20129 // Pack the i64 into a vector, do the operation and extract.
20130
20131 assert(Subtarget.hasFP16() && "Expected FP16");
20132
20133 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
20134 if (IsStrict) {
20135 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20136 {Op.getOperand(0), InVec});
20137 SDValue Chain = CvtVec.getValue(1);
20138 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20139 DAG.getVectorIdxConstant(0, dl));
20140 return DAG.getMergeValues({Value, Chain}, dl);
20141 }
20142
20143 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
20144
20145 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20146 DAG.getVectorIdxConstant(0, dl));
20147}
20148
20149static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
20150 const X86Subtarget &Subtarget) {
20151 switch (Opcode) {
20152 case ISD::SINT_TO_FP:
20153 // TODO: Handle wider types with AVX/AVX512.
20154 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
20155 return false;
20156 // CVTDQ2PS or (V)CVTDQ2PD
20157 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
20158
20159 case ISD::UINT_TO_FP:
20160 // TODO: Handle wider types and i64 elements.
20161 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
20162 return false;
20163 // VCVTUDQ2PS or VCVTUDQ2PD
20164 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20165
20166 default:
20167 return false;
20168 }
20169}
20170
20171/// Given a scalar cast operation that is extracted from a vector, try to
20172/// vectorize the cast op followed by extraction. This will avoid an expensive
20173/// round-trip between XMM and GPR.
20175 SelectionDAG &DAG,
20176 const X86Subtarget &Subtarget) {
20177 // TODO: This could be enhanced to handle smaller integer types by peeking
20178 // through an extend.
20179 SDValue Extract = Cast.getOperand(0);
20180 MVT DestVT = Cast.getSimpleValueType();
20181 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20182 !isa<ConstantSDNode>(Extract.getOperand(1)))
20183 return SDValue();
20184
20185 // See if we have a 128-bit vector cast op for this type of cast.
20186 SDValue VecOp = Extract.getOperand(0);
20187 EVT FromVT = VecOp.getValueType();
20188 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
20189 MVT Vec128VT =
20190 MVT::getVectorVT(FromVT.getScalarType().getSimpleVT(), NumEltsInXMM);
20191 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
20192 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
20193 return SDValue();
20194
20195 // If we are extracting from a non-zero element, first shuffle the source
20196 // vector to allow extracting from element zero.
20197 if (!isNullConstant(Extract.getOperand(1))) {
20198 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
20199 Mask[0] = Extract.getConstantOperandVal(1);
20200 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
20201 }
20202 // If the source vector is wider than 128-bits, extract the low part. Do not
20203 // create an unnecessarily wide vector cast op.
20204 if (FromVT != Vec128VT)
20205 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
20206
20207 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
20208 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
20209 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
20210 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
20211 DAG.getVectorIdxConstant(0, DL));
20212}
20213
20214/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
20215/// try to vectorize the cast ops. This will avoid an expensive round-trip
20216/// between XMM and GPR.
20217static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
20218 SelectionDAG &DAG,
20219 const X86Subtarget &Subtarget) {
20220 SDValue CastToInt = CastToFP.getOperand(0);
20221 MVT VT = CastToFP.getSimpleValueType();
20222 if ((CastToInt.getOpcode() != ISD::FP_TO_SINT &&
20223 CastToInt.getOpcode() != ISD::FP_TO_UINT) ||
20224 VT.isVector())
20225 return SDValue();
20226
20227 MVT IntVT = CastToInt.getSimpleValueType();
20228 SDValue X = CastToInt.getOperand(0);
20229 MVT SrcVT = X.getSimpleValueType();
20230 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20231 return SDValue();
20232
20233 // See if we have 128-bit vector cast instructions for this type of cast.
20234 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
20235 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20236 (IntVT != MVT::i32 && IntVT != MVT::i64))
20237 return SDValue();
20238
20239 unsigned SrcSize = SrcVT.getSizeInBits();
20240 unsigned IntSize = IntVT.getSizeInBits();
20241 unsigned VTSize = VT.getSizeInBits();
20242 bool IsUnsigned = CastToInt.getOpcode() == ISD::FP_TO_UINT;
20243 unsigned ToIntOpcode =
20244 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
20245 unsigned ToFPOpcode =
20246 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
20247 unsigned Width = 128;
20248
20249 if (Subtarget.hasVLX() && Subtarget.hasDQI()) {
20250 // AVX512DQ+VLX
20251 if (IsUnsigned) {
20252 ToIntOpcode =
20253 SrcSize != IntSize ? X86ISD::CVTTP2UI : (unsigned)ISD::FP_TO_UINT;
20254 ToFPOpcode =
20255 IntSize != VTSize ? X86ISD::CVTUI2P : (unsigned)ISD::UINT_TO_FP;
20256 }
20257 } else {
20258 if (IsUnsigned || IntVT == MVT::i64) {
20259 // SSE2 can only perform f64/f32 <-> i32 signed.
20260 if (!Subtarget.useAVX512Regs() || !Subtarget.hasDQI())
20261 return SDValue();
20262
20263 // Need to extend width for AVX512DQ without AVX512VL.
20264 Width = 512;
20265 ToIntOpcode = CastToInt.getOpcode();
20266 ToFPOpcode = IsUnsigned ? ISD::UINT_TO_FP : ISD::SINT_TO_FP;
20267 }
20268 }
20269
20270 MVT VecSrcVT, VecIntVT, VecVT;
20271 unsigned NumElts;
20272 unsigned SrcElts, VTElts;
20273 // Some conversions are only legal with uniform vector sizes on AVX512DQ.
20274 if (Width == 512) {
20275 NumElts = std::min(Width / IntSize, Width / SrcSize);
20276 SrcElts = NumElts;
20277 VTElts = NumElts;
20278 } else {
20279 NumElts = Width / IntSize;
20280 SrcElts = Width / SrcSize;
20281 VTElts = Width / VTSize;
20282 }
20283 VecIntVT = MVT::getVectorVT(IntVT, NumElts);
20284 VecSrcVT = MVT::getVectorVT(SrcVT, SrcElts);
20285 VecVT = MVT::getVectorVT(VT, VTElts);
20286 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
20287 //
20288 // We are not defining the high elements (for example, zero them) because
20289 // that could nullify any performance advantage that we hoped to gain from
20290 // this vector op hack. We do not expect any adverse effects (like denorm
20291 // penalties) with cast ops.
20292 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20293 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
20294 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
20295 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
20296 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
20297}
20298
20300 SelectionDAG &DAG,
20301 const X86Subtarget &Subtarget) {
20302 bool IsStrict = Op->isStrictFPOpcode();
20303 MVT VT = Op->getSimpleValueType(0);
20304 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
20305
20306 if (Subtarget.hasDQI()) {
20307 assert(!Subtarget.hasVLX() && "Unexpected features");
20308
20309 assert((Src.getSimpleValueType() == MVT::v2i64 ||
20310 Src.getSimpleValueType() == MVT::v4i64) &&
20311 "Unsupported custom type");
20312
20313 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
20314 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
20315 "Unexpected VT!");
20316 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20317
20318 // Need to concat with zero vector for strict fp to avoid spurious
20319 // exceptions.
20320 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
20321 : DAG.getUNDEF(MVT::v8i64);
20322 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
20323 DAG.getVectorIdxConstant(0, DL));
20324 SDValue Res, Chain;
20325 if (IsStrict) {
20326 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
20327 {Op->getOperand(0), Src});
20328 Chain = Res.getValue(1);
20329 } else {
20330 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
20331 }
20332
20333 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20334 DAG.getVectorIdxConstant(0, DL));
20335
20336 if (IsStrict)
20337 return DAG.getMergeValues({Res, Chain}, DL);
20338 return Res;
20339 }
20340
20341 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20342 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20343 if (VT != MVT::v4f32 || IsSigned)
20344 return SDValue();
20345
20346 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20347 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20348 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20349 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20350 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20351 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20352 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20353 SmallVector<SDValue, 4> SignCvts(4);
20354 SmallVector<SDValue, 4> Chains(4);
20355 for (int i = 0; i != 4; ++i) {
20356 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20357 DAG.getVectorIdxConstant(i, DL));
20358 if (IsStrict) {
20359 SignCvts[i] =
20360 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20361 {Op.getOperand(0), Elt});
20362 Chains[i] = SignCvts[i].getValue(1);
20363 } else {
20364 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20365 }
20366 }
20367 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20368
20369 SDValue Slow, Chain;
20370 if (IsStrict) {
20371 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20372 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20373 {Chain, SignCvt, SignCvt});
20374 Chain = Slow.getValue(1);
20375 } else {
20376 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20377 }
20378
20379 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20380 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20381
20382 if (IsStrict)
20383 return DAG.getMergeValues({Cvt, Chain}, DL);
20384
20385 return Cvt;
20386}
20387
20389 SelectionDAG &DAG) {
20390 bool IsStrict = Op->isStrictFPOpcode();
20391 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20392 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20393 MVT VT = Op.getSimpleValueType();
20394 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20395
20396 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20397 if (IsStrict)
20398 return DAG.getNode(
20399 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20400 {Chain,
20401 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20402 Rnd});
20403 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20404 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20405}
20406
20407static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20408 const X86Subtarget &Subtarget) {
20409 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20410 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20411 return true;
20412 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20413 return true;
20414 }
20415 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20416 return true;
20417 if (Subtarget.useAVX512Regs()) {
20418 if (VT == MVT::v16i32)
20419 return true;
20420 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20421 return true;
20422 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20423 return true;
20424 }
20425 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20426 (VT == MVT::v2i64 || VT == MVT::v4i64))
20427 return true;
20428 return false;
20429}
20430
20431SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20432 SelectionDAG &DAG) const {
20433 bool IsStrict = Op->isStrictFPOpcode();
20434 unsigned OpNo = IsStrict ? 1 : 0;
20435 SDValue Src = Op.getOperand(OpNo);
20436 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20437 MVT SrcVT = Src.getSimpleValueType();
20438 MVT VT = Op.getSimpleValueType();
20439 SDLoc dl(Op);
20440
20441 if (isSoftF16(VT, Subtarget))
20442 return promoteXINT_TO_FP(Op, dl, DAG);
20443 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20444 return Op;
20445
20446 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20447 return LowerWin64_INT128_TO_FP(Op, DAG);
20448
20449 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20450 return Extract;
20451
20452 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20453 return R;
20454
20455 if (SrcVT.isVector()) {
20456 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20457 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20458 // source for strict FP.
20459 if (IsStrict)
20460 return DAG.getNode(
20461 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20462 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20463 DAG.getUNDEF(SrcVT))});
20464 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20465 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20466 DAG.getUNDEF(SrcVT)));
20467 }
20468 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20469 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20470
20471 return SDValue();
20472 }
20473
20474 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20475 "Unknown SINT_TO_FP to lower!");
20476
20477 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20478
20479 // These are really Legal; return the operand so the caller accepts it as
20480 // Legal.
20481 if (SrcVT == MVT::i32 && UseSSEReg)
20482 return Op;
20483 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20484 return Op;
20485
20486 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20487 return V;
20488 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20489 return V;
20490
20491 // SSE doesn't have an i16 conversion so we need to promote.
20492 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20493 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20494 if (IsStrict)
20495 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20496 {Chain, Ext});
20497
20498 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20499 }
20500
20501 if (VT == MVT::f128 || !Subtarget.hasX87())
20502 return SDValue();
20503
20504 SDValue ValueToStore = Src;
20505 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20506 // Bitcasting to f64 here allows us to do a single 64-bit store from
20507 // an SSE register, avoiding the store forwarding penalty that would come
20508 // with two 32-bit stores.
20509 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20510
20511 unsigned Size = SrcVT.getStoreSize();
20512 Align Alignment(Size);
20513 MachineFunction &MF = DAG.getMachineFunction();
20514 auto PtrVT = getPointerTy(MF.getDataLayout());
20515 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20516 MachinePointerInfo MPI =
20518 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20519 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20520 std::pair<SDValue, SDValue> Tmp =
20521 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20522
20523 if (IsStrict)
20524 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20525
20526 return Tmp.first;
20527}
20528
20529std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20530 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20531 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20532 // Build the FILD
20533 SDVTList Tys;
20534 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20535 if (useSSE)
20536 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20537 else
20538 Tys = DAG.getVTList(DstVT, MVT::Other);
20539
20540 SDValue FILDOps[] = {Chain, Pointer};
20541 SDValue Result =
20542 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20543 Alignment, MachineMemOperand::MOLoad);
20544 Chain = Result.getValue(1);
20545
20546 if (useSSE) {
20548 unsigned SSFISize = DstVT.getStoreSize();
20549 int SSFI =
20550 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20551 auto PtrVT = getPointerTy(MF.getDataLayout());
20552 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20553 Tys = DAG.getVTList(MVT::Other);
20554 SDValue FSTOps[] = {Chain, Result, StackSlot};
20557 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20558
20559 Chain =
20560 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20561 Result = DAG.getLoad(
20562 DstVT, DL, Chain, StackSlot,
20564 Chain = Result.getValue(1);
20565 }
20566
20567 return { Result, Chain };
20568}
20569
20570/// Horizontal vector math instructions may be slower than normal math with
20571/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20572/// implementation, and likely shuffle complexity of the alternate sequence.
20573static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20574 const X86Subtarget &Subtarget) {
20575 bool IsOptimizingSize = DAG.shouldOptForSize();
20576 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20577 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20578}
20579
20580/// 64-bit unsigned integer to double expansion.
20582 SelectionDAG &DAG,
20583 const X86Subtarget &Subtarget) {
20584 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20585 // when converting 0 when rounding toward negative infinity. Caller will
20586 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20587 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20588 // This algorithm is not obvious. Here it is what we're trying to output:
20589 /*
20590 movq %rax, %xmm0
20591 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20592 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20593 #ifdef __SSE3__
20594 haddpd %xmm0, %xmm0
20595 #else
20596 pshufd $0x4e, %xmm0, %xmm1
20597 addpd %xmm1, %xmm0
20598 #endif
20599 */
20600
20601 LLVMContext *Context = DAG.getContext();
20602
20603 // Build some magic constants.
20604 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20605 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20606 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20607 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20608
20610 CV1.push_back(
20611 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20612 APInt(64, 0x4330000000000000ULL))));
20613 CV1.push_back(
20614 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20615 APInt(64, 0x4530000000000000ULL))));
20616 Constant *C1 = ConstantVector::get(CV1);
20617 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20618
20619 // Load the 64-bit value into an XMM register.
20620 SDValue XR1 =
20621 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20622 SDValue CLod0 = DAG.getLoad(
20623 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20625 SDValue Unpck1 =
20626 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20627
20628 SDValue CLod1 = DAG.getLoad(
20629 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20631 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20632 // TODO: Are there any fast-math-flags to propagate here?
20633 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20634 SDValue Result;
20635
20636 if (Subtarget.hasSSE3() &&
20637 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20638 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20639 } else {
20640 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20641 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20642 }
20643 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20644 DAG.getVectorIdxConstant(0, dl));
20645 return Result;
20646}
20647
20648/// 32-bit unsigned integer to float expansion.
20650 SelectionDAG &DAG,
20651 const X86Subtarget &Subtarget) {
20652 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20653 // FP constant to bias correct the final result.
20654 SDValue Bias = DAG.getConstantFP(
20655 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20656
20657 // Load the 32-bit value into an XMM register.
20658 SDValue Load =
20659 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20660
20661 // Zero out the upper parts of the register.
20662 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20663
20664 // Or the load with the bias.
20665 SDValue Or = DAG.getNode(
20666 ISD::OR, dl, MVT::v2i64,
20667 DAG.getBitcast(MVT::v2i64, Load),
20668 DAG.getBitcast(MVT::v2i64,
20669 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20670 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20671 DAG.getBitcast(MVT::v2f64, Or),
20672 DAG.getVectorIdxConstant(0, dl));
20673
20674 if (Op.getNode()->isStrictFPOpcode()) {
20675 // Subtract the bias.
20676 // TODO: Are there any fast-math-flags to propagate here?
20677 SDValue Chain = Op.getOperand(0);
20678 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20679 {Chain, Or, Bias});
20680
20681 if (Op.getValueType() == Sub.getValueType())
20682 return Sub;
20683
20684 // Handle final rounding.
20685 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20686 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20687
20688 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20689 }
20690
20691 // Subtract the bias.
20692 // TODO: Are there any fast-math-flags to propagate here?
20693 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20694
20695 // Handle final rounding.
20696 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20697}
20698
20700 SelectionDAG &DAG,
20701 const X86Subtarget &Subtarget) {
20702 if (Op.getSimpleValueType() != MVT::v2f64)
20703 return SDValue();
20704
20705 bool IsStrict = Op->isStrictFPOpcode();
20706
20707 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20708 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20709
20710 if (Subtarget.hasAVX512()) {
20711 if (!Subtarget.hasVLX()) {
20712 // Let generic type legalization widen this.
20713 if (!IsStrict)
20714 return SDValue();
20715 // Otherwise pad the integer input with 0s and widen the operation.
20716 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20717 DAG.getConstant(0, DL, MVT::v2i32));
20718 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20719 {Op.getOperand(0), N0});
20720 SDValue Chain = Res.getValue(1);
20721 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20722 DAG.getVectorIdxConstant(0, DL));
20723 return DAG.getMergeValues({Res, Chain}, DL);
20724 }
20725
20726 // Legalize to v4i32 type.
20727 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20728 DAG.getUNDEF(MVT::v2i32));
20729 if (IsStrict)
20730 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20731 {Op.getOperand(0), N0});
20732 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20733 }
20734
20735 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20736 // This gives us the floating point equivalent of 2^52 + the i32 integer
20737 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20738 // point leaving just our i32 integers in double format.
20739 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20740 SDValue VBias = DAG.getConstantFP(
20741 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20742 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20743 DAG.getBitcast(MVT::v2i64, VBias));
20744 Or = DAG.getBitcast(MVT::v2f64, Or);
20745
20746 if (IsStrict)
20747 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20748 {Op.getOperand(0), Or, VBias});
20749 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20750}
20751
20753 SelectionDAG &DAG,
20754 const X86Subtarget &Subtarget) {
20755 bool IsStrict = Op->isStrictFPOpcode();
20756 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20757 MVT VecIntVT = V.getSimpleValueType();
20758 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20759 "Unsupported custom type");
20760
20761 if (Subtarget.hasAVX512()) {
20762 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20763 assert(!Subtarget.hasVLX() && "Unexpected features");
20764 MVT VT = Op->getSimpleValueType(0);
20765
20766 // v8i32->v8f64 is legal with AVX512 so just return it.
20767 if (VT == MVT::v8f64)
20768 return Op;
20769
20770 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20771 VT == MVT::v8f16) &&
20772 "Unexpected VT!");
20773 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20774 MVT WideIntVT = MVT::v16i32;
20775 if (VT == MVT::v4f64) {
20776 WideVT = MVT::v8f64;
20777 WideIntVT = MVT::v8i32;
20778 }
20779
20780 // Need to concat with zero vector for strict fp to avoid spurious
20781 // exceptions.
20782 SDValue Tmp =
20783 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20784 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20785 DAG.getVectorIdxConstant(0, DL));
20786 SDValue Res, Chain;
20787 if (IsStrict) {
20788 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20789 {Op->getOperand(0), V});
20790 Chain = Res.getValue(1);
20791 } else {
20792 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20793 }
20794
20795 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20796 DAG.getVectorIdxConstant(0, DL));
20797
20798 if (IsStrict)
20799 return DAG.getMergeValues({Res, Chain}, DL);
20800 return Res;
20801 }
20802
20803 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20804 Op->getSimpleValueType(0) == MVT::v4f64) {
20805 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20806 Constant *Bias = ConstantFP::get(
20807 *DAG.getContext(),
20808 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20809 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20810 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20811 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20812 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20813 SDValue VBias = DAG.getMemIntrinsicNode(
20814 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20817
20818 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20819 DAG.getBitcast(MVT::v4i64, VBias));
20820 Or = DAG.getBitcast(MVT::v4f64, Or);
20821
20822 if (IsStrict)
20823 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20824 {Op.getOperand(0), Or, VBias});
20825 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20826 }
20827
20828 // The algorithm is the following:
20829 // #ifdef __SSE4_1__
20830 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20831 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20832 // (uint4) 0x53000000, 0xaa);
20833 // #else
20834 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20835 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20836 // #endif
20837 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20838 // return (float4) lo + fhi;
20839
20840 bool Is128 = VecIntVT == MVT::v4i32;
20841 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20842 // If we convert to something else than the supported type, e.g., to v4f64,
20843 // abort early.
20844 if (VecFloatVT != Op->getSimpleValueType(0))
20845 return SDValue();
20846
20847 // In the #idef/#else code, we have in common:
20848 // - The vector of constants:
20849 // -- 0x4b000000
20850 // -- 0x53000000
20851 // - A shift:
20852 // -- v >> 16
20853
20854 // Create the splat vector for 0x4b000000.
20855 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20856 // Create the splat vector for 0x53000000.
20857 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20858
20859 // Create the right shift.
20860 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20861 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20862
20863 SDValue Low, High;
20864 if (Subtarget.hasSSE41()) {
20865 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20866 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20867 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20868 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20869 // Low will be bitcasted right away, so do not bother bitcasting back to its
20870 // original type.
20871 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20872 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20873 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20874 // (uint4) 0x53000000, 0xaa);
20875 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20876 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20877 // High will be bitcasted right away, so do not bother bitcasting back to
20878 // its original type.
20879 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20880 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20881 } else {
20882 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20883 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20884 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20885 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20886
20887 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20888 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20889 }
20890
20891 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20892 SDValue VecCstFSub = DAG.getConstantFP(
20893 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20894
20895 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20896 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20897 // constant, we avoid reassociation in MachineCombiner when reassoc is
20898 // enabled. See PR24512.
20899 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20900 // TODO: Are there any fast-math-flags to propagate here?
20901 // (float4) lo;
20902 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20903 // return (float4) lo + fhi;
20904 if (IsStrict) {
20905 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20906 {Op.getOperand(0), HighBitcast, VecCstFSub});
20907 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20908 {FHigh.getValue(1), LowBitcast, FHigh});
20909 }
20910
20911 SDValue FHigh =
20912 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20913 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20914}
20915
20917 const X86Subtarget &Subtarget) {
20918 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20919 SDValue N0 = Op.getOperand(OpNo);
20920 MVT SrcVT = N0.getSimpleValueType();
20921
20922 switch (SrcVT.SimpleTy) {
20923 default:
20924 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20925 case MVT::v2i32:
20926 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20927 case MVT::v4i32:
20928 case MVT::v8i32:
20929 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20930 case MVT::v2i64:
20931 case MVT::v4i64:
20932 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20933 }
20934}
20935
20936SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20937 SelectionDAG &DAG) const {
20938 bool IsStrict = Op->isStrictFPOpcode();
20939 unsigned OpNo = IsStrict ? 1 : 0;
20940 SDValue Src = Op.getOperand(OpNo);
20941 SDLoc dl(Op);
20942 auto PtrVT = getPointerTy(DAG.getDataLayout());
20943 MVT SrcVT = Src.getSimpleValueType();
20944 MVT DstVT = Op->getSimpleValueType(0);
20945 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20946
20947 // Bail out when we don't have native conversion instructions.
20948 if (DstVT == MVT::f128)
20949 return SDValue();
20950
20951 if (isSoftF16(DstVT, Subtarget))
20952 return promoteXINT_TO_FP(Op, dl, DAG);
20953 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20954 return Op;
20955
20956 if (SDValue V = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20957 return V;
20958
20959 if (DstVT.isVector())
20960 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20961
20962 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20963 return LowerWin64_INT128_TO_FP(Op, DAG);
20964
20965 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20966 return Extract;
20967
20968 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20969 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20970 // Conversions from unsigned i32 to f32/f64 are legal,
20971 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20972 return Op;
20973 }
20974
20975 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20976 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20977 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20978 if (IsStrict)
20979 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20980 {Chain, Src});
20981 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20982 }
20983
20984 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20985 return V;
20986 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20987 return V;
20988
20989 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20990 // infinity. It produces -0.0, so disable under strictfp.
20991 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20992 !IsStrict)
20993 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20994 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20995 // negative infinity. So disable under strictfp. Using FILD instead.
20996 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20997 !IsStrict)
20998 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20999 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
21000 (DstVT == MVT::f32 || DstVT == MVT::f64))
21001 return SDValue();
21002
21003 // Make a 64-bit buffer, and use it to build an FILD.
21004 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
21005 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
21006 Align SlotAlign(8);
21007 MachinePointerInfo MPI =
21009 if (SrcVT == MVT::i32) {
21010 SDValue OffsetSlot =
21011 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
21012 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
21013 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
21014 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
21015 std::pair<SDValue, SDValue> Tmp =
21016 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21017 if (IsStrict)
21018 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21019
21020 return Tmp.first;
21021 }
21022
21023 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
21024 SDValue ValueToStore = Src;
21025 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
21026 // Bitcasting to f64 here allows us to do a single 64-bit store from
21027 // an SSE register, avoiding the store forwarding penalty that would come
21028 // with two 32-bit stores.
21029 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21030 }
21031 SDValue Store =
21032 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21033 // For i64 source, we need to add the appropriate power of 2 if the input
21034 // was negative. We must be careful to do the computation in x87 extended
21035 // precision, not in SSE.
21036 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21037 SDValue Ops[] = {Store, StackSlot};
21038 SDValue Fild =
21039 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
21040 SlotAlign, MachineMemOperand::MOLoad);
21041 Chain = Fild.getValue(1);
21042
21043 // Check whether the sign bit is set.
21044 SDValue SignSet = DAG.getSetCC(
21045 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
21046 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
21047
21048 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
21049 APInt FF(64, 0x5F80000000000000ULL);
21050 SDValue FudgePtr =
21051 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
21052 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
21053
21054 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
21055 SDValue Zero = DAG.getIntPtrConstant(0, dl);
21056 SDValue Four = DAG.getIntPtrConstant(4, dl);
21057 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
21058 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
21059
21060 // Load the value out, extending it from f32 to f80.
21061 SDValue Fudge = DAG.getExtLoad(
21062 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
21064 CPAlignment);
21065 Chain = Fudge.getValue(1);
21066 // Extend everything to 80 bits to force it to be done on x87.
21067 // TODO: Are there any fast-math-flags to propagate here?
21068 if (IsStrict) {
21069 unsigned Opc = ISD::STRICT_FADD;
21070 // Windows needs the precision control changed to 80bits around this add.
21071 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21073
21074 SDValue Add =
21075 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
21076 // STRICT_FP_ROUND can't handle equal types.
21077 if (DstVT == MVT::f80)
21078 return Add;
21079 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
21080 {Add.getValue(1), Add,
21081 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
21082 }
21083 unsigned Opc = ISD::FADD;
21084 // Windows needs the precision control changed to 80bits around this add.
21085 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21087
21088 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
21089 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
21090 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21091}
21092
21093// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
21094// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
21095// just return an SDValue().
21096// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
21097// to i16, i32 or i64, and we lower it to a legal sequence and return the
21098// result.
21099SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
21100 bool IsSigned,
21101 SDValue &Chain) const {
21102 bool IsStrict = Op->isStrictFPOpcode();
21103 SDLoc DL(Op);
21104
21105 EVT DstTy = Op.getValueType();
21106 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
21107 EVT TheVT = Value.getValueType();
21108 auto PtrVT = getPointerTy(DAG.getDataLayout());
21109
21110 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21111 // f16 must be promoted before using the lowering in this routine.
21112 // fp128 does not use this lowering.
21113 return SDValue();
21114 }
21115
21116 // If using FIST to compute an unsigned i64, we'll need some fixup
21117 // to handle values above the maximum signed i64. A FIST is always
21118 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
21119 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21120
21121 // FIXME: This does not generate an invalid exception if the input does not
21122 // fit in i32. PR44019
21123 if (!IsSigned && DstTy != MVT::i64) {
21124 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
21125 // The low 32 bits of the fist result will have the correct uint32 result.
21126 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
21127 DstTy = MVT::i64;
21128 }
21129
21130 assert(DstTy.getSimpleVT() <= MVT::i64 &&
21131 DstTy.getSimpleVT() >= MVT::i16 &&
21132 "Unknown FP_TO_INT to lower!");
21133
21134 // We lower FP->int64 into FISTP64 followed by a load from a temporary
21135 // stack slot.
21136 MachineFunction &MF = DAG.getMachineFunction();
21137 unsigned MemSize = DstTy.getStoreSize();
21138 int SSFI =
21139 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
21140 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21141
21142 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21143
21144 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
21145
21146 if (UnsignedFixup) {
21147 //
21148 // Conversion to unsigned i64 is implemented with a select,
21149 // depending on whether the source value fits in the range
21150 // of a signed i64. Let Thresh be the FP equivalent of
21151 // 0x8000000000000000ULL.
21152 //
21153 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
21154 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
21155 // FistSrc = (Value - FltOfs);
21156 // Fist-to-mem64 FistSrc
21157 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
21158 // to XOR'ing the high 32 bits with Adjust.
21159 //
21160 // Being a power of 2, Thresh is exactly representable in all FP formats.
21161 // For X87 we'd like to use the smallest FP type for this constant, but
21162 // for DAG type consistency we have to match the FP operand type.
21163
21164 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
21165 [[maybe_unused]] APFloat::opStatus Status = APFloat::opOK;
21166 bool LosesInfo = false;
21167 if (TheVT == MVT::f64)
21168 // The rounding mode is irrelevant as the conversion should be exact.
21169 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
21170 &LosesInfo);
21171 else if (TheVT == MVT::f80)
21172 Status = Thresh.convert(APFloat::x87DoubleExtended(),
21173 APFloat::rmNearestTiesToEven, &LosesInfo);
21174
21175 assert(Status == APFloat::opOK && !LosesInfo &&
21176 "FP conversion should have been exact");
21177
21178 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
21179
21180 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
21181 *DAG.getContext(), TheVT);
21182 SDValue Cmp;
21183 if (IsStrict) {
21184 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
21185 /*IsSignaling*/ true);
21186 Chain = Cmp.getValue(1);
21187 } else {
21188 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
21189 }
21190
21191 // Our preferred lowering of
21192 //
21193 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
21194 //
21195 // is
21196 //
21197 // (Value >= Thresh) << 63
21198 //
21199 // but since we can get here after LegalOperations, DAGCombine might do the
21200 // wrong thing if we create a select. So, directly create the preferred
21201 // version.
21202 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
21203 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
21204 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
21205
21206 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
21207 DAG.getConstantFP(0.0, DL, TheVT));
21208
21209 if (IsStrict) {
21210 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
21211 { Chain, Value, FltOfs });
21212 Chain = Value.getValue(1);
21213 } else
21214 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
21215 }
21216
21217 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
21218
21219 // FIXME This causes a redundant load/store if the SSE-class value is already
21220 // in memory, such as if it is on the callstack.
21221 if (isScalarFPTypeInSSEReg(TheVT)) {
21222 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
21223 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
21224 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21225 SDValue Ops[] = { Chain, StackSlot };
21226
21227 unsigned FLDSize = TheVT.getStoreSize();
21228 assert(FLDSize <= MemSize && "Stack slot not big enough");
21229 MachineMemOperand *MMO = MF.getMachineMemOperand(
21230 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
21231 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
21232 Chain = Value.getValue(1);
21233 }
21234
21235 // Build the FP_TO_INT*_IN_MEM
21236 MachineMemOperand *MMO = MF.getMachineMemOperand(
21237 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
21238 SDValue Ops[] = { Chain, Value, StackSlot };
21240 DAG.getVTList(MVT::Other),
21241 Ops, DstTy, MMO);
21242
21243 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
21244 Chain = Res.getValue(1);
21245
21246 // If we need an unsigned fixup, XOR the result with adjust.
21247 if (UnsignedFixup)
21248 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
21249
21250 return Res;
21251}
21252
21254 const X86Subtarget &Subtarget) {
21255 MVT VT = Op.getSimpleValueType();
21256 SDValue In = Op.getOperand(0);
21257 MVT InVT = In.getSimpleValueType();
21258 unsigned Opc = Op.getOpcode();
21259
21260 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
21262 "Unexpected extension opcode");
21264 "Expected same number of elements");
21265 assert((VT.getVectorElementType() == MVT::i16 ||
21266 VT.getVectorElementType() == MVT::i32 ||
21267 VT.getVectorElementType() == MVT::i64) &&
21268 "Unexpected element type");
21269 assert((InVT.getVectorElementType() == MVT::i8 ||
21270 InVT.getVectorElementType() == MVT::i16 ||
21271 InVT.getVectorElementType() == MVT::i32) &&
21272 "Unexpected element type");
21273
21274 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
21275
21276 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
21277 assert(InVT == MVT::v32i8 && "Unexpected VT!");
21278 return splitVectorIntUnary(Op, DAG, dl);
21279 }
21280
21281 if (Subtarget.hasInt256())
21282 return Op;
21283
21284 // Optimize vectors in AVX mode:
21285 //
21286 // v8i16 -> v8i32
21287 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
21288 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
21289 // Concat upper and lower parts.
21290 //
21291 // v4i32 -> v4i64
21292 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
21293 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
21294 // Concat upper and lower parts.
21295 //
21296 MVT HalfVT = VT.getHalfNumVectorElementsVT();
21297 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
21298
21299 // Short-circuit if we can determine that each 128-bit half is the same value.
21300 // Otherwise, this is difficult to match and optimize.
21301 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
21302 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
21303 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
21304
21305 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
21306 SDValue Undef = DAG.getUNDEF(InVT);
21307 bool NeedZero = Opc == ISD::ZERO_EXTEND;
21308 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
21309 OpHi = DAG.getBitcast(HalfVT, OpHi);
21310
21311 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
21312}
21313
21314// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
21315static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
21316 const SDLoc &dl, SelectionDAG &DAG) {
21317 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
21318 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21319 DAG.getVectorIdxConstant(0, dl));
21320 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21321 DAG.getVectorIdxConstant(8, dl));
21322 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
21323 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
21324 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
21325 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21326}
21327
21329 const X86Subtarget &Subtarget,
21330 SelectionDAG &DAG) {
21331 MVT VT = Op->getSimpleValueType(0);
21332 SDValue In = Op->getOperand(0);
21333 MVT InVT = In.getSimpleValueType();
21334 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
21335 unsigned NumElts = VT.getVectorNumElements();
21336
21337 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
21338 // avoids a constant pool load.
21339 if (VT.getVectorElementType() != MVT::i8) {
21340 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
21341 return DAG.getNode(ISD::SRL, DL, VT, Extend,
21342 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
21343 }
21344
21345 // Extend VT if BWI is not supported.
21346 MVT ExtVT = VT;
21347 if (!Subtarget.hasBWI()) {
21348 // If v16i32 is to be avoided, we'll need to split and concatenate.
21349 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21350 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21351
21352 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21353 }
21354
21355 // Widen to 512-bits if VLX is not supported.
21356 MVT WideVT = ExtVT;
21357 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21358 NumElts *= 512 / ExtVT.getSizeInBits();
21359 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21360 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21361 DAG.getVectorIdxConstant(0, DL));
21362 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21363 }
21364
21365 SDValue One = DAG.getConstant(1, DL, WideVT);
21366 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21367
21368 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21369
21370 // Truncate if we had to extend above.
21371 if (VT != ExtVT) {
21372 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21373 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21374 }
21375
21376 // Extract back to 128/256-bit if we widened.
21377 if (WideVT != VT)
21378 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR,