68#define DEBUG_TYPE "x86-isel"
71 "x86-experimental-pref-innermost-loop-alignment",
cl::init(4),
73 "Sets the preferable loop alignment for experiments (as log2 bytes) "
74 "for innermost loops only. If specified, this option overrides "
75 "alignment set by x86-experimental-pref-loop-alignment."),
79 "x86-br-merging-base-cost",
cl::init(2),
81 "Sets the cost threshold for when multiple conditionals will be merged "
82 "into one branch versus be split in multiple branches. Merging "
83 "conditionals saves branches at the cost of additional instructions. "
84 "This value sets the instruction cost limit, below which conditionals "
85 "will be merged, and above which conditionals will be split. Set to -1 "
86 "to never merge branches."),
90 "x86-br-merging-ccmp-bias",
cl::init(6),
91 cl::desc(
"Increases 'x86-br-merging-base-cost' in cases that the target "
92 "supports conditional compare instructions."),
97 cl::desc(
"Replace narrow shifts with wider shifts."),
101 "x86-br-merging-likely-bias",
cl::init(0),
102 cl::desc(
"Increases 'x86-br-merging-base-cost' in cases that it is likely "
103 "that all conditionals will be executed. For example for merging "
104 "the conditionals (a == b && c > d), if its known that a == b is "
105 "likely, then it is likely that if the conditionals are split "
106 "both sides will be executed, so it may be desirable to increase "
107 "the instruction cost threshold. Set to -1 to never merge likely "
112 "x86-br-merging-unlikely-bias",
cl::init(-1),
114 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
115 "that all conditionals will be executed. For example for merging "
116 "the conditionals (a == b && c > d), if its known that a == b is "
117 "unlikely, then it is unlikely that if the conditionals are split "
118 "both sides will be executed, so it may be desirable to decrease "
119 "the instruction cost threshold. Set to -1 to never merge unlikely "
124 "mul-constant-optimization",
cl::init(
true),
125 cl::desc(
"Replace 'mul x, Const' with more effective instructions like "
132 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
149 if (Subtarget.isAtom())
151 else if (Subtarget.is64Bit())
160 if (Subtarget.hasSlowDivide32())
162 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
168 static const struct {
170 const char *
const Name;
180 for (
const auto &LC : LibraryCalls) {
201 if (Subtarget.is64Bit())
218 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
227 if (Subtarget.is64Bit())
236 if (Subtarget.is64Bit())
244 if (Subtarget.is64Bit())
255 if (Subtarget.is64Bit())
259 if (!Subtarget.useSoftFloat()) {
323 if (!Subtarget.is64Bit()) {
332 for (
MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
338 if (Subtarget.is64Bit()) {
343 if (Subtarget.hasAVX10_2()) {
346 for (
MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
351 if (Subtarget.hasAVX10_2_512()) {
355 if (Subtarget.is64Bit()) {
371 if (Subtarget.is64Bit()) {
376 }
else if (!Subtarget.is64Bit())
389 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
400 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
401 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
405 if (Subtarget.is64Bit())
416 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
434 if (!Subtarget.hasBMI()) {
437 if (Subtarget.is64Bit()) {
443 if (Subtarget.hasLZCNT()) {
449 for (
auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
450 if (VT == MVT::i64 && !Subtarget.is64Bit())
464 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ?
Custom :
Expand);
471 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
476 for (
MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
489 if (Subtarget.is64Bit())
491 if (Subtarget.hasPOPCNT()) {
505 if (!Subtarget.hasMOVBE())
509 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
515 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
536 for (
auto VT : { MVT::i32, MVT::i64 }) {
537 if (VT == MVT::i64 && !Subtarget.is64Bit())
548 for (
auto VT : { MVT::i32, MVT::i64 }) {
549 if (VT == MVT::i64 && !Subtarget.is64Bit())
562 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
572 if (!Subtarget.is64Bit())
575 if (Subtarget.is64Bit() && Subtarget.
hasAVX()) {
608 bool Is64Bit = Subtarget.is64Bit();
662 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE2()) {
666 : &X86::FR16RegClass);
668 : &X86::FR32RegClass);
670 : &X86::FR64RegClass);
678 for (
auto VT : { MVT::f32, MVT::f64 }) {
699 setF16Action(MVT::f16,
Promote);
746 }
else if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE1() &&
747 (UseX87 || Is64Bit)) {
785 for (
auto VT : { MVT::f32, MVT::f64 }) {
798 if (UseX87 && (
getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
799 addLegalFPImmediate(
APFloat(+0.0f));
800 addLegalFPImmediate(
APFloat(+1.0f));
801 addLegalFPImmediate(
APFloat(-0.0f));
802 addLegalFPImmediate(
APFloat(-1.0f));
804 addLegalFPImmediate(
APFloat(+0.0f));
809 addLegalFPImmediate(
APFloat(+0.0));
810 addLegalFPImmediate(
APFloat(+1.0));
811 addLegalFPImmediate(
APFloat(-0.0));
812 addLegalFPImmediate(
APFloat(-1.0));
814 addLegalFPImmediate(
APFloat(+0.0));
845 addLegalFPImmediate(TmpFlt);
847 addLegalFPImmediate(TmpFlt);
853 addLegalFPImmediate(TmpFlt2);
855 addLegalFPImmediate(TmpFlt2);
904 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.
hasSSE1()) {
906 : &X86::VR128RegClass);
983 for (
auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
984 MVT::v4f32, MVT::v8f32, MVT::v16f32,
985 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
1068 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1073 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE1()) {
1075 : &X86::VR128RegClass);
1103 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE2()) {
1105 : &X86::VR128RegClass);
1110 : &X86::VR128RegClass);
1112 : &X86::VR128RegClass);
1114 : &X86::VR128RegClass);
1116 : &X86::VR128RegClass);
1118 : &X86::VR128RegClass);
1120 for (
auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1127 for (
auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1128 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1163 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1186 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1206 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1214 for (
auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1219 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1225 setF16Action(MVT::v8f16,
Expand);
1250 for (
auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1324 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1328 if (VT == MVT::v2i64)
continue;
1342 if (Subtarget.hasGFNI()) {
1349 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSSE3()) {
1354 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1366 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE41()) {
1367 for (
MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1407 for (
auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1422 if (Subtarget.is64Bit() && !Subtarget.
hasAVX512()) {
1434 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE42()) {
1438 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1439 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1440 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1446 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1450 if (!Subtarget.useSoftFloat() && Subtarget.
hasAVX()) {
1454 : &X86::VR256RegClass);
1456 : &X86::VR256RegClass);
1458 : &X86::VR256RegClass);
1460 : &X86::VR256RegClass);
1462 : &X86::VR256RegClass);
1464 : &X86::VR256RegClass);
1466 : &X86::VR256RegClass);
1468 for (
auto VT : { MVT::v8f32, MVT::v4f64 }) {
1532 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1538 if (VT == MVT::v4i64)
continue;
1559 for (
auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1570 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1590 for (
auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1591 MVT::v2f64, MVT::v4f64 }) {
1597 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1638 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1646 for (
auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1668 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1669 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1676 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1677 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1682 for (
MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1683 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1694 setF16Action(MVT::v16f16,
Expand);
1710 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1711 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1716 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1717 Subtarget.hasF16C()) {
1718 for (
MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1722 for (
MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1737 if (!Subtarget.useSoftFloat() && Subtarget.
hasAVX512()) {
1765 if (!Subtarget.hasDQI()) {
1778 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1784 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1787 for (
auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1800 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1803 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1804 for (
MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1813 if (!Subtarget.useSoftFloat() && Subtarget.
useAVX512Regs()) {
1814 bool HasBWI = Subtarget.hasBWI();
1834 for (
MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1850 if (Subtarget.hasDQI())
1853 for (
MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1860 for (
MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1897 if (!Subtarget.hasVLX()) {
1898 for (
auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1899 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1925 for (
auto VT : { MVT::v16f32, MVT::v8f64 }) {
1942 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1969 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1993 for (
auto VT : { MVT::v16i32, MVT::v8i64 }) {
2002 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2023 if (Subtarget.hasDQI()) {
2031 if (Subtarget.hasCDI()) {
2033 for (
auto VT : { MVT::v16i32, MVT::v8i64} ) {
2038 if (Subtarget.hasVPOPCNTDQ()) {
2039 for (
auto VT : { MVT::v16i32, MVT::v8i64 })
2046 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2047 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2050 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2051 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2062 setF16Action(MVT::v32f16,
Expand);
2071 for (
auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2078 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2087 if (Subtarget.hasVBMI2()) {
2088 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2102 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2103 for (
auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2113 if (!Subtarget.useSoftFloat() && Subtarget.
hasAVX512()) {
2121 if (Subtarget.hasDQI()) {
2126 "Unexpected operation action!");
2134 for (
auto VT : { MVT::v2i64, MVT::v4i64 }) {
2142 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2151 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2152 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2155 if (Subtarget.hasDQI()) {
2166 if (Subtarget.hasCDI()) {
2167 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2172 if (Subtarget.hasVPOPCNTDQ()) {
2173 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2180 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2181 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2182 MVT::v16i16, MVT::v8i8})
2187 for (
MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2191 if (Subtarget.hasVLX())
2192 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2193 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2197 if (Subtarget.hasVBMI2())
2198 for (
MVT VT : {MVT::v32i16, MVT::v64i8})
2202 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2203 for (
MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2209 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2213 for (
auto VT : { MVT::v32i1, MVT::v64i1 }) {
2226 for (
auto VT : { MVT::v16i1, MVT::v32i1 })
2234 for (
auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2243 if (Subtarget.hasBITALG()) {
2244 for (
auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2249 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2250 auto setGroup = [&] (
MVT VT) {
2319 setGroup(MVT::v32f16);
2363 if (Subtarget.hasVLX()) {
2364 setGroup(MVT::v8f16);
2365 setGroup(MVT::v16f16);
2418 if (!Subtarget.useSoftFloat() &&
2419 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2421 : &X86::VR128RegClass);
2423 : &X86::VR256RegClass);
2429 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2430 setF16Action(VT,
Expand);
2431 if (!Subtarget.hasBF16())
2448 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2451 setF16Action(MVT::v32bf16,
Expand);
2462 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2463 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2476 if (Subtarget.hasAVX10_2_512()) {
2489 for (
auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2495 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2508 if (Subtarget.hasBWI()) {
2513 if (Subtarget.hasFP16()) {
2545 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2553 if (!Subtarget.is64Bit()) {
2563 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2564 if (VT == MVT::i64 && !Subtarget.is64Bit())
2608 if (Subtarget.is32Bit() &&
2748 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2755 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.
hasAVX512() &&
2756 !Subtarget.hasBWI())
2781 bool AssumeSingleUse) {
2782 if (!AssumeSingleUse && !
Op.hasOneUse())
2788 auto *Ld = cast<LoadSDNode>(
Op.getNode());
2789 if (!Subtarget.
hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2790 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() <
Align(16))
2801 bool AssumeSingleUse) {
2802 assert(Subtarget.
hasAVX() &&
"Expected AVX for broadcast from memory");
2808 auto *Ld = cast<LoadSDNode>(
Op.getNode());
2809 return !Ld->isVolatile() ||
2814 if (!
Op.hasOneUse())
2827 if (
Op.hasOneUse()) {
2828 unsigned Opcode =
Op.getNode()->user_begin()->getOpcode();
2841 default:
return false;
2882 default:
return false;
2903 int ReturnAddrIndex = FuncInfo->
getRAIndex();
2905 if (ReturnAddrIndex == 0) {
2918 bool HasSymbolicDisplacement) {
2925 if (!HasSymbolicDisplacement)
2943 return Offset < 16 * 1024 * 1024;
2967 switch (SetCCOpcode) {
2992 if (SetCCOpcode ==
ISD::SETGT && RHSC->isAllOnes()) {
2997 if (SetCCOpcode ==
ISD::SETLT && RHSC->isZero()) {
3001 if (SetCCOpcode ==
ISD::SETGE && RHSC->isZero()) {
3005 if (SetCCOpcode ==
ISD::SETLT && RHSC->isOne()) {
3020 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3024 switch (SetCCOpcode) {
3040 switch (SetCCOpcode) {
3094 unsigned Intrinsic)
const {
3100 switch (Intrinsic) {
3101 case Intrinsic::x86_aesenc128kl:
3102 case Intrinsic::x86_aesdec128kl:
3104 Info.ptrVal =
I.getArgOperand(1);
3109 case Intrinsic::x86_aesenc256kl:
3110 case Intrinsic::x86_aesdec256kl:
3112 Info.ptrVal =
I.getArgOperand(1);
3117 case Intrinsic::x86_aesencwide128kl:
3118 case Intrinsic::x86_aesdecwide128kl:
3120 Info.ptrVal =
I.getArgOperand(0);
3125 case Intrinsic::x86_aesencwide256kl:
3126 case Intrinsic::x86_aesdecwide256kl:
3128 Info.ptrVal =
I.getArgOperand(0);
3133 case Intrinsic::x86_cmpccxadd32:
3134 case Intrinsic::x86_cmpccxadd64:
3135 case Intrinsic::x86_atomic_bts:
3136 case Intrinsic::x86_atomic_btc:
3137 case Intrinsic::x86_atomic_btr: {
3139 Info.ptrVal =
I.getArgOperand(0);
3140 unsigned Size =
I.getType()->getScalarSizeInBits();
3147 case Intrinsic::x86_atomic_bts_rm:
3148 case Intrinsic::x86_atomic_btc_rm:
3149 case Intrinsic::x86_atomic_btr_rm: {
3151 Info.ptrVal =
I.getArgOperand(0);
3152 unsigned Size =
I.getArgOperand(1)->getType()->getScalarSizeInBits();
3159 case Intrinsic::x86_aadd32:
3160 case Intrinsic::x86_aadd64:
3161 case Intrinsic::x86_aand32:
3162 case Intrinsic::x86_aand64:
3163 case Intrinsic::x86_aor32:
3164 case Intrinsic::x86_aor64:
3165 case Intrinsic::x86_axor32:
3166 case Intrinsic::x86_axor64:
3167 case Intrinsic::x86_atomic_add_cc:
3168 case Intrinsic::x86_atomic_sub_cc:
3169 case Intrinsic::x86_atomic_or_cc:
3170 case Intrinsic::x86_atomic_and_cc:
3171 case Intrinsic::x86_atomic_xor_cc: {
3173 Info.ptrVal =
I.getArgOperand(0);
3174 unsigned Size =
I.getArgOperand(1)->getType()->getScalarSizeInBits();
3185 switch (IntrData->
Type) {
3190 Info.ptrVal =
I.getArgOperand(0);
3196 ScalarVT = MVT::i16;
3198 ScalarVT = MVT::i32;
3208 Info.ptrVal =
nullptr;
3220 Info.ptrVal =
nullptr;
3241 bool ForCodeSize)
const {
3242 for (
const APFloat &FPImm : LegalFPImmediates)
3243 if (Imm.bitwiseIsEqual(FPImm))
3251 assert(cast<LoadSDNode>(Load)->
isSimple() &&
"illegal to narrow");
3255 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3257 if (
const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3263 EVT VT = Load->getValueType(0);
3267 if (
Use.getResNo() != 0)
3291 if (BitSize == 0 || BitSize > 64)
3338 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3342 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3343 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3347 unsigned Index)
const {
3390 return Subtarget.hasBMI() || Subtarget.
canUseCMOV() ||
3399 return Subtarget.hasLZCNT() || Subtarget.
canUseCMOV() ||
3407 return !Subtarget.
hasSSE2() || VT == MVT::f80;
3411 return (VT == MVT::f64 && Subtarget.
hasSSE2()) ||
3412 (VT == MVT::f32 && Subtarget.
hasSSE1()) || VT == MVT::f16;
3422 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3440 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3452 return Subtarget.hasFastLZCNT();
3461 EVT VT =
Y.getValueType();
3466 if (!Subtarget.hasBMI())
3470 if (VT != MVT::i32 && VT != MVT::i64)
3473 return !isa<ConstantSDNode>(
Y) || cast<ConstantSDNode>(
Y)->isOpaque();
3477 EVT VT =
Y.getValueType();
3487 if (VT == MVT::v4i32)
3494 return X.getValueType().isScalarInteger();
3500 unsigned OldShiftOpcode,
unsigned NewShiftOpcode,
3504 X, XC,
CC,
Y, OldShiftOpcode, NewShiftOpcode, DAG))
3507 if (
X.getValueType().isScalarInteger())
3521 EVT VT,
unsigned ShiftOpc,
bool MayTransformRotate,
3522 const APInt &ShiftOrRotateAmt,
const std::optional<APInt> &AndMask)
const {
3526 bool PreferRotate =
false;
3535 PreferRotate = Subtarget.hasBMI2();
3536 if (!PreferRotate) {
3539 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3544 assert(AndMask.has_value() &&
"Null andmask when querying about shift+and");
3546 if (PreferRotate && MayTransformRotate)
3580 if (PreferRotate || !MayTransformRotate || VT.
isVector())
3590 const Value *Rhs)
const {
3594 if (BaseCost >= 0 && Subtarget.hasCCMP())
3597 if (BaseCost >= 0 && Opc == Instruction::And &&
3612 N->getOperand(0).getOpcode() ==
ISD::SRL) ||
3614 N->getOperand(0).getOpcode() ==
ISD::SHL)) &&
3615 "Expected shift-shift mask");
3617 EVT VT =
N->getValueType(0);
3618 if ((Subtarget.hasFastVectorShiftMasks() && VT.
isVector()) ||
3619 (Subtarget.hasFastScalarShiftMasks() && !VT.
isVector())) {
3623 return N->getOperand(1) ==
N->getOperand(0).getOperand(1);
3629 EVT VT =
Y.getValueType();
3636 if (VT == MVT::i64 && !Subtarget.is64Bit())
3696 [CmpVal](
int M) { return isUndefOrEqual(M, CmpVal); });
3708 [](
int M) { return M == SM_SentinelUndef; });
3713 unsigned NumElts = Mask.size();
3719 unsigned NumElts = Mask.size();
3725 return (Val >=
Low && Val <
Hi);
3768 unsigned NumElts = Mask.size();
3779 unsigned Size,
int Low,
int Step = 1) {
3780 for (
unsigned i = Pos, e = Pos +
Size; i != e; ++i,
Low += Step)
3792 for (
unsigned i = Pos, e = Pos +
Size; i != e; ++i,
Low += Step)
3808 unsigned NumElts = Mask.size();
3827 WidenedMask.
assign(Mask.size() / 2, 0);
3828 for (
int i = 0,
Size = Mask.size(); i <
Size; i += 2) {
3830 int M1 = Mask[i + 1];
3841 WidenedMask[i / 2] =
M1 / 2;
3845 WidenedMask[i / 2] =
M0 / 2;
3862 WidenedMask[i / 2] =
M0 / 2;
3869 assert(WidenedMask.
size() == Mask.size() / 2 &&
3870 "Incorrect size of mask after widening the elements!");
3876 const APInt &Zeroable,
3883 assert(!Zeroable.
isZero() &&
"V2's non-undef elements are used?!");
3884 for (
int i = 0,
Size = Mask.size(); i !=
Size; ++i)
3900 unsigned NumSrcElts = Mask.size();
3901 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3902 "Illegal shuffle scale factor");
3905 if (NumDstElts >= NumSrcElts) {
3906 int Scale = NumDstElts / NumSrcElts;
3914 while (ScaledMask.
size() > NumDstElts) {
3918 ScaledMask = std::move(WidenedMask);
3940 const SDLoc &dl,
bool IsMask =
false) {
3945 MVT ConstVecVT = VT;
3954 for (
unsigned i = 0; i < NumElts; ++i) {
3955 bool IsUndef = Values[i] < 0 && IsMask;
3972 "Unequal constant and undef arrays");
3976 MVT ConstVecVT = VT;
3985 for (
unsigned i = 0, e = Bits.size(); i != e; ++i) {
3990 const APInt &V = Bits[i];
3995 }
else if (EltVT == MVT::f32) {
3998 }
else if (EltVT == MVT::f64) {
4021 "Unexpected vector type");
4035 "Unexpected vector type");
4049 LHS.getValueType() !=
RHS.getValueType() ||
4050 LHS.getOperand(0) !=
RHS.getOperand(0))
4054 if (Src.getValueSizeInBits() != (
LHS.getValueSizeInBits() * 2))
4057 unsigned NumElts =
LHS.getValueType().getVectorNumElements();
4058 if ((
LHS.getConstantOperandAPInt(1) == 0 &&
4059 RHS.getConstantOperandAPInt(1) == NumElts) ||
4060 (AllowCommute &&
RHS.getConstantOperandAPInt(1) == 0 &&
4061 LHS.getConstantOperandAPInt(1) == NumElts))
4068 const SDLoc &dl,
unsigned vectorWidth) {
4076 unsigned ElemsPerChunk = vectorWidth / ElVT.
getSizeInBits();
4081 IdxVal &= ~(ElemsPerChunk - 1);
4086 Vec->
ops().slice(IdxVal, ElemsPerChunk));
4120 unsigned vectorWidth) {
4121 assert((vectorWidth == 128 || vectorWidth == 256) &&
4122 "Unsupported vector width");
4128 EVT ResultVT = Result.getValueType();
4136 IdxVal &= ~(ElemsPerChunk - 1);
4162 "Unsupported vector widening type");
4183 const SDLoc &dl,
unsigned WideSizeInBits) {
4186 "Unsupported vector widening type");
4190 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4198 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4199 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4209 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4217 assert(Ops.
empty() &&
"Expected an empty ops vector");
4220 Ops.
append(
N->op_begin(),
N->op_end());
4227 const APInt &
Idx =
N->getConstantOperandAPInt(2);
4228 EVT VT = Src.getValueType();
4233 if (
Idx == 0 && Src.isUndef()) {
4241 Src.getOperand(1).getValueType() == SubVT &&
4265 if (Src.isUndef()) {
4285 unsigned NumSubOps = SubOps.
size();
4286 unsigned HalfNumSubOps = NumSubOps / 2;
4287 assert((NumSubOps % 2) == 0 &&
"Unexpected number of subvectors");
4293 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.
getContext());
4307 EVT VT =
Op.getValueType();
4310 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4311 "Can't split odd sized vector");
4317 return std::make_pair(
Lo,
Lo);
4320 return std::make_pair(
Lo,
Hi);
4326 EVT VT =
Op.getValueType();
4331 for (
unsigned I = 0;
I != NumOps; ++
I) {
4333 if (!
SrcOp.getValueType().isVector()) {
4343 DAG.
getNode(
Op.getOpcode(), dl, LoVT, LoOps),
4344 DAG.
getNode(
Op.getOpcode(), dl, HiVT, HiOps));
4353 [[maybe_unused]]
EVT VT =
Op.getValueType();
4354 assert((
Op.getOperand(0).getValueType().is256BitVector() ||
4355 Op.getOperand(0).getValueType().is512BitVector()) &&
4357 assert(
Op.getOperand(0).getValueType().getVectorNumElements() ==
4368 [[maybe_unused]]
EVT VT =
Op.getValueType();
4369 assert(
Op.getOperand(0).getValueType() == VT &&
4370 Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!");
4382template <
typename F>
4385 F Builder,
bool CheckBWI =
true) {
4386 assert(Subtarget.
hasSSE2() &&
"Target assumed to support at least SSE2");
4387 unsigned NumSubs = 1;
4394 }
else if (Subtarget.
hasAVX2()) {
4407 return Builder(DAG,
DL, Ops);
4410 for (
unsigned i = 0; i != NumSubs; ++i) {
4413 EVT OpVT =
Op.getValueType();
4437 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4444 APInt SplatValue, SplatUndef;
4445 unsigned SplatBitSize;
4447 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4448 HasAnyUndefs, OpEltSizeInBits) &&
4449 !HasAnyUndefs && SplatValue.
getBitWidth() == OpEltSizeInBits)
4464 MVT OpVT =
Op.getSimpleValueType();
4468 assert(OpVT == VT &&
"Vector type mismatch");
4470 if (
SDValue BroadcastOp = MakeBroadcastOp(
Op, OpVT, DstVT)) {
4496 unsigned IdxVal =
Op.getConstantOperandVal(2);
4502 if (IdxVal == 0 && Vec.
isUndef())
4505 MVT OpVT =
Op.getSimpleValueType();
4524 assert(IdxVal + SubVecNumElems <= NumElems &&
4526 "Unexpected index value in INSERT_SUBVECTOR");
4546 Undef, SubVec, ZeroIdx);
4549 assert(IdxVal != 0 &&
"Unexpected index");
4556 assert(IdxVal != 0 &&
"Unexpected index");
4559 [](
SDValue V) { return V.isUndef(); })) {
4564 unsigned ShiftLeft = NumElems - SubVecNumElems;
4565 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4568 if (ShiftRight != 0)
4576 if (IdxVal + SubVecNumElems == NumElems) {
4579 if (SubVecNumElems * 2 == NumElems) {
4589 Undef, Vec, ZeroIdx);
4606 unsigned ShiftLeft = NumElems - SubVecNumElems;
4607 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4610 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4633 unsigned LowShift = NumElems - IdxVal;
4640 unsigned HighShift = IdxVal + SubVecNumElems;
4671 "Expected a 128/256/512-bit vector type");
4679 EVT InVT = In.getValueType();
4683 "Unknown extension opcode");
4689 "Expected VTs to be the same size!");
4693 InVT = In.getValueType();
4711 bool Lo,
bool Unary) {
4713 "Illegal vector type to unpack");
4714 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
4717 for (
int i = 0; i < NumElts; ++i) {
4718 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4719 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4720 Pos += (Unary ? 0 : NumElts * (i % 2));
4721 Pos += (
Lo ? 0 : NumEltsInLane / 2);
4722 Mask.push_back(Pos);
4732 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
4734 for (
int i = 0; i < NumElts; ++i) {
4736 Pos += (
Lo ? 0 : NumElts / 2);
4737 Mask.push_back(Pos);
4747 for (
int I = 0, NumElts = Mask.size();
I != NumElts; ++
I) {
4751 SDValue V = (M < NumElts) ? V1 : V2;
4754 Ops[
I] = V.getOperand(M % NumElts);
4783 bool PackHiHalf =
false) {
4784 MVT OpVT =
LHS.getSimpleValueType();
4786 bool UsePackUS = Subtarget.
hasSSE41() || EltSizeInBits == 8;
4787 assert(OpVT ==
RHS.getSimpleValueType() &&
4790 "Unexpected PACK operand types");
4791 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4792 "Unexpected PACK result type");
4795 if (EltSizeInBits == 32) {
4797 int Offset = PackHiHalf ? 1 : 0;
4799 for (
int I = 0;
I != NumElts;
I += 4) {
4852 MVT VT = V2.getSimpleValueType();
4857 for (
int i = 0; i != NumElems; ++i)
4859 MaskVec[i] = (i ==
Idx) ? NumElems : i;
4867 return dyn_cast<ConstantPoolSDNode>(
Ptr);
4891 assert(LD &&
"Unexpected null LoadSDNode");
4899 bool AllowWholeUndefs =
true,
4900 bool AllowPartialUndefs =
false) {
4901 assert(EltBits.
empty() &&
"Expected an empty EltBits vector");
4905 EVT VT =
Op.getValueType();
4907 assert((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!");
4908 unsigned NumElts = SizeInBits / EltSizeInBits;
4913 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4914 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4915 "Constant bit sizes don't match");
4918 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4923 if (NumSrcElts == NumElts) {
4924 UndefElts = UndefSrcElts;
4925 EltBits.
assign(SrcEltBits.begin(), SrcEltBits.end());
4930 APInt UndefBits(SizeInBits, 0);
4931 APInt MaskBits(SizeInBits, 0);
4933 for (
unsigned i = 0; i != NumSrcElts; ++i) {
4934 unsigned BitOffset = i * SrcEltSizeInBits;
4935 if (UndefSrcElts[i])
4936 UndefBits.
setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4937 MaskBits.
insertBits(SrcEltBits[i], BitOffset);
4941 UndefElts =
APInt(NumElts, 0);
4944 for (
unsigned i = 0; i != NumElts; ++i) {
4945 unsigned BitOffset = i * EltSizeInBits;
4950 if (!AllowWholeUndefs)
4958 if (UndefEltBits.
getBoolValue() && !AllowPartialUndefs)
4961 EltBits[i] = MaskBits.
extractBits(EltSizeInBits, BitOffset);
4968 unsigned UndefBitIndex) {
4971 if (isa<UndefValue>(Cst)) {
4972 Undefs.
setBit(UndefBitIndex);
4975 if (
auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4976 Mask = CInt->getValue();
4979 if (
auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4980 Mask = CFP->getValueAPF().bitcastToAPInt();
4983 if (
auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4984 Type *Ty = CDS->getType();
4986 Type *EltTy = CDS->getElementType();
4990 if (!IsInteger && !IsFP)
4993 for (
unsigned I = 0, E = CDS->getNumElements();
I != E; ++
I)
4995 Mask.insertBits(CDS->getElementAsAPInt(
I),
I * EltBits);
4997 Mask.insertBits(CDS->getElementAsAPFloat(
I).bitcastToAPInt(),
5008 return CastBitData(UndefSrcElts, SrcEltBits);
5012 if (
auto *Cst = dyn_cast<ConstantSDNode>(
Op)) {
5015 return CastBitData(UndefSrcElts, SrcEltBits);
5017 if (
auto *Cst = dyn_cast<ConstantFPSDNode>(
Op)) {
5019 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5021 return CastBitData(UndefSrcElts, SrcEltBits);
5025 if (
auto *BV = dyn_cast<BuildVectorSDNode>(
Op)) {
5029 if (BV->getConstantRawBits(
true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5031 for (
unsigned I = 0, E = SrcEltBits.
size();
I != E; ++
I)
5034 return CastBitData(UndefSrcElts, SrcEltBits);
5042 if (!CstTy->
isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5046 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5047 if ((SizeInBits % SrcEltSizeInBits) != 0)
5050 APInt UndefSrcElts(NumSrcElts, 0);
5052 for (
unsigned i = 0; i != NumSrcElts; ++i)
5057 return CastBitData(UndefSrcElts, SrcEltBits);
5063 auto *MemIntr = cast<MemIntrinsicSDNode>(
Op);
5070 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5072 APInt UndefSrcElts(NumSrcElts, 0);
5074 if (CollectConstantBits(
C, SrcEltBits[0], UndefSrcElts, 0)) {
5075 if (UndefSrcElts[0])
5076 UndefSrcElts.
setBits(0, NumSrcElts);
5077 if (SrcEltBits[0].
getBitWidth() != SrcEltSizeInBits)
5078 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5079 SrcEltBits.
append(NumSrcElts - 1, SrcEltBits[0]);
5080 return CastBitData(UndefSrcElts, SrcEltBits);
5087 auto *MemIntr = cast<MemIntrinsicSDNode>(
Op);
5094 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5095 if (!CstTy->
isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5096 (SizeInBits % SubVecSizeInBits) != 0)
5099 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5100 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5101 APInt UndefSubElts(NumSubElts, 0);
5103 APInt(CstEltSizeInBits, 0));
5104 for (
unsigned i = 0; i != NumSubElts; ++i) {
5108 for (
unsigned j = 1; j != NumSubVecs; ++j)
5109 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5113 return CastBitData(UndefSubElts, SubEltBits);
5120 isa<ConstantSDNode>(
Op.getOperand(0).getOperand(0))) {
5122 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5124 APInt UndefSrcElts(NumSrcElts, 0);
5126 const APInt &
C =
Op.getOperand(0).getConstantOperandAPInt(0);
5127 SrcEltBits.
push_back(
C.zextOrTrunc(SrcEltSizeInBits));
5128 SrcEltBits.
append(NumSrcElts - 1,
APInt(SrcEltSizeInBits, 0));
5129 return CastBitData(UndefSrcElts, SrcEltBits);
5137 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5139 APInt UndefSrcElts, UndefSubElts;
5142 UndefSubElts, EltSubBits,
5143 AllowWholeUndefs && AllowUndefs,
5144 AllowPartialUndefs && AllowUndefs) &&
5146 UndefSrcElts, EltSrcBits,
5147 AllowWholeUndefs && AllowUndefs,
5148 AllowPartialUndefs && AllowUndefs)) {
5149 unsigned BaseIdx =
Op.getConstantOperandVal(2);
5150 UndefSrcElts.
insertBits(UndefSubElts, BaseIdx);
5151 for (
unsigned i = 0, e = EltSubBits.
size(); i != e; ++i)
5152 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5153 return CastBitData(UndefSrcElts, EltSrcBits);
5164 UndefElts, EltBits, AllowWholeUndefs,
5165 AllowPartialUndefs)) {
5166 EVT SrcVT =
Op.getOperand(0).getValueType();
5169 unsigned BaseIdx =
Op.getConstantOperandVal(1);
5170 UndefElts = UndefElts.
extractBits(NumSubElts, BaseIdx);
5171 if ((BaseIdx + NumSubElts) != NumSrcElts)
5172 EltBits.
erase(EltBits.
begin() + BaseIdx + NumSubElts, EltBits.
end());
5180 if (
auto *SVN = dyn_cast<ShuffleVectorSDNode>(
Op)) {
5186 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5190 APInt UndefElts0, UndefElts1;
5194 UndefElts0, EltBits0, AllowWholeUndefs,
5195 AllowPartialUndefs))
5199 UndefElts1, EltBits1, AllowWholeUndefs,
5200 AllowPartialUndefs))
5204 for (
int i = 0; i != (int)NumElts; ++i) {
5209 }
else if (M < (
int)NumElts) {
5214 if (UndefElts1[M - NumElts])
5216 EltBits.
push_back(EltBits1[M - NumElts]);
5231 Op,
Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5232 true, AllowPartialUndefs)) {
5233 int SplatIndex = -1;
5234 for (
int i = 0, e = EltBits.
size(); i != e; ++i) {
5237 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5243 if (0 <= SplatIndex) {
5244 SplatVal = EltBits[SplatIndex];
5255 unsigned MaskEltSizeInBits,
5266 for (
const APInt &Elt : EltBits)
5281 bool IsPow2OrUndef =
true;
5282 for (
unsigned I = 0, E = EltBits.
size();
I != E; ++
I)
5283 IsPow2OrUndef &= UndefElts[
I] || EltBits[
I].isPowerOf2();
5284 return IsPow2OrUndef;
5291 EVT VT = V.getValueType();
5297 return V.getOperand(0);
5301 (
isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5303 Not = DAG.
getBitcast(V.getOperand(0).getValueType(), Not);
5313 V.getOperand(0).hasOneUse()) {
5317 V.getScalarValueSizeInBits(), UndefElts,
5321 bool MinSigned =
false;
5322 for (
APInt &Elt : EltBits) {
5323 MinSigned |= Elt.isMinSignedValue();
5328 MVT VT = V.getSimpleValueType();
5338 for (
SDValue &CatOp : CatOps) {
5342 CatOp = DAG.
getBitcast(CatOp.getValueType(), NotCat);
5349 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5364 bool Unary,
unsigned NumStages = 1) {
5365 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
5369 unsigned Offset = Unary ? 0 : NumElts;
5370 unsigned Repetitions = 1u << (NumStages - 1);
5371 unsigned Increment = 1u << NumStages;
5372 assert((NumEltsPerLane >> NumStages) > 0 &&
"Illegal packing compaction");
5374 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5375 for (
unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5376 for (
unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5377 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5378 for (
unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5379 Mask.push_back(Elt + (Lane * NumEltsPerLane) +
Offset);
5389 int NumInnerElts = NumElts / 2;
5390 int NumEltsPerLane = NumElts / NumLanes;
5391 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5397 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
5398 for (
int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5399 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5400 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5401 if (DemandedElts[OuterIdx])
5402 DemandedLHS.
setBit(InnerIdx);
5403 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5404 DemandedRHS.
setBit(InnerIdx);
5413 DemandedLHS, DemandedRHS);
5414 DemandedLHS |= DemandedLHS << 1;
5415 DemandedRHS |= DemandedRHS << 1;
5431 MVT VT =
N.getSimpleValueType();
5438 assert(Mask.empty() &&
"getTargetShuffleMask expects an empty Mask vector");
5439 assert(Ops.
empty() &&
"getTargetShuffleMask expects an empty Ops vector");
5442 bool IsFakeUnary =
false;
5443 switch (
N.getOpcode()) {
5445 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5446 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5447 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5449 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5452 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5453 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5454 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5456 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5459 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5460 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5461 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5463 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5466 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5467 if (isa<ConstantSDNode>(
N.getOperand(1)) &&
5468 isa<ConstantSDNode>(
N.getOperand(2))) {
5469 int BitLen =
N.getConstantOperandVal(1);
5470 int BitIdx =
N.getConstantOperandVal(2);
5476 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5477 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5478 if (isa<ConstantSDNode>(
N.getOperand(2)) &&
5479 isa<ConstantSDNode>(
N.getOperand(3))) {
5480 int BitLen =
N.getConstantOperandVal(2);
5481 int BitIdx =
N.getConstantOperandVal(3);
5483 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5487 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5488 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5490 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5493 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5494 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5496 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5499 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5500 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5502 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5505 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5506 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5508 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5512 "Only 32-bit and 64-bit elements are supported!");
5513 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5514 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5515 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5517 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5523 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5524 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5525 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5527 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5533 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5534 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5540 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5541 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5547 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5548 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5553 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5554 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5559 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5560 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5565 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5573 if (
N.getOperand(0).getValueType() == VT) {
5580 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5582 SDValue MaskNode =
N.getOperand(1);
5592 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5593 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5595 SDValue MaskNode =
N.getOperand(1);
5603 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5604 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5611 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5612 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5616 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5617 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5618 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5620 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5623 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5624 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5625 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5627 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5630 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5635 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5640 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5645 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5646 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5647 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5648 SDValue MaskNode =
N.getOperand(2);
5649 SDValue CtrlNode =
N.getOperand(3);
5650 if (
ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5651 unsigned CtrlImm = CtrlOp->getZExtValue();
5662 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5663 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5664 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5665 SDValue MaskNode =
N.getOperand(2);
5673 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5677 SDValue MaskNode =
N.getOperand(0);
5686 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5687 assert(
N.getOperand(2).getValueType() == VT &&
"Unexpected value type");
5688 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(2);
5692 SDValue MaskNode =
N.getOperand(1);
5709 if (!AllowSentinelZero &&
isAnyZero(Mask))
5717 if (M >= (
int)Mask.size())
5724 if (!IsUnary || IsFakeUnary)
5750 int Size = Mask.size();
5760 int ScalarSizeInBits = VectorSizeInBits /
Size;
5761 assert(!(VectorSizeInBits % ScalarSizeInBits) &&
"Illegal shuffle mask size");
5763 for (
int i = 0; i <
Size; ++i) {
5770 if ((M >= 0 && M <
Size && V1IsZero) || (M >=
Size && V2IsZero)) {
5785 if ((
Size % V.getNumOperands()) == 0) {
5786 int Scale =
Size / V->getNumOperands();
5793 APInt Val = Cst->getAPIntValue();
5794 Val = Val.
extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5798 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5799 Val = Val.
extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5808 if ((V.getNumOperands() %
Size) == 0) {
5809 int Scale = V->getNumOperands() /
Size;
5810 bool AllUndef =
true;
5811 bool AllZero =
true;
5812 for (
int j = 0; j < Scale; ++j) {
5813 SDValue Op = V.getOperand((M * Scale) + j);
5814 AllUndef &=
Op.isUndef();
5837 MVT VT =
N.getSimpleValueType();
5841 int Size = Mask.size();
5843 SDValue V2 = IsUnary ? V1 : Ops[1];
5850 "Illegal split of shuffle value type");
5854 APInt UndefSrcElts[2];
5856 bool IsSrcConstant[2] = {
5858 SrcEltBits[0],
true,
5861 SrcEltBits[1],
true,
5864 for (
int i = 0; i <
Size; ++i) {
5878 unsigned SrcIdx = M /
Size;
5893 (
Size % V.getValueType().getVectorNumElements()) == 0) {
5894 int Scale =
Size / V.getValueType().getVectorNumElements();
5895 int Idx = M / Scale;
5906 SDValue Vec = V.getOperand(0);
5909 int Idx = V.getConstantOperandVal(2);
5910 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5911 if (M <
Idx || (
Idx + NumSubElts) <= M)
5918 if (IsSrcConstant[SrcIdx]) {
5919 if (UndefSrcElts[SrcIdx][M])
5921 else if (SrcEltBits[SrcIdx][M] == 0)
5927 "Different mask size from vector size!");
5933 const APInt &KnownUndef,
5934 const APInt &KnownZero,
5935 bool ResolveKnownZeros=
true) {
5936 unsigned NumElts = Mask.size();
5938 KnownZero.
getBitWidth() == NumElts &&
"Shuffle mask size mismatch");
5940 for (
unsigned i = 0; i != NumElts; ++i) {
5943 else if (ResolveKnownZeros && KnownZero[i])
5952 unsigned NumElts = Mask.size();
5955 for (
unsigned i = 0; i != NumElts; ++i) {
5967 EVT CondVT =
Cond.getValueType();
5980 for (
int i = 0; i != (int)NumElts; ++i) {
5985 if (UndefElts[i] || (!IsBLENDV && EltBits[i].
isZero()) ||
5986 (IsBLENDV && EltBits[i].isNonNegative()))
5998 bool ResolveKnownElts);
6008 bool ResolveKnownElts) {
6012 MVT VT =
N.getSimpleValueType();
6016 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6019 unsigned NumSizeInBytes = NumSizeInBits / 8;
6020 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6022 unsigned Opcode =
N.getOpcode();
6026 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(
N)->getMask();
6028 Mask.append(ShuffleMask.
begin(), ShuffleMask.
end());
6043 uint64_t ZeroMask = IsAndN ? 255 : 0;
6050 assert(UndefElts.
isZero() &&
"Unexpected UNDEF element in AND/ANDNP mask");
6051 for (
int i = 0, e = (
int)EltBits.
size(); i != e; ++i) {
6052 const APInt &ByteBits = EltBits[i];
6053 if (ByteBits != 0 && ByteBits != 255)
6078 size_t MaskSize = std::max(SrcMask0.
size(), SrcMask1.
size());
6082 for (
int i = 0; i != (int)MaskSize; ++i) {
6092 Mask.push_back(i + MaskSize);
6105 if (!
N->isOnlyUserOf(Sub.
getNode()))
6108 uint64_t InsertIdx =
N.getConstantOperandVal(2);
6115 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
6116 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
6117 "Subvector valuetype mismatch");
6118 InsertIdx *= (MaxElts / NumElts);
6119 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
6120 NumSubElts *= (MaxElts / NumElts);
6121 bool SrcIsUndef = Src.isUndef();
6122 for (
int i = 0; i != (int)MaxElts; ++i)
6124 for (
int i = 0; i != (int)NumSubElts; ++i)
6125 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6134 if (
Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6135 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
6137 Src.getOperand(0).isUndef() &&
6138 Src.getOperand(1).getValueType() == SubVT &&
6139 Src.getConstantOperandVal(2) == 0) {
6140 for (
int i = 0; i != (int)NumSubElts; ++i)
6142 for (
int i = 0; i != (int)NumSubElts; ++i)
6143 Mask.push_back(i + NumElts);
6158 Depth + 1, ResolveKnownElts))
6168 if (SubMask.
size() != NumSubElts) {
6169 assert(((SubMask.
size() % NumSubElts) == 0 ||
6170 (NumSubElts % SubMask.
size()) == 0) &&
"Illegal submask scale");
6171 if ((NumSubElts % SubMask.
size()) == 0) {
6172 int Scale = NumSubElts / SubMask.
size();
6175 SubMask = ScaledSubMask;
6177 int Scale = SubMask.
size() / NumSubElts;
6178 NumSubElts = SubMask.
size();
6188 for (
int i = 0; i != (int)NumElts; ++i)
6190 for (
int i = 0; i != (int)NumSubElts; ++i) {
6193 int InputIdx = M / NumSubElts;
6194 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6196 Mask[i + InsertIdx] = M;
6208 unsigned DstIdx = 0;
6211 if (!isa<ConstantSDNode>(
N.getOperand(2)) ||
6212 N.getConstantOperandAPInt(2).uge(NumElts))
6214 DstIdx =
N.getConstantOperandVal(2);
6219 for (
unsigned i = 0; i != NumElts; ++i)
6239 if ((MinBitsPerElt % 8) != 0)
6250 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.
getOperand(1)))
6259 unsigned DstByte = DstIdx * NumBytesPerElt;
6270 for (
int i = 0; i != (int)NumSizeInBytes; ++i)
6271 Mask.push_back(NumSizeInBytes + i);
6274 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6275 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6276 for (
unsigned i = 0; i != MinBytesPerElts; ++i)
6277 Mask[DstByte + i] = SrcByte + i;
6278 for (
unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6288 "Unexpected input value type");
6290 APInt EltsLHS, EltsRHS;
6295 bool Offset0 =
false, Offset1 =
false;
6324 bool IsUnary = (N0 == N1);
6332 if (Offset0 || Offset1) {
6334 if ((Offset0 &&
isInRange(M, 0, NumElts)) ||
6335 (Offset1 &&
isInRange(M, NumElts, 2 * NumElts)))
6352 EVT SrcVT = Src.getValueType();
6359 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6360 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 &&
"Illegal truncation");
6361 for (
unsigned i = 0; i != NumSrcElts; ++i)
6362 Mask.push_back(i * Scale);
6371 if (!Amt || (*Amt % 8) != 0)
6381 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6382 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6383 Mask[i + j] = i + j - ByteShift;
6385 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6386 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6387 Mask[i + j - ByteShift] = i + j;
6393 uint64_t ShiftVal =
N.getConstantOperandVal(1);
6395 if (NumBitsPerElt <= ShiftVal) {
6401 if ((ShiftVal % 8) != 0)
6411 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6412 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6413 Mask[i + j] = i + j - ByteShift;
6415 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6416 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6417 Mask[i + j - ByteShift] = i + j;
6424 uint64_t RotateVal =
N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6425 if ((RotateVal % 8) != 0)
6428 int Offset = RotateVal / 8;
6430 for (
int i = 0; i != (int)NumElts; ++i) {
6431 int BaseIdx = i * NumBytesPerElt;
6432 for (
int j = 0; j != (int)NumBytesPerElt; ++j) {
6433 Mask.push_back(BaseIdx + ((
Offset + j) % NumBytesPerElt));
6440 if (!Src.getSimpleValueType().isVector()) {
6443 Src.getOperand(0).getValueType().getScalarType() !=
6446 Src = Src.getOperand(0);
6449 Mask.append(NumElts, 0);
6454 EVT SrcVT = Src.getValueType();
6459 (NumBitsPerSrcElt % 8) != 0)
6463 APInt DemandedSrcElts =
6468 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 &&
"Unexpected extension");
6469 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6470 for (
unsigned I = 0;
I != NumElts; ++
I)
6471 Mask.append(Scale,
I);
6480 EVT SrcVT = Src.getValueType();
6502 int MaskWidth = Mask.size();
6504 for (
int i = 0, e = Inputs.
size(); i < e; ++i) {
6505 int lo = UsedInputs.
size() * MaskWidth;
6506 int hi = lo + MaskWidth;
6511 if ((lo <= M) && (M < hi))
6515 if (
none_of(Mask, [lo, hi](
int i) {
return (lo <= i) && (i < hi); })) {
6523 bool IsRepeat =
false;
6524 for (
int j = 0, ue = UsedInputs.
size(); j != ue; ++j) {
6525 if (UsedInputs[j] != Inputs[i])
6529 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6538 Inputs = UsedInputs;
6549 bool ResolveKnownElts) {
6553 EVT VT =
Op.getValueType();
6558 if (ResolveKnownElts)
6563 ResolveKnownElts)) {
6574 bool ResolveKnownElts) {
6575 APInt KnownUndef, KnownZero;
6577 KnownZero, DAG,
Depth, ResolveKnownElts);
6583 bool ResolveKnownElts =
true) {
6584 EVT VT =
Op.getValueType();
6588 unsigned NumElts =
Op.getValueType().getVectorNumElements();
6600 "Unknown broadcast load type");
6611 Opcode,
DL, Tys, Ops, MemVT,
6625 EVT VT =
Op.getValueType();
6626 unsigned Opcode =
Op.getOpcode();
6630 if (
auto *SV = dyn_cast<ShuffleVectorSDNode>(
Op)) {
6631 int Elt = SV->getMaskElt(Index);
6636 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6650 int Elt = ShuffleMask[Index];
6657 assert(0 <= Elt && Elt < (2 * NumElems) &&
"Shuffle index out of range");
6666 uint64_t SubIdx =
Op.getConstantOperandVal(2);
6669 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6676 EVT SubVT =
Op.getOperand(0).getValueType();
6678 uint64_t SubIdx = Index / NumSubElts;
6679 uint64_t SubElt = Index % NumSubElts;
6686 uint64_t SrcIdx =
Op.getConstantOperandVal(1);
6693 EVT SrcVT = Src.getValueType();
6704 isa<ConstantSDNode>(
Op.getOperand(2))) {
6705 if (
Op.getConstantOperandAPInt(2) == Index)
6706 return Op.getOperand(1);
6711 return (Index == 0) ?
Op.getOperand(0)
6715 return Op.getOperand(Index);
6722 const APInt &NonZeroMask,
6723 unsigned NumNonZero,
unsigned NumZero,
6726 MVT VT =
Op.getSimpleValueType();
6729 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.
hasSSE41())) &&
6730 "Illegal vector insertion");
6735 for (
unsigned i = 0; i < NumElts; ++i) {
6736 bool IsNonZero = NonZeroMask[i];
6745 if (NumZero || 0 != i)
6748 assert(0 == i &&
"Expected insertion into zero-index");
6764 const APInt &NonZeroMask,
6765 unsigned NumNonZero,
unsigned NumZero,
6768 if (NumNonZero > 8 && !Subtarget.
hasSSE41())
6782 for (
unsigned I = 0;
I != 4; ++
I) {
6783 if (!NonZeroMask[
I])
6791 assert(V &&
"Failed to fold v16i8 vector to zero");
6796 for (
unsigned i = V ? 4 : 0; i < 16; i += 2) {
6797 bool ThisIsNonZero = NonZeroMask[i];
6798 bool NextIsNonZero = NonZeroMask[i + 1];
6799 if (!ThisIsNonZero && !NextIsNonZero)
6803 if (ThisIsNonZero) {
6804 if (NumZero || NextIsNonZero)
6810 if (NextIsNonZero) {
6812 if (i == 0 && NumZero)
6828 if (i != 0 || NumZero)
6846 const APInt &NonZeroMask,
6847 unsigned NumNonZero,
unsigned NumZero,
6850 if (NumNonZero > 4 && !Subtarget.
hasSSE41())
6866 if (Subtarget.
hasSSE3() && !Subtarget.hasXOP() &&
6867 Op.getOperand(0) ==
Op.getOperand(2) &&
6868 Op.getOperand(1) ==
Op.getOperand(3) &&
6869 Op.getOperand(0) !=
Op.getOperand(1)) {
6870 MVT VT =
Op.getSimpleValueType();
6874 SDValue Ops[4] = {
Op.getOperand(0),
Op.getOperand(1),
6882 std::bitset<4> Zeroable, Undefs;
6883 for (
int i = 0; i < 4; ++i) {
6888 assert(Zeroable.size() - Zeroable.count() > 1 &&
6889 "We expect at least two non-zero elements!");
6894 unsigned FirstNonZeroIdx;
6895 for (
unsigned i = 0; i < 4; ++i) {
6906 if (!FirstNonZero.
getNode()) {
6908 FirstNonZeroIdx = i;
6912 assert(FirstNonZero.
getNode() &&
"Unexpected build vector of all zeros!");
6918 unsigned EltMaskIdx, EltIdx;
6920 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6921 if (Zeroable[EltIdx]) {
6923 Mask[EltIdx] = EltIdx+4;
6927 Elt =
Op->getOperand(EltIdx);
6930 if (Elt.
getOperand(0) != V1 || EltMaskIdx != EltIdx)
6932 Mask[EltIdx] = EltIdx;
6937 SDValue VZeroOrUndef = (Zeroable == Undefs)
6950 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6953 bool CanFold =
true;
6954 for (
unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6968 assert(V1.
getNode() &&
"Expected at least two non-zero elements!");
6971 if (V2.getSimpleValueType() != MVT::v4f32)
6975 unsigned ZMask = Zeroable.to_ulong();
6977 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6978 assert((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!");
6990 MVT ShVT = MVT::v16i8;
6993 assert(NumBits % 8 == 0 &&
"Only support byte sized shifts");
7008 EVT PVT = LD->getValueType(0);
7009 if (PVT != MVT::i32 && PVT != MVT::f32)
7015 FI = FINode->getIndex();
7018 isa<FrameIndexSDNode>(
Ptr.getOperand(0))) {
7019 FI = cast<FrameIndexSDNode>(
Ptr.getOperand(0))->getIndex();
7029 SDValue Chain = LD->getChain();
7033 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7050 int64_t StartOffset =
Offset & ~int64_t(RequiredAlign.
value() - 1);
7057 int EltNo = (
Offset - StartOffset) >> 2;
7062 LD->getPointerInfo().getWithOffset(StartOffset));
7075 auto *BaseLd = cast<LoadSDNode>(Elt);
7076 if (!BaseLd->isSimple())
7089 if (
auto *AmtC = dyn_cast<ConstantSDNode>(Elt.
getOperand(1))) {
7090 uint64_t Amt = AmtC->getZExtValue();
7092 ByteOffset += Amt / 8;
7098 if (
auto *IdxC = dyn_cast<ConstantSDNode>(Elt.
getOperand(1))) {
7100 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7102 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7105 ByteOffset +=
Idx * (SrcSizeInBits / 8);
7123 bool IsAfterLegalize) {
7127 unsigned NumElems = Elts.
size();
7129 int LastLoadedElt = -1;
7139 for (
unsigned i = 0; i < NumElems; ++i) {
7158 if (!
findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7160 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7161 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7169 "Incomplete element masks");
7172 if (UndefMask.
popcount() == NumElems)
7183 "Register/Memory size mismatch");
7185 assert(LDBase &&
"Did not find base load for merging consecutive loads");
7187 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7188 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7189 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7190 assert((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected");
7193 if (ByteOffsets[FirstLoadedElt] != 0)
7200 int64_t ByteOffset = ByteOffsets[EltIdx];
7201 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7202 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7203 return (0 <= BaseIdx && BaseIdx < (
int)NumElems && LoadMask[BaseIdx] &&
7204 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7207 EltIdx - FirstLoadedElt);
7213 bool IsConsecutiveLoad =
true;
7214 bool IsConsecutiveLoadWithZeros =
true;
7215 for (
int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7217 if (!CheckConsecutiveLoad(LDBase, i)) {
7218 IsConsecutiveLoad =
false;
7219 IsConsecutiveLoadWithZeros =
false;
7222 }
else if (ZeroMask[i]) {
7223 IsConsecutiveLoad =
false;
7230 "Cannot merge volatile or atomic loads.");
7235 for (
auto *LD : Loads)
7250 if (FirstLoadedElt == 0 &&
7251 (NumLoadedElts == (
int)NumElems || IsDereferenceable) &&
7252 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7263 return DAG.
getBitcast(VT, Elts[FirstLoadedElt]);
7266 return CreateLoad(VT, LDBase);
7270 if (!IsAfterLegalize && VT.
isVector()) {
7272 if ((NumMaskElts % NumElems) == 0) {
7273 unsigned Scale = NumMaskElts / NumElems;
7275 for (
unsigned i = 0; i < NumElems; ++i) {
7278 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7279 for (
unsigned j = 0; j != Scale; ++j)
7280 ClearMask[(i * Scale) + j] = (i * Scale) + j +
Offset;
7282 SDValue V = CreateLoad(VT, LDBase);
7292 unsigned HalfNumElems = NumElems / 2;
7298 DAG, Subtarget, IsAfterLegalize);
7306 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7307 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7308 LoadSizeInBits == 64) &&
7315 if (!Subtarget.
hasSSE2() && VT == MVT::v4f32)
7323 for (
auto *LD : Loads)
7334 for (
unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7335 unsigned RepeatSize = SubElems * BaseSizeInBits;
7336 unsigned ScalarSize = std::min(RepeatSize, 64u);
7337 if (!Subtarget.
hasAVX2() && ScalarSize < 32)
7342 if (RepeatSize > ScalarSize && SubElems == 1)
7347 for (
unsigned i = 0; i != NumElems &&
Match; ++i) {
7351 if (RepeatedLoads[i % SubElems].
isUndef())
7352 RepeatedLoads[i % SubElems] = Elt;
7354 Match &= (RepeatedLoads[i % SubElems] == Elt);
7359 Match &= !RepeatedLoads.
back().isUndef();
7367 if (RepeatSize > ScalarSize)
7369 RepeatSize / ScalarSize);
7375 RepeatVT, RepeatedLoads,
DL, DAG, Subtarget, IsAfterLegalize)) {
7376 SDValue Broadcast = RepeatLoad;
7377 if (RepeatSize > ScalarSize) {
7405 bool IsAfterLegalize) {
7424 auto getConstantScalar = [&](
const APInt &Val) ->
Constant * {
7426 if (ScalarSize == 16)
7428 if (ScalarSize == 32)
7430 assert(ScalarSize == 64 &&
"Unsupported floating point scalar size");
7437 for (
unsigned I = 0, E = Bits.size();
I != E; ++
I)
7439 : getConstantScalar(Bits[
I]));
7448 auto getConstantScalar = [&](
const APInt &Val) ->
Constant * {
7450 if (ScalarSize == 16)
7452 if (ScalarSize == 32)
7454 assert(ScalarSize == 64 &&
"Unsupported floating point scalar size");
7460 if (ScalarSize == SplatBitSize)
7461 return getConstantScalar(SplatValue);
7463 unsigned NumElm = SplatBitSize / ScalarSize;
7465 for (
unsigned I = 0;
I != NumElm; ++
I) {
7467 ConstantVec.
push_back(getConstantScalar(Val));
7473 for (
auto *U :
N->users()) {
7474 unsigned Opc = U->getOpcode();
7484 if (
N->hasOneUse()) {
7516 "Unsupported vector type for broadcast.");
7523 assert((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.");
7524 if (Sequence.size() == 1)
7534 if (!Sequence.empty() && Subtarget.hasCDI()) {
7536 unsigned SeqLen = Sequence.size();
7537 bool UpperZeroOrUndef =
7542 if (UpperZeroOrUndef && ((Op0.getOpcode() ==
ISD::BITCAST) ||
7547 : Op0.getOperand(0).getOperand(0);
7550 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||
7551 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) {
7565 unsigned NumUndefElts = UndefElements.
count();
7566 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7567 APInt SplatValue, Undef;
7568 unsigned SplatBitSize;
7571 if (BVOp->
isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7581 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7582 (SplatBitSize < 32 && Subtarget.
hasAVX2())) {
7589 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7599 if (SplatBitSize > 64) {
7605 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7611 Ops, VVT, MPI, Alignment,
7621 if (!Ld || NumElts - NumUndefElts != 1)
7624 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7628 bool ConstSplatVal =
7656 if (ConstSplatVal && (Subtarget.
hasAVX2() || OptForSize)) {
7664 if (ScalarSize == 32 ||
7665 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7666 (CVT == MVT::f16 && Subtarget.
hasAVX2()) ||
7667 (OptForSize && (ScalarSize == 64 || Subtarget.
hasAVX2()))) {
7670 C = CI->getConstantIntValue();
7672 C = CF->getConstantFPValue();
7674 assert(
C &&
"Invalid constant type");
7678 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7691 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7702 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7703 (Subtarget.hasVLX() && ScalarSize == 64)) {
7704 auto *LN = cast<LoadSDNode>(Ld);
7706 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7709 LN->getMemoryVT(), LN->getMemOperand());
7717 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7718 auto *LN = cast<LoadSDNode>(Ld);
7720 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7723 LN->getMemoryVT(), LN->getMemOperand());
7728 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7743 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7764 ExtractedFromVec = ShuffleVec;
7772 MVT VT =
Op.getSimpleValueType();
7785 for (
unsigned i = 0; i != NumElems; ++i) {
7786 unsigned Opc =
Op.getOperand(i).getOpcode();
7793 if (InsertIndices.
size() > 1)
7800 SDValue ExtractedFromVec =
Op.getOperand(i).getOperand(0);
7801 SDValue ExtIdx =
Op.getOperand(i).getOperand(1);
7804 if (!isa<ConstantSDNode>(ExtIdx))
7813 VecIn1 = ExtractedFromVec;
7814 else if (VecIn1 != ExtractedFromVec) {
7816 VecIn2 = ExtractedFromVec;
7817 else if (VecIn2 != ExtractedFromVec)
7822 if (ExtractedFromVec == VecIn1)
7824 else if (ExtractedFromVec == VecIn2)
7825 Mask[i] =
Idx + NumElems;
7834 for (
unsigned Idx : InsertIndices)
7844 MVT VT =
Op.getSimpleValueType();
7860 MVT VT =
Op.getSimpleValueType();
7862 "Unexpected type in LowerBUILD_VECTORvXi1!");
7869 bool IsSplat =
true;
7870 bool HasConstElts =
false;
7876 if (
auto *InC = dyn_cast<ConstantSDNode>(In)) {
7877 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7878 HasConstElts =
true;
7884 else if (In !=
Op.getOperand(SplatIdx))
7895 assert(
Cond.getValueType() == MVT::i8 &&
"Unexpected VT!");
7901 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7922 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7939 for (
unsigned InsertIdx : NonConstIdx) {
7941 Op.getOperand(InsertIdx),
7982 unsigned BaseIdx,
unsigned LastIdx,
7984 EVT VT =
N->getValueType(0);
7986 assert(BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!");
7988 "Invalid Vector in input!");
7991 bool CanFold =
true;
7992 unsigned ExpectedVExtractIdx = BaseIdx;
7993 unsigned NumElts = LastIdx - BaseIdx;
7998 for (
unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8002 if (
Op->isUndef()) {
8004 if (i * 2 == NumElts)
8005 ExpectedVExtractIdx = BaseIdx;
8006 ExpectedVExtractIdx += 2;
8010 CanFold =
Op->getOpcode() == Opcode &&
Op->hasOneUse();
8031 if (i * 2 < NumElts) {
8043 if (i * 2 == NumElts)
8044 ExpectedVExtractIdx = BaseIdx;
8048 if (I0 == ExpectedVExtractIdx)
8050 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8057 ExpectedVExtractIdx += 2;
8096 unsigned X86Opcode,
bool Mode,
8097 bool isUndefLO,
bool isUndefHI) {
8100 "Invalid nodes in input!");
8114 if (!isUndefLO && !V0->
isUndef())
8115 LO = DAG.
getNode(X86Opcode,
DL, NewVT, V0_LO, V0_HI);
8116 if (!isUndefHI && !V1->
isUndef())
8117 HI = DAG.
getNode(X86Opcode,
DL, NewVT, V1_LO, V1_HI);
8121 LO = DAG.
getNode(X86Opcode,
DL, NewVT, V0_LO, V1_LO);
8124 HI = DAG.
getNode(X86Opcode,
DL, NewVT, V0_HI, V1_HI);
8138 unsigned &NumExtracts,
8155 unsigned Opc[2] = {0, 0};
8156 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
8160 unsigned Opcode =
Op.getOpcode();
8186 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8188 Opc[i % 2] = Opcode;
8225 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8265 unsigned ExpectedUses) {
8295 unsigned NumExtracts;
8307 return DAG.
getNode(Opc,
DL, VT, Opnd0, Opnd1, Opnd2);
8320 Mask.push_back(
I + E + 1);
8344 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8345 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8346 for (
unsigned i = 0; i != Num128BitChunks; ++i) {
8347 for (
unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8359 GenericOpcode =
Op.getOpcode();
8360 switch (GenericOpcode) {
8366 default:
return false;
8377 !isa<ConstantSDNode>(Op1.
getOperand(1)) || !
Op.hasOneUse())
8382 if (j < NumEltsIn64Bits) {
8390 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8397 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8398 (j % NumEltsIn64Bits) * 2;
8399 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8408 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8439 for (
unsigned i = 0; i != NumElts; ++i)
8444 unsigned HalfNumElts = NumElts / 2;
8453 return DAG.
getNode(HOpcode,
DL, VT, V0, V1);
8461 unsigned NumNonUndefs =
8463 if (NumNonUndefs < 2)
8470 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.
hasSSE3()) ||
8471 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.
hasSSSE3()) ||
8472 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.
hasAVX()) ||
8473 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.
hasAVX2())) {
8486 unsigned Half = NumElts / 2;
8487 unsigned NumUndefsLO = 0;
8488 unsigned NumUndefsHI = 0;
8489 for (
unsigned i = 0, e = Half; i != e; ++i)
8493 for (
unsigned i = Half, e = NumElts; i != e; ++i)
8498 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8501 bool CanFold =
true;
8522 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8531 bool isUndefLO = NumUndefsLO == Half;
8532 bool isUndefHI = NumUndefsHI == Half;
8538 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8539 VT == MVT::v16i16) {
8558 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8563 bool isUndefLO = NumUndefsLO == Half;
8564 bool isUndefHI = NumUndefsHI == Half;
8566 isUndefLO, isUndefHI);
8584 MVT VT =
Op->getSimpleValueType(0);
8590 unsigned Opcode =
Op->getOperand(0).getOpcode();
8591 for (
unsigned i = 1; i < NumElems; ++i)
8592 if (Opcode !=
Op->getOperand(i).getOpcode())
8596 bool IsShift =
false;
8610 if (
Op->getSplatValue())
8623 if (!isa<ConstantSDNode>(
RHS))
8640 if (IsShift &&
any_of(RHSElts, [&](
SDValue V) {
return RHSElts[0] != V; }))
8661 MVT VT =
Op.getSimpleValueType();
8671 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8693 "Illegal variable permute mask size");
8701 SDLoc(IndicesVec), SizeInBits);
8705 IndicesVT, IndicesVec);
8717 Subtarget, DAG,
SDLoc(IndicesVec));
8732 EVT SrcVT =
Idx.getValueType();
8742 for (
uint64_t i = 0; i != Scale; ++i) {
8743 IndexScale |= Scale << (i * NumDstBits);
8744 IndexOffset |= i << (i * NumDstBits);
8754 unsigned Opcode = 0;
8763 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8767 ShuffleVT = MVT::v16i8;
8772 if (Subtarget.
hasAVX()) {
8774 ShuffleVT = MVT::v4f32;
8777 ShuffleVT = MVT::v16i8;
8782 if (Subtarget.
hasAVX()) {
8786 ShuffleVT = MVT::v2f64;
8792 DAG.getVectorShuffle(VT,
DL, SrcVec, SrcVec, {0, 0}),
8798 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8800 else if (Subtarget.hasXOP()) {
8809 }
else if (Subtarget.hasAVX()) {
8820 EVT VT =
Idx.getValueType();
8826 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8832 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8834 else if (Subtarget.hasAVX()) {
8836 IndicesVec = ScaleIndices(IndicesVec, 2);
8839 MVT::v32i8, DAG.
getBitcast(MVT::v32i8, SrcVec),
8840 DAG.
getBitcast(MVT::v32i8, IndicesVec),
DL, DAG, Subtarget));
8845 if (Subtarget.hasAVX2())
8847 else if (Subtarget.hasAVX()) {
8850 {0, 1, 2, 3, 0, 1, 2, 3});
8852 {4, 5, 6, 7, 4, 5, 6, 7});
8853 if (Subtarget.hasXOP())
8869 if (Subtarget.hasAVX512()) {
8870 if (!Subtarget.hasVLX()) {
8872 SrcVec =
widenSubVector(WidenSrcVT, SrcVec,
false, Subtarget, DAG,
8874 IndicesVec =
widenSubVector(MVT::v8i64, IndicesVec,
false, Subtarget,
8875 DAG,
SDLoc(IndicesVec));
8881 }
else if (Subtarget.hasAVX()) {
8889 if (Subtarget.hasXOP())
8904 if (Subtarget.hasVBMI())
8908 if (Subtarget.hasBWI())
8915 if (Subtarget.hasAVX512())
8924 "Illegal variable permute shuffle type");
8928 IndicesVec = ScaleIndices(IndicesVec, Scale);
8931 IndicesVec = DAG.
getBitcast(ShuffleIdxVT, IndicesVec);
8935 ? DAG.
getNode(Opcode,
DL, ShuffleVT, IndicesVec, SrcVec)
8936 : DAG.
getNode(Opcode,
DL, ShuffleVT, SrcVec, IndicesVec);
8959 for (
unsigned Idx = 0, E = V.getNumOperands();
Idx != E; ++
Idx) {
8968 SrcVec =
Op.getOperand(0);
8969 else if (SrcVec !=
Op.getOperand(0))
8971 SDValue ExtractedIndex =
Op->getOperand(1);
8975 ExtractedIndex = ExtractedIndex.
getOperand(0);
8984 else if (IndicesVec != ExtractedIndex.
getOperand(0))
8987 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.
getOperand(1));
8988 if (!PermIdx || PermIdx->getAPIntValue() !=
Idx)
8992 MVT VT = V.getSimpleValueType();
9000 MVT VT =
Op.getSimpleValueType();
9002 MVT OpEltVT =
Op.getOperand(0).getSimpleValueType();
9010 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9021 bool IsAllConstants =
true;
9022 bool OneUseFrozenUndefs =
true;
9024 unsigned NumConstants = NumElems;
9025 for (
unsigned i = 0; i < NumElems; ++i) {
9032 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->
hasOneUse();
9033 FrozenUndefMask.
setBit(i);
9038 IsAllConstants =
false;
9053 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9057 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9065 if (
unsigned NumFrozenUndefElts = FrozenUndefMask.
popcount();
9066 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9069 for (
unsigned i = 0; i < NumElems; ++i) {
9075 if (!FrozenUndefMask[i])
9076 Elts[i] =
Op.getOperand(i);
9078 BlendMask[i] += NumElems;
9093 unsigned UpperElems = NumElems / 2;
9094 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9095 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.
countl_one();
9096 if (NumUpperUndefsOrZeros >= UpperElems) {
9098 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9099 UpperElems = NumElems - (NumElems / 4);
9101 bool UndefUpper = UndefMask.
countl_one() >= UpperElems;
9105 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9112 return HorizontalOp;
9118 unsigned NumZero = ZeroMask.
popcount();
9119 unsigned NumNonZero = NonZeroMask.
popcount();
9127 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9128 FrozenUndefMask.
isZero() &&
9135 Type *EltType =
Op.getValueType().getScalarType().getTypeForEVT(Context);
9139 for (
unsigned i = 0; i != NumElems; ++i) {
9141 if (
auto *
C = dyn_cast<ConstantSDNode>(Elt))
9142 ConstVecOps[i] = ConstantInt::get(Context,
C->getAPIntValue());
9143 else if (
auto *
C = dyn_cast<ConstantFPSDNode>(Elt))
9144 ConstVecOps[i] = ConstantFP::get(Context,
C->getValueAPF());
9147 "Expected one variable element in this vector");
9161 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9167 if (InsertC < NumEltsInLow128Bits)
9173 assert(Subtarget.
hasAVX() &&
"Must have AVX with >16-byte vector");
9176 for (
unsigned i = 0; i != NumElts; ++i)
9177 ShuffleMask.
push_back(i == InsertC ? NumElts : i);
9183 if (NumNonZero == 1) {
9195 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9196 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9197 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9200 "Expected an SSE value type!");
9209 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9219 if (NumElems == 2 &&
Idx == 1 &&
9225 VT,
Op.getOperand(1)),
9226 NumBits/2, DAG, *
this, dl);
9237 if (EVTBits == 32) {
9244 if (Values.
size() == 1) {
9245 if (EVTBits == 32) {
9252 if (
Op.getNode()->isOnlyUserOf(Item.
getNode()))
9277 if (Subtarget.
hasAVX2() && EVTBits == 32 && Values.
size() == 2) {
9278 SDValue Ops[4] = {
Op.getOperand(0),
Op.getOperand(1),
9282 for (
unsigned i = 2; i != NumElems; ++i)
9283 if (Ops[i % 2] !=
Op.getOperand(i))
9287 if (CanSplat(
Op, NumElems, Ops)) {
9309 HVT, dl,
Op->ops().slice(NumElems / 2, NumElems /2));
9316 if (EVTBits == 64) {
9317 if (NumNonZero == 1) {
9321 Op.getOperand(
Idx));
9328 if (EVTBits == 8 && NumElems == 16)
9330 NumZero, DAG, Subtarget))
9333 if (EltVT == MVT::i16 && NumElems == 8)
9335 NumZero, DAG, Subtarget))
9339 if (EVTBits == 32 && NumElems == 4)
9344 if (NumElems == 4 && NumZero > 0) {
9346 for (
unsigned i = 0; i < 4; ++i) {
9347 bool isZero = !NonZeroMask[i];
9354 for (
unsigned i = 0; i < 2; ++i) {
9361 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9364 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9367 Ops[i] =
getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9377 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9378 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9383 assert(Values.
size() > 1 &&
"Expected non-undef and non-splat vector");
9390 if (Subtarget.
hasSSE41() && EltVT != MVT::f16) {
9392 if (!
Op.getOperand(0).isUndef())
9397 for (
unsigned i = 1; i < NumElems; ++i) {
9398 if (
Op.getOperand(i).isUndef())
continue;
9409 for (
unsigned i = 0; i < NumElems; ++i) {
9410 if (!
Op.getOperand(i).isUndef())
9420 for (
unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9423 for(
unsigned i = 0; i != Scale; ++i)
9425 for (
unsigned i = 0; i != Scale; ++i)
9426 Mask.push_back(NumElems+i);
9429 for (
unsigned i = 0, e = NumElems / (2 * Scale); i !=
e; ++i)
9441 MVT ResVT =
Op.getSimpleValueType();
9444 ResVT.
is512BitVector()) &&
"Value type must be 256-/512-bit wide");
9447 unsigned NumFreezeUndef = 0;
9448 unsigned NumZero = 0;
9449 unsigned NumNonZero = 0;
9450 unsigned NonZeros = 0;
9451 for (
unsigned i = 0; i != NumOperands; ++i) {
9465 assert(i <
sizeof(NonZeros) * CHAR_BIT);
9472 if (NumNonZero > 2) {
9476 Ops.
slice(0, NumOperands/2));
9478 Ops.
slice(NumOperands/2));
9487 MVT SubVT =
Op.getOperand(0).getSimpleValueType();
9489 for (
unsigned i = 0; i != NumOperands; ++i) {
9490 if ((NonZeros & (1 << i)) == 0)
9508 MVT ResVT =
Op.getSimpleValueType();
9512 "Unexpected number of operands in CONCAT_VECTORS");
9516 for (
unsigned i = 0; i != NumOperands; ++i) {
9520 assert(i <
sizeof(NonZeros) * CHAR_BIT);
9532 if (
isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9533 Log2_64(NonZeros) != NumOperands - 1) {
9557 if (NumOperands > 2) {
9561 Ops.
slice(0, NumOperands / 2));
9563 Ops.
slice(NumOperands / 2));
9582 MVT VT =
Op.getSimpleValueType();
9616 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
9617 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
9618 if (Mask[i] >= 0 && Mask[i] != i)
9630 unsigned ScalarSizeInBits,
9632 assert(LaneSizeInBits && ScalarSizeInBits &&
9633 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9634 "Illegal shuffle lane size");
9635 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9636 int Size = Mask.size();
9637 for (
int i = 0; i <
Size; ++i)
9638 if (Mask[i] >= 0 && (Mask[i] %
Size) / LaneSize != i / LaneSize)
9653 unsigned ScalarSizeInBits,
9655 assert(LaneSizeInBits && ScalarSizeInBits &&
9656 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9657 "Illegal shuffle lane size");
9658 int NumElts = Mask.size();
9659 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9660 int NumLanes = NumElts / NumEltsPerLane;
9662 for (
int i = 0; i != NumLanes; ++i) {
9664 for (
int j = 0; j != NumEltsPerLane; ++j) {
9665 int M = Mask[(i * NumEltsPerLane) + j];
9668 int Lane = (M % NumElts) / NumEltsPerLane;
9669 if (SrcLane >= 0 && SrcLane != Lane)
9693 RepeatedMask.
assign(LaneSize, -1);
9694 int Size = Mask.size();
9695 for (
int i = 0; i <
Size; ++i) {
9699 if ((Mask[i] %
Size) / LaneSize != i / LaneSize)
9705 int LocalM = Mask[i] <
Size ? Mask[i] % LaneSize
9706 : Mask[i] % LaneSize + LaneSize;
9707 if (RepeatedMask[i % LaneSize] < 0)
9709 RepeatedMask[i % LaneSize] = LocalM;
9710 else if (RepeatedMask[i % LaneSize] != LocalM)
9740 unsigned EltSizeInBits,
9743 int LaneSize = LaneSizeInBits / EltSizeInBits;
9745 int Size = Mask.size();
9746 for (
int i = 0; i <
Size; ++i) {
9756 if ((Mask[i] %
Size) / LaneSize != i / LaneSize)
9762 int LaneM = Mask[i] /
Size;
9763 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9766 RepeatedMask[i % LaneSize] = LocalM;
9767 else if (RepeatedMask[i % LaneSize] != LocalM)
9780 Mask, RepeatedMask);
9786 int Idx,
int ExpectedIdx) {
9787 assert(0 <=
Idx &&
Idx < MaskSize && 0 <= ExpectedIdx &&
9788 ExpectedIdx < MaskSize &&
"Out of range element index");
9789 if (!
Op || !ExpectedOp ||
Op.getOpcode() != ExpectedOp.
getOpcode())
9792 switch (
Op.getOpcode()) {
9804 return (
Op == ExpectedOp &&
9805 (
int)
Op.getValueType().getVectorNumElements() == MaskSize);
9815 if (
Op == ExpectedOp &&
Op.getOperand(0) ==
Op.getOperand(1)) {
9816 MVT VT =
Op.getSimpleValueType();
9818 if (MaskSize == NumElts) {
9820 int NumEltsPerLane = NumElts / NumLanes;
9821 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9823 (
Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9825 (
Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9826 return SameLane && SameElt;
9848 int Size = Mask.size();
9849 if (
Size != (
int)ExpectedMask.
size())
9852 for (
int i = 0; i <
Size; ++i) {
9853 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
9854 int MaskIdx = Mask[i];
9855 int ExpectedIdx = ExpectedMask[i];
9856 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9859 MaskIdx = MaskIdx <
Size ? MaskIdx : (MaskIdx -
Size);
9860 ExpectedIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
9882 int Size = Mask.size();
9883 if (
Size != (
int)ExpectedMask.
size())
9887 "Illegal target shuffle mask");
9895 !V1.getValueType().isVector()))
9898 !V2.getValueType().isVector()))
9904 for (
int i = 0; i <
Size; ++i) {
9905 int MaskIdx = Mask[i];
9906 int ExpectedIdx = ExpectedMask[i];
9916 int BitIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
9917 APInt &ZeroMask = ExpectedIdx <
Size ? ZeroV1 : ZeroV2;
9925 MaskIdx = MaskIdx <
Size ? MaskIdx : (MaskIdx -
Size);
9926 ExpectedIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
9940 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9951 return IsUnpackwdMask;
9965 for (
unsigned i = 0; i != 4; ++i) {
9980 assert(Mask.size() % 2 == 0 &&
"Expecting even number of elements in mask");
9981 unsigned HalfSize = Mask.size() / 2;
9982 for (
unsigned i = 0; i != HalfSize; ++i) {
9983 if (Mask[i] != Mask[i + HalfSize])
9998 assert(Mask.size() == 4 &&
"Only 4-lane shuffle masks");
9999 assert(Mask[0] >= -1 && Mask[0] < 4 &&
"Out of bound mask element!");
10000 assert(Mask[1] >= -1 && Mask[1] < 4 &&
"Out of bound mask element!");
10001 assert(Mask[2] >= -1 && Mask[2] < 4 &&
"Out of bound mask element!");
10002 assert(Mask[3] >= -1 && Mask[3] < 4 &&
"Out of bound mask element!");
10006 int FirstIndex =
find_if(Mask, [](
int M) {
return M >= 0; }) - Mask.begin();
10007 assert(0 <= FirstIndex && FirstIndex < 4 &&
"All undef shuffle mask");
10009 int FirstElt = Mask[FirstIndex];
10010 if (
all_of(Mask, [FirstElt](
int M) {
return M < 0 || M == FirstElt; }))
10011 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10014 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10015 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10016 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10017 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10029 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10030 "Unexpected SHUFPD mask size");
10031 assert(
all_of(Mask, [](
int M) {
return -1 <= M && M <= 1; }) &&
10032 "Unexpected SHUFPD mask elements");
10036 int FirstIndex =
find_if(Mask, [](
int M) {
return M >= 0; }) - Mask.begin();
10037 assert(0 <= FirstIndex && FirstIndex < (
int)Mask.size() &&
10038 "All undef shuffle mask");
10040 int FirstElt = Mask[FirstIndex];
10041 if (
all_of(Mask, [FirstElt](
int M) {
return M < 0 || M == FirstElt; }) &&
10042 count_if(Mask, [FirstElt](
int M) {
return M == FirstElt; }) > 1) {
10044 for (
unsigned I = 0, E = Mask.size();
I != E; ++
I)
10045 Imm |= FirstElt <<
I;
10052 for (
unsigned I = 0, E = Mask.size();
I != E; ++
I)
10053 Imm |= (Mask[
I] < 0 ? (
I & 1) : Mask[
I]) <<
I;
10072 bool &IsZeroSideLeft) {
10073 int NextElement = -1;
10075 for (
int i = 0, e = Mask.size(); i < e; i++) {
10077 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
10083 if (NextElement < 0) {
10084 NextElement = Mask[i] != 0 ?
VectorType.getVectorNumElements() : 0;
10085 IsZeroSideLeft = NextElement != 0;
10088 if (NextElement != Mask[i])
10101 int Size = Mask.size();
10115 for (
int i = 0; i < NumBytes; ++i) {
10116 int M = Mask[i / NumEltBytes];
10118 PSHUFBMask[i] = DAG.
getUNDEF(MVT::i8);
10121 if (Zeroable[i / NumEltBytes]) {
10122 PSHUFBMask[i] = ZeroMask;
10128 if (V && V != SrcV)
10134 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10138 M = M * NumEltBytes + (i % NumEltBytes);
10141 assert(V &&
"Failed to find a source input");
10156 const APInt &Zeroable,
10159 bool IsLeftZeroSide =
true;
10163 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10168 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10169 "Unexpected number of vector elements");
10171 Subtarget, DAG,
DL);
10173 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10178 unsigned &UnpackOpcode,
bool IsUnary,
10184 bool Undef1 =
true, Undef2 =
true, Zero1 =
true, Zero2 =
true;
10185 for (
int i = 0; i != NumElts; i += 2) {
10186 int M1 = TargetMask[i + 0];
10187 int M2 = TargetMask[i + 1];
10193 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10194 "Zeroable shuffle detected");
10200 (IsUnary ? V1 : V2))) {
10202 V2 = (Undef2 ? DAG.
getUNDEF(VT) : (IsUnary ? V1 : V2));
10203 V1 = (Undef1 ? DAG.
getUNDEF(VT) : V1);
10209 (IsUnary ? V1 : V2))) {
10211 V2 = (Undef2 ? DAG.
getUNDEF(VT) : (IsUnary ? V1 : V2));
10212 V1 = (Undef1 ? DAG.
getUNDEF(VT) : V1);
10217 if (IsUnary && (Zero1 || Zero2)) {
10219 if ((Subtarget.
hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10223 bool MatchLo =
true, MatchHi =
true;
10224 for (
int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10225 int M = TargetMask[i];
10228 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10232 MatchLo &= (M == Unpckl[i]);
10233 MatchHi &= (M == Unpckh[i]);
10236 if (MatchLo || MatchHi) {
10300 unsigned UnpackOpcode;
10312 DAG.
getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10314 return DAG.
getNode(UnpackOpcode,
DL, VT, V1, V1);
10325 unsigned NumElts = Mask.size();
10327 unsigned MaxScale = 64 / EltSizeInBits;
10329 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10330 unsigned SrcEltBits = EltSizeInBits * Scale;
10331 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10333 unsigned NumSrcElts = NumElts / Scale;
10336 unsigned UpperElts = NumElts - NumSrcElts;
10342 if ((NumSrcElts * EltSizeInBits) >= 128) {
10360 MVT SrcVT = Src.getSimpleValueType();
10370 if (NumSrcElts == NumDstElts)
10373 if (NumSrcElts > NumDstElts) {
10379 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10396 if (DstVT != TruncVT)
10420 const APInt &Zeroable,
10423 assert((VT == MVT::v16i8 || VT == MVT::v8i16) &&
"Unexpected VTRUNC type");
10429 unsigned MaxScale = 64 / EltSizeInBits;
10430 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10431 unsigned SrcEltBits = EltSizeInBits * Scale;
10432 unsigned NumSrcElts = NumElts / Scale;
10433 unsigned UpperElts = NumElts - NumSrcElts;
10442 Src.getScalarValueSizeInBits() == SrcEltBits) {
10443 Src = Src.getOperand(0);
10444 }
else if (Subtarget.hasVLX()) {
10457 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10470 const APInt &Zeroable,
10474 "Unexpected VTRUNC type");
10480 unsigned MaxScale = 64 / EltSizeInBits;
10481 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10483 unsigned SrcEltBits = EltSizeInBits * Scale;
10484 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10489 unsigned NumHalfSrcElts = NumElts / Scale;
10490 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10497 unsigned UpperElts = NumElts - NumSrcElts;
10498 if (UpperElts > 0 &&
10509 return Lo.getOperand(0) ==
Hi.getOperand(0);
10512 auto *LDLo = cast<LoadSDNode>(
Lo);
10513 auto *LDHi = cast<LoadSDNode>(
Hi);
10515 LDHi, LDLo,
Lo.getValueType().getStoreSize(), 1);
10573 bool IsSingleInput) {
10576 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10578 "We should only be called with masks with a power-of-2 size!");
10581 int Offset = MatchEven ? 0 : 1;
10586 bool ViableForN[3] = {
true,
true,
true};
10588 for (
int i = 0, e = Mask.size(); i < e; ++i) {
10594 bool IsAnyViable =
false;
10595 for (
unsigned j = 0; j != std::size(ViableForN); ++j)
10596 if (ViableForN[j]) {
10601 IsAnyViable =
true;
10603 ViableForN[j] =
false;
10610 for (
unsigned j = 0; j != std::size(ViableForN); ++j)
10626 unsigned MaxStages = 1) {
10629 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10630 "Illegal maximum compaction");
10633 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10634 unsigned NumPackedBits = NumSrcBits - BitSize;
10638 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10641 if ((!N1.
isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10642 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10644 if (Subtarget.
hasSSE41() || BitSize == 8) {
10657 if ((N1.
isUndef() || IsZero1 || IsAllOnes1 ||
10659 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10671 for (
unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10679 if (MatchPACK(V1, V2, PackVT))
10686 if (MatchPACK(V1, V1, PackVT))
10698 unsigned PackOpcode;
10701 unsigned MaxStages =
Log2_32(64 / EltBits);
10703 Subtarget, MaxStages))
10707 unsigned NumStages =
Log2_32(CurrentEltBits / EltBits);
10710 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10715 unsigned MaxPackBits = 16;
10716 if (CurrentEltBits > 16 &&
10722 for (
unsigned i = 0; i != NumStages; ++i) {
10723 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10724 unsigned NumSrcElts = SizeBits / SrcEltBits;
10732 CurrentEltBits /= 2;
10735 "Failed to lower compaction shuffle");
10745 const APInt &Zeroable,
10752 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10758 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10771 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
10774 if (Mask[i] %
Size != i)
10777 V = Mask[i] <
Size ? V1 : V2;
10778 else if (V != (Mask[i] <
Size ? V1 : V2))
10806 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
10807 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i +
Size)
10823 const APInt &Zeroable,
bool &ForceV1Zero,
10824 bool &ForceV2Zero,
uint64_t &BlendMask) {
10825 bool V1IsZeroOrUndef =
10827 bool V2IsZeroOrUndef =
10831 ForceV1Zero =
false, ForceV2Zero =
false;
10832 assert(Mask.size() <= 64 &&
"Shuffle mask too big for blend mask");
10834 int NumElts = Mask.size();
10836 int NumEltsPerLane = NumElts / NumLanes;
10837 assert((NumLanes * NumEltsPerLane) == NumElts &&
"Value type mismatch");
10841 bool ForceWholeLaneMasks =
10846 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
10848 bool LaneV1InUse =
false;
10849 bool LaneV2InUse =
false;
10851 for (
int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10852 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10856 if (M == Elt || (0 <= M && M < NumElts &&
10859 LaneV1InUse =
true;
10862 if (M == (Elt + NumElts) ||
10865 LaneBlendMask |= 1ull << LaneElt;
10866 Mask[Elt] = Elt + NumElts;
10867 LaneV2InUse =
true;
10870 if (Zeroable[Elt]) {
10871 if (V1IsZeroOrUndef) {
10872 ForceV1Zero =
true;
10874 LaneV1InUse =
true;
10877 if (V2IsZeroOrUndef) {
10878 ForceV2Zero =
true;
10879 LaneBlendMask |= 1ull << LaneElt;
10880 Mask[Elt] = Elt + NumElts;
10881 LaneV2InUse =
true;
10891 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10892 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10894 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10907 const APInt &Zeroable,
10911 bool ForceV1Zero =
false, ForceV2Zero =
false;
10928 assert(Subtarget.
hasAVX2() &&
"256-bit integer blends require AVX2!");
10932 assert(Subtarget.
hasAVX() &&
"256-bit float blends require AVX!");
10939 assert(Subtarget.
hasSSE41() &&
"128-bit blends require SSE41!");
10942 case MVT::v16i16: {
10943 assert(Subtarget.
hasAVX2() &&
"v16i16 blends require AVX2!");
10947 assert(RepeatedMask.
size() == 8 &&
"Repeated mask size doesn't match!");
10949 for (
int i = 0; i < 8; ++i)
10950 if (RepeatedMask[i] >= 8)
10951 BlendMask |= 1ull << i;
10958 uint64_t LoMask = BlendMask & 0xFF;
10959 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10960 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10966 MVT::v16i16,
DL,
Lo,
Hi,
10967 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10972 assert(Subtarget.
hasAVX2() &&
"256-bit byte-blends require AVX2!");
10975 assert(Subtarget.
hasSSE41() &&
"128-bit byte-blends require SSE41!");
10982 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10989 if (Subtarget.hasVLX())
11022 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
11023 for (
int j = 0; j < Scale; ++j)
11070 bool ImmBlends =
false) {
11076 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
11080 assert(Mask[i] <
Size * 2 &&
"Shuffle input is out of bounds.");
11082 if (BlendMask[Mask[i] %
Size] < 0)
11083 BlendMask[Mask[i] %
Size] = Mask[i];
11084 else if (BlendMask[Mask[i] %
Size] != Mask[i])
11087 PermuteMask[i] = Mask[i] %
Size;
11109 int NumElts = Mask.size();
11111 int NumLaneElts = NumElts / NumLanes;
11112 int NumHalfLaneElts = NumLaneElts / 2;
11114 bool MatchLo =
true, MatchHi =
true;
11118 for (
int Elt = 0; Elt != NumElts; ++Elt) {
11126 if (M < NumElts && (
Op.isUndef() ||
Op == V1))
11128 else if (NumElts <= M && (
Op.isUndef() ||
Op == V2)) {
11134 bool MatchLoAnyLane =
false, MatchHiAnyLane =
false;
11135 for (
int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11136 int Lo = Lane, Mid = Lane + NumHalfLaneElts,
Hi = Lane + NumLaneElts;
11139 if (MatchLoAnyLane || MatchHiAnyLane) {
11140 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11141 "Failed to match UNPCKLO/UNPCKHI");
11145 MatchLo &= MatchLoAnyLane;
11146 MatchHi &= MatchHiAnyLane;
11147 if (!MatchLo && !MatchHi)
11150 assert((MatchLo ^ MatchHi) &&
"Failed to match UNPCKLO/UNPCKHI");
11156 for (
int Elt = 0; Elt != NumElts; ++Elt) {
11163 bool IsFirstOp = M < NumElts;
11165 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11166 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11167 PermuteMask[Elt] = BaseMaskElt;
11168 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11169 PermuteMask[Elt] = BaseMaskElt + 1;
11170 assert(PermuteMask[Elt] != -1 &&
11171 "Input mask element is defined but failed to assign permute mask");
11193 int Size = Mask.size();
11194 assert(Mask.size() >= 2 &&
"Single element masks are invalid.");
11205 bool UnpackLo = NumLoInputs >= NumHiInputs;
11207 auto TryUnpack = [&](
int ScalarSize,
int Scale) {
11211 for (
int i = 0; i <
Size; ++i) {
11216 int UnpackIdx = i / Scale;
11220 if ((UnpackIdx % 2 == 0) != (Mask[i] <
Size))
11226 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 :
Size / 2)] =
11249 UnpackVT, V1, V2));
11255 for (
int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11256 if (
SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11267 if (NumLoInputs == 0 || NumHiInputs == 0) {
11268 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11269 "We have to have *some* inputs!");
11270 int HalfOffset = NumLoInputs == 0 ?
Size / 2 : 0;
11278 for (
int i = 0; i <
Size; ++i) {
11282 assert(Mask[i] %
Size >= HalfOffset &&
"Found input from wrong half!");
11285 2 * ((Mask[i] %
Size) - HalfOffset) + (Mask[i] <
Size ? 0 : 1);
11314 int NumEltsPerLane = NumElts / NumLanes;
11317 bool Blend1 =
true;
11318 bool Blend2 =
true;
11319 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11320 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11321 for (
int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11322 for (
int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11323 int M = Mask[Lane + Elt];
11327 Blend1 &= (M == (Lane + Elt));
11328 assert(Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask");
11329 M = M % NumEltsPerLane;
11330 Range1.first = std::min(Range1.first, M);
11331 Range1.second = std::max(Range1.second, M);
11334 Blend2 &= (M == (Lane + Elt));
11335 assert(Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask");
11336 M = M % NumEltsPerLane;
11337 Range2.first = std::min(Range2.first, M);
11338 Range2.second = std::max(Range2.second, M);
11346 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11347 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11361 for (
int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11362 for (
int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11363 int M = Mask[Lane + Elt];
11367 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11369 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11376 if (Range2.second < Range1.first)
11377 return RotateAndPermute(V1, V2, Range1.first, 0);
11378 if (Range1.second < Range2.first)
11379 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11393 size_t NumUndefs = 0;
11394 std::optional<int> UniqueElt;
11395 for (
int Elt : Mask) {
11400 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11406 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11419 int NumElts = Mask.size();
11421 int NumEltsPerLane = NumElts / NumLanes;
11425 bool IsAlternating =
true;
11426 bool V1Zero =
true, V2Zero =
true;
11430 for (
int i = 0; i < NumElts; ++i) {
11432 if (M >= 0 && M < NumElts) {
11435 V1Zero &= Zeroable[i];
11436 IsAlternating &= (i & 1) == 0;
11437 }
else if (M >= NumElts) {
11438 V2Mask[i] = M - NumElts;
11439 FinalMask[i] = i + NumElts;
11440 V2Zero &= Zeroable[i];
11441 IsAlternating &= (i & 1) == 1;
11448 auto canonicalizeBroadcastableInput = [
DL, VT, &Subtarget,
11451 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11452 if (!Subtarget.
hasAVX2() && (!Subtarget.
hasAVX() || EltSizeInBits < 32 ||
11458 "Expected to demand only the 0'th element.");
11461 int &InputMaskElt =
I.value();
11462 if (InputMaskElt >= 0)
11463 InputMaskElt =
I.index();
11473 canonicalizeBroadcastableInput(V1, V1Mask);
11474 canonicalizeBroadcastableInput(V2, V2Mask);
11499 DL, VT, V1, V2, Mask, Subtarget, DAG))
11507 DL, VT, V1, V2, Mask, Subtarget, DAG))
11516 V1Mask.
assign(NumElts, -1);
11517 V2Mask.
assign(NumElts, -1);
11518 FinalMask.
assign(NumElts, -1);
11519 for (
int i = 0; i != NumElts; i += NumEltsPerLane)
11520 for (
int j = 0; j != NumEltsPerLane; ++j) {
11521 int M = Mask[i + j];
11522 if (M >= 0 && M < NumElts) {
11523 V1Mask[i + (j / 2)] = M;
11524 FinalMask[i + j] = i + (j / 2);
11525 }
else if (M >= NumElts) {
11526 V2Mask[i + (j / 2)] = M - NumElts;
11527 FinalMask[i + j] = i + (j / 2) + NumElts;
11541 assert(EltSizeInBits < 64 &&
"Can't rotate 64-bit integers");
11544 int MinSubElts = Subtarget.
hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11545 int MaxSubElts = 64 / EltSizeInBits;
11546 unsigned RotateAmt, NumSubElts;
11548 MaxSubElts, NumSubElts, RotateAmt))
11550 unsigned NumElts = Mask.size();
11565 if (!IsLegal && Subtarget.
hasSSE3())
11578 if ((RotateAmt % 16) == 0)
11581 unsigned ShlAmt = RotateAmt;
11603 int NumElts = Mask.size();
11614 for (
int i = 0; i < NumElts; ++i) {
11617 "Unexpected mask index.");
11622 int StartIdx = i - (M % NumElts);
11630 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11633 Rotation = CandidateRotation;
11634 else if (Rotation != CandidateRotation)
11639 SDValue MaskV = M < NumElts ? V1 : V2;
11650 else if (TargetV != MaskV)
11657 assert(Rotation != 0 &&
"Failed to locate a viable rotation!");
11658 assert((
Lo ||
Hi) &&
"Failed to find a rotated input vector!");
11703 int NumElts = RepeatedMask.
size();
11704 int Scale = 16 / NumElts;
11705 return Rotation * Scale;
11716 if (ByteRotation <= 0)
11728 "512-bit PALIGNR requires BWI instructions");
11735 "Rotate-based lowering only supports 128-bit lowering!");
11736 assert(Mask.size() <= 16 &&
11737 "Can shuffle at most 16 bytes in a 128-bit vector!");
11738 assert(ByteVT == MVT::v16i8 &&
11739 "SSE2 rotate lowering only needed for v16i8!");
11742 int LoByteShift = 16 - ByteRotation;
11743 int HiByteShift = ByteRotation;
11767 const APInt &Zeroable,
11771 "Only 32-bit and 64-bit elements are supported!");
11775 &&
"VLX required for 128/256-bit vectors");
11787 unsigned NumElts = Mask.size();
11790 assert((ZeroLo + ZeroHi) < NumElts &&
"Zeroable shuffle detected");
11791 if (!ZeroLo && !ZeroHi)
11795 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11796 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11804 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11805 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11818 const APInt &Zeroable,
11828 if (!ZeroLo && !ZeroHi)
11831 unsigned NumElts = Mask.size();
11832 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11842 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11851 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11856 }
else if (ZeroHi == 0) {
11857 unsigned Shift = Mask[ZeroLo] % NumElts;
11862 }
else if (!Subtarget.
hasSSSE3()) {
11866 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11869 Shift += Mask[ZeroLo] % NumElts;
11905 int MaskOffset,
const APInt &Zeroable,
11907 int Size = Mask.size();
11908 unsigned SizeInBits =
Size * ScalarSizeInBits;
11910 auto CheckZeros = [&](
int Shift,
int Scale,
bool Left) {
11911 for (
int i = 0; i <
Size; i += Scale)
11912 for (
int j = 0; j < Shift; ++j)
11913 if (!Zeroable[i + j + (
Left ? 0 : (Scale - Shift))])
11919 auto MatchShift = [&](
int Shift,
int Scale,
bool Left) {
11920 for (
int i = 0; i !=
Size; i += Scale) {
11921 unsigned Pos =
Left ? i + Shift : i;
11922 unsigned Low =
Left ? i : i + Shift;
11923 unsigned Len = Scale - Shift;
11928 int ShiftEltBits = ScalarSizeInBits * Scale;
11929 bool ByteShift = ShiftEltBits > 64;
11932 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11936 Scale = ByteShift ? Scale / 2 : Scale;
11942 return (
int)ShiftAmt;
11951 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11952 for (
int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11953 for (
int Shift = 1; Shift != Scale; ++Shift)
11954 for (
bool Left : {
true,
false})
11955 if (CheckZeros(Shift, Scale,
Left)) {
11956 int ShiftAmt = MatchShift(Shift, Scale,
Left);
11967 const APInt &Zeroable,
11970 int Size = Mask.size();
11979 Mask, 0, Zeroable, Subtarget);
11982 if (ShiftAmt < 0) {
11984 Mask,
Size, Zeroable, Subtarget);
11995 "Illegal integer vector type");
11997 V = DAG.
getNode(Opcode,
DL, ShiftVT, V,
12007 int Size = Mask.size();
12008 int HalfSize =
Size / 2;
12018 int Len = HalfSize;
12019 for (; Len > 0; --Len)
12020 if (!Zeroable[Len - 1])
12022 assert(Len > 0 &&
"Zeroable shuffle mask");
12027 for (
int i = 0; i != Len; ++i) {
12036 if (i > M || M >= HalfSize)
12039 if (
Idx < 0 || (Src == V &&
Idx == (M - i))) {
12047 if (!Src ||
Idx < 0)
12050 assert((
Idx + Len) <= HalfSize &&
"Illegal extraction mask");
12063 int Size = Mask.size();
12064 int HalfSize =
Size / 2;
12071 for (
int Idx = 0;
Idx != HalfSize; ++
Idx) {
12087 for (
int Hi =
Idx + 1;
Hi <= HalfSize; ++
Hi) {
12089 int Len =
Hi -
Idx;
12103 }
else if ((!
Base || (
Base == V1)) &&
12106 }
else if ((!
Base || (
Base == V2)) &&
12156 assert(Scale > 1 &&
"Need a scale to extend.");
12159 int NumEltsPerLane = 128 / EltBits;
12160 int OffsetLane =
Offset / NumEltsPerLane;
12161 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12162 "Only 8, 16, and 32 bit elements can be extended.");
12163 assert(Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.");
12164 assert(0 <=
Offset &&
"Extension offset must be positive.");
12166 "Extension offset must be in the first lane or start an upper lane.");
12169 auto SafeOffset = [&](
int Idx) {
12170 return OffsetLane == (
Idx / NumEltsPerLane);
12174 auto ShuffleOffset = [&](
SDValue V) {
12179 for (
int i = 0; i * Scale < NumElements; ++i) {
12180 int SrcIdx = i +
Offset;
12181 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12194 NumElements / Scale);
12196 InputV = ShuffleOffset(InputV);
12198 DL, ExtVT, InputV, DAG);
12207 if (AnyExt && EltBits == 32) {
12215 if (AnyExt && EltBits == 16 && Scale > 2) {
12216 int PSHUFDMask[4] = {
Offset / 2, -1,
12221 int PSHUFWMask[4] = {1, -1, -1, -1};
12224 VT, DAG.
getNode(OddEvenOp,
DL, MVT::v8i16,
12231 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12232 assert(NumElements == (
int)Mask.size() &&
"Unexpected shuffle mask size!");
12235 int LoIdx =
Offset * EltBits;
12244 int HiIdx = (
Offset + 1) * EltBits;
12256 if (Scale > 4 && EltBits == 8 && Subtarget.
hasSSSE3()) {
12257 assert(NumElements == 16 &&
"Unexpected byte vector width!");
12259 for (
int i = 0; i < 16; ++i) {
12261 if ((i % Scale == 0 && SafeOffset(
Idx))) {
12268 InputV = DAG.
getBitcast(MVT::v16i8, InputV);
12276 int AlignToUnpack =
Offset % (NumElements / Scale);
12277 if (AlignToUnpack) {
12279 for (
int i = AlignToUnpack; i < NumElements; ++i)
12280 ShMask[i - AlignToUnpack] = i;
12282 Offset -= AlignToUnpack;
12288 if (
Offset >= (NumElements / 2)) {
12290 Offset -= (NumElements / 2);
12297 InputV = DAG.
getNode(UnpackLoHi,
DL, InputVT, InputV, Ext);
12301 }
while (Scale > 1);
12322 int NumLanes = Bits / 128;
12324 int NumEltsPerLane = NumElements / NumLanes;
12326 "Exceeds 32-bit integer zero extension limit");
12327 assert((
int)Mask.size() == NumElements &&
"Unexpected shuffle mask size");
12333 bool AnyExt =
true;
12336 for (
int i = 0; i < NumElements; ++i) {
12340 if (i % Scale != 0) {
12352 SDValue V = M < NumElements ? V1 : V2;
12353 M = M % NumElements;
12356 Offset = M - (i / Scale);
12357 }
else if (InputV != V)
12364 (
Offset % NumEltsPerLane) == 0))
12369 if (
Offset && (
Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12372 if ((M % NumElements) != (
Offset + (i / Scale)))
12385 if (
Offset != 0 && Matches < 2)
12389 InputV, Mask, Subtarget, DAG);
12393 assert(Bits % 64 == 0 &&
12394 "The number of bits in a vector must be divisible by 64 on x86!");
12395 int NumExtElements = Bits / 64;
12399 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12400 assert(NumElements % NumExtElements == 0 &&
12401 "The input vector size must be divisible by the extended size.");
12412 auto CanZExtLowHalf = [&]() {
12413 for (
int i = NumElements / 2; i != NumElements; ++i)
12423 if (
SDValue V = CanZExtLowHalf()) {
12438 MVT VT = V.getSimpleValueType();
12444 MVT NewVT = V.getSimpleValueType();
12465 return V->hasOneUse() &&
12469template<
typename T>
12471 T EltVT = VT.getScalarType();
12472 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12473 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12493 find_if(Mask, [&Mask](
int M) {
return M >= (int)Mask.size(); }) -
12496 bool IsV1Zeroable =
true;
12497 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
12498 if (i != V2Index && !Zeroable[i]) {
12499 IsV1Zeroable =
false;
12504 if (!IsV1Zeroable) {
12506 V1Mask[V2Index] = -1;
12521 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12525 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12534 if (!IsV1Zeroable) {
12545 }
else if (Mask[V2Index] != (
int)Mask.size() || EltVT == MVT::i8 ||
12546 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12552 if (!IsV1Zeroable) {
12555 assert(VT == ExtVT &&
"Cannot change extended type when non-zeroable!");
12562 unsigned MovOpc = 0;
12563 if (EltVT == MVT::f16)
12565 else if (EltVT == MVT::f32)
12567 else if (EltVT == MVT::f64)
12571 return DAG.
getNode(MovOpc,
DL, ExtVT, V1, V2);
12582 if (V2Index != 0) {
12589 V2Shuffle[V2Index] = 0;
12611 "We can only lower integer broadcasts with AVX2!");
12617 assert(V0VT.
isVector() &&
"Unexpected non-vector vector-sized value!");
12627 if (V0EltSize <= EltSize)
12630 assert(((V0EltSize % EltSize) == 0) &&
12631 "Scalar type sizes must all be powers of 2 on x86!");
12634 const unsigned Scale = V0EltSize / EltSize;
12635 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12647 if (
const int OffsetIdx = BroadcastIdx % Scale)
12661 assert(Mask.size() == 4 &&
"Unsupported mask size!");
12662 assert(Mask[0] >= -1 && Mask[0] < 8 &&
"Out of bound mask element!");
12663 assert(Mask[1] >= -1 && Mask[1] < 8 &&
"Out of bound mask element!");
12664 assert(Mask[2] >= -1 && Mask[2] < 8 &&
"Out of bound mask element!");
12665 assert(Mask[3] >= -1 && Mask[3] < 8 &&
"Out of bound mask element!");
12669 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12671 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12683 assert((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.");
12684 int Size = Mask.size();
12685 for (
int i = 0; i <
Size; ++i)
12686 if (Mask[i] >= 0 && Mask[i] /
Size == Input && Mask[i] %
Size != i)
12701 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12721 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12723 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12729 if (NumElts == 4 &&
12734 NewMask.
append(NumElts, -1);
12754 if (!((Subtarget.
hasSSE3() && VT == MVT::v2f64) ||
12755 (Subtarget.
hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12762 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.
hasAVX2())
12769 if (BroadcastIdx < 0)
12771 assert(BroadcastIdx < (
int)Mask.size() &&
"We only expect to be called with "
12772 "a sorted mask where the broadcast "
12774 int NumActiveElts =
count_if(Mask, [](
int M) {
return M >= 0; });
12780 int BitOffset = BroadcastIdx * NumEltBits;
12783 switch (V.getOpcode()) {
12785 V = V.getOperand(0);
12789 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12790 int OpIdx = BitOffset / OpBitWidth;
12791 V = V.getOperand(OpIdx);
12792 BitOffset %= OpBitWidth;
12797 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12798 unsigned Idx = V.getConstantOperandVal(1);
12799 unsigned BeginOffset =
Idx * EltBitWidth;
12800 BitOffset += BeginOffset;
12801 V = V.getOperand(0);
12805 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12807 int Idx = (int)V.getConstantOperandVal(2);
12808 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12809 int BeginOffset =
Idx * EltBitWidth;
12810 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12811 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12812 BitOffset -= BeginOffset;
12822 assert((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset");
12823 BroadcastIdx = BitOffset / NumEltBits;
12826 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12835 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12836 return TruncBroadcast;
12842 V = V.getOperand(BroadcastIdx);
12848 cast<LoadSDNode>(V)->isSimple()) {
12858 assert((
int)(
Offset * 8) == BitOffset &&
"Unexpected bit-offset");
12875 assert(SVT == MVT::f64 &&
"Unexpected VT!");
12880 }
else if (!BroadcastFromReg) {
12883 }
else if (BitOffset != 0) {
12891 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12896 if (BitOffset < 128 && NumActiveElts > 1 &&
12897 V.getScalarValueSizeInBits() == NumEltBits) {
12898 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12899 "Unexpected bit-offset");
12901 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
12906 if ((BitOffset % 128) != 0)
12909 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12910 "Unexpected bit-offset");
12911 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12912 "Unexpected vector size");
12913 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12921 if (Subtarget.
hasAVX()) {
12929 if (!V.getValueType().isVector()) {
12930 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12931 "Unexpected scalar size");
12940 if (V.getValueSizeInBits() > 128)
12945 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12957 unsigned &InsertPSMask,
12958 const APInt &Zeroable,
12961 assert(V2.getSimpleValueType().is128BitVector() &&
"Bad operand type!");
12962 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
12969 unsigned ZMask = 0;
12970 int VADstIndex = -1;
12971 int VBDstIndex = -1;
12972 bool VAUsedInPlace =
false;
12974 for (
int i = 0; i < 4; ++i) {
12982 if (i == CandidateMask[i]) {
12983 VAUsedInPlace =
true;
12988 if (VADstIndex >= 0 || VBDstIndex >= 0)
12991 if (CandidateMask[i] < 4) {
13001 if (VADstIndex < 0 && VBDstIndex < 0)
13006 unsigned VBSrcIndex = 0;
13007 if (VADstIndex >= 0) {
13010 VBSrcIndex = CandidateMask[VADstIndex];
13011 VBDstIndex = VADstIndex;
13014 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13019 if (!VAUsedInPlace)
13027 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13028 assert((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!");
13032 if (matchAsInsertPS(V1, V2, Mask))
13038 if (matchAsInsertPS(V2, V1, CommutedMask))
13048 assert(V2.getSimpleValueType() == MVT::v4f32 &&
"Bad operand type!");
13051 unsigned InsertPSMask = 0;
13072 assert(V2.getSimpleValueType() == MVT::v2f64 &&
"Bad operand type!");
13073 assert(Mask.size() == 2 &&
"Unexpected mask size for v2 shuffle!");
13075 if (V2.isUndef()) {
13078 Mask, Subtarget, DAG))
13083 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13085 if (Subtarget.
hasAVX()) {
13098 assert(Mask[0] >= 0 &&
"No undef lanes in multi-input v2 shuffles!");
13099 assert(Mask[1] >= 0 &&
"No undef lanes in multi-input v2 shuffles!");
13100 assert(Mask[0] < 2 &&
"We sort V1 to be the first input.");
13101 assert(Mask[1] >= 2 &&
"We sort V2 to be the second input.");
13110 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13114 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13115 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13117 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13133 Zeroable, Subtarget, DAG))
13140 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13156 assert(V2.getSimpleValueType() == MVT::v2i64 &&
"Bad operand type!");
13157 assert(Mask.size() == 2 &&
"Unexpected mask size for v2 shuffle!");
13159 if (V2.isUndef()) {
13162 Mask, Subtarget, DAG))
13169 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13170 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13171 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13172 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13178 assert(Mask[0] != -1 &&
"No undef lanes in multi-input v2 shuffles!");
13179 assert(Mask[1] != -1 &&
"No undef lanes in multi-input v2 shuffles!");
13180 assert(Mask[0] < 2 &&
"We sort V1 to be the first input.");
13181 assert(Mask[1] >= 2 &&
"We sort V2 to be the second input.");
13196 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13200 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13202 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13207 bool IsBlendSupported = Subtarget.
hasSSE41();
13208 if (IsBlendSupported)
13210 Zeroable, Subtarget, DAG))
13220 if (Subtarget.hasVLX())
13222 Zeroable, Subtarget, DAG))
13232 if (IsBlendSupported)
13234 Zeroable, Subtarget, DAG);
13254 SDValue LowV = V1, HighV = V2;
13256 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
13258 if (NumV2Elements == 1) {
13259 int V2Index =
find_if(Mask, [](
int M) {
return M >= 4; }) - Mask.begin();
13263 int V2AdjIndex = V2Index ^ 1;
13265 if (Mask[V2AdjIndex] < 0) {
13271 NewMask[V2Index] -= 4;
13275 int V1Index = V2AdjIndex;
13276 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13288 NewMask[V1Index] = 2;
13289 NewMask[V2Index] = 0;
13291 }
else if (NumV2Elements == 2) {
13292 if (Mask[0] < 4 && Mask[1] < 4) {
13297 }
else if (Mask[2] < 4 && Mask[3] < 4) {
13312 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13313 Mask[2] < 4 ? Mask[2] : Mask[3],
13314 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13315 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13322 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13323 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13324 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13325 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13327 }
else if (NumV2Elements == 3) {
13348 assert(V2.getSimpleValueType() == MVT::v4f32 &&
"Bad operand type!");
13349 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
13353 Zeroable, Subtarget, DAG))
13356 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
13358 if (NumV2Elements == 0) {
13361 Mask, Subtarget, DAG))
13372 if (Subtarget.
hasAVX()) {
13396 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13410 if (NumV2Elements == 1 && Mask[0] >= 4)
13412 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13452 assert(V2.getSimpleValueType() == MVT::v4i32 &&
"Bad operand type!");
13453 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
13459 Zeroable, Subtarget, DAG))
13462 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
13465 if (Subtarget.preferLowerShuffleAsShift()) {
13468 Subtarget, DAG,
true))
13470 if (NumV2Elements == 0)
13476 if (NumV2Elements == 0) {
13478 if (
count_if(Mask, [](
int M) {
return M >= 0 && M < 4; }) > 1) {
13480 Mask, Subtarget, DAG))
13489 const int UnpackLoMask[] = {0, 0, 1, 1};
13490 const int UnpackHiMask[] = {2, 2, 3, 3};
13492 Mask = UnpackLoMask;
13494 Mask = UnpackHiMask;
13511 if (NumV2Elements == 1)
13513 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13518 bool IsBlendSupported = Subtarget.
hasSSE41();
13519 if (IsBlendSupported)
13521 Zeroable, Subtarget, DAG))
13525 Zeroable, Subtarget, DAG))
13535 if (Subtarget.hasVLX())
13536 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13537 Zeroable, Subtarget, DAG))
13540 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13551 if (IsBlendSupported)
13553 Zeroable, Subtarget, DAG);
13557 Mask, Subtarget, DAG))
13594 assert(Mask.size() == 8 &&
"Shuffle mask length doesn't match!");
13606 for (
int i = 0; i != 4; ++i)
13607 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13613 copy_if(LoMask, std::back_inserter(LoInputs), [](
int M) {
return M >= 0; });
13617 copy_if(HiMask, std::back_inserter(HiInputs), [](
int M) {
return M >= 0; });
13621 int NumHToL = LoInputs.
size() - NumLToL;
13623 int NumHToH = HiInputs.
size() - NumLToH;
13642 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13643 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13645 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13648 for (
int DWord = 0; DWord != 4; ++DWord) {
13649 int M0 = Mask[2 * DWord + 0];
13650 int M1 = Mask[2 * DWord + 1];
13653 if (
M0 < 0 &&
M1 < 0)
13656 bool Match =
false;
13657 for (
int j = 0, e = DWordPairs.
size(); j < e; ++j) {
13658 auto &DWordPair = DWordPairs[j];
13661 DWordPair.first = (
M0 >= 0 ?
M0 : DWordPair.first);
13662 DWordPair.second = (
M1 >= 0 ?
M1 : DWordPair.second);
13663 PSHUFDMask[DWord] = DOffset + j;
13669 PSHUFDMask[DWord] = DOffset + DWordPairs.
size();
13674 if (DWordPairs.
size() <= 2) {
13675 DWordPairs.
resize(2, std::make_pair(-1, -1));
13676 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13677 DWordPairs[1].first, DWordPairs[1].second};
13678 if ((NumHToL + NumHToH) == 0)
13679 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask,
X86ISD::PSHUFLW);
13680 if ((NumLToL + NumLToH) == 0)
13681 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask,
X86ISD::PSHUFHW);
13717 int AOffset,
int BOffset) {
13719 "Must call this with A having 3 or 1 inputs from the A half.");
13721 "Must call this with B having 1 or 3 inputs from the B half.");
13723 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13725 bool ThreeAInputs = AToAInputs.
size() == 3;
13731 int ADWord = 0, BDWord = 0;
13732 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13733 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13734 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13735 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13736 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13737 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13738 int TripleNonInputIdx =
13739 TripleInputSum - std::accumulate(TripleInputs.
begin(), TripleInputs.
end(), 0);
13740 TripleDWord = TripleNonInputIdx / 2;
13744 OneInputDWord = (OneInput / 2) ^ 1;
13751 if (BToBInputs.
size() == 2 && AToBInputs.
size() == 2) {
13756 int NumFlippedAToBInputs =
llvm::count(AToBInputs, 2 * ADWord) +
13758 int NumFlippedBToBInputs =
llvm::count(BToBInputs, 2 * BDWord) +
13760 if ((NumFlippedAToBInputs == 1 &&
13761 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13762 (NumFlippedBToBInputs == 1 &&
13763 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13768 auto FixFlippedInputs = [&V, &
DL, &Mask, &DAG](
int PinnedIdx,
int DWord,
13770 int FixIdx = PinnedIdx ^ 1;
13771 bool IsFixIdxInput =
is_contained(Inputs, PinnedIdx ^ 1);
13775 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13776 bool IsFixFreeIdxInput =
is_contained(Inputs, FixFreeIdx);
13777 if (IsFixIdxInput == IsFixFreeIdxInput)
13780 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13781 "We need to be changing the number of flipped inputs!");
13782 int PSHUFHalfMask[] = {0, 1, 2, 3};
13783 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13789 for (
int &M : Mask)
13790 if (M >= 0 && M == FixIdx)
13792 else if (M >= 0 && M == FixFreeIdx)
13795 if (NumFlippedBToBInputs != 0) {
13797 BToAInputs.
size() == 3 ? TripleNonInputIdx : OneInput;
13798 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13800 assert(NumFlippedAToBInputs != 0 &&
"Impossible given predicates!");
13801 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13802 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13807 int PSHUFDMask[] = {0, 1, 2, 3};
13808 PSHUFDMask[ADWord] = BDWord;
13809 PSHUFDMask[BDWord] = ADWord;
13816 for (
int &M : Mask)
13817 if (M >= 0 && M/2 == ADWord)
13818 M = 2 * BDWord + M % 2;
13819 else if (M >= 0 && M/2 == BDWord)
13820 M = 2 * ADWord + M % 2;
13826 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13827 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13828 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13829 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13836 int PSHUFLMask[4] = {-1, -1, -1, -1};
13837 int PSHUFHMask[4] = {-1, -1, -1, -1};
13838 int PSHUFDMask[4] = {-1, -1, -1, -1};
13843 auto fixInPlaceInputs =
13847 if (InPlaceInputs.
empty())
13849 if (InPlaceInputs.
size() == 1) {
13850 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13851 InPlaceInputs[0] - HalfOffset;
13852 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13855 if (IncomingInputs.
empty()) {
13857 for (
int Input : InPlaceInputs) {
13858 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13859 PSHUFDMask[Input / 2] = Input / 2;
13864 assert(InPlaceInputs.
size() == 2 &&
"Cannot handle 3 or 4 inputs!");
13865 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13866 InPlaceInputs[0] - HalfOffset;
13869 int AdjIndex = InPlaceInputs[0] ^ 1;
13870 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13871 std::replace(HalfMask.
begin(), HalfMask.
end(), InPlaceInputs[1], AdjIndex);
13872 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13874 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13875 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13881 auto moveInputsToRightHalf = [&PSHUFDMask](
13886 auto isWordClobbered = [](
ArrayRef<int> SourceHalfMask,
int Word) {
13887 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13889 auto isDWordClobbered = [&isWordClobbered](
ArrayRef<int> SourceHalfMask,
13891 int LowWord = Word & ~1;
13892 int HighWord = Word | 1;
13893 return isWordClobbered(SourceHalfMask, LowWord) ||
13894 isWordClobbered(SourceHalfMask, HighWord);
13897 if (IncomingInputs.
empty())
13900 if (ExistingInputs.
empty()) {
13902 for (
int Input : IncomingInputs) {
13905 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13906 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13907 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13908 Input - SourceOffset;
13910 for (
int &M : HalfMask)
13911 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13913 else if (M == Input)
13914 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13916 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13917 Input - SourceOffset &&
13918 "Previous placement doesn't match!");
13923 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13927 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13928 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13930 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13932 "Previous placement doesn't match!");
13938 for (
int &M : HalfMask)
13939 if (M >= SourceOffset && M < SourceOffset + 4) {
13940 M = M - SourceOffset + DestOffset;
13941 assert(M >= 0 &&
"This should never wrap below zero!");
13949 if (IncomingInputs.
size() == 1) {
13950 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13951 int InputFixed =
find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13953 SourceHalfMask[InputFixed - SourceOffset] =
13954 IncomingInputs[0] - SourceOffset;
13955 std::replace(HalfMask.
begin(), HalfMask.
end(), IncomingInputs[0],
13957 IncomingInputs[0] = InputFixed;
13959 }
else if (IncomingInputs.
size() == 2) {
13960 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13961 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13965 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13966 IncomingInputs[1] - SourceOffset};
13971 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13972 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13973 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13974 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13975 InputsFixed[1] = InputsFixed[0] ^ 1;
13976 }
else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13977 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13978 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13979 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13980 InputsFixed[0] = InputsFixed[1] ^ 1;
13981 }
else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13982 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13986 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13987 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13988 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13989 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13995 for (
int i = 0; i < 4; ++i)
13996 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13997 "We can't handle any clobbers here!");
13998 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13999 "Cannot have adjacent inputs here!");
14001 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14002 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14006 for (
int &M : FinalSourceHalfMask)
14007 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14008 M = InputsFixed[1] + SourceOffset;
14009 else if (M == InputsFixed[1] + SourceOffset)
14010 M = (InputsFixed[0] ^ 1) + SourceOffset;
14012 InputsFixed[1] = InputsFixed[0] ^ 1;
14016 for (
int &M : HalfMask)
14017 if (M == IncomingInputs[0])
14018 M = InputsFixed[0] + SourceOffset;
14019 else if (M == IncomingInputs[1])
14020 M = InputsFixed[1] + SourceOffset;
14022 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14023 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14030 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14031 assert(PSHUFDMask[FreeDWord] < 0 &&
"DWord not free");
14032 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14033 for (
int &M : HalfMask)
14034 for (
int Input : IncomingInputs)
14036 M = FreeDWord * 2 + Input % 2;
14038 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14040 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14060 "Failed to lift all the high half inputs to the low mask!");
14061 assert(
count_if(HiMask, [](
int M) {
return M >= 0 && M < 4; }) == 0 &&
14062 "Failed to lift all the low half inputs to the high mask!");
14070 for (
int &M : HiMask)
14086 "Lane crossing shuffle masks not supported");
14089 int Size = Mask.size();
14090 int Scale = NumBytes /
Size;
14097 for (
int i = 0; i < NumBytes; ++i) {
14098 int M = Mask[i / Scale];
14102 const int ZeroMask = 0x80;
14103 int V1Idx = M <
Size ? M * Scale + i % Scale : ZeroMask;
14104 int V2Idx = M <
Size ? ZeroMask : (M -
Size) * Scale + i % Scale;
14105 if (Zeroable[i / Scale])
14106 V1Idx = V2Idx = ZeroMask;
14110 V1InUse |= (ZeroMask != V1Idx);
14111 V2InUse |= (ZeroMask != V2Idx);
14124 if (V1InUse && V2InUse)
14127 V = V1InUse ? V1 : V2;
14150 assert(V2.getSimpleValueType() == MVT::v8i16 &&
"Bad operand type!");
14151 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
14156 Zeroable, Subtarget, DAG))
14164 int NumV2Inputs =
count_if(Mask, [](
int M) {
return M >= 8; });
14166 if (NumV2Inputs == 0) {
14170 Subtarget, DAG,
false))
14175 Mask, Subtarget, DAG))
14204 "All single-input shuffles should be canonicalized to be V1-input "
14214 if (Subtarget.hasSSE4A())
14220 if (NumV2Inputs == 1)
14222 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14227 bool IsBlendSupported = Subtarget.
hasSSE41();
14228 if (IsBlendSupported)
14230 Zeroable, Subtarget, DAG))
14234 Zeroable, Subtarget, DAG))
14262 Zeroable, Subtarget, DAG))
14267 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.
hasSSE41())) &&
14268 !Subtarget.hasVLX()) {
14270 unsigned PackOpc = 0;
14271 if (NumEvenDrops == 2 && Subtarget.
hasAVX2() &&
14282 }
else if (Subtarget.
hasSSE41()) {
14285 for (
unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14294 }
else if (!Subtarget.
hasSSSE3()) {
14307 if (NumEvenDrops == 2) {
14308 Result = DAG.
getBitcast(MVT::v4i32, Result);
14309 Result = DAG.
getNode(PackOpc,
DL, MVT::v8i16, Result, Result);
14317 if (NumOddDrops == 1) {
14318 bool HasSSE41 = Subtarget.
hasSSE41();
14326 MVT::v8i16, V1, V2);
14331 Mask, Subtarget, DAG))
14336 if (!IsBlendSupported && Subtarget.
hasSSSE3()) {
14337 bool V1InUse, V2InUse;
14339 Zeroable, DAG, V1InUse, V2InUse);
14345 Zeroable, Subtarget, DAG);
14354 assert(V2.getSimpleValueType() == MVT::v8f16 &&
"Bad operand type!");
14355 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
14356 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
14358 if (Subtarget.hasFP16()) {
14359 if (NumV2Elements == 0) {
14362 Mask, Subtarget, DAG))
14365 if (NumV2Elements == 1 && Mask[0] >= 8)
14367 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14394 MVT ShuffleVT = VT;
14404 for (
int &M : AdjustedMask)
14406 M += (Scale - 1) * NumElts;
14419 if (VT != ShuffleVT)
14437 assert(V2.getSimpleValueType() == MVT::v16i8 &&
"Bad operand type!");
14438 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
14458 Zeroable, Subtarget, DAG))
14471 if (Subtarget.hasSSE4A())
14476 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
14479 if (NumV2Elements == 0) {
14482 Mask, Subtarget, DAG))
14502 for (
int i = 0; i < 16; i += 2)
14503 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14508 auto tryToWidenViaDuplication = [&]() ->
SDValue {
14509 if (!canWidenViaDuplication(Mask))
14512 copy_if(Mask, std::back_inserter(LoInputs),
14513 [](
int M) {
return M >= 0 && M < 8; });
14517 copy_if(Mask, std::back_inserter(HiInputs), [](
int M) {
return M >= 8; });
14521 bool TargetLo = LoInputs.
size() >= HiInputs.
size();
14522 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14523 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14525 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14527 for (
int I : InPlaceInputs) {
14528 PreDupI16Shuffle[
I/2] =
I/2;
14531 int j = TargetLo ? 0 : 4, je = j + 4;
14532 for (
int i = 0, ie = MovingInputs.
size(); i < ie; ++i) {
14535 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14538 while (j < je && PreDupI16Shuffle[j] >= 0)
14546 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14550 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14555 DAG.
getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14558 bool EvenInUse =
false, OddInUse =
false;
14559 for (
int i = 0; i < 16; i += 2) {
14560 EvenInUse |= (Mask[i + 0] >= 0);
14561 OddInUse |= (Mask[i + 1] >= 0);
14562 if (EvenInUse && OddInUse)
14566 MVT::v16i8, EvenInUse ? V1 : DAG.
getUNDEF(MVT::v16i8),
14567 OddInUse ? V1 : DAG.
getUNDEF(MVT::v16i8));
14569 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14570 for (
int i = 0; i < 16; ++i)
14571 if (Mask[i] >= 0) {
14572 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14573 assert(MappedMask < 8 &&
"Invalid v8 shuffle mask!");
14574 if (PostDupI16Shuffle[i / 2] < 0)
14575 PostDupI16Shuffle[i / 2] = MappedMask;
14577 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14578 "Conflicting entries in the original shuffle!");
14583 DAG.
getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14585 if (
SDValue V = tryToWidenViaDuplication())
14590 Zeroable, Subtarget, DAG))
14599 Zeroable, Subtarget, DAG))
14603 bool IsSingleInput = V2.isUndef();
14622 if (Subtarget.
hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14623 bool V1InUse =
false;
14624 bool V2InUse =
false;
14627 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14632 if (V1InUse && V2InUse) {
14635 Zeroable, Subtarget, DAG))
14647 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14651 if (Subtarget.hasVBMI())
14656 if (Subtarget.hasXOP()) {
14664 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14672 if (NumV2Elements == 1)
14674 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14687 if (NumEvenDrops) {
14693 assert(NumEvenDrops <= 3 &&
14694 "No support for dropping even elements more than 3 times.");
14696 for (
unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14701 if (!IsSingleInput)
14707 IsSingleInput ? V1 : V2);
14708 for (
int i = 1; i < NumEvenDrops; ++i) {
14709 Result = DAG.
getBitcast(MVT::v8i16, Result);
14716 if (NumOddDrops == 1) {
14720 if (!IsSingleInput)
14725 IsSingleInput ? V1 : V2);
14729 if (NumV2Elements > 0)
14731 Zeroable, Subtarget, DAG);
14738 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14739 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14740 for (
int i = 0; i < 16; ++i)
14742 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14748 if (
none_of(LoBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; }) &&
14749 none_of(HiBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; })) {
14756 VHiHalf = DAG.
getUNDEF(MVT::v8i16);
14759 for (
int &M : LoBlendMask)
14762 for (
int &M : HiBlendMask)
14788 const APInt &Zeroable,
14791 if (VT == MVT::v8bf16) {
14828 "Only for 256-bit or wider vector shuffles!");
14830 assert(V2.getSimpleValueType() == VT &&
"Bad operand type!");
14836 int SplitNumElements = NumElements / 2;
14842 auto SplitVector = [&](
SDValue V) {
14845 return std::make_pair(DAG.
getBitcast(SplitVT, LoV),
14849 SDValue LoV1, HiV1, LoV2, HiV2;
14850 std::tie(LoV1, HiV1) = SplitVector(V1);
14851 std::tie(LoV2, HiV2) = SplitVector(V2);
14854 auto GetHalfBlendPiecesReq = [&](
const ArrayRef<int> &HalfMask,
bool &UseLoV1,
14855 bool &UseHiV1,
bool &UseLoV2,
14857 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 =
false;
14858 for (
int i = 0; i < SplitNumElements; ++i) {
14859 int M = HalfMask[i];
14860 if (M >= NumElements) {
14861 if (M >= NumElements + SplitNumElements)
14865 }
else if (M >= 0) {
14866 if (M >= SplitNumElements)
14874 auto CheckHalfBlendUsable = [&](
const ArrayRef<int> &HalfMask) ->
bool {
14878 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14879 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14881 return !(UseHiV1 || UseHiV2);
14888 for (
int i = 0; i < SplitNumElements; ++i) {
14889 int M = HalfMask[i];
14890 if (M >= NumElements) {
14891 V2BlendMask[i] = M - NumElements;
14892 BlendMask[i] = SplitNumElements + i;
14893 }
else if (M >= 0) {
14894 V1BlendMask[i] = M;
14899 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14900 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14905 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) &&
"Shuffle isn't simple");
14908 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14910 if (!UseLoV2 && !UseHiV2)
14912 if (!UseLoV1 && !UseHiV1)
14916 if (UseLoV1 && UseHiV1) {
14920 V1Blend = UseLoV1 ? LoV1 : HiV1;
14921 for (
int i = 0; i < SplitNumElements; ++i)
14922 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14923 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14925 if (UseLoV2 && UseHiV2) {
14929 V2Blend = UseLoV2 ? LoV2 : HiV2;
14930 for (
int i = 0; i < SplitNumElements; ++i)
14931 if (BlendMask[i] >= SplitNumElements)
14932 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14937 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14955 const APInt &Zeroable,
14958 assert(!V2.isUndef() &&
"This routine must not be used to lower single-input "
14959 "shuffles as it could then recurse on itself.");
14960 int Size = Mask.size();
14965 auto DoBothBroadcast = [&] {
14966 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14969 if (V2BroadcastIdx < 0)
14970 V2BroadcastIdx = M -
Size;
14971 else if (M -
Size != V2BroadcastIdx)
14973 }
else if (M >= 0) {
14974 if (V1BroadcastIdx < 0)
14975 V1BroadcastIdx = M;
14976 else if (M != V1BroadcastIdx)
14981 if (DoBothBroadcast())
14989 int LaneSize =
Size / LaneCount;
14991 LaneInputs[0].
resize(LaneCount,
false);
14992 LaneInputs[1].
resize(LaneCount,
false);
14993 for (
int i = 0; i <
Size; ++i)
14995 LaneInputs[Mask[i] /
Size][(Mask[i] %
Size) / LaneSize] =
true;
14996 if (LaneInputs[0].
count() <= 1 && LaneInputs[1].
count() <= 1)
15012 assert(VT == MVT::v4f64 &&
"Only for v4f64 shuffles");
15014 int LHSMask[4] = {-1, -1, -1, -1};
15015 int RHSMask[4] = {-1, -1, -1, -1};
15016 int SHUFPDMask[4] = {-1, -1, -1, -1};
15020 for (
int i = 0; i != 4; ++i) {
15024 int LaneBase = i & ~1;
15025 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15026 LaneMask[LaneBase + (M & 1)] = M;
15027 SHUFPDMask[i] = M & 1;
15049 int NumEltsPerLane = NumElts / NumLanes;
15050 bool CanUseSublanes = Subtarget.
hasAVX2() && V2.isUndef();
15057 auto getSublanePermute = [&](
int NumSublanes) ->
SDValue {
15058 int NumSublanesPerLane = NumSublanes / NumLanes;
15059 int NumEltsPerSublane = NumElts / NumSublanes;
15067 for (
int i = 0; i != NumElts; ++i) {
15072 int SrcSublane = M / NumEltsPerSublane;
15073 int DstLane = i / NumEltsPerLane;
15077 bool Found =
false;
15078 int DstSubStart = DstLane * NumSublanesPerLane;
15079 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15080 for (
int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15081 if (!
isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15085 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15086 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15087 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15088 DemandedCrossLane.
setBit(InLaneMask[i]);
15098 if (!CanUseSublanes) {
15103 int NumIdentityLanes = 0;
15104 bool OnlyShuffleLowestLane =
true;
15105 for (
int i = 0; i != NumLanes; ++i) {
15106 int LaneOffset = i * NumEltsPerLane;
15108 i * NumEltsPerLane))
15109 NumIdentityLanes++;
15110 else if (CrossLaneMask[LaneOffset] != 0)
15111 OnlyShuffleLowestLane =
false;
15113 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15120 if (CrossLaneMask == Mask || InLaneMask == Mask)
15125 for (
int i = 0; i != NumElts; ++i)
15126 if (!DemandedCrossLane[i])
15135 if (
SDValue V = getSublanePermute(NumLanes))
15139 if (!CanUseSublanes)
15143 if (
SDValue V = getSublanePermute(NumLanes * 2))
15148 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15151 return getSublanePermute(NumLanes * 4);
15157 int Size = Mask.size();
15158 InLaneMask.
assign(Mask.begin(), Mask.end());
15159 for (
int i = 0; i <
Size; ++i) {
15160 int &M = InLaneMask[i];
15163 if (((M %
Size) / LaneSize) != (i / LaneSize))
15164 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) +
Size;
15180 int Size = Mask.size();
15181 int LaneSize =
Size / 2;
15186 if (VT == MVT::v4f64 &&
15187 !
all_of(Mask, [LaneSize](
int M) {
return M < LaneSize; }))
15195 bool LaneCrossing[2] = {
false,
false};
15196 for (
int i = 0; i <
Size; ++i)
15197 if (Mask[i] >= 0 && ((Mask[i] %
Size) / LaneSize) != (i / LaneSize))
15198 LaneCrossing[(Mask[i] %
Size) / LaneSize] =
true;
15199 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15201 bool LaneUsed[2] = {
false,
false};
15202 for (
int i = 0; i <
Size; ++i)
15204 LaneUsed[(Mask[i] %
Size) / LaneSize] =
true;
15205 AllLanes = LaneUsed[0] && LaneUsed[1];
15210 "This last part of this routine only works on single input shuffles");
15216 "In-lane shuffle mask expected");
15236 const APInt &Zeroable,
15239 if (V2.isUndef()) {
15249 VT, MemVT, Ld, Ofs, DAG))
15264 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15265 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15268 if (WidenedMask[0] == 0 && IsHighZero) {
15288 if (!IsLowZero && !IsHighZero) {
15307 if (Subtarget.hasVLX()) {
15308 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15309 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15310 ((WidenedMask[1] % 2) << 1);
15330 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15331 (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?");
15333 unsigned PermMask = 0;
15334 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15335 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15338 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15340 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15357 assert(!V2.isUndef() &&
"This is only useful with multiple inputs.");
15362 int NumElts = Mask.size();
15370 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
15371 int Srcs[2] = {-1, -1};
15373 for (
int i = 0; i != NumLaneElts; ++i) {
15374 int M = Mask[(Lane * NumLaneElts) + i];
15381 int LaneSrc = M / NumLaneElts;
15383 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15385 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15390 Srcs[Src] = LaneSrc;
15391 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15398 LaneSrcs[Lane][0] = Srcs[0];
15399 LaneSrcs[Lane][1] = Srcs[1];
15402 assert(
M1.size() == M2.size() &&
"Unexpected mask size");
15403 for (
int i = 0, e =
M1.size(); i != e; ++i)
15404 if (
M1[i] >= 0 && M2[i] >= 0 &&
M1[i] != M2[i])
15410 assert(Mask.size() == MergedMask.size() &&
"Unexpected mask size");
15411 for (
int i = 0, e = MergedMask.size(); i != e; ++i) {
15415 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15416 "Unexpected mask element");
15421 if (MatchMasks(InLaneMask, RepeatMask)) {
15423 MergeMasks(InLaneMask, RepeatMask);
15428 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15431 if (MatchMasks(InLaneMask, RepeatMask)) {
15433 MergeMasks(InLaneMask, RepeatMask);
15442 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
15444 if (LaneSrcs[Lane][0] >= 0)
15447 for (
int i = 0; i != NumLaneElts; ++i) {
15448 int M = Mask[(Lane * NumLaneElts) + i];
15453 if (RepeatMask[i] < 0)
15454 RepeatMask[i] = M % NumLaneElts;
15456 if (RepeatMask[i] < NumElts) {
15457 if (RepeatMask[i] != M % NumLaneElts)
15459 LaneSrcs[Lane][0] = M / NumLaneElts;
15461 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15463 LaneSrcs[Lane][1] = M / NumLaneElts;
15467 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15472 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
15473 int Src = LaneSrcs[Lane][0];
15474 for (
int i = 0; i != NumLaneElts; ++i) {
15477 M = Src * NumLaneElts + i;
15478 NewMask[Lane * NumLaneElts + i] = M;
15485 if (isa<ShuffleVectorSDNode>(NewV1) &&
15486 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15489 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
15490 int Src = LaneSrcs[Lane][1];
15491 for (
int i = 0; i != NumLaneElts; ++i) {
15494 M = Src * NumLaneElts + i;
15495 NewMask[Lane * NumLaneElts + i] = M;
15502 if (isa<ShuffleVectorSDNode>(NewV2) &&
15503 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15506 for (
int i = 0; i != NumElts; ++i) {
15511 NewMask[i] = RepeatMask[i % NumLaneElts];
15512 if (NewMask[i] < 0)
15515 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15528 int &HalfIdx1,
int &HalfIdx2) {
15529 assert((Mask.size() == HalfMask.
size() * 2) &&
15530 "Expected input mask to be twice as long as output");
15535 if (UndefLower == UndefUpper)
15538 unsigned HalfNumElts = HalfMask.
size();
15539 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15542 for (
unsigned i = 0; i != HalfNumElts; ++i) {
15543 int M = Mask[i + MaskIndexOffset];
15551 int HalfIdx = M / HalfNumElts;
15554 int HalfElt = M % HalfNumElts;
15558 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15559 HalfMask[i] = HalfElt;
15560 HalfIdx1 = HalfIdx;
15563 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15564 HalfMask[i] = HalfElt + HalfNumElts;
15565 HalfIdx2 = HalfIdx;
15580 int HalfIdx2,
bool UndefLower,
15589 auto getHalfVector = [&](
int HalfIdx) {
15592 SDValue V = (HalfIdx < 2 ? V1 : V2);
15593 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15599 SDValue Half1 = getHalfVector(HalfIdx1);
15600 SDValue Half2 = getHalfVector(HalfIdx2);
15610 unsigned Offset = UndefLower ? HalfNumElts : 0;
15623 "Expected 256-bit or 512-bit vector");
15630 "Completely undef shuffle mask should have been simplified already");
15654 int HalfIdx1, HalfIdx2;
15659 assert(HalfMask.
size() == HalfNumElts &&
"Unexpected shuffle mask length");
15662 unsigned NumLowerHalves =
15663 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15664 unsigned NumUpperHalves =
15665 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15666 assert(NumLowerHalves + NumUpperHalves <= 2 &&
"Only 1 or 2 halves allowed");
15674 if (NumUpperHalves == 0)
15678 if (NumUpperHalves == 1) {
15682 if (EltWidth == 32 && NumLowerHalves && HalfVT.
is128BitVector() &&
15685 Subtarget.hasFastVariableCrossLaneShuffle()))
15691 if (EltWidth == 64 && V2.isUndef())
15695 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
15707 assert(NumUpperHalves == 2 &&
"Half vector count went wrong");
15712 if (NumUpperHalves == 0) {
15715 if (Subtarget.
hasAVX2() && EltWidth == 64)
15738 int NumLaneElts = NumElts / NumLanes;
15743 for (
unsigned BroadcastSize : {16, 32, 64}) {
15752 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
15753 for (
int j = 0; j != NumBroadcastElts; ++j) {
15754 int M = Mask[i + j];
15757 int &R = RepeatMask[j];
15758 if (0 != ((M % NumElts) / NumLaneElts))
15760 if (0 <= R && R != M)
15768 if (!FindRepeatingBroadcastMask(RepeatMask))
15776 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
15777 for (
int j = 0; j != NumBroadcastElts; ++j)
15778 BroadcastMask[i + j] = j;
15782 if (BroadcastMask == Mask)
15800 auto ShuffleSubLanes = [&](
int SubLaneScale) {
15801 int NumSubLanes = NumLanes * SubLaneScale;
15802 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15807 int TopSrcSubLane = -1;
15813 for (
int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15818 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15819 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15822 int Lane = (M % NumElts) / NumLaneElts;
15823 if ((0 <= SrcLane) && (SrcLane != Lane))
15826 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15827 SubLaneMask[Elt] = LocalM;
15835 for (
int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15837 for (
int i = 0; i != NumSubLaneElts; ++i) {
15838 if (
M1[i] < 0 || M2[i] < 0)
15840 if (
M1[i] != M2[i])
15846 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15847 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15851 for (
int i = 0; i != NumSubLaneElts; ++i) {
15852 int M = SubLaneMask[i];
15855 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15856 "Unexpected mask element");
15857 RepeatedSubLaneMask[i] = M;
15862 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15863 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15864 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15869 if (Dst2SrcSubLanes[DstSubLane] < 0)
15872 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15873 "Unexpected source lane");
15877 for (
int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15878 int Lane = SubLane / SubLaneScale;
15879 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15880 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15881 int M = RepeatedSubLaneMask[Elt];
15884 int Idx = (SubLane * NumSubLaneElts) + Elt;
15885 RepeatedMask[
Idx] = M + (Lane * NumLaneElts);
15891 for (
int i = 0; i != NumElts; i += NumSubLaneElts) {
15892 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15893 if (SrcSubLane < 0)
15895 for (
int j = 0; j != NumSubLaneElts; ++j)
15896 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15901 if (RepeatedMask == Mask || SubLaneMask == Mask)
15915 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15918 MinSubLaneScale = 2;
15920 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15922 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15923 MinSubLaneScale = MaxSubLaneScale = 4;
15925 for (
int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15926 if (
SDValue Shuffle = ShuffleSubLanes(Scale))
15933 bool &ForceV1Zero,
bool &ForceV2Zero,
15935 const APInt &Zeroable) {
15938 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15939 "Unexpected data type for VSHUFPD");
15941 "Illegal shuffle mask");
15943 bool ZeroLane[2] = {
true,
true };
15944 for (
int i = 0; i < NumElts; ++i)
15945 ZeroLane[i & 1] &= Zeroable[i];
15949 bool IsSHUFPD =
true;
15950 bool IsCommutable =
true;
15952 for (
int i = 0; i < NumElts; ++i) {
15957 int Val = (i & 6) + NumElts * (i & 1);
15958 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15959 if (Mask[i] < Val || Mask[i] > Val + 1)
15961 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15962 IsCommutable =
false;
15963 SHUFPDMask[i] = Mask[i] % 2;
15966 if (!IsSHUFPD && !IsCommutable)
15969 if (!IsSHUFPD && IsCommutable)
15972 ForceV1Zero = ZeroLane[0];
15973 ForceV2Zero = ZeroLane[1];
15980 const APInt &Zeroable,
15983 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15984 "Unexpected data type for VSHUFPD");
15986 unsigned Immediate = 0;
15987 bool ForceV1Zero =
false, ForceV2Zero =
false;
16008 const APInt &Zeroable,
16010 assert(VT == MVT::v32i8 &&
"Unexpected type!");
16017 if (Zeroable.
countl_one() < (Mask.size() - 8))
16029 { 0, 1, 2, 3, 16, 17, 18, 19,
16030 4, 5, 6, 7, 20, 21, 22, 23 });
16057 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16061 auto IsInterleavingPattern = [&](
ArrayRef<int> Mask,
unsigned Begin0,
16063 size_t Size = Mask.size();
16064 assert(
Size % 2 == 0 &&
"Expected even mask size");
16065 for (
unsigned I = 0;
I <
Size;
I += 2) {
16066 if (Mask[
I] != (
int)(Begin0 +
I / 2) ||
16067 Mask[
I + 1] != (
int)(Begin1 +
I / 2))
16074 size_t FirstQtr = NumElts / 2;
16075 size_t ThirdQtr = NumElts + NumElts / 2;
16076 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16077 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16078 if (!IsFirstHalf && !IsSecondHalf)
16088 if (Shuffles.
size() != 2)
16091 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16092 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16095 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16096 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16097 FirstHalf = Shuffles[0];
16098 SecondHalf = Shuffles[1];
16099 }
else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16100 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16101 FirstHalf = Shuffles[1];
16102 SecondHalf = Shuffles[0];
16131 assert(V2.getSimpleValueType() == MVT::v4f64 &&
"Bad operand type!");
16132 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
16138 if (V2.isUndef()) {
16141 Mask, Subtarget, DAG))
16151 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16152 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16165 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16170 Mask, DAG, Subtarget))
16183 Zeroable, Subtarget, DAG))
16188 Zeroable, Subtarget, DAG))
16199 !
all_of(Mask, [](
int M) {
return M < 2 || (4 <= M && M < 6); }) &&
16206 if (V1IsInPlace || V2IsInPlace)
16208 Zeroable, Subtarget, DAG);
16213 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16220 if (!(Subtarget.
hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16222 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16226 if (Subtarget.hasVLX())
16228 Zeroable, Subtarget, DAG))
16235 Zeroable, Subtarget, DAG);
16251 assert(V2.getSimpleValueType() == MVT::v4i64 &&
"Bad operand type!");
16252 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
16253 assert(Subtarget.
hasAVX2() &&
"We can only lower v4i64 with AVX2!");
16260 Zeroable, Subtarget, DAG))
16269 if (Subtarget.preferLowerShuffleAsShift())
16272 Subtarget, DAG,
true))
16275 if (V2.isUndef()) {
16302 if (Subtarget.hasVLX()) {
16304 Zeroable, Subtarget, DAG))
16308 Zeroable, Subtarget, DAG))
16326 if (V1IsInPlace || V2IsInPlace)
16328 Zeroable, Subtarget, DAG);
16333 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16345 if (!V1IsInPlace && !V2IsInPlace)
16347 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16352 Zeroable, Subtarget, DAG);
16364 assert(V2.getSimpleValueType() == MVT::v8f32 &&
"Bad operand type!");
16365 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
16368 Zeroable, Subtarget, DAG))
16386 Zeroable, Subtarget, DAG))
16394 "Repeated masks must be half the mask width!");
16418 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16423 if (V2.isUndef()) {
16440 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16444 if (Subtarget.hasVLX())
16446 Zeroable, Subtarget, DAG))
16470 Zeroable, Subtarget, DAG);
16486 assert(V2.getSimpleValueType() == MVT::v8i32 &&
"Bad operand type!");
16487 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
16488 assert(Subtarget.
hasAVX2() &&
"We can only lower v8i32 with AVX2!");
16490 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
16496 Zeroable, Subtarget, DAG))
16515 Zeroable, Subtarget, DAG))
16524 if (Subtarget.preferLowerShuffleAsShift()) {
16527 Subtarget, DAG,
true))
16529 if (NumV2Elements == 0)
16539 bool Is128BitLaneRepeatedShuffle =
16541 if (Is128BitLaneRepeatedShuffle) {
16542 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
16558 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16564 if (Subtarget.hasVLX()) {
16566 Zeroable, Subtarget, DAG))
16570 Zeroable, Subtarget, DAG))
16582 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16585 if (V2.isUndef()) {
16604 CastV1, CastV2, DAG);
16611 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16616 Zeroable, Subtarget, DAG);
16628 assert(V2.getSimpleValueType() == MVT::v16i16 &&
"Bad operand type!");
16629 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
16630 assert(Subtarget.
hasAVX2() &&
"We can only lower v16i16 with AVX2!");
16636 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16645 Zeroable, Subtarget, DAG))
16665 Subtarget, DAG,
false))
16676 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16679 if (V2.isUndef()) {
16694 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16707 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16712 Zeroable, Subtarget, DAG))
16716 if (Subtarget.hasBWI())
16722 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16727 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16751 assert(V2.getSimpleValueType() == MVT::v32i8 &&
"Bad operand type!");
16752 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
16753 assert(Subtarget.
hasAVX2() &&
"We can only lower v32i8 with AVX2!");
16759 Zeroable, Subtarget, DAG))
16768 Zeroable, Subtarget, DAG))
16805 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16817 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16825 Zeroable, Subtarget, DAG))
16829 if (Subtarget.hasVBMI())
16835 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16840 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16846 if (Subtarget.hasVLX())
16848 Mask, Zeroable, DAG))
16875 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
16877 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16879 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16895 if (ElementBits < 32) {
16913 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16945 "Unexpected element type size for 128bit shuffle.");
16955 assert(Widened128Mask.
size() == 4 &&
"Shuffle widening mismatch");
16958 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16959 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16960 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16971 bool OnlyUsesV1 =
isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16973 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16983 bool IsInsert =
true;
16985 for (
int i = 0; i < 4; ++i) {
16986 assert(Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value");
16987 if (Widened128Mask[i] < 0)
16991 if (Widened128Mask[i] < 4) {
16992 if (Widened128Mask[i] != i) {
16998 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17005 if (IsInsert && V2Index >= 0) {
17018 Widened128Mask.
clear();
17024 int PermMask[4] = {-1, -1, -1, -1};
17026 for (
int i = 0; i < 4; ++i) {
17027 assert(Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value");
17028 if (Widened128Mask[i] < 0)
17031 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17038 PermMask[i] = Widened128Mask[i] % 4;
17051 assert(V2.getSimpleValueType() == MVT::v8f64 &&
"Bad operand type!");
17052 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17054 if (V2.isUndef()) {
17056 if (
isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17062 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17063 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17064 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17065 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17077 V2, Subtarget, DAG))
17085 Zeroable, Subtarget, DAG))
17093 Zeroable, Subtarget, DAG))
17105 assert(V2.getSimpleValueType() == MVT::v16f32 &&
"Bad operand type!");
17106 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
17112 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
17129 Zeroable, Subtarget, DAG))
17137 Zeroable, Subtarget, DAG))
17141 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17147 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17152 if (V2.isUndef() &&
17160 Zeroable, Subtarget, DAG))
17172 assert(V2.getSimpleValueType() == MVT::v8i64 &&
"Bad operand type!");
17173 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17176 if (Subtarget.preferLowerShuffleAsShift())
17179 Subtarget, DAG,
true))
17182 if (V2.isUndef()) {
17204 V2, Subtarget, DAG))
17215 Zeroable, Subtarget, DAG))
17219 if (Subtarget.hasBWI())
17233 Zeroable, Subtarget, DAG))
17245 assert(V2.getSimpleValueType() == MVT::v16i32 &&
"Bad operand type!");
17246 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
17248 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
17254 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17258 if (Subtarget.preferLowerShuffleAsShift()) {
17261 Subtarget, DAG,
true))
17263 if (NumV2Elements == 0)
17273 bool Is128BitLaneRepeatedShuffle =
17275 if (Is128BitLaneRepeatedShuffle) {
17276 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
17289 Subtarget, DAG,
false))
17292 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17299 Zeroable, Subtarget, DAG))
17303 if (Subtarget.hasBWI())
17314 CastV1, CastV2, DAG);
17321 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17326 Zeroable, Subtarget, DAG))
17330 Zeroable, Subtarget, DAG))
17342 assert(V2.getSimpleValueType() == MVT::v32i16 &&
"Bad operand type!");
17343 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
17344 assert(Subtarget.hasBWI() &&
"We can only lower v32i16 with AVX-512-BWI!");
17350 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17365 Subtarget, DAG,
false))
17373 if (V2.isUndef()) {
17385 RepeatedMask, Subtarget, DAG);
17390 Zeroable, Subtarget, DAG))
17394 Zeroable, Subtarget, DAG))
17401 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17413 assert(V2.getSimpleValueType() == MVT::v64i8 &&
"Bad operand type!");
17414 assert(Mask.size() == 64 &&
"Unexpected mask size for v64 shuffle!");
17415 assert(Subtarget.hasBWI() &&
"We can only lower v64i8 with AVX-512-BWI!");
17421 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17452 Zeroable, Subtarget, DAG))
17456 Zeroable, Subtarget, DAG))
17462 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17466 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17470 Zeroable, Subtarget, DAG))
17477 Mask, Subtarget, DAG))
17482 bool V1InUse, V2InUse;
17484 DAG, V1InUse, V2InUse);
17491 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17495 if (Subtarget.hasVBMI())
17508 const APInt &Zeroable,
17512 "Cannot lower 512-bit vectors w/ basic ISA!");
17516 int NumElts = Mask.size();
17517 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
17519 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17521 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17534 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17546 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17547 if (!Subtarget.hasBWI())
17589 int NumElts = Mask.size();
17590 for (
int i = 0; i != NumElts; ++i) {
17593 "Unexpected mask index.");
17598 if (ShiftAmt < 0) {
17605 if (ShiftAmt != M - i)
17608 assert(ShiftAmt >= 0 &&
"All undef?");
17622 int MaskOffset,
const APInt &Zeroable) {
17623 int Size = Mask.size();
17625 auto CheckZeros = [&](
int Shift,
bool Left) {
17626 for (
int j = 0; j < Shift; ++j)
17627 if (!Zeroable[j + (
Left ? 0 : (
Size - Shift))])
17633 auto MatchShift = [&](
int Shift,
bool Left) {
17634 unsigned Pos =
Left ? Shift : 0;
17635 unsigned Low =
Left ? 0 : Shift;
17636 unsigned Len =
Size - Shift;
17640 for (
int Shift = 1; Shift !=
Size; ++Shift)
17641 for (
bool Left : {
true,
false})
17642 if (CheckZeros(Shift,
Left) && MatchShift(Shift,
Left)) {
17657 const APInt &Zeroable,
17661 "Cannot lower 512-bit vectors w/o basic ISA!");
17663 int NumElts = Mask.size();
17664 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
17667 int SubvecElts = 0;
17669 for (
int i = 0; i != NumElts; ++i) {
17670 if (Mask[i] >= 0) {
17674 Src = Mask[i] / NumElts;
17675 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17681 assert(SubvecElts != NumElts &&
"Identity shuffle?");
17684 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17688 if ((
int)Zeroable.
countl_one() >= (NumElts - SubvecElts)) {
17689 assert(Src >= 0 &&
"Expected a source!");
17709 if (ShiftAmt >= 0) {
17720 ShiftAmt += WideElts - NumElts;
17723 Res = DAG.
getNode(Opcode,
DL, WideVT, Res,
17750 ExtVT = MVT::v2i64;
17753 ExtVT = MVT::v4i32;
17758 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17768 assert(Subtarget.hasBWI() &&
"Expected AVX512BW support");
17776 ExtVT = MVT::v64i8;
17786 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17787 (Subtarget.hasDQI() && (NumElems < 32)))
17797 int NumElements = Mask.size();
17799 int NumV1Elements = 0, NumV2Elements = 0;
17803 else if (M < NumElements)
17811 if (NumV2Elements > NumV1Elements)
17814 assert(NumV1Elements > 0 &&
"No V1 indices");
17816 if (NumV2Elements == 0)
17824 if (NumV1Elements == NumV2Elements) {
17825 int LowV1Elements = 0, LowV2Elements = 0;
17826 for (
int M : Mask.slice(0, NumElements / 2))
17827 if (M >= NumElements)
17831 if (LowV2Elements > LowV1Elements)
17833 if (LowV2Elements == LowV1Elements) {
17834 int SumV1Indices = 0, SumV2Indices = 0;
17835 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
17836 if (Mask[i] >= NumElements)
17838 else if (Mask[i] >= 0)
17840 if (SumV2Indices < SumV1Indices)
17842 if (SumV2Indices == SumV1Indices) {
17843 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17844 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
17845 if (Mask[i] >= NumElements)
17846 NumV2OddIndices += i % 2;
17847 else if (Mask[i] >= 0)
17848 NumV1OddIndices += i % 2;
17849 if (NumV2OddIndices < NumV1OddIndices)
17863 if (!V.getValueType().isSimple())
17866 MVT VT = V.getSimpleValueType().getScalarType();
17867 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17872 if ((VT == MVT::i16 || VT == MVT::i8) &&
17873 V.getSimpleValueType().getSizeInBits() < 512)
17876 auto HasMaskOperation = [&](
SDValue V) {
17879 switch (V->getOpcode()) {
17898 if (!V->hasOneUse())
17904 if (HasMaskOperation(V))
17929 MVT VT =
Op.getSimpleValueType();
17935 "Can't lower MMX shuffles");
17937 bool V1IsUndef = V1.
isUndef();
17938 bool V2IsUndef = V2.isUndef();
17939 if (V1IsUndef && V2IsUndef)
17952 any_of(OrigMask, [NumElements](
int M) {
return M >= NumElements; })) {
17954 for (
int &M : NewMask)
17955 if (M >= NumElements)
17961 int MaskUpperLimit = OrigMask.
size() * (V2IsUndef ? 1 : 2);
17962 (void)MaskUpperLimit;
17964 [&](
int M) {
return -1 <= M && M < MaskUpperLimit; }) &&
17965 "Out of bounds shuffle index");
17970 APInt KnownUndef, KnownZero;
17973 APInt Zeroable = KnownUndef | KnownZero;
17999 int NewNumElts = NumElements / 2;
18007 bool UsedZeroVector =
false;
18009 "V2's non-undef elements are used?!");
18010 for (
int i = 0; i != NewNumElts; ++i)
18012 WidenedMask[i] = i + NewNumElts;
18013 UsedZeroVector =
true;
18017 if (UsedZeroVector)
18038 assert(NumElements == (
int)Mask.size() &&
18039 "canonicalizeShuffleMaskWithHorizOp "
18040 "shouldn't alter the shuffle mask size");
18069 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18084 if (NumVecBits != 128 && NumVecBits != 256)
18087 if (NumElementBits == 32 || NumElementBits == 64) {
18088 unsigned NumLargeElements = 512 / NumElementBits;
18096 Subtarget, DAG,
DL);
18100 Subtarget, DAG,
DL);
18108 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18109 VecVT == MVT::v16i16) {
18114 Passthru = Passthru.
isUndef()
18133 MVT VT =
Op.getSimpleValueType();
18152 MVT VT =
Op.getSimpleValueType();
18174 MVT CondVT =
Cond.getSimpleValueType();
18175 unsigned CondEltSize =
Cond.getScalarValueSizeInBits();
18176 if (CondEltSize == 1)
18187 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18200 return DAG.
getSelect(dl, VT, Mask, LHS, RHS);
18204 if (CondEltSize != EltSize) {
18221 !Subtarget.hasXOP()) {
18227 if (FreeCond && (FreeLHS || FreeRHS))
18247 case MVT::v16i16: {
18260 MVT VT =
Op.getSimpleValueType();
18263 assert(isa<ConstantSDNode>(
Idx) &&
"Constant index expected");
18278 unsigned IdxVal =
Idx->getAsZExtVal();
18284 if (VT == MVT::f32) {
18290 if (!
Op.hasOneUse())
18295 User->getValueType(0) != MVT::i32))
18302 if (VT == MVT::i32 || VT == MVT::i64)
18316 auto* IdxC = dyn_cast<ConstantSDNode>(
Idx);
18317 MVT EltVT =
Op.getSimpleValueType();
18320 "Unexpected vector type in ExtractBitFromMaskVector");
18328 if (NumElts == 1) {
18340 unsigned IdxVal = IdxC->getZExtValue();
18357 MVT VT =
N->getSimpleValueType(0);
18361 switch (
User->getOpcode()) {
18367 return DemandedElts;
18369 DemandedElts.
setBit(
User->getConstantOperandVal(1));
18372 if (!
User->getValueType(0).isSimple() ||
18373 !
User->getValueType(0).isVector()) {
18375 return DemandedElts;
18383 return DemandedElts;
18386 return DemandedElts;
18390X86TargetLowering::LowerEXTRACT_VECTOR_ELT(
SDValue Op,
18396 auto* IdxC = dyn_cast<ConstantSDNode>(
Idx);
18435 unsigned IdxVal = IdxC->getZExtValue();
18449 IdxVal &= ElemsPerChunk - 1;
18456 MVT VT =
Op.getSimpleValueType();
18458 if (VT == MVT::i16) {
18463 if (Subtarget.hasFP16())
18483 if (VT == MVT::i8) {
18488 int DWordIdx = IdxVal / 4;
18489 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18493 int ShiftVal = (IdxVal % 4) * 8;
18500 int WordIdx = IdxVal / 2;
18501 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18505 int ShiftVal = (IdxVal % 2) * 8;
18519 Mask[0] =
static_cast<int>(IdxVal);
18535 int Mask[2] = { 1, -1 };
18554 if (!isa<ConstantSDNode>(
Idx)) {
18573 MVT VT =
Op.getSimpleValueType();
18578 if (EltVT == MVT::i1)
18585 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18587 if (EltVT == MVT::bf16) {
18599 if (!(Subtarget.hasBWI() ||
18600 (Subtarget.
hasAVX512() && EltSizeInBits >= 32) ||
18601 (Subtarget.
hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18614 for (
unsigned I = 0;
I != NumElts; ++
I)
18619 return DAG.
getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18623 if (N2C->getAPIntValue().uge(NumElts))
18625 uint64_t IdxVal = N2C->getZExtValue();
18630 if (IsZeroElt || IsAllOnesElt) {
18633 if (IsAllOnesElt &&
18634 ((VT == MVT::v16i8 && !Subtarget.
hasSSE41()) ||
18635 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.
hasInt256()))) {
18639 CstVectorElts[IdxVal] = OnesCst;
18648 for (
unsigned i = 0; i != NumElts; ++i)
18649 BlendMask.
push_back(i == IdxVal ? i + NumElts : i);
18665 if ((Subtarget.
hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18666 (Subtarget.
hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18673 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18675 "Vectors will always have power-of-two number of elements.");
18680 if (IdxVal >= NumEltsIn128 &&
18681 ((Subtarget.
hasAVX2() && EltSizeInBits != 8) ||
18682 (Subtarget.
hasAVX() && (EltSizeInBits >= 32) &&
18686 for (
unsigned i = 0; i != NumElts; ++i)
18687 BlendMask.
push_back(i == IdxVal ? i + NumElts : i);
18696 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18708 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18709 EltVT == MVT::f16 || EltVT == MVT::i64) {
18716 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18727 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.
hasSSE41())) {
18729 if (VT == MVT::v8i16) {
18733 assert(VT == MVT::v16i8 &&
"PINSRB requires v16i8 vector");
18738 assert(N1.getValueType() != MVT::i32 &&
"Unexpected VT");
18741 return DAG.
getNode(Opc, dl, VT, N0, N1, N2);
18745 if (EltVT == MVT::f32) {
18775 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18785 MVT OpVT =
Op.getSimpleValueType();
18806 "Expected an SSE type!");
18810 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18823 assert(
Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18830 assert(
Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18831 "Only vXi1 extract_subvectors need custom lowering");
18835 uint64_t IdxVal =
Op.getConstantOperandVal(1);
18852unsigned X86TargetLowering::getGlobalWrapperKind(
18853 const GlobalValue *GV,
const unsigned char OpFlags)
const {
18887 CP->getConstVal(), PtrVT,
CP->getAlign(),
CP->getOffset(), OpFlag);
18890 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlag),
DL, PtrVT, Result);
18912 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlag),
DL, PtrVT, Result);
18925 return LowerGlobalOrExternal(
Op, DAG,
false);
18931 unsigned char OpFlags =
18933 const BlockAddress *BA = cast<BlockAddressSDNode>(
Op)->getBlockAddress();
18934 int64_t
Offset = cast<BlockAddressSDNode>(
Op)->getOffset();
18939 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlags), dl, PtrVT, Result);
18953 bool ForCall)
const {
18958 const char *ExternalSym =
nullptr;
18959 if (
const auto *
G = dyn_cast<GlobalAddressSDNode>(
Op)) {
18960 GV =
G->getGlobal();
18963 const auto *ES = cast<ExternalSymbolSDNode>(
Op);
18964 ExternalSym = ES->getSymbol();
18969 unsigned char OpFlags;
18987 int64_t GlobalOffset = 0;
19000 if (ForCall && !NeedsLoad && !HasPICReg &&
Offset == 0)
19003 Result = DAG.
getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19028 return LowerGlobalOrExternal(
Op, DAG,
false);
19032 const EVT PtrVT,
unsigned ReturnReg,
19033 unsigned char OperandFlags,
19034 bool LoadGlobalBaseReg =
false,
19035 bool LocalDynamic =
false) {
19043 if (LocalDynamic && UseTLSDESC) {
19050 "Unexpected TLSDESC DAG");
19054 "Unexpected TLSDESC DAG");
19056 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19058 "Unexpected TLSDESC DAG");
19059 Ret =
SDValue(CopyFromRegOp, 0);
19072 if (LoadGlobalBaseReg) {
19078 Chain = DAG.
getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19080 Chain = DAG.
getNode(CallType, dl, NodeTys, {Chain, TGA});
19128 bool Is64Bit,
bool Is64BitLP64) {
19138 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19177 unsigned char OperandFlags = 0;
19234 if (Subtarget.is64Bit()) {
19246 PositionIndependent);
19253 unsigned char OpFlag = 0;
19254 unsigned WrapperKind = 0;
19258 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19293 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19319 SDValue TlsArray = Subtarget.is64Bit()
19334 if (Subtarget.is64Bit())
19365 if (Subtarget.is64Bit() && Subtarget.
isTargetELF()) {
19406 "Unexpected opcode!");
19407 bool IsStrict =
Op->isStrictFPOpcode();
19408 unsigned OpNo = IsStrict ? 1 : 0;
19410 MVT SrcVT = Src.getSimpleValueType();
19411 MVT VT =
Op.getSimpleValueType();
19413 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19414 (VT != MVT::f32 && VT != MVT::f64))
19420 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19427 {Op.getOperand(0), InVec});
19447 "Unexpected opcode!");
19448 bool IsStrict =
Op->isStrictFPOpcode();
19449 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
19450 MVT SrcVT = Src.getSimpleValueType();
19451 MVT VT =
Op.getSimpleValueType();
19453 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19458 assert(Subtarget.hasFP16() &&
"Expected FP16");
19462 SDValue CvtVec = DAG.
getNode(
Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19463 {Op.getOperand(0), InVec});
19481 if (!Subtarget.
hasSSE2() || FromVT != MVT::v4i32)
19484 return ToVT == MVT::v4f32 || (Subtarget.
hasAVX() && ToVT == MVT::v4f64);
19488 if (!Subtarget.
hasAVX512() || FromVT != MVT::v4i32)
19491 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19509 !isa<ConstantSDNode>(Extract.
getOperand(1)))
19530 if (FromVT != Vec128VT)
19554 MVT SrcVT =
X.getSimpleValueType();
19555 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19560 if (!Subtarget.
hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19572 unsigned ToIntOpcode =
19574 unsigned ToFPOpcode =
19593 bool IsStrict =
Op->isStrictFPOpcode();
19594 MVT VT =
Op->getSimpleValueType(0);
19595 SDValue Src =
Op->getOperand(IsStrict ? 1 : 0);
19597 if (Subtarget.hasDQI()) {
19598 assert(!Subtarget.hasVLX() &&
"Unexpected features");
19600 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19601 Src.getSimpleValueType() == MVT::v4i64) &&
19602 "Unsupported custom type");
19605 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19607 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19617 Res = DAG.
getNode(
Op.getOpcode(),
DL, {WideVT, MVT::Other},
19618 {Op->getOperand(0), Src});
19621 Res = DAG.
getNode(
Op.getOpcode(),
DL, WideVT, Src);
19634 if (VT != MVT::v4f32 || IsSigned)
19646 for (
int i = 0; i != 4; ++i) {
19652 {
Op.getOperand(0), Elt});
19653 Chains[i] = SignCvts[i].getValue(1);
19664 {Chain, SignCvt, SignCvt});
19681 bool IsStrict =
Op->isStrictFPOpcode();
19682 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
19684 MVT VT =
Op.getSimpleValueType();
19692 DAG.
getNode(
Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19695 DAG.
getNode(
Op.getOpcode(), dl, NVT, Src), Rnd);
19700 if (VT == MVT::v4i32 && Subtarget.
hasSSE2() && IsSigned)
19702 if (VT == MVT::v8i32 && Subtarget.
hasAVX() && IsSigned)
19704 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19707 if (VT == MVT::v16i32)
19709 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19712 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19713 (VT == MVT::v2i64 || VT == MVT::v4i64))
19720 bool IsStrict =
Op->isStrictFPOpcode();
19721 unsigned OpNo = IsStrict ? 1 : 0;
19724 MVT SrcVT = Src.getSimpleValueType();
19725 MVT VT =
Op.getSimpleValueType();
19734 return LowerWin64_INT128_TO_FP(
Op, DAG);
19743 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19755 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19761 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19762 "Unknown SINT_TO_FP to lower!");
19768 if (SrcVT == MVT::i32 && UseSSEReg)
19770 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19779 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19788 if (VT == MVT::f128 || !Subtarget.hasX87())
19792 if (SrcVT == MVT::i64 && Subtarget.
hasSSE2() && !Subtarget.is64Bit())
19796 ValueToStore = DAG.
getBitcast(MVT::f64, ValueToStore);
19806 Chain = DAG.
getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19807 std::pair<SDValue, SDValue> Tmp =
19808 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19823 Tys = DAG.
getVTList(MVT::f80, MVT::Other);
19825 Tys = DAG.
getVTList(DstVT, MVT::Other);
19827 SDValue FILDOps[] = {Chain, Pointer};
19831 Chain = Result.getValue(1);
19841 SDValue FSTOps[] = {Chain, Result, StackSlot};
19849 DstVT,
DL, Chain, StackSlot,
19851 Chain = Result.getValue(1);
19854 return { Result, Chain };
19863 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19864 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19874 assert(!
Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!");
19891 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19899 APInt(64, 0x4330000000000000ULL))));
19902 APInt(64, 0x4530000000000000ULL))));
19916 MVT::v2f64, dl, CLod0.
getValue(1), CPIdx1,
19939 unsigned OpNo =
Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19942 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19961 if (
Op.getNode()->isStrictFPOpcode()) {
19966 {Chain,
Or, Bias});
19973 Sub, Sub.
getValue(1), dl,
Op.getSimpleValueType());
19975 return DAG.
getMergeValues({ResultPair.first, ResultPair.second}, dl);
19989 if (
Op.getSimpleValueType() != MVT::v2f64)
19992 bool IsStrict =
Op->isStrictFPOpcode();
19994 SDValue N0 =
Op.getOperand(IsStrict ? 1 : 0);
19998 if (!Subtarget.hasVLX()) {
20006 {Op.getOperand(0), N0});
20018 {
Op.getOperand(0), N0});
20028 llvm::bit_cast<double>(0x4330000000000000ULL),
DL, MVT::v2f64);
20035 {
Op.getOperand(0),
Or, VBias});
20042 bool IsStrict =
Op->isStrictFPOpcode();
20043 SDValue V =
Op->getOperand(IsStrict ? 1 : 0);
20044 MVT VecIntVT = V.getSimpleValueType();
20045 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20046 "Unsupported custom type");
20050 assert(!Subtarget.hasVLX() &&
"Unexpected features");
20051 MVT VT =
Op->getSimpleValueType(0);
20054 if (VT == MVT::v8f64)
20057 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20059 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20060 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20070 {
Op->getOperand(0), V});
20084 if (Subtarget.
hasAVX() && VecIntVT == MVT::v4i32 &&
20085 Op->getSimpleValueType(0) == MVT::v4f64) {
20105 {
Op.getOperand(0),
Or, VBias});
20121 bool Is128 = VecIntVT == MVT::v4i32;
20122 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20125 if (VecFloatVT !=
Op->getSimpleValueType(0))
20146 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20187 {
Op.getOperand(0), HighBitcast, VecCstFSub});
20189 {FHigh.
getValue(1), LowBitcast, FHigh});
20199 unsigned OpNo =
Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20219 bool IsStrict =
Op->isStrictFPOpcode();
20220 unsigned OpNo = IsStrict ? 1 : 0;
20224 MVT SrcVT = Src.getSimpleValueType();
20225 MVT DstVT =
Op->getSimpleValueType(0);
20229 if (DstVT == MVT::f128)
20241 return LowerWin64_INT128_TO_FP(
Op, DAG);
20247 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20254 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20269 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.
hasSSE2() &&
20274 if (SrcVT == MVT::i32 && Subtarget.
hasSSE2() && DstVT != MVT::f80 &&
20277 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20278 (DstVT == MVT::f32 || DstVT == MVT::f64))
20283 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20284 Align SlotAlign(8);
20287 if (SrcVT == MVT::i32) {
20290 SDValue Store1 = DAG.
getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20293 std::pair<SDValue, SDValue> Tmp =
20294 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20301 assert(SrcVT == MVT::i64 &&
"Unexpected type in UINT_TO_FP");
20307 ValueToStore = DAG.
getBitcast(MVT::f64, ValueToStore);
20310 DAG.
getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20327 APInt FF(64, 0x5F80000000000000ULL);
20330 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20349 if (Subtarget.
isOSWindows() && DstVT == MVT::f32)
20353 DAG.
getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20355 if (DstVT == MVT::f80)
20363 if (Subtarget.
isOSWindows() && DstVT == MVT::f32)
20380 bool IsStrict =
Op->isStrictFPOpcode();
20383 EVT DstTy =
Op.getValueType();
20388 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20397 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20401 if (!IsSigned && DstTy != MVT::i64) {
20404 assert(DstTy == MVT::i32 &&
"Unexpected FP_TO_UINT");
20408 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20409 DstTy.getSimpleVT() >= MVT::i16 &&
20410 "Unknown FP_TO_INT to lower!");
20415 unsigned MemSize = DstTy.getStoreSize();
20424 if (UnsignedFixup) {
20444 bool LosesInfo =
false;
20445 if (TheVT == MVT::f64)
20449 else if (TheVT == MVT::f80)
20454 "FP conversion should have been exact");
20464 Chain =
Cmp.getValue(1);
20489 { Chain,
Value, FltOfs });
20490 Chain =
Value.getValue(1);
20500 assert(DstTy == MVT::i64 &&
"Invalid FP_TO_SINT to lower!");
20503 SDValue Ops[] = { Chain, StackSlot };
20506 assert(FLDSize <= MemSize &&
"Stack slot not big enough");
20510 Chain =
Value.getValue(1);
20533 MVT VT =
Op.getSimpleValueType();
20535 MVT InVT = In.getSimpleValueType();
20536 unsigned Opc =
Op.getOpcode();
20540 "Unexpected extension opcode");
20542 "Expected same number of elements");
20546 "Unexpected element type");
20550 "Unexpected element type");
20554 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20555 assert(InVT == MVT::v32i8 &&
"Unexpected VT!");
20579 if (
auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20595 assert((VT == MVT::v16i8 || VT == MVT::v16i16) &&
"Unexpected VT.");
20609 MVT VT =
Op->getSimpleValueType(0);
20611 MVT InVT = In.getSimpleValueType();
20625 if (!Subtarget.hasBWI()) {
20634 MVT WideVT = ExtVT;
20659 return SelectedVal;
20665 MVT SVT = In.getSimpleValueType();
20684 "Unexpected PACK opcode");
20691 EVT SrcVT = In.getValueType();
20694 if (SrcVT == DstVT)
20704 assert(SrcSizeInBits > DstSizeInBits &&
"Illegal truncation");
20712 EVT InVT = MVT::i16, OutVT = MVT::i8;
20721 if (SrcSizeInBits <= 128) {
20738 if (
Hi.isUndef()) {
20745 unsigned SubSizeInBits = SrcSizeInBits / 2;
20747 OutVT =
EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20768 int Scale = 64 / OutVT.getScalarSizeInBits();
20781 assert(SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater");
20813 EVT SrcVT = In.getValueType();
20831 EVT SrcVT = In.getValueType();
20838 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20839 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20842 assert(NumSrcEltBits > NumDstEltBits &&
"Bad truncation");
20843 unsigned NumStages =
Log2_32(NumSrcEltBits / NumDstEltBits);
20848 if ((DstSVT == MVT::i32 && SrcVT.
getSizeInBits() <= 128) ||
20849 (DstSVT == MVT::i16 && SrcVT.
getSizeInBits() <= (64 * NumStages)) ||
20850 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.
hasSSSE3()))
20855 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20861 if (Subtarget.
hasAVX512() && NumStages > 1)
20864 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20865 unsigned NumPackedZeroBits = Subtarget.
hasSSE41() ? NumPackedSignBits : 8;
20872 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
20887 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20891 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20892 if (Flags.hasNoSignedWrap() || MinSignBits < NumSignBits) {
20900 if (In.getOpcode() ==
ISD::SRL && In->hasOneUse())
20902 if (*ShAmt == MinSignBits) {
20917 MVT SrcVT = In.getSimpleValueType();
20920 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20921 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20937 unsigned PackOpcode;
20950 MVT SrcVT = In.getSimpleValueType();
20954 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20955 (DstSVT == MVT::i8 || DstSVT == MVT::i16) &&
isPowerOf2_32(NumElems) &&
20960 if (Subtarget.
hasSSSE3() && NumElems == 8) {
20961 if (SrcSVT == MVT::i16)
20963 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.
hasSSE41()))
20982 if (Subtarget.
hasSSE41() || DstSVT == MVT::i8)
20985 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20989 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21001 MVT VT =
Op.getSimpleValueType();
21003 MVT InVT = In.getSimpleValueType();
21009 if (Subtarget.hasBWI()) {
21025 "Unexpected vector type.");
21027 assert((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements");
21039 if (InVT == MVT::v16i8) {
21043 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21046 assert(InVT == MVT::v16i16 &&
"Unexpected VT!");
21072 if (Subtarget.hasDQI())
21079 MVT VT =
Op.getSimpleValueType();
21081 MVT InVT =
In.getSimpleValueType();
21083 "Invalid TRUNCATE operation");
21088 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21090 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21091 "Unexpected subtarget!");
21110 VT, In,
DL, Subtarget, DAG,
Op->getFlags()))
21128 VT, In,
DL, Subtarget, DAG,
Op->getFlags()))
21133 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21134 assert(VT == MVT::v32i8 &&
"Unexpected VT!");
21142 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21150 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21153 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21164 static const int ShufMask[] = {0, 2, 4, 6};
21166 DAG.
getBitcast(MVT::v4i32, OpHi), ShufMask);
21169 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21173 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21174 -1, -1, -1, -1, -1, -1, -1, -1,
21175 16, 17, 20, 21, 24, 25, 28, 29,
21176 -1, -1, -1, -1, -1, -1, -1, -1 };
21181 static const int ShufMask2[] = {0, 2, -1, -1};
21193 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21204 MVT SrcVT = Src.getSimpleValueType();
21206 assert(DstBits == 32 &&
"expandFP_TO_UINT_SSE - only vXi32 supported");
21225 if (VT == MVT::v8i32 && !Subtarget.
hasAVX2()) {
21238 bool IsStrict =
Op->isStrictFPOpcode();
21241 MVT VT =
Op->getSimpleValueType(0);
21242 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
21244 MVT SrcVT = Src.getSimpleValueType();
21251 return DAG.
getNode(
Op.getOpcode(), dl, {VT, MVT::Other},
21252 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21253 {NVT, MVT::Other}, {Chain, Src})});
21254 return DAG.
getNode(
Op.getOpcode(), dl, VT,
21260 if (VT.isVector()) {
21261 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21262 MVT ResVT = MVT::v4i32;
21263 MVT TruncVT = MVT::v4i1;
21270 if (!IsSigned && !Subtarget.hasVLX()) {
21273 ResVT = MVT::v8i32;
21274 TruncVT = MVT::v8i1;
21275 Opc =
Op.getOpcode();
21285 Res = DAG.
getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21288 Res = DAG.
getNode(Opc, dl, ResVT, Src);
21300 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
21305 if (EleVT != MVT::i64)
21306 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21308 if (SrcVT != MVT::v8f16) {
21319 dl, {ResVT, MVT::Other}, {Chain, Src});
21342 if (VT.getVectorElementType() == MVT::i16) {
21345 "Expected f32/f64 vector!");
21350 dl, {NVT, MVT::Other}, {Chain, Src});
21366 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21367 assert(!IsSigned &&
"Expected unsigned conversion!");
21373 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21374 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21376 assert(!IsSigned &&
"Expected unsigned conversion!");
21377 assert(!Subtarget.hasVLX() &&
"Unexpected features!");
21378 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21379 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21405 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21406 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21408 assert(!Subtarget.hasVLX() &&
"Unexpected features!");
21409 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21419 Res = DAG.
getNode(
Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21423 Res = DAG.
getNode(
Op.getOpcode(), dl, MVT::v8i64, Src);
21434 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21435 if (!Subtarget.hasVLX()) {
21444 Tmp = DAG.
getNode(
Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21452 assert(Subtarget.hasDQI() && Subtarget.hasVLX() &&
"Requires AVX512DQVL");
21458 return DAG.
getNode(Opc, dl, {VT, MVT::Other}, {
Op->getOperand(0), Tmp});
21461 return DAG.
getNode(Opc, dl, VT, Tmp);
21466 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21467 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21468 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21469 assert(!IsSigned &&
"Expected unsigned conversion!");
21478 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21480 if (!IsSigned && UseSSEReg) {
21487 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21488 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21489 unsigned DstBits = VT.getScalarSizeInBits();
21519 if (VT == MVT::i64)
21522 assert(VT == MVT::i32 &&
"Unexpected VT!");
21527 if (Subtarget.is64Bit()) {
21550 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21551 assert(IsSigned &&
"Expected i16 FP_TO_UINT to have been promoted!");
21566 if (UseSSEReg && IsSigned)
21570 if (SrcVT == MVT::f128) {
21577 MakeLibCallOptions CallOptions;
21578 std::pair<SDValue, SDValue> Tmp =
21579 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21588 if (
SDValue V = FP_TO_INTHelper(
Op, DAG, IsSigned, Chain)) {
21594 llvm_unreachable(
"Expected FP_TO_INTHelper to handle all remaining cases.");
21600 EVT DstVT =
Op.getSimpleValueType();
21601 MVT SrcVT = Src.getSimpleValueType();
21606 if (SrcVT == MVT::f16)
21613 return LRINT_LLRINTHelper(
Op.getNode(), DAG);
21618 EVT DstVT =
N->getValueType(0);
21620 EVT SrcVT = Src.getValueType();
21622 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21635 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21637 int SPFI = cast<FrameIndexSDNode>(
StackPtr.getNode())->getIndex();
21642 assert(DstVT == MVT::i64 &&
"Invalid LRINT/LLRINT to lower!");
21643 Chain = DAG.
getStore(Chain,
DL, Src, StackPtr, MPI);