70#define DEBUG_TYPE "x86-isel"
73 "x86-experimental-pref-innermost-loop-alignment",
cl::init(4),
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
81 "x86-br-merging-base-cost",
cl::init(2),
83 "Sets the cost threshold for when multiple conditionals will be merged "
84 "into one branch versus be split in multiple branches. Merging "
85 "conditionals saves branches at the cost of additional instructions. "
86 "This value sets the instruction cost limit, below which conditionals "
87 "will be merged, and above which conditionals will be split. Set to -1 "
88 "to never merge branches."),
92 "x86-br-merging-ccmp-bias",
cl::init(6),
93 cl::desc(
"Increases 'x86-br-merging-base-cost' in cases that the target "
94 "supports conditional compare instructions."),
99 cl::desc(
"Replace narrow shifts with wider shifts."),
103 "x86-br-merging-likely-bias",
cl::init(0),
104 cl::desc(
"Increases 'x86-br-merging-base-cost' in cases that it is likely "
105 "that all conditionals will be executed. For example for merging "
106 "the conditionals (a == b && c > d), if its known that a == b is "
107 "likely, then it is likely that if the conditionals are split "
108 "both sides will be executed, so it may be desirable to increase "
109 "the instruction cost threshold. Set to -1 to never merge likely "
114 "x86-br-merging-unlikely-bias",
cl::init(-1),
116 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
117 "that all conditionals will be executed. For example for merging "
118 "the conditionals (a == b && c > d), if its known that a == b is "
119 "unlikely, then it is unlikely that if the conditionals are split "
120 "both sides will be executed, so it may be desirable to decrease "
121 "the instruction cost threshold. Set to -1 to never merge unlikely "
126 "mul-constant-optimization",
cl::init(
true),
127 cl::desc(
"Replace 'mul x, Const' with more effective instructions like "
134 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
151 if (Subtarget.isAtom())
153 else if (Subtarget.is64Bit())
162 if (Subtarget.hasSlowDivide32())
164 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
168 if (Subtarget.canUseCMPXCHG16B())
170 else if (Subtarget.canUseCMPXCHG8B())
183 if (Subtarget.is64Bit())
200 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
206 if (Subtarget.canUseCMOV()) {
209 if (Subtarget.is64Bit())
218 if (Subtarget.is64Bit())
226 if (Subtarget.is64Bit())
237 if (Subtarget.is64Bit())
241 if (!Subtarget.useSoftFloat()) {
305 if (!Subtarget.is64Bit() && Subtarget.hasX87()) {
311 if (Subtarget.hasSSE2()) {
314 for (
MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 if (Subtarget.is64Bit()) {
323 if (Subtarget.hasAVX10_2()) {
328 for (
MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
333 if (Subtarget.is64Bit()) {
344 if (!Subtarget.hasSSE2()) {
347 if (Subtarget.is64Bit()) {
352 }
else if (!Subtarget.is64Bit())
365 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
376 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
377 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
381 if (Subtarget.is64Bit())
392 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
410 if (!Subtarget.hasBMI()) {
413 if (Subtarget.is64Bit()) {
419 if (Subtarget.hasLZCNT()) {
425 for (
auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
426 if (VT == MVT::i64 && !Subtarget.is64Bit())
440 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ?
Custom :
Expand);
447 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
452 for (
MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
465 if (Subtarget.is64Bit())
467 if (Subtarget.hasPOPCNT()) {
481 if (!Subtarget.hasMOVBE())
485 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
491 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
492 if (VT == MVT::i64 && !Subtarget.is64Bit())
512 for (
auto VT : { MVT::i32, MVT::i64 }) {
513 if (VT == MVT::i64 && !Subtarget.is64Bit())
524 for (
auto VT : { MVT::i32, MVT::i64 }) {
525 if (VT == MVT::i64 && !Subtarget.is64Bit())
532 if (Subtarget.hasSSEPrefetch())
538 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
548 if (!Subtarget.is64Bit())
551 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
558 if (Subtarget.canUseCMPXCHG16B())
562 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
563 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
576 if (Subtarget.isTargetPS())
584 bool Is64Bit = Subtarget.is64Bit();
639 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
643 : &X86::FR16RegClass);
645 : &X86::FR32RegClass);
647 : &X86::FR64RegClass);
655 for (
auto VT : { MVT::f32, MVT::f64 }) {
676 setF16Action(MVT::f16,
Promote);
732 }
else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
733 (UseX87 || Is64Bit)) {
771 for (
auto VT : { MVT::f32, MVT::f64 }) {
784 if (UseX87 && (
getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
785 addLegalFPImmediate(
APFloat(+0.0f));
786 addLegalFPImmediate(
APFloat(+1.0f));
787 addLegalFPImmediate(
APFloat(-0.0f));
788 addLegalFPImmediate(
APFloat(-1.0f));
790 addLegalFPImmediate(
APFloat(+0.0f));
795 addLegalFPImmediate(
APFloat(+0.0));
796 addLegalFPImmediate(
APFloat(+1.0));
797 addLegalFPImmediate(
APFloat(-0.0));
798 addLegalFPImmediate(
APFloat(-1.0));
800 addLegalFPImmediate(
APFloat(+0.0));
831 addLegalFPImmediate(TmpFlt);
833 addLegalFPImmediate(TmpFlt);
839 addLegalFPImmediate(TmpFlt2);
841 addLegalFPImmediate(TmpFlt2);
890 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
892 : &X86::VR128RegClass);
969 for (
auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
970 MVT::v4f32, MVT::v8f32, MVT::v16f32,
971 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1059 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1061 : &X86::VR128RegClass);
1089 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1091 : &X86::VR128RegClass);
1096 : &X86::VR128RegClass);
1098 : &X86::VR128RegClass);
1100 : &X86::VR128RegClass);
1102 : &X86::VR128RegClass);
1104 : &X86::VR128RegClass);
1106 for (
auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1147 if (Subtarget.hasPCLMUL()) {
1148 for (
auto VT : {MVT::i64, MVT::v4i32, MVT::v2i64}) {
1157 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1180 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1200 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1208 for (
auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1213 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1219 setF16Action(MVT::v8f16,
Expand);
1244 for (
auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1290 if (!Subtarget.hasAVX512())
1318 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1322 if (VT == MVT::v2i64)
continue;
1336 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1342 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1350 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1355 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1368 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1369 for (
MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1409 for (
auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1424 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1440 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1441 for (
MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1442 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1448 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1452 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1453 bool HasInt256 = Subtarget.hasInt256();
1456 : &X86::VR256RegClass);
1458 : &X86::VR256RegClass);
1460 : &X86::VR256RegClass);
1462 : &X86::VR256RegClass);
1464 : &X86::VR256RegClass);
1466 : &X86::VR256RegClass);
1468 : &X86::VR256RegClass);
1470 for (
auto VT : { MVT::v8f32, MVT::v4f64 }) {
1535 if (!Subtarget.hasAVX512())
1540 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1546 if (VT == MVT::v4i64)
continue;
1567 for (
auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1578 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1597 if (Subtarget.hasAnyFMA()) {
1598 for (
auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1599 MVT::v2f64, MVT::v4f64 }) {
1605 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1646 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1654 for (
auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1676 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1677 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1684 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1685 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1690 for (
MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1691 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1702 setF16Action(MVT::v16f16,
Expand);
1712 if (Subtarget.hasPCLMUL()) {
1713 for (
auto VT : {MVT::v8i32, MVT::v4i64}) {
1726 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1727 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1731 if (Subtarget.hasGFNI()) {
1737 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1738 Subtarget.hasF16C()) {
1739 for (
MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1743 for (
MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1758 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1786 if (!Subtarget.hasDQI()) {
1799 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1805 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1808 for (
auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1821 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1824 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1825 for (
MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1834 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1835 bool HasBWI = Subtarget.hasBWI();
1855 for (
MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1871 if (Subtarget.hasDQI())
1889 for (
MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1896 for (
MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1933 if (!Subtarget.hasVLX()) {
1934 for (
auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1935 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1961 for (
auto VT : { MVT::v16f32, MVT::v8f64 }) {
1978 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2005 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
2029 for (
auto VT : { MVT::v16i32, MVT::v8i64 }) {
2040 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2061 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2067 if (Subtarget.hasDQI())
2070 if (Subtarget.hasCDI()) {
2072 for (
auto VT : { MVT::v16i32, MVT::v8i64} ) {
2077 if (Subtarget.hasVPOPCNTDQ()) {
2078 for (
auto VT : { MVT::v16i32, MVT::v8i64 })
2086 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2087 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2090 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2091 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2102 setF16Action(MVT::v32f16,
Expand);
2111 for (
auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2118 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2127 if (Subtarget.hasVBMI2()) {
2128 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2138 if (Subtarget.hasPCLMUL()) {
2139 for (
auto VT : {MVT::v16i32, MVT::v8i64}) {
2149 if (Subtarget.hasGFNI()) {
2155 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2156 for (
auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2171 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2172 for (
MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
2173 MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
2174 MVT::v16f32, MVT::v8f64})
2183 if (Subtarget.hasDQI()) {
2188 "Unexpected operation action!");
2196 for (
auto VT : { MVT::v2i64, MVT::v4i64 }) {
2204 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2213 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2214 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2217 if (Subtarget.hasDQI()) {
2228 if (Subtarget.hasCDI()) {
2229 for (
auto VT : {MVT::i256, MVT::i512}) {
2230 if (VT == MVT::i512 && !Subtarget.useAVX512Regs())
2237 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2242 if (Subtarget.hasVPOPCNTDQ()) {
2243 for (
auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64})
2251 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2252 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2253 MVT::v16i16, MVT::v8i8})
2258 for (
MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2262 if (Subtarget.hasVLX())
2263 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2264 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2268 if (Subtarget.hasVBMI2())
2269 for (
MVT VT : {MVT::v32i16, MVT::v64i8})
2273 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2274 for (
MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2280 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2284 for (
auto VT : { MVT::v32i1, MVT::v64i1 }) {
2297 for (
auto VT : { MVT::v16i1, MVT::v32i1 })
2305 for (
auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2306 MVT::v16f16, MVT::v8f16}) {
2315 if (Subtarget.hasBITALG()) {
2316 for (
auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2321 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2322 auto setGroup = [&] (
MVT VT) {
2392 if (Subtarget.useAVX512Regs()) {
2393 setGroup(MVT::v32f16);
2444 if (Subtarget.hasVLX()) {
2445 setGroup(MVT::v8f16);
2446 setGroup(MVT::v16f16);
2497 if (!Subtarget.useSoftFloat() &&
2498 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2500 : &X86::VR128RegClass);
2501 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2502 : &X86::VR256RegClass);
2508 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2509 setF16Action(VT,
Expand);
2510 if (!Subtarget.hasBF16())
2527 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2528 Subtarget.useAVX512Regs()) {
2530 setF16Action(MVT::v32bf16,
Expand);
2541 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2553 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2566 for (
auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2572 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2585 if (Subtarget.hasBWI()) {
2590 if (Subtarget.hasFP16()) {
2622 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2630 if (!Subtarget.is64Bit()) {
2640 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2641 if (VT == MVT::i64 && !Subtarget.is64Bit())
2663 if (Subtarget.isTargetWin64()) {
2682 if (Subtarget.is32Bit() &&
2683 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2711 if (Subtarget.isOSWindows()) {
2824 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2829 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2835 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2842 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2843 !Subtarget.hasBWI())
2870 bool AssumeSingleUse,
bool IgnoreAlignment) {
2871 if (!AssumeSingleUse && !
Op.hasOneUse())
2878 if (!IgnoreAlignment && !Subtarget.
hasAVX() &&
2879 !Subtarget.hasSSEUnalignedMem() && Ld->getValueSizeInBits(0) == 128 &&
2880 Ld->getAlign() <
Align(16))
2891 bool AssumeSingleUse) {
2892 assert(Subtarget.
hasAVX() &&
"Expected AVX for broadcast from memory");
2899 return !Ld->isVolatile() ||
2904 if (!
Op.hasOneUse())
2917 if (
Op.hasOneUse()) {
2918 unsigned Opcode =
Op.getNode()->user_begin()->getOpcode();
2932 EVT VT =
Op.getValueType();
2933 unsigned Opcode =
Op.getOpcode();
2934 if ((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
2971 default:
return false;
2972 case X86ISD::BLENDI:
2973 case X86ISD::PSHUFB:
2974 case X86ISD::PSHUFD:
2975 case X86ISD::PSHUFHW:
2976 case X86ISD::PSHUFLW:
2978 case X86ISD::INSERTPS:
2979 case X86ISD::EXTRQI:
2980 case X86ISD::INSERTQI:
2981 case X86ISD::VALIGN:
2982 case X86ISD::PALIGNR:
2983 case X86ISD::VSHLDQ:
2984 case X86ISD::VSRLDQ:
2985 case X86ISD::MOVLHPS:
2986 case X86ISD::MOVHLPS:
2987 case X86ISD::MOVSHDUP:
2988 case X86ISD::MOVSLDUP:
2989 case X86ISD::MOVDDUP:
2993 case X86ISD::UNPCKL:
2994 case X86ISD::UNPCKH:
2995 case X86ISD::VBROADCAST:
2996 case X86ISD::VPERMILPI:
2997 case X86ISD::VPERMILPV:
2998 case X86ISD::VPERM2X128:
2999 case X86ISD::SHUF128:
3000 case X86ISD::VPERMIL2:
3001 case X86ISD::VPERMI:
3002 case X86ISD::VPPERM:
3003 case X86ISD::VPERMV:
3004 case X86ISD::VPERMV3:
3005 case X86ISD::VZEXT_MOVL:
3006 case X86ISD::COMPRESS:
3007 case X86ISD::EXPAND:
3014 default:
return false;
3016 case X86ISD::PSHUFB:
3017 case X86ISD::VPERMILPV:
3018 case X86ISD::VPERMIL2:
3019 case X86ISD::VPPERM:
3020 case X86ISD::VPERMV:
3021 case X86ISD::VPERMV3:
3035 int ReturnAddrIndex = FuncInfo->
getRAIndex();
3037 if (ReturnAddrIndex == 0) {
3039 unsigned SlotSize = RegInfo->getSlotSize();
3050 bool HasSymbolicDisplacement) {
3057 if (!HasSymbolicDisplacement)
3075 return Offset < 16 * 1024 * 1024;
3099 switch (SetCCOpcode) {
3124 if (SetCCOpcode ==
ISD::SETGT && RHSC->isAllOnes()) {
3129 if (SetCCOpcode ==
ISD::SETLT && RHSC->isZero()) {
3133 if (SetCCOpcode ==
ISD::SETGE && RHSC->isZero()) {
3137 if (SetCCOpcode ==
ISD::SETLT && RHSC->isOne()) {
3152 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3156 switch (SetCCOpcode) {
3172 switch (SetCCOpcode) {
3233 case Intrinsic::x86_aesenc128kl:
3234 case Intrinsic::x86_aesdec128kl:
3236 Info.ptrVal =
I.getArgOperand(1);
3238 Info.align =
Align(1);
3242 case Intrinsic::x86_aesenc256kl:
3243 case Intrinsic::x86_aesdec256kl:
3245 Info.ptrVal =
I.getArgOperand(1);
3247 Info.align =
Align(1);
3251 case Intrinsic::x86_aesencwide128kl:
3252 case Intrinsic::x86_aesdecwide128kl:
3254 Info.ptrVal =
I.getArgOperand(0);
3256 Info.align =
Align(1);
3260 case Intrinsic::x86_aesencwide256kl:
3261 case Intrinsic::x86_aesdecwide256kl:
3263 Info.ptrVal =
I.getArgOperand(0);
3265 Info.align =
Align(1);
3269 case Intrinsic::x86_cmpccxadd32:
3270 case Intrinsic::x86_cmpccxadd64:
3271 case Intrinsic::x86_atomic_bts:
3272 case Intrinsic::x86_atomic_btc:
3273 case Intrinsic::x86_atomic_btr: {
3275 Info.ptrVal =
I.getArgOperand(0);
3276 unsigned Size =
I.getType()->getScalarSizeInBits();
3284 case Intrinsic::x86_atomic_bts_rm:
3285 case Intrinsic::x86_atomic_btc_rm:
3286 case Intrinsic::x86_atomic_btr_rm: {
3288 Info.ptrVal =
I.getArgOperand(0);
3289 unsigned Size =
I.getArgOperand(1)->getType()->getScalarSizeInBits();
3297 case Intrinsic::x86_aadd32:
3298 case Intrinsic::x86_aadd64:
3299 case Intrinsic::x86_aand32:
3300 case Intrinsic::x86_aand64:
3301 case Intrinsic::x86_aor32:
3302 case Intrinsic::x86_aor64:
3303 case Intrinsic::x86_axor32:
3304 case Intrinsic::x86_axor64:
3305 case Intrinsic::x86_atomic_add_cc:
3306 case Intrinsic::x86_atomic_sub_cc:
3307 case Intrinsic::x86_atomic_or_cc:
3308 case Intrinsic::x86_atomic_and_cc:
3309 case Intrinsic::x86_atomic_xor_cc: {
3311 Info.ptrVal =
I.getArgOperand(0);
3312 unsigned Size =
I.getArgOperand(1)->getType()->getScalarSizeInBits();
3324 switch (IntrData->
Type) {
3329 Info.ptrVal =
I.getArgOperand(0);
3335 ScalarVT = MVT::i16;
3337 ScalarVT = MVT::i32;
3340 Info.align =
Align(1);
3348 Info.ptrVal =
nullptr;
3354 Info.align =
Align(1);
3361 Info.ptrVal =
nullptr;
3367 Info.align =
Align(1);
3381 bool ForCodeSize)
const {
3382 for (
const APFloat &FPImm : LegalFPImmediates)
3383 if (Imm.bitwiseIsEqual(FPImm))
3390 std::optional<unsigned> ByteOffset)
const {
3393 auto PeekThroughOneUserBitcasts = [](
const SDNode *
N) {
3395 N = *
N->user_begin();
3402 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3410 EVT VT = Load->getValueType(0);
3412 !
SDValue(Load, 0).hasOneUse()) {
3413 bool FullWidthUse =
false;
3414 bool AllExtractStores =
true;
3417 if (
Use.getResNo() != 0)
3425 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3426 return Inner->getOpcode() == ISD::STORE;
3430 AllExtractStores =
false;
3437 FullWidthUse =
true;
3440 if (AllExtractStores)
3457 assert(Ty->isIntegerTy());
3459 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3460 if (BitSize == 0 || BitSize > 64)
3471 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3477 if (VT.
isVector() && Subtarget.hasAVX512())
3497 unsigned TZeros = ShiftedMulC == 2 ? 0 : ShiftedMulC.
countr_zero();
3499 if ((ShiftedMulC - 1).isPowerOf2() || (ShiftedMulC + 1).isPowerOf2())
3519 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3523 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3524 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3528 unsigned Index)
const {
3571 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3572 Subtarget.hasBitScanPassThrough() ||
3573 (!Ty->isVectorTy() &&
3574 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3580 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3581 Subtarget.hasBitScanPassThrough();
3588 return !Subtarget.hasSSE2() || VT == MVT::f80;
3592 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3593 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3599 if (!Subtarget.hasAVX512() && !LoadVT.
isVector() && BitcastVT.
isVector() &&
3603 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3629 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3634 if (MemVT.
getSizeInBits() > Subtarget.getPreferVectorWidth())
3641 return Subtarget.hasFastLZCNT();
3652 return Y.getValueType().isScalarInteger();
3656 EVT VT =
Y.getValueType();
3659 if (!Subtarget.hasBMI())
3663 if (VT != MVT::i32 && VT != MVT::i64)
3672 if (VT == MVT::v4i32)
3675 return Subtarget.hasSSE2();
3679 return X.getValueType().isScalarInteger();
3685 unsigned OldShiftOpcode,
unsigned NewShiftOpcode,
3689 X, XC, CC,
Y, OldShiftOpcode, NewShiftOpcode, DAG))
3692 if (
X.getValueType().isScalarInteger())
3699 if (Subtarget.hasAVX2())
3706 EVT VT,
unsigned ShiftOpc,
bool MayTransformRotate,
3707 const APInt &ShiftOrRotateAmt,
const std::optional<APInt> &AndMask)
const {
3711 bool PreferRotate =
false;
3715 PreferRotate = Subtarget.hasAVX512() && (VT.
getScalarType() == MVT::i32 ||
3720 PreferRotate = Subtarget.hasBMI2();
3721 if (!PreferRotate) {
3724 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3729 assert(AndMask.has_value() &&
"Null andmask when querying about shift+and");
3731 if (PreferRotate && MayTransformRotate)
3765 if (PreferRotate || !MayTransformRotate || VT.
isVector())
3775 const Value *Rhs)
const {
3779 if (BaseCost >= 0 && Subtarget.hasCCMP())
3782 if (BaseCost >= 0 &&
Opc == Instruction::And &&
3791 if (BaseCost >= 0 && !Subtarget.hasCCMP() &&
Opc == Instruction::Or &&
3794 return {-1, -1, -1};
3807 N->getOperand(0).getOpcode() ==
ISD::SRL) ||
3809 N->getOperand(0).getOpcode() ==
ISD::SHL)) &&
3810 "Expected shift-shift mask");
3812 EVT VT =
N->getValueType(0);
3813 if ((Subtarget.hasFastVectorShiftMasks() && VT.
isVector()) ||
3814 (Subtarget.hasFastScalarShiftMasks() && !VT.
isVector())) {
3818 return N->getOperand(1) ==
N->getOperand(0).getOperand(1);
3824 EVT VT =
Y.getValueType();
3830 unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
3838 !Subtarget.isOSWindows())
3888 [CmpVal](
int M) { return isUndefOrEqual(M, CmpVal); });
3904 unsigned NumElts = Mask.size();
3910 unsigned NumElts = Mask.size();
3916 return (Val >=
Low && Val <
Hi);
3959 unsigned NumElts = Mask.size();
3970 unsigned Size,
int Low,
int Step = 1) {
3971 for (
unsigned i = Pos, e = Pos +
Size; i != e; ++i,
Low += Step)
3983 for (
unsigned i = Pos, e = Pos +
Size; i != e; ++i,
Low += Step)
3999 unsigned NumElts = Mask.size();
4018 WidenedMask.
assign(Mask.size() / 2, 0);
4019 for (
int i = 0,
Size = Mask.size(); i <
Size; i += 2) {
4021 int M1 = Mask[i + 1];
4032 WidenedMask[i / 2] =
M1 / 2;
4036 WidenedMask[i / 2] =
M0 / 2;
4053 WidenedMask[i / 2] =
M0 / 2;
4060 assert(WidenedMask.
size() == Mask.size() / 2 &&
4061 "Incorrect size of mask after widening the elements!");
4067 const APInt &Zeroable,
4074 assert(!Zeroable.
isZero() &&
"V2's non-undef elements are used?!");
4075 for (
int i = 0,
Size = Mask.size(); i !=
Size; ++i)
4091 unsigned NumSrcElts = Mask.size();
4092 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
4093 "Illegal shuffle scale factor");
4096 if (NumDstElts >= NumSrcElts) {
4097 int Scale = NumDstElts / NumSrcElts;
4105 while (ScaledMask.
size() > NumDstElts) {
4109 ScaledMask = std::move(WidenedMask);
4126 unsigned SrcSizeInBits,
unsigned DstSizeInBits) {
4127 assert(DstMask.
empty() &&
"Expected an empty shuffle mas");
4128 assert((DstSizeInBits % SrcSizeInBits) == 0 &&
"Illegal shuffle scale");
4129 unsigned Scale = DstSizeInBits / SrcSizeInBits;
4130 unsigned NumSrcElts = SrcMask.
size();
4132 for (
int &M : DstMask) {
4135 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
4149 const SDLoc &dl,
bool IsMask =
false) {
4154 MVT ConstVecVT = VT;
4163 for (
unsigned i = 0; i < NumElts; ++i) {
4164 bool IsUndef = Values[i] < 0 && IsMask;
4167 Ops.push_back(OpNode);
4181 "Unequal constant and undef arrays");
4185 MVT ConstVecVT = VT;
4195 for (
unsigned i = 0, e = Bits.size(); i != e; ++i) {
4200 const APInt &V = Bits[i];
4204 Ops.push_back(DAG.
getConstant(V.extractBits(32, 32), dl, EltVT));
4225 "Unexpected vector type");
4239 "Unexpected vector type");
4253 LHS.getValueType() !=
RHS.getValueType() ||
4254 LHS.getOperand(0) !=
RHS.getOperand(0))
4258 if (Src.getValueSizeInBits() != (
LHS.getValueSizeInBits() * 2))
4261 unsigned NumElts =
LHS.getValueType().getVectorNumElements();
4262 if ((
LHS.getConstantOperandAPInt(1) == 0 &&
4263 RHS.getConstantOperandAPInt(1) == NumElts) ||
4264 (AllowCommute &&
RHS.getConstantOperandAPInt(1) == 0 &&
4265 LHS.getConstantOperandAPInt(1) == NumElts))
4272 const SDLoc &dl,
unsigned vectorWidth) {
4275 unsigned ResultNumElts =
4280 "Illegal subvector extraction");
4283 unsigned ElemsPerChunk = vectorWidth / ElVT.
getSizeInBits();
4288 IdxVal &= ~(ElemsPerChunk - 1);
4293 Vec->
ops().slice(IdxVal, ElemsPerChunk));
4314 "Unexpected vector size!");
4327 unsigned vectorWidth) {
4328 assert((vectorWidth == 128 || vectorWidth == 256) &&
4329 "Unsupported vector width");
4341 IdxVal &= ~(ElemsPerChunk - 1);
4365 "Unsupported vector widening type");
4385 const SDLoc &dl,
unsigned WideSizeInBits) {
4388 "Unsupported vector widening type");
4392 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4400 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4401 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4411 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4419 assert(
Ops.empty() &&
"Expected an empty ops vector");
4422 Ops.append(
N->op_begin(),
N->op_end());
4429 const APInt &Idx =
N->getConstantOperandAPInt(2);
4430 EVT VT = Src.getValueType();
4431 EVT SubVT =
Sub.getValueType();
4435 if (Idx == 0 && Src.isUndef()) {
4443 Src.getOperand(1).getValueType() == SubVT &&
4467 if (Src.isUndef()) {
4477 EVT VT =
N->getValueType(0);
4479 uint64_t Idx =
N->getConstantOperandVal(1);
4486 (VT.
getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4487 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4488 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4489 unsigned NumSubs = VT.
getSizeInBits() / SrcOps[0].getValueSizeInBits();
4490 Ops.append(SrcOps.
begin() + SubIdx, SrcOps.
begin() + SubIdx + NumSubs);
4495 assert(
Ops.empty() &&
"Expected an empty ops vector");
4507 unsigned NumSubOps = SubOps.
size();
4508 unsigned HalfNumSubOps = NumSubOps / 2;
4509 assert((NumSubOps % 2) == 0 &&
"Unexpected number of subvectors");
4529 EVT VT =
Op.getValueType();
4532 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4533 "Can't split odd sized vector");
4537 assert((SubOps.
size() % 2) == 0 &&
"Can't split odd sized vector concat");
4538 unsigned HalfOps = SubOps.
size() / 2;
4544 return std::make_pair(
Lo,
Hi);
4551 return std::make_pair(
Lo,
Lo);
4554 return std::make_pair(
Lo,
Hi);
4559 unsigned NumOps =
Op.getNumOperands();
4560 EVT VT =
Op.getValueType();
4565 for (
unsigned I = 0;
I !=
NumOps; ++
I) {
4567 if (!
SrcOp.getValueType().isVector()) {
4577 DAG.
getNode(
Op.getOpcode(), dl, LoVT, LoOps),
4578 DAG.
getNode(
Op.getOpcode(), dl, HiVT, HiOps));
4587 [[maybe_unused]]
EVT VT =
Op.getValueType();
4588 assert((
Op.getOperand(0).getValueType().is256BitVector() ||
4589 Op.getOperand(0).getValueType().is512BitVector()) &&
4591 assert(
Op.getOperand(0).getValueType().getVectorNumElements() ==
4602 [[maybe_unused]]
EVT VT =
Op.getValueType();
4603 assert(
Op.getOperand(0).getValueType() == VT &&
4604 Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!");
4616template <
typename F>
4619 F Builder,
bool CheckBWI =
true,
4620 bool AllowAVX512 =
true) {
4621 assert(Subtarget.
hasSSE2() &&
"Target assumed to support at least SSE2");
4622 unsigned NumSubs = 1;
4623 if (AllowAVX512 && ((CheckBWI && Subtarget.
useBWIRegs()) ||
4629 }
else if (Subtarget.
hasAVX2()) {
4642 return Builder(DAG,
DL,
Ops);
4645 for (
unsigned i = 0; i != NumSubs; ++i) {
4648 EVT OpVT =
Op.getValueType();
4669 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4672 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4679 APInt SplatValue, SplatUndef;
4680 unsigned SplatBitSize;
4682 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4683 HasAnyUndefs, OpEltSizeInBits) &&
4684 !HasAnyUndefs && SplatValue.
getBitWidth() == OpEltSizeInBits)
4699 MVT OpVT =
Op.getSimpleValueType();
4703 assert(OpVT == VT &&
"Vector type mismatch");
4705 if (
SDValue BroadcastOp = MakeBroadcastOp(
Op, OpVT, DstVT)) {
4731 unsigned IdxVal =
Op.getConstantOperandVal(2);
4737 if (IdxVal == 0 && Vec.
isUndef())
4740 MVT OpVT =
Op.getSimpleValueType();
4759 assert(IdxVal + SubVecNumElems <= NumElems &&
4761 "Unexpected index value in INSERT_SUBVECTOR");
4770 Vec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4771 Vec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4781 Undef, SubVec, ZeroIdx);
4784 assert(IdxVal != 0 &&
"Unexpected index");
4785 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4791 assert(IdxVal != 0 &&
"Unexpected index");
4794 [](
SDValue V) { return V.isUndef(); })) {
4795 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4799 unsigned ShiftLeft = NumElems - SubVecNumElems;
4800 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4801 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4803 if (ShiftRight != 0)
4804 SubVec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4811 if (IdxVal + SubVecNumElems == NumElems) {
4812 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4814 if (SubVecNumElems * 2 == NumElems) {
4824 Undef, Vec, ZeroIdx);
4827 Vec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4828 Vec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4841 unsigned ShiftLeft = NumElems - SubVecNumElems;
4842 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4845 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4851 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4853 SubVec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4862 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4864 SubVec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4868 unsigned LowShift = NumElems - IdxVal;
4875 unsigned HighShift = IdxVal + SubVecNumElems;
4906 "Expected a 128/256/512-bit vector type");
4918 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
4922 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
4926 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
4940 if (VT !=
SrcOp.getSimpleValueType())
4948 if (ShiftAmt >= ElementType.getSizeInBits()) {
4949 if (
Opc == X86ISD::VSRAI)
4950 ShiftAmt = ElementType.getSizeInBits() - 1;
4956 (
Opc == X86ISD::VSHLI ||
Opc == X86ISD::VSRLI ||
Opc == X86ISD::VSRAI) &&
4957 "Unknown target vector shift-by-constant node");
4994 "Illegal vector splat index");
4997 if (ShAmtIdx != 0) {
5016 bool IsMasked =
false;
5024 ShAmt = DAG.
getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
5037 {ShAmt.getOperand(1), Mask}))) {
5053 if (AmtVT == MVT::v4i32 && (ShAmt.
getOpcode() == X86ISD::VBROADCAST ||
5054 ShAmt.
getOpcode() == X86ISD::VBROADCAST_LOAD)) {
5055 ShAmt = DAG.
getNode(X86ISD::VZEXT_MOVL,
SDLoc(ShAmt), MVT::v4i32, ShAmt);
5063 ShAmt = DAG.
getNode(X86ISD::VSHLDQ,
SDLoc(ShAmt), MVT::v16i8, ShAmt,
5065 ShAmt = DAG.
getNode(X86ISD::VSRLDQ,
SDLoc(ShAmt), MVT::v16i8, ShAmt,
5084 EVT InVT = In.getValueType();
5109 "Expected VTs to be the same size!");
5113 InVT = In.getValueType();
5131 bool Lo,
bool Unary) {
5133 "Illegal vector type to unpack");
5134 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
5137 for (
int i = 0; i < NumElts; ++i) {
5138 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5139 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5140 Pos += (Unary ? 0 : NumElts * (i % 2));
5141 Pos += (
Lo ? 0 : NumEltsInLane / 2);
5142 Mask.push_back(Pos);
5152 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
5154 for (
int i = 0; i < NumElts; ++i) {
5156 Pos += (
Lo ? 0 : NumElts / 2);
5157 Mask.push_back(Pos);
5167 for (
int I = 0, NumElts = Mask.size();
I != NumElts; ++
I) {
5171 SDValue V = (M < NumElts) ? V1 : V2;
5174 Ops[
I] = V.getOperand(M % NumElts);
5203 bool PackHiHalf =
false) {
5204 MVT OpVT =
LHS.getSimpleValueType();
5206 bool UsePackUS = Subtarget.
hasSSE41() || EltSizeInBits == 8;
5207 assert(OpVT ==
RHS.getSimpleValueType() &&
5210 "Unexpected PACK operand types");
5211 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
5212 "Unexpected PACK result type");
5215 if (EltSizeInBits == 32) {
5217 int Offset = PackHiHalf ? 1 : 0;
5219 for (
int I = 0;
I != NumElts;
I += 4) {
5277 for (
int i = 0; i != NumElems; ++i)
5279 MaskVec[i] = (i == Idx) ? NumElems : i;
5284 if (Ptr.
getOpcode() == X86ISD::Wrapper ||
5311 assert(LD &&
"Unexpected null LoadSDNode");
5319 EVT CondVT =
Cond.getValueType();
5320 return N->getOpcode() ==
ISD::VSELECT && Subtarget.hasAVX512() &&
5329 bool AllowWholeUndefs =
true,
5330 bool AllowPartialUndefs =
false) {
5331 assert(EltBits.
empty() &&
"Expected an empty EltBits vector");
5335 EVT VT =
Op.getValueType();
5337 unsigned NumElts = SizeInBits / EltSizeInBits;
5340 if ((SizeInBits % EltSizeInBits) != 0)
5346 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5347 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5348 "Constant bit sizes don't match");
5351 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5356 if (NumSrcElts == NumElts) {
5357 UndefElts = UndefSrcElts;
5358 EltBits.
assign(SrcEltBits.begin(), SrcEltBits.end());
5363 APInt UndefBits(SizeInBits, 0);
5364 APInt MaskBits(SizeInBits, 0);
5366 for (
unsigned i = 0; i != NumSrcElts; ++i) {
5367 unsigned BitOffset = i * SrcEltSizeInBits;
5368 if (UndefSrcElts[i])
5369 UndefBits.
setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5370 MaskBits.
insertBits(SrcEltBits[i], BitOffset);
5374 UndefElts =
APInt(NumElts, 0);
5377 for (
unsigned i = 0; i != NumElts; ++i) {
5378 unsigned BitOffset = i * EltSizeInBits;
5383 if (!AllowWholeUndefs)
5391 if (UndefEltBits.
getBoolValue() && !AllowPartialUndefs)
5394 EltBits[i] = MaskBits.
extractBits(EltSizeInBits, BitOffset);
5401 unsigned UndefBitIndex) {
5405 Undefs.setBit(UndefBitIndex);
5409 Mask = CInt->getValue();
5413 Mask = CFP->getValueAPF().bitcastToAPInt();
5417 Type *Ty = CDS->getType();
5419 Type *EltTy = CDS->getElementType();
5423 if (!IsInteger && !IsFP)
5426 for (
unsigned I = 0,
E = CDS->getNumElements();
I !=
E; ++
I)
5428 Mask.insertBits(CDS->getElementAsAPInt(
I),
I * EltBits);
5430 Mask.insertBits(CDS->getElementAsAPFloat(
I).bitcastToAPInt(),
5441 return CastBitData(UndefSrcElts, SrcEltBits);
5448 return CastBitData(UndefSrcElts, SrcEltBits);
5452 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5454 return CastBitData(UndefSrcElts, SrcEltBits);
5462 if (BV->getConstantRawBits(
true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5464 for (
unsigned I = 0,
E = SrcEltBits.
size();
I !=
E; ++
I)
5467 return CastBitData(UndefSrcElts, SrcEltBits);
5475 if (!CstTy->
isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5479 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5480 if ((SizeInBits % SrcEltSizeInBits) != 0)
5483 APInt UndefSrcElts(NumSrcElts, 0);
5485 for (
unsigned i = 0; i != NumSrcElts; ++i)
5490 return CastBitData(UndefSrcElts, SrcEltBits);
5494 if (
Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5500 SDValue Ptr = MemIntr->getBasePtr();
5503 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5505 APInt UndefSrcElts(NumSrcElts, 0);
5507 if (CollectConstantBits(
C, SrcEltBits[0], UndefSrcElts, 0)) {
5508 if (UndefSrcElts[0])
5509 UndefSrcElts.
setBits(0, NumSrcElts);
5510 if (SrcEltBits[0].
getBitWidth() != SrcEltSizeInBits)
5511 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5512 SrcEltBits.
append(NumSrcElts - 1, SrcEltBits[0]);
5513 return CastBitData(UndefSrcElts, SrcEltBits);
5519 if (
Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5521 SDValue Ptr = MemIntr->getBasePtr();
5527 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5528 if (!CstTy->
isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5529 (SizeInBits % SubVecSizeInBits) != 0)
5532 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5533 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5534 APInt UndefSubElts(NumSubElts, 0);
5536 APInt(CstEltSizeInBits, 0));
5537 for (
unsigned i = 0; i != NumSubElts; ++i) {
5541 for (
unsigned j = 1; j != NumSubVecs; ++j)
5542 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5546 return CastBitData(UndefSubElts, SubEltBits);
5551 if (
Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5555 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5557 APInt UndefSrcElts(NumSrcElts, 0);
5559 const APInt &
C =
Op.getOperand(0).getConstantOperandAPInt(0);
5560 SrcEltBits.
push_back(
C.zextOrTrunc(SrcEltSizeInBits));
5561 SrcEltBits.
append(NumSrcElts - 1,
APInt(SrcEltSizeInBits, 0));
5562 return CastBitData(UndefSrcElts, SrcEltBits);
5570 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5572 APInt UndefSrcElts, UndefSubElts;
5575 UndefSubElts, EltSubBits,
5576 AllowWholeUndefs && AllowUndefs,
5577 AllowPartialUndefs && AllowUndefs) &&
5579 UndefSrcElts, EltSrcBits,
5580 AllowWholeUndefs && AllowUndefs,
5581 AllowPartialUndefs && AllowUndefs)) {
5582 unsigned BaseIdx =
Op.getConstantOperandVal(2);
5583 UndefSrcElts.
insertBits(UndefSubElts, BaseIdx);
5584 for (
unsigned i = 0, e = EltSubBits.
size(); i != e; ++i)
5585 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5586 return CastBitData(UndefSrcElts, EltSrcBits);
5593 EltBits, AllowWholeUndefs,
5594 AllowPartialUndefs)) {
5595 EVT SrcVT =
Op.getOperand(0).getValueType();
5596 unsigned NumSrcElts = SrcVT.
getSizeInBits() / EltSizeInBits;
5599 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5602 (BaseOfs % EltSizeInBits) == 0 &&
"Bad subvector index");
5604 UndefElts = UndefElts.
extractBits(NumSubElts, BaseIdx);
5605 if ((BaseIdx + NumSubElts) != NumSrcElts)
5606 EltBits.
erase(EltBits.
begin() + BaseIdx + NumSubElts, EltBits.
end());
5619 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5623 APInt UndefElts0, UndefElts1;
5627 UndefElts0, EltBits0, AllowWholeUndefs,
5628 AllowPartialUndefs))
5632 UndefElts1, EltBits1, AllowWholeUndefs,
5633 AllowPartialUndefs))
5637 for (
int i = 0; i != (int)NumElts; ++i) {
5642 }
else if (M < (
int)NumElts) {
5647 if (UndefElts1[M - NumElts])
5649 EltBits.
push_back(EltBits1[M - NumElts]);
5664 Op,
Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5665 true, AllowPartialUndefs)) {
5666 int SplatIndex = -1;
5667 for (
int i = 0, e = EltBits.
size(); i != e; ++i) {
5670 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5676 if (0 <= SplatIndex) {
5677 SplatVal = EltBits[SplatIndex];
5690 case ::llvm::RoundingMode::TowardPositive:
return X86::rmUpward;
5701 unsigned MaskEltSizeInBits,
5712 for (
const APInt &Elt : EltBits)
5727 bool IsPow2OrUndef =
true;
5728 for (
unsigned I = 0,
E = EltBits.
size();
I !=
E; ++
I)
5729 IsPow2OrUndef &= UndefElts[
I] || EltBits[
I].isPowerOf2();
5730 return IsPow2OrUndef;
5737 EVT VT = V.getValueType();
5743 return V.getOperand(0);
5747 (
isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5749 Not = DAG.
getBitcast(V.getOperand(0).getValueType(), Not);
5756 if (V.getOpcode() == X86ISD::PCMPGT &&
5759 V.getOperand(0).hasOneUse()) {
5763 V.getScalarValueSizeInBits(), UndefElts,
5767 bool MinSigned =
false;
5768 for (
APInt &Elt : EltBits) {
5769 MinSigned |= Elt.isMinSignedValue();
5774 MVT VT = V.getSimpleValueType();
5775 return DAG.
getNode(X86ISD::PCMPGT,
DL, VT, V.getOperand(1),
5784 for (
SDValue &CatOp : CatOps) {
5788 CatOp = DAG.
getBitcast(CatOp.getValueType(), NotCat);
5795 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5810 bool Unary,
unsigned NumStages = 1) {
5811 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
5815 unsigned Offset = Unary ? 0 : NumElts;
5816 unsigned Repetitions = 1u << (NumStages - 1);
5818 assert((NumEltsPerLane >> NumStages) > 0 &&
"Illegal packing compaction");
5820 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5821 for (
unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5822 for (
unsigned Elt = 0; Elt != NumEltsPerLane; Elt +=
Increment)
5823 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5824 for (
unsigned Elt = 0; Elt != NumEltsPerLane; Elt +=
Increment)
5825 Mask.push_back(Elt + (Lane * NumEltsPerLane) +
Offset);
5835 int NumInnerElts = NumElts / 2;
5836 int NumEltsPerLane = NumElts / NumLanes;
5837 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5843 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
5844 for (
int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5845 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5846 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5847 if (DemandedElts[OuterIdx])
5848 DemandedLHS.
setBit(InnerIdx);
5849 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5850 DemandedRHS.
setBit(InnerIdx);
5859 DemandedLHS, DemandedRHS);
5860 DemandedLHS |= DemandedLHS << 1;
5861 DemandedRHS |= DemandedRHS << 1;
5877 MVT VT =
N.getSimpleValueType();
5884 assert(Mask.empty() &&
"getTargetShuffleMask expects an empty Mask vector");
5885 assert(
Ops.empty() &&
"getTargetShuffleMask expects an empty Ops vector");
5888 bool IsFakeUnary =
false;
5889 switch (
N.getOpcode()) {
5890 case X86ISD::BLENDI:
5891 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5892 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5893 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5895 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5898 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5899 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5900 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5902 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5904 case X86ISD::INSERTPS:
5905 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5906 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5907 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5909 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5911 case X86ISD::EXTRQI:
5912 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5915 int BitLen =
N.getConstantOperandVal(1);
5916 int BitIdx =
N.getConstantOperandVal(2);
5921 case X86ISD::INSERTQI:
5922 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5923 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5926 int BitLen =
N.getConstantOperandVal(2);
5927 int BitIdx =
N.getConstantOperandVal(3);
5929 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5932 case X86ISD::UNPCKH:
5933 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5934 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5936 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5938 case X86ISD::UNPCKL:
5939 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5940 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5942 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5944 case X86ISD::MOVHLPS:
5945 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5946 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5948 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5950 case X86ISD::MOVLHPS:
5951 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5952 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5954 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5956 case X86ISD::VALIGN:
5958 "Only 32-bit and 64-bit elements are supported!");
5959 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5960 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5961 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5963 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5964 Ops.push_back(
N.getOperand(1));
5965 Ops.push_back(
N.getOperand(0));
5967 case X86ISD::PALIGNR:
5969 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5970 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5971 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5973 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5974 Ops.push_back(
N.getOperand(1));
5975 Ops.push_back(
N.getOperand(0));
5977 case X86ISD::VSHLDQ:
5979 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5980 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5984 case X86ISD::VSRLDQ:
5986 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5987 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5991 case X86ISD::PSHUFD:
5992 case X86ISD::VPERMILPI:
5993 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5994 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5998 case X86ISD::PSHUFHW:
5999 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6000 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6004 case X86ISD::PSHUFLW:
6005 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6006 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6010 case X86ISD::VZEXT_MOVL:
6011 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6015 case X86ISD::VBROADCAST:
6019 if (
N.getOperand(0).getValueType() == VT) {
6025 case X86ISD::VPERMILPV: {
6026 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6028 SDValue MaskNode =
N.getOperand(1);
6036 case X86ISD::PSHUFB: {
6038 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6039 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6041 SDValue MaskNode =
N.getOperand(1);
6048 case X86ISD::VPERMI:
6049 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6050 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6057 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6058 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6061 case X86ISD::VPERM2X128:
6062 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6063 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6064 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6066 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6068 case X86ISD::SHUF128:
6069 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6070 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6071 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6073 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6075 case X86ISD::MOVSLDUP:
6076 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6080 case X86ISD::MOVSHDUP:
6081 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6085 case X86ISD::MOVDDUP:
6086 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6090 case X86ISD::VPERMIL2: {
6091 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6092 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6093 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6094 SDValue MaskNode =
N.getOperand(2);
6095 SDValue CtrlNode =
N.getOperand(3);
6097 unsigned CtrlImm = CtrlOp->getZExtValue();
6107 case X86ISD::VPPERM: {
6108 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6109 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6110 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6111 SDValue MaskNode =
N.getOperand(2);
6118 case X86ISD::VPERMV: {
6119 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6122 Ops.push_back(
N.getOperand(1));
6123 SDValue MaskNode =
N.getOperand(0);
6131 case X86ISD::VPERMV3: {
6132 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6133 assert(
N.getOperand(2).getValueType() == VT &&
"Unexpected value type");
6134 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(2);
6136 Ops.push_back(
N.getOperand(0));
6137 Ops.push_back(
N.getOperand(2));
6138 SDValue MaskNode =
N.getOperand(1);
6146 case X86ISD::COMPRESS: {
6148 SDValue PassThru =
N.getOperand(1);
6155 "Illegal compression mask");
6156 for (
unsigned I = 0;
I != NumElems; ++
I) {
6160 while (Mask.size() != NumElems) {
6161 Mask.push_back(NumElems + Mask.size());
6163 Ops.push_back(CmpVec);
6164 Ops.push_back(PassThru);
6167 case X86ISD::EXPAND: {
6169 SDValue PassThru =
N.getOperand(1);
6176 "Illegal expansion mask");
6177 unsigned ExpIndex = 0;
6178 for (
unsigned I = 0;
I != NumElems; ++
I) {
6180 Mask.push_back(
I + NumElems);
6182 Mask.push_back(ExpIndex++);
6184 Ops.push_back(ExpVec);
6185 Ops.push_back(PassThru);
6197 if (!AllowSentinelZero &&
isAnyZero(Mask))
6205 if (M >= (
int)Mask.size())
6211 Ops.push_back(
N.getOperand(0));
6212 if (!IsUnary || IsFakeUnary)
6213 Ops.push_back(
N.getOperand(1));
6238 int Size = Mask.size();
6248 int ScalarSizeInBits = VectorSizeInBits /
Size;
6249 assert(!(VectorSizeInBits % ScalarSizeInBits) &&
"Illegal shuffle mask size");
6251 for (
int i = 0; i <
Size; ++i) {
6258 if ((M >= 0 && M <
Size && V1IsZero) || (M >=
Size && V2IsZero)) {
6273 if ((
Size % V.getNumOperands()) == 0) {
6274 int Scale =
Size / V->getNumOperands();
6281 APInt Val = Cst->getAPIntValue();
6282 Val = Val.
extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6286 APInt Val = Cst->getValueAPF().bitcastToAPInt();
6287 Val = Val.
extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6296 if ((V.getNumOperands() %
Size) == 0) {
6297 int Scale = V->getNumOperands() /
Size;
6298 bool AllUndef =
true;
6299 bool AllZero =
true;
6300 for (
int j = 0; j < Scale; ++j) {
6301 SDValue Op = V.getOperand((M * Scale) + j);
6302 AllUndef &=
Op.isUndef();
6325 MVT VT =
N.getSimpleValueType();
6329 int Size = Mask.size();
6338 "Illegal split of shuffle value type");
6342 APInt UndefSrcElts[2];
6344 bool IsSrcConstant[2] = {
6346 SrcEltBits[0],
true,
6349 SrcEltBits[1],
true,
6352 for (
int i = 0; i <
Size; ++i) {
6366 unsigned SrcIdx = M /
Size;
6381 (
Size % V.getValueType().getVectorNumElements()) == 0) {
6382 int Scale =
Size / V.getValueType().getVectorNumElements();
6383 int Idx = M / Scale;
6397 int Idx = V.getConstantOperandVal(2);
6398 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6399 if (M < Idx || (Idx + NumSubElts) <= M)
6406 if (IsSrcConstant[SrcIdx]) {
6407 if (UndefSrcElts[SrcIdx][M])
6409 else if (SrcEltBits[SrcIdx][M] == 0)
6415 "Different mask size from vector size!");
6421 const APInt &KnownUndef,
6422 const APInt &KnownZero,
6423 bool ResolveKnownZeros=
true) {
6424 unsigned NumElts = Mask.size();
6426 KnownZero.
getBitWidth() == NumElts &&
"Shuffle mask size mismatch");
6428 for (
unsigned i = 0; i != NumElts; ++i) {
6431 else if (ResolveKnownZeros && KnownZero[i])
6440 unsigned NumElts = Mask.size();
6443 for (
unsigned i = 0; i != NumElts; ++i) {
6455 EVT CondVT =
Cond.getValueType();
6468 for (
int i = 0; i != (int)NumElts; ++i) {
6473 if (UndefElts[i] || (!IsBLENDV && EltBits[i].
isZero()) ||
6474 (IsBLENDV && EltBits[i].isNonNegative()))
6486 bool ResolveKnownElts);
6496 bool ResolveKnownElts) {
6500 MVT VT =
N.getSimpleValueType();
6504 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6507 unsigned NumSizeInBytes = NumSizeInBits / 8;
6508 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6510 unsigned Opcode =
N.getOpcode();
6516 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6517 Ops.push_back(
N.getOperand(0));
6518 Ops.push_back(
N.getOperand(1));
6524 case X86ISD::ANDNP: {
6530 bool IsAndN = (X86ISD::ANDNP == Opcode);
6531 uint64_t ZeroMask = IsAndN ? 255 : 0;
6538 assert(UndefElts.
isZero() &&
"Unexpected UNDEF element in AND/ANDNP mask");
6539 for (
int i = 0, e = (
int)EltBits.
size(); i != e; ++i) {
6540 const APInt &ByteBits = EltBits[i];
6541 if (ByteBits != 0 && ByteBits != 255)
6545 Ops.push_back(IsAndN ? N1 : N0);
6566 size_t MaskSize = std::max(SrcMask0.
size(), SrcMask1.
size());
6570 for (
int i = 0; i != (int)MaskSize; ++i) {
6580 Mask.push_back(i + MaskSize);
6581 else if (MaskSize == NumElts && !DemandedElts[i])
6586 Ops.push_back(
N.getOperand(0));
6587 Ops.push_back(
N.getOperand(1));
6592 unsigned NumSubElts =
N.getOperand(0).getValueType().getVectorNumElements();
6593 if (NumBitsPerElt == 64) {
6594 for (
unsigned I = 0,
E =
N.getNumOperands();
I !=
E; ++
I) {
6595 for (
unsigned M = 0; M != NumSubElts; ++M)
6596 Mask.push_back((
I * NumElts) + M);
6597 Ops.push_back(
N.getOperand(
I));
6606 EVT SubVT =
Sub.getValueType();
6608 uint64_t InsertIdx =
N.getConstantOperandVal(2);
6610 if (DemandedElts.
extractBits(NumSubElts, InsertIdx) == 0) {
6611 Mask.resize(NumElts);
6612 std::iota(Mask.begin(), Mask.end(), 0);
6618 if (
Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6620 Src.getOperand(0).isUndef() &&
6621 Src.getOperand(1).getValueType() == SubVT &&
6622 Src.getConstantOperandVal(2) == 0 &&
6623 (NumBitsPerElt == 64 || Src.getOperand(1) ==
Sub) &&
6625 Mask.resize(NumElts);
6626 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6627 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6628 Ops.push_back(Src.getOperand(1));
6633 if (InsertIdx != 0 && Src.isUndef() &&
6636 std::iota(Mask.begin() + InsertIdx, Mask.begin() + InsertIdx + NumSubElts,
6641 if (!
N->isOnlyUserOf(
Sub.getNode()))
6656 unsigned NumSubSrcSrcElts =
6658 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6659 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6660 "Subvector valuetype mismatch");
6661 InsertIdx *= (MaxElts / NumElts);
6662 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6663 NumSubElts *= (MaxElts / NumElts);
6664 bool SrcIsUndef = Src.isUndef();
6665 for (
int i = 0; i != (int)MaxElts; ++i)
6667 for (
int i = 0; i != (int)NumSubElts; ++i)
6668 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6671 Ops.push_back(SubSrcSrc);
6678 Depth + 1, ResolveKnownElts))
6688 if (SubMask.
size() != NumSubElts) {
6689 assert(((SubMask.
size() % NumSubElts) == 0 ||
6690 (NumSubElts % SubMask.
size()) == 0) &&
6691 "Illegal submask scale");
6692 if ((NumSubElts % SubMask.
size()) == 0) {
6693 int Scale = NumSubElts / SubMask.
size();
6696 SubMask = ScaledSubMask;
6698 int Scale = SubMask.
size() / NumSubElts;
6699 NumSubElts = SubMask.
size();
6709 for (
int i = 0; i != (int)NumElts; ++i)
6711 for (
int i = 0; i != (int)NumSubElts; ++i) {
6714 int InputIdx = M / NumSubElts;
6715 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6717 Mask[i + InsertIdx] = M;
6721 case X86ISD::PINSRB:
6722 case X86ISD::PINSRW:
6729 unsigned DstIdx = 0;
6733 N.getConstantOperandAPInt(2).uge(NumElts))
6735 DstIdx =
N.getConstantOperandVal(2);
6739 Ops.push_back(
N.getOperand(0));
6740 for (
unsigned i = 0; i != NumElts; ++i)
6760 if ((MinBitsPerElt % 8) != 0)
6780 unsigned DstByte = DstIdx * NumBytesPerElt;
6786 Ops.push_back(SrcVec);
6789 Ops.push_back(SrcVec);
6790 Ops.push_back(
N.getOperand(0));
6791 for (
int i = 0; i != (int)NumSizeInBytes; ++i)
6792 Mask.push_back(NumSizeInBytes + i);
6795 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6796 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6797 for (
unsigned i = 0; i != MinBytesPerElts; ++i)
6798 Mask[DstByte + i] = SrcByte + i;
6799 for (
unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6803 case X86ISD::PACKSS:
6804 case X86ISD::PACKUS: {
6809 "Unexpected input value type");
6811 APInt EltsLHS, EltsRHS;
6816 bool Offset0 =
false, Offset1 =
false;
6817 if (Opcode == X86ISD::PACKSS) {
6845 bool IsUnary = (N0 == N1);
6853 if (Offset0 || Offset1) {
6855 if ((Offset0 &&
isInRange(M, 0, NumElts)) ||
6856 (Offset1 &&
isInRange(M, NumElts, 2 * NumElts)))
6862 case X86ISD::BLENDV: {
6865 Ops.push_back(
N.getOperand(1));
6866 Ops.push_back(
N.getOperand(2));
6871 case X86ISD::VTRUNC: {
6873 EVT SrcVT = Src.getValueType();
6878 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6879 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 &&
"Illegal truncation");
6880 for (
unsigned i = 0; i != NumSrcElts; ++i)
6881 Mask.push_back(i * Scale);
6897 for (
unsigned I = 0;
I != NumElts; ++
I)
6898 if (DemandedElts[
I] && !UndefElts[
I] &&
6899 (EltBits[
I].urem(8) != 0 || EltBits[
I].uge(NumBitsPerElt)))
6903 Ops.push_back(
N.getOperand(0));
6905 for (
unsigned I = 0;
I != NumElts; ++
I) {
6906 if (!DemandedElts[
I] || UndefElts[
I])
6908 unsigned ByteShift = EltBits[
I].getZExtValue() / 8;
6909 unsigned Lo =
I * NumBytesPerElt;
6910 unsigned Hi =
Lo + NumBytesPerElt;
6914 std::iota(Mask.begin() +
Lo + ByteShift, Mask.begin() +
Hi,
Lo);
6916 std::iota(Mask.begin() +
Lo, Mask.begin() +
Hi - ByteShift,
6922 case X86ISD::VSRLI: {
6923 uint64_t ShiftVal =
N.getConstantOperandVal(1);
6925 if (NumBitsPerElt <= ShiftVal) {
6931 if ((ShiftVal % 8) != 0)
6935 Ops.push_back(
N.getOperand(0));
6940 if (X86ISD::VSHLI == Opcode) {
6941 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6942 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6943 Mask[i + j] = i + j - ByteShift;
6945 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6946 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6947 Mask[i + j - ByteShift] = i + j;
6962 for (
unsigned I = 0;
I != NumElts; ++
I)
6963 if (DemandedElts[
I] && !UndefElts[
I] &&
6964 (EltBits[
I].urem(NumBitsPerElt) % 8) != 0)
6967 Ops.push_back(
N.getOperand(0));
6968 for (
unsigned I = 0;
I != NumElts; ++
I) {
6969 if (!DemandedElts[
I] || UndefElts[
I]) {
6973 int Offset = EltBits[
I].urem(NumBitsPerElt) / 8;
6975 int BaseIdx =
I * NumBytesPerElt;
6976 for (
int J = 0; J != (int)NumBytesPerElt; ++J) {
6977 Mask.push_back(BaseIdx + ((
Offset + J) % NumBytesPerElt));
6982 case X86ISD::VROTLI:
6983 case X86ISD::VROTRI: {
6985 uint64_t RotateVal =
N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6986 if ((RotateVal % 8) != 0)
6988 Ops.push_back(
N.getOperand(0));
6989 int Offset = RotateVal / 8;
6991 for (
int i = 0; i != (int)NumElts; ++i) {
6992 int BaseIdx = i * NumBytesPerElt;
6993 for (
int j = 0; j != (int)NumBytesPerElt; ++j) {
6994 Mask.push_back(BaseIdx + ((
Offset + j) % NumBytesPerElt));
6999 case X86ISD::VBROADCAST: {
7001 if (!Src.getSimpleValueType().isVector()) {
7004 Src.getOperand(0).getValueType().getScalarType() !=
7007 Src = Src.getOperand(0);
7010 Mask.append(NumElts, 0);
7015 EVT SrcVT = Src.getValueType();
7020 (NumBitsPerSrcElt % 8) != 0)
7024 APInt DemandedSrcElts =
7029 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 &&
"Unexpected extension");
7030 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
7031 for (
unsigned I = 0;
I != NumElts; ++
I)
7032 Mask.append(Scale,
I);
7041 EVT SrcVT = Src.getValueType();
7063 int MaskWidth = Mask.size();
7065 for (
int i = 0, e = Inputs.
size(); i < e; ++i) {
7066 int lo = UsedInputs.
size() * MaskWidth;
7067 int hi = lo + MaskWidth;
7072 if ((lo <= M) && (M < hi))
7076 if (
none_of(Mask, [lo, hi](
int i) {
return (lo <= i) && (i < hi); })) {
7084 bool IsRepeat =
false;
7085 for (
int j = 0, ue = UsedInputs.
size(); j != ue; ++j) {
7086 if (UsedInputs[j] != Inputs[i])
7090 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7099 Inputs = std::move(UsedInputs);
7110 bool ResolveKnownElts) {
7114 EVT VT =
Op.getValueType();
7119 if (ResolveKnownElts)
7124 ResolveKnownElts)) {
7135 bool ResolveKnownElts) {
7136 APInt KnownUndef, KnownZero;
7138 KnownZero, DAG,
Depth, ResolveKnownElts);
7144 bool ResolveKnownElts =
true) {
7145 EVT VT =
Op.getValueType();
7149 unsigned NumElts =
Op.getValueType().getVectorNumElements();
7159 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
7160 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
7161 "Unknown broadcast load type");
7172 Opcode,
DL, Tys,
Ops, MemVT,
7186 EVT VT =
Op.getValueType();
7187 unsigned Opcode =
Op.getOpcode();
7192 int Elt = SV->getMaskElt(Index);
7211 int Elt = ShuffleMask[Index];
7218 assert(0 <= Elt && Elt < (2 * NumElems) &&
"Shuffle index out of range");
7227 uint64_t SubIdx =
Op.getConstantOperandVal(2);
7228 unsigned NumSubElts =
Sub.getValueType().getVectorNumElements();
7230 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
7237 EVT SubVT =
Op.getOperand(0).getValueType();
7239 uint64_t SubIdx = Index / NumSubElts;
7240 uint64_t SubElt = Index % NumSubElts;
7247 uint64_t SrcIdx =
Op.getConstantOperandVal(1);
7254 EVT SrcVT = Src.getValueType();
7266 if (
Op.getConstantOperandAPInt(2) == Index)
7267 return Op.getOperand(1);
7272 return (Index == 0) ?
Op.getOperand(0)
7276 return Op.getOperand(Index);
7283 const APInt &NonZeroMask,
7284 unsigned NumNonZero,
unsigned NumZero,
7287 MVT VT =
Op.getSimpleValueType();
7290 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.
hasSSE41())) &&
7291 "Illegal vector insertion");
7296 for (
unsigned i = 0; i < NumElts; ++i) {
7297 bool IsNonZero = NonZeroMask[i];
7306 if (NumZero || 0 != i)
7309 assert(0 == i &&
"Expected insertion into zero-index");
7325 const APInt &NonZeroMask,
7326 unsigned NumNonZero,
unsigned NumZero,
7329 if (NumNonZero > 8 && !Subtarget.
hasSSE41())
7343 for (
unsigned I = 0;
I != 4; ++
I) {
7344 if (!NonZeroMask[
I])
7352 assert(V &&
"Failed to fold v16i8 vector to zero");
7354 V = DAG.
getNode(X86ISD::VZEXT_MOVL,
DL, MVT::v4i32, V);
7357 for (
unsigned i = V ? 4 : 0; i < 16; i += 2) {
7358 bool ThisIsNonZero = NonZeroMask[i];
7359 bool NextIsNonZero = NonZeroMask[i + 1];
7360 if (!ThisIsNonZero && !NextIsNonZero)
7364 if (ThisIsNonZero) {
7365 if (NumZero || NextIsNonZero)
7371 if (NextIsNonZero) {
7373 if (i == 0 && NumZero)
7389 if (i != 0 || NumZero)
7407 const APInt &NonZeroMask,
7408 unsigned NumNonZero,
unsigned NumZero,
7411 if (NumNonZero > 4 && !Subtarget.
hasSSE41())
7427 if (Subtarget.
hasSSE3() && !Subtarget.hasXOP() &&
7428 Op.getOperand(0) ==
Op.getOperand(2) &&
7429 Op.getOperand(1) ==
Op.getOperand(3) &&
7430 Op.getOperand(0) !=
Op.getOperand(1)) {
7431 MVT VT =
Op.getSimpleValueType();
7443 std::bitset<4> Zeroable, Undefs;
7444 for (
int i = 0; i < 4; ++i) {
7449 assert(Zeroable.size() - Zeroable.count() > 1 &&
7450 "We expect at least two non-zero elements!");
7455 unsigned FirstNonZeroIdx;
7456 for (
unsigned i = 0; i < 4; ++i) {
7467 if (!FirstNonZero.
getNode()) {
7469 FirstNonZeroIdx = i;
7473 assert(FirstNonZero.
getNode() &&
"Unexpected build vector of all zeros!");
7479 unsigned EltMaskIdx, EltIdx;
7481 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7482 if (Zeroable[EltIdx]) {
7484 Mask[EltIdx] = EltIdx+4;
7488 Elt =
Op->getOperand(EltIdx);
7491 if (Elt.
getOperand(0) != V1 || EltMaskIdx != EltIdx)
7493 Mask[EltIdx] = EltIdx;
7498 SDValue VZeroOrUndef = (Zeroable == Undefs)
7511 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7514 bool CanFold =
true;
7515 for (
unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7529 assert(V1.
getNode() &&
"Expected at least two non-zero elements!");
7536 unsigned ZMask = Zeroable.to_ulong();
7538 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7539 assert((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!");
7541 DAG.
getNode(X86ISD::INSERTPS,
DL, MVT::v4f32, V1, V2,
7551 MVT ShVT = MVT::v16i8;
7552 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7554 assert(NumBits % 8 == 0 &&
"Only support byte sized shifts");
7566 SDValue Ptr = LD->getBasePtr();
7569 EVT PVT = LD->getValueType(0);
7570 if (PVT != MVT::i32 && PVT != MVT::f32)
7576 FI = FINode->getIndex();
7590 SDValue Chain = LD->getChain();
7594 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7611 int64_t StartOffset =
Offset & ~int64_t(RequiredAlign.
value() - 1);
7618 int EltNo = (
Offset - StartOffset) >> 2;
7623 LD->getPointerInfo().getWithOffset(StartOffset));
7637 if (!BaseLd->isSimple())
7651 uint64_t Amt = AmtC->getZExtValue();
7653 ByteOffset += Amt / 8;
7663 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7665 uint64_t Idx = IdxC->getZExtValue();
7666 ByteOffset += Idx * (SrcSizeInBits / 8);
7684 bool IsAfterLegalize,
7685 unsigned Depth = 0) {
7691 unsigned NumElems = Elts.
size();
7693 int LastLoadedElt = -1;
7703 for (
unsigned i = 0; i < NumElems; ++i) {
7722 if (!
findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7724 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7725 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7733 "Incomplete element masks");
7736 if (UndefMask.
popcount() == NumElems)
7747 "Register/Memory size mismatch");
7749 assert(LDBase &&
"Did not find base load for merging consecutive loads");
7751 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7752 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7753 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7754 assert((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected");
7757 if (ByteOffsets[FirstLoadedElt] != 0)
7764 int64_t ByteOffset = ByteOffsets[EltIdx];
7765 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7766 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7767 return (0 <= BaseIdx && BaseIdx < (
int)NumElems && LoadMask[BaseIdx] &&
7768 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7770 int Stride = EltIdx - FirstLoadedElt;
7776 unsigned BaseMemSizeInBits =
Base->getMemoryVT().getSizeInBits();
7777 if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
7778 (BaseMemSizeInBits % BaseSizeInBits) == 0) {
7779 unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
7789 bool IsConsecutiveLoad =
true;
7790 bool IsConsecutiveLoadWithZeros =
true;
7791 for (
int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7793 if (!CheckConsecutiveLoad(LDBase, i)) {
7794 IsConsecutiveLoad =
false;
7795 IsConsecutiveLoadWithZeros =
false;
7798 }
else if (ZeroMask[i]) {
7799 IsConsecutiveLoad =
false;
7806 "Cannot merge volatile or atomic loads.");
7810 for (
auto *LD : Loads)
7825 if (FirstLoadedElt == 0 &&
7826 (NumLoadedElts == (
int)NumElems || IsDereferenceable) &&
7827 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7838 return DAG.
getBitcast(VT, Elts[FirstLoadedElt]);
7841 return CreateLoad(VT, LDBase);
7845 if (!IsAfterLegalize && VT.
isVector()) {
7847 if ((NumMaskElts % NumElems) == 0) {
7848 unsigned Scale = NumMaskElts / NumElems;
7850 for (
unsigned i = 0; i < NumElems; ++i) {
7853 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7854 for (
unsigned j = 0; j != Scale; ++j)
7855 ClearMask[(i * Scale) + j] = (i * Scale) + j +
Offset;
7857 SDValue V = CreateLoad(VT, LDBase);
7867 unsigned HalfNumElems = NumElems / 2;
7873 DAG, Subtarget, IsAfterLegalize,
Depth + 1);
7881 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7882 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7883 LoadSizeInBits == 64) &&
7890 if (!Subtarget.
hasSSE2() && VT == MVT::v4f32)
7898 for (
auto *LD : Loads)
7909 for (
unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7910 unsigned RepeatSize = SubElems * BaseSizeInBits;
7911 unsigned ScalarSize = std::min(RepeatSize, 64u);
7912 if (!Subtarget.
hasAVX2() && ScalarSize < 32)
7917 if (RepeatSize > ScalarSize && SubElems == 1)
7922 for (
unsigned i = 0; i != NumElems && Match; ++i) {
7926 if (RepeatedLoads[i % SubElems].
isUndef())
7927 RepeatedLoads[i % SubElems] = Elt;
7929 Match &= (RepeatedLoads[i % SubElems] == Elt);
7933 Match &= !RepeatedLoads.
front().isUndef();
7934 Match &= !RepeatedLoads.
back().isUndef();
7942 if (RepeatSize > ScalarSize)
7944 RepeatSize / ScalarSize);
7950 RepeatVT, RepeatedLoads,
DL, DAG, Subtarget, IsAfterLegalize,
7952 SDValue Broadcast = RepeatLoad;
7953 if (RepeatSize > ScalarSize) {
7964 DAG.
getNode(X86ISD::VBROADCAST,
DL, BroadcastVT, RepeatLoad);
7979 VT, ReverseElts,
DL, DAG, Subtarget, IsAfterLegalize,
Depth + 1)) {
7981 std::iota(ReverseMask.
rbegin(), ReverseMask.
rend(), 0);
7995 bool IsAfterLegalize) {
8014 auto getConstantScalar = [&](
const APInt &Val) ->
Constant * {
8016 if (ScalarSize == 16)
8018 if (ScalarSize == 32)
8020 assert(ScalarSize == 64 &&
"Unsupported floating point scalar size");
8027 for (
unsigned I = 0,
E = Bits.size();
I !=
E; ++
I)
8029 : getConstantScalar(Bits[
I]));
8038 auto getConstantScalar = [&](
const APInt &Val) ->
Constant * {
8040 if (ScalarSize == 16)
8042 if (ScalarSize == 32)
8044 assert(ScalarSize == 64 &&
"Unsupported floating point scalar size");
8050 if (ScalarSize == SplatBitSize)
8051 return getConstantScalar(SplatValue);
8053 unsigned NumElm = SplatBitSize / ScalarSize;
8055 for (
unsigned I = 0;
I != NumElm; ++
I) {
8057 ConstantVec.
push_back(getConstantScalar(Val));
8063 for (
auto *U :
N->users()) {
8064 unsigned Opc = U->getOpcode();
8066 if (
Opc == X86ISD::VPERMV && U->getOperand(0).getNode() ==
N)
8068 if (
Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() ==
N)
8074 if (
N->hasOneUse()) {
8077 if (
Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() !=
N)
8088 unsigned SizeInBits = V.getValueSizeInBits();
8089 if ((SizeInBits == 512 && Subtarget.
hasAVX512()) ||
8090 (SizeInBits >= 128 && Subtarget.hasVLX())) {
8091 if (V.hasOneUse() && V->user_begin()->getOpcode() ==
ISD::VSELECT &&
8092 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
8120 "Unsupported vector type for broadcast.");
8127 assert((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.");
8128 if (Sequence.size() == 1)
8138 if (!Sequence.empty() && Subtarget.hasCDI()) {
8140 unsigned SeqLen = Sequence.size();
8141 bool UpperZeroOrUndef =
8146 if (UpperZeroOrUndef && ((Op0.getOpcode() ==
ISD::BITCAST) ||
8151 : Op0.getOperand(0).getOperand(0);
8154 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||
8155 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) {
8161 SDValue Bcst = DAG.
getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8169 unsigned NumUndefElts = UndefElements.
count();
8170 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8172 unsigned SplatBitSize;
8185 if (SplatBitSize == 32 || SplatBitSize == 64 ||
8186 (SplatBitSize < 32 && Subtarget.
hasAVX2())) {
8203 if (SplatBitSize > 64) {
8215 Ops, VVT, MPI, Alignment,
8225 if (!Ld || NumElts - NumUndefElts != 1)
8228 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8232 bool ConstSplatVal =
8260 if (ConstSplatVal && (Subtarget.
hasAVX2() || OptForSize)) {
8268 if (ScalarSize == 32 ||
8269 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
8270 (CVT == MVT::f16 && Subtarget.
hasAVX2()) ||
8271 (OptForSize && (ScalarSize == 64 || Subtarget.
hasAVX2()))) {
8274 C = CI->getConstantIntValue();
8276 C = CF->getConstantFPValue();
8278 assert(
C &&
"Invalid constant type");
8295 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
8296 return DAG.
getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8306 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8307 (Subtarget.hasVLX() && ScalarSize == 64)) {
8310 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8313 LN->getMemoryVT(), LN->getMemOperand());
8321 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
8324 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8327 LN->getMemoryVT(), LN->getMemOperand());
8332 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
8333 return DAG.
getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8368 ExtractedFromVec = ShuffleVec;
8376 MVT VT =
Op.getSimpleValueType();
8383 unsigned NumElems =
Op.getNumOperands();
8389 for (
unsigned i = 0; i != NumElems; ++i) {
8390 unsigned Opc =
Op.getOperand(i).getOpcode();
8397 if (InsertIndices.
size() > 1)
8404 SDValue ExtractedFromVec =
Op.getOperand(i).getOperand(0);
8405 SDValue ExtIdx =
Op.getOperand(i).getOperand(1);
8417 VecIn1 = ExtractedFromVec;
8418 else if (VecIn1 != ExtractedFromVec) {
8420 VecIn2 = ExtractedFromVec;
8421 else if (VecIn2 != ExtractedFromVec)
8426 if (ExtractedFromVec == VecIn1)
8428 else if (ExtractedFromVec == VecIn2)
8429 Mask[i] = Idx + NumElems;
8438 for (
unsigned Idx : InsertIndices)
8448 MVT VT =
Op.getSimpleValueType();
8449 MVT SVT = Subtarget.hasFP16() ? MVT::f16 : MVT::i16;
8452 for (
unsigned I = 0,
E =
Op.getNumOperands();
I !=
E; ++
I)
8463 MVT VT =
Op.getSimpleValueType();
8465 "Unexpected type in LowerBUILD_VECTORvXi1!");
8472 bool IsSplat =
true;
8473 bool HasConstElts =
false;
8475 for (
unsigned idx = 0, e =
Op.getNumOperands(); idx < e; ++idx) {
8480 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8481 HasConstElts =
true;
8487 else if (In !=
Op.getOperand(SplatIdx))
8498 assert(
Cond.getValueType() == MVT::i8 &&
"Unexpected VT!");
8504 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8525 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8542 for (
unsigned InsertIdx : NonConstIdx) {
8544 Op.getOperand(InsertIdx),
8552 case X86ISD::PACKSS:
8553 case X86ISD::PACKUS:
8587 unsigned BaseIdx,
unsigned LastIdx,
8589 EVT VT =
N->getValueType(0);
8591 assert(BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!");
8593 "Invalid Vector in input!");
8596 bool CanFold =
true;
8597 unsigned ExpectedVExtractIdx = BaseIdx;
8598 unsigned NumElts = LastIdx - BaseIdx;
8603 for (
unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8607 if (
Op->isUndef()) {
8609 if (i * 2 == NumElts)
8610 ExpectedVExtractIdx = BaseIdx;
8611 ExpectedVExtractIdx += 2;
8615 CanFold =
Op->getOpcode() == Opcode &&
Op->hasOneUse();
8636 if (i * 2 < NumElts) {
8648 if (i * 2 == NumElts)
8649 ExpectedVExtractIdx = BaseIdx;
8653 if (I0 == ExpectedVExtractIdx)
8655 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8662 ExpectedVExtractIdx += 2;
8701 unsigned X86Opcode,
bool Mode,
8702 bool isUndefLO,
bool isUndefHI) {
8705 "Invalid nodes in input!");
8719 if (!isUndefLO && !V0->
isUndef())
8720 LO = DAG.
getNode(X86Opcode,
DL, NewVT, V0_LO, V0_HI);
8721 if (!isUndefHI && !V1->
isUndef())
8722 HI = DAG.
getNode(X86Opcode,
DL, NewVT, V1_LO, V1_HI);
8726 LO = DAG.
getNode(X86Opcode,
DL, NewVT, V0_LO, V1_LO);
8729 HI = DAG.
getNode(X86Opcode,
DL, NewVT, V0_HI, V1_HI);
8743 unsigned &NumExtracts,
bool &IsSubAdd,
8744 bool &HasAllowContract) {
8756 HasAllowContract = NumElts != 0;
8762 unsigned Opc[2] = {0, 0};
8763 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
8767 unsigned Opcode =
Op.getOpcode();
8787 if (
Opc[i % 2] != 0 &&
Opc[i % 2] != Opcode)
8789 Opc[i % 2] = Opcode;
8815 HasAllowContract &=
Op->getFlags().hasAllowContract();
8861 unsigned ExpectedUses,
8862 bool AllowSubAddOrAddSubContract) {
8872 (AllowSubAddOrAddSubContract && Opnd0->
getFlags().hasAllowContract());
8891 unsigned NumExtracts;
8893 bool HasAllowContract;
8894 if (!
isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8903 HasAllowContract)) {
8904 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8918 Mask.push_back(
I +
E + 1);
8925 return DAG.
getNode(X86ISD::ADDSUB,
DL, VT, Opnd0, Opnd1);
8942 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8943 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8944 for (
unsigned i = 0; i != Num128BitChunks; ++i) {
8945 for (
unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8957 GenericOpcode =
Op.getOpcode();
8958 switch (GenericOpcode) {
8960 case ISD::ADD: HOpcode = X86ISD::HADD;
break;
8961 case ISD::SUB: HOpcode = X86ISD::HSUB;
break;
8962 case ISD::FADD: HOpcode = X86ISD::FHADD;
break;
8963 case ISD::FSUB: HOpcode = X86ISD::FHSUB;
break;
8964 default:
return false;
8980 if (j < NumEltsIn64Bits) {
8988 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8995 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8996 (j % NumEltsIn64Bits) * 2;
8997 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9006 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9037 for (
unsigned i = 0; i != NumElts; ++i)
9042 unsigned HalfNumElts = NumElts / 2;
9051 return DAG.
getNode(HOpcode,
DL, VT, V0, V1);
9059 unsigned NumNonUndefs =
9061 if (NumNonUndefs < 2)
9068 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.
hasSSE3()) ||
9069 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.
hasSSSE3()) ||
9070 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.
hasAVX()) ||
9071 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.
hasAVX2())) {
9084 unsigned Half = NumElts / 2;
9085 unsigned NumUndefsLO = 0;
9086 unsigned NumUndefsHI = 0;
9087 for (
unsigned i = 0, e = Half; i != e; ++i)
9091 for (
unsigned i = Half, e = NumElts; i != e; ++i)
9096 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9099 bool CanFold =
true;
9106 X86Opcode = X86ISD::HADD;
9113 X86Opcode = X86ISD::HSUB;
9120 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9129 bool isUndefLO = NumUndefsLO == Half;
9130 bool isUndefHI = NumUndefsHI == Half;
9136 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9137 VT == MVT::v16i16) {
9141 X86Opcode = X86ISD::HADD;
9144 X86Opcode = X86ISD::HSUB;
9147 X86Opcode = X86ISD::FHADD;
9150 X86Opcode = X86ISD::FHSUB;
9156 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9161 bool isUndefLO = NumUndefsLO == Half;
9162 bool isUndefHI = NumUndefsHI == Half;
9164 isUndefLO, isUndefHI);
9182 MVT VT =
Op->getSimpleValueType(0);
9189 unsigned Opcode =
Op->getOperand(0).getOpcode();
9190 for (
unsigned i = 1; i < NumElems; ++i)
9191 if (Opcode !=
Op->getOperand(i).getOpcode())
9195 bool IsShift =
false;
9209 if (
Op->getSplatValue())
9217 bool RHSAllConst =
true;
9236 if (Op1.getValueSizeInBits() != ElemSize)
9242 if (
any_of(RHSElts, [&](
SDValue V) {
return RHSElts[0] != V; }))
9247 return V.getValueSizeInBits() == ElemSize;
9249 "Element size mismatch");
9257 if (!
LHS && !
RHS && !RHSAllConst)
9283 if (VT != MVT::v4f64)
9291 UniqueOps.insert(
Op);
9295 if (UniqueOps.size() != 2u)
9299 UniqueOps.erase(Op0);
9300 SDValue Op1 = *UniqueOps.begin();
9307 for (
auto I = 0u;
I < NumElems; ++
I) {
9309 Mask[
I] =
Op == Op0 ?
I :
I + NumElems;
9330 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
9333 unsigned WideBits = 2 * EltBits;
9340 for (
unsigned I = 0;
I != NumElts;
I += 2) {
9356 X.getValueType().bitsGE(WideSVT)) {
9357 if (
X.getValueType().bitsGT(WideSVT))
9366 assert(WideOps.
size() == (NumElts / 2) &&
"Failed to widen build vector");
9376 MVT VT =
Op.getSimpleValueType();
9386 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9408 "Illegal variable permute mask size");
9416 SDLoc(IndicesVec), SizeInBits);
9420 IndicesVT, IndicesVec);
9432 Subtarget, DAG,
SDLoc(IndicesVec));
9457 for (
uint64_t i = 0; i != Scale; ++i) {
9458 IndexScale |= Scale << (i * NumDstBits);
9459 IndexOffset |= i << (i * NumDstBits);
9469 unsigned Opcode = 0;
9475 Opcode = X86ISD::PSHUFB;
9478 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9479 Opcode = X86ISD::VPERMV;
9481 Opcode = X86ISD::PSHUFB;
9482 ShuffleVT = MVT::v16i8;
9487 if (Subtarget.
hasAVX()) {
9488 Opcode = X86ISD::VPERMILPV;
9489 ShuffleVT = MVT::v4f32;
9491 Opcode = X86ISD::PSHUFB;
9492 ShuffleVT = MVT::v16i8;
9497 if (Subtarget.
hasAVX()) {
9500 Opcode = X86ISD::VPERMILPV;
9501 ShuffleVT = MVT::v2f64;
9513 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9514 Opcode = X86ISD::VPERMV;
9515 else if (Subtarget.hasXOP()) {
9522 DAG.
getNode(X86ISD::VPPERM,
DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9523 DAG.
getNode(X86ISD::VPPERM,
DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9524 }
else if (Subtarget.
hasAVX()) {
9547 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9548 Opcode = X86ISD::VPERMV;
9549 else if (Subtarget.
hasAVX()) {
9551 IndicesVec = ScaleIndices(IndicesVec, 2);
9554 MVT::v32i8, DAG.
getBitcast(MVT::v32i8, SrcVec),
9555 DAG.
getBitcast(MVT::v32i8, IndicesVec),
DL, DAG, Subtarget));
9561 Opcode = X86ISD::VPERMV;
9562 else if (Subtarget.
hasAVX()) {
9565 {0, 1, 2, 3, 0, 1, 2, 3});
9567 {4, 5, 6, 7, 4, 5, 6, 7});
9568 if (Subtarget.hasXOP())
9570 VT, DAG.
getNode(X86ISD::VPERMIL2,
DL, MVT::v8f32, LoLo, HiHi,
9576 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v8f32, HiHi, IndicesVec),
9577 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v8f32, LoLo, IndicesVec),
9585 if (!Subtarget.hasVLX()) {
9587 SrcVec =
widenSubVector(WidenSrcVT, SrcVec,
false, Subtarget, DAG,
9589 IndicesVec =
widenSubVector(MVT::v8i64, IndicesVec,
false, Subtarget,
9590 DAG,
SDLoc(IndicesVec));
9595 Opcode = X86ISD::VPERMV;
9596 }
else if (Subtarget.
hasAVX()) {
9604 if (Subtarget.hasXOP())
9606 VT, DAG.
getNode(X86ISD::VPERMIL2,
DL, MVT::v4f64, LoLo, HiHi,
9612 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v4f64, HiHi, IndicesVec),
9613 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v4f64, LoLo, IndicesVec),
9619 if (Subtarget.hasVBMI())
9620 Opcode = X86ISD::VPERMV;
9623 if (Subtarget.hasBWI())
9624 Opcode = X86ISD::VPERMV;
9631 Opcode = X86ISD::VPERMV;
9639 "Illegal variable permute shuffle type");
9643 IndicesVec = ScaleIndices(IndicesVec, Scale);
9646 IndicesVec = DAG.
getBitcast(ShuffleIdxVT, IndicesVec);
9649 SDValue Res = Opcode == X86ISD::VPERMV
9650 ? DAG.
getNode(Opcode,
DL, ShuffleVT, IndicesVec, SrcVec)
9651 : DAG.
getNode(Opcode,
DL, ShuffleVT, SrcVec, IndicesVec);
9672 auto PeekThroughFreeze = [](
SDValue N) {
9674 return N->getOperand(0);
9680 for (
unsigned Idx = 0,
E = V.getNumOperands(); Idx !=
E; ++Idx) {
9681 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9689 SrcVec =
Op.getOperand(0);
9690 else if (SrcVec !=
Op.getOperand(0))
9692 SDValue ExtractedIndex =
Op->getOperand(1);
9696 ExtractedIndex = ExtractedIndex.
getOperand(0);
9705 else if (IndicesVec != ExtractedIndex.
getOperand(0))
9709 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9713 MVT VT = V.getSimpleValueType();
9721 MVT VT =
Op.getSimpleValueType();
9723 MVT OpEltVT =
Op.getOperand(0).getSimpleValueType();
9731 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9742 bool IsAllConstants =
true;
9743 bool OneUseFrozenUndefs =
true;
9744 SmallSet<SDValue, 8> Values;
9745 unsigned NumConstants = NumElems;
9746 for (
unsigned i = 0; i < NumElems; ++i) {
9753 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->
hasOneUse();
9754 FrozenUndefMask.
setBit(i);
9759 IsAllConstants =
false;
9774 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9778 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9786 if (
unsigned NumFrozenUndefElts = FrozenUndefMask.
popcount();
9787 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9788 SmallVector<int, 16> BlendMask(NumElems, -1);
9790 for (
unsigned i = 0; i < NumElems; ++i) {
9796 if (!FrozenUndefMask[i])
9797 Elts[i] =
Op.getOperand(i);
9799 BlendMask[i] += NumElems;
9814 unsigned UpperElems = NumElems / 2;
9815 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9816 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.
countl_one();
9817 if (NumUpperUndefsOrZeros >= UpperElems) {
9819 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9820 UpperElems = NumElems - (NumElems / 4);
9822 bool UndefUpper = UndefMask.
countl_one() >= UpperElems;
9826 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9833 return HorizontalOp;
9843 unsigned NumZero = ZeroMask.
popcount();
9844 unsigned NumNonZero = NonZeroMask.
popcount();
9852 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9853 FrozenUndefMask.
isZero() &&
9860 Type *EltType =
Op.getValueType().getScalarType().getTypeForEVT(
Context);
9864 for (
unsigned i = 0; i != NumElems; ++i) {
9867 ConstVecOps[i] = ConstantInt::get(
Context,
C->getAPIntValue());
9869 ConstVecOps[i] = ConstantFP::get(
Context,
C->getValueAPF());
9872 "Expected one variable element in this vector");
9886 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9892 if (InsertC < NumEltsInLow128Bits)
9898 assert(Subtarget.hasAVX() &&
"Must have AVX with >16-byte vector");
9899 SmallVector<int, 8> ShuffleMask;
9901 for (
unsigned i = 0; i != NumElts; ++i)
9902 ShuffleMask.
push_back(i == InsertC ? NumElts : i);
9908 if (NumNonZero == 1) {
9920 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9921 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9922 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9925 "Expected an SSE value type!");
9934 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9944 if (NumElems == 2 && Idx == 1 &&
9950 VT,
Op.getOperand(1)),
9951 NumBits/2, DAG, *
this, dl);
9962 if (EVTBits == 32) {
9969 if (Values.
size() == 1) {
9970 if (EVTBits == 32) {
9977 if (
Op.getNode()->isOnlyUserOf(Item.
getNode()))
10002 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.
size() == 2) {
10007 for (
unsigned i = 2; i != NumElems; ++i)
10008 if (
Ops[i % 2] !=
Op.getOperand(i))
10012 if (CanSplat(
Op, NumElems,
Ops)) {
10034 HVT, dl,
Op->ops().slice(NumElems / 2, NumElems /2));
10041 if (EVTBits == 64) {
10042 if (NumNonZero == 1) {
10046 Op.getOperand(Idx));
10053 if (EVTBits == 8 && NumElems == 16)
10055 NumZero, DAG, Subtarget))
10058 if (EltVT == MVT::i16 && NumElems == 8)
10060 NumZero, DAG, Subtarget))
10064 if (EVTBits == 32 && NumElems == 4)
10069 if (NumElems == 4 && NumZero > 0) {
10071 for (
unsigned i = 0; i < 4; ++i) {
10072 bool isZero = !NonZeroMask[i];
10079 for (
unsigned i = 0; i < 2; ++i) {
10086 Ops[i] = getMOVL(DAG, dl, VT,
Ops[i*2+1],
Ops[i*2]);
10089 Ops[i] = getMOVL(DAG, dl, VT,
Ops[i*2],
Ops[i*2+1]);
10102 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10103 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10108 assert(Values.
size() > 1 &&
"Expected non-undef and non-splat vector");
10115 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
10117 if (!
Op.getOperand(0).isUndef())
10122 for (
unsigned i = 1; i < NumElems; ++i) {
10123 if (
Op.getOperand(i).isUndef())
continue;
10134 for (
unsigned i = 0; i < NumElems; ++i) {
10135 if (!
Op.getOperand(i).isUndef())
10145 for (
unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10147 SmallVector<int, 16>
Mask;
10148 for(
unsigned i = 0; i != Scale; ++i)
10150 for (
unsigned i = 0; i != Scale; ++i)
10151 Mask.push_back(NumElems+i);
10154 for (
unsigned i = 0, e = NumElems / (2 * Scale); i !=
e; ++i)
10166 MVT ResVT =
Op.getSimpleValueType();
10168 "Value type must be 256-/512-bit wide");
10170 unsigned NumOperands =
Op.getNumOperands();
10171 unsigned NumFreezeUndef = 0;
10172 unsigned NumZero = 0;
10173 unsigned NumNonZero = 0;
10174 unsigned NonZeros = 0;
10176 for (
unsigned i = 0; i != NumOperands; ++i) {
10192 assert(i <
sizeof(NonZeros) * CHAR_BIT);
10193 NonZeros |= 1 << i;
10199 if (NumNonZero > 2) {
10203 Ops.slice(0, NumOperands/2));
10205 Ops.slice(NumOperands/2));
10217 U,
getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
10219 MVT SubVT =
Op.getOperand(0).getSimpleValueType();
10221 for (
unsigned i = 0; i != NumOperands; ++i) {
10222 if ((NonZeros & (1 << i)) == 0)
10239 MVT ResVT =
Op.getSimpleValueType();
10240 unsigned NumOperands =
Op.getNumOperands();
10242 "Unexpected number of operands in CONCAT_VECTORS");
10246 for (
unsigned i = 0; i != NumOperands; ++i) {
10250 assert(i <
sizeof(NonZeros) * CHAR_BIT);
10262 if (
isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10263 Log2_64(NonZeros) != NumOperands - 1) {
10264 unsigned Idx =
Log2_64(NonZeros);
10269 Op = DAG.
getNode(X86ISD::KSHIFTL, dl, ShiftVT,
Op,
10280 unsigned Idx =
Log2_64(NonZeros);
10287 if (NumOperands > 2) {
10291 Ops.slice(0, NumOperands / 2));
10293 Ops.slice(NumOperands / 2));
10313 MVT VT =
Op.getSimpleValueType();
10322 (
Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4)));
10340 int Idx,
int ExpectedIdx) {
10341 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
10342 ExpectedIdx < MaskSize &&
"Out of range element index");
10343 if (!
Op || !ExpectedOp ||
Op.getOpcode() != ExpectedOp.
getOpcode())
10346 EVT VT =
Op.getValueType();
10356 if (Idx == ExpectedIdx &&
Op == ExpectedOp)
10359 switch (
Op.getOpcode()) {
10363 return Op.getOperand(Idx) == ExpectedOp.
getOperand(ExpectedIdx);
10366 EVT SrcVT = Src.getValueType();
10370 return (Idx % Scale) == (ExpectedIdx % Scale) &&
10372 Idx / Scale, ExpectedIdx / Scale);
10376 for (
unsigned I = 0;
I != Scale; ++
I)
10379 (ExpectedIdx * Scale) +
I))
10388 return Op == ExpectedOp &&
10389 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
10391 case X86ISD::VBROADCAST:
10392 case X86ISD::VBROADCAST_LOAD:
10393 return Op == ExpectedOp;
10394 case X86ISD::SUBV_BROADCAST_LOAD:
10395 if (
Op == ExpectedOp) {
10397 unsigned NumMemElts =
MemOp->getMemoryVT().getVectorNumElements();
10398 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
10401 case X86ISD::VPERMI: {
10402 if (
Op == ExpectedOp) {
10407 Mask[ExpectedIdx]);
10413 case X86ISD::FHADD:
10414 case X86ISD::FHSUB:
10415 case X86ISD::PACKSS:
10416 case X86ISD::PACKUS:
10419 if (
Op == ExpectedOp &&
Op.getOperand(0) ==
Op.getOperand(1)) {
10422 int NumEltsPerLane = NumElts / NumLanes;
10423 int NumHalfEltsPerLane = NumEltsPerLane / 2;
10424 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
10426 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
10427 return SameLane && SameElt;
10443 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
10444 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
10445 if (Mask[i] >= 0 && Mask[i] != i)
10457 unsigned ScalarSizeInBits,
10459 assert(LaneSizeInBits && ScalarSizeInBits &&
10460 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10461 "Illegal shuffle lane size");
10462 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10463 int Size = Mask.size();
10464 for (
int i = 0; i <
Size; ++i)
10465 if (Mask[i] >= 0 && (Mask[i] %
Size) / LaneSize != i / LaneSize)
10480 unsigned ScalarSizeInBits,
10482 assert(LaneSizeInBits && ScalarSizeInBits &&
10483 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10484 "Illegal shuffle lane size");
10485 int NumElts = Mask.size();
10486 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10487 int NumLanes = NumElts / NumEltsPerLane;
10488 if (NumLanes > 1) {
10489 for (
int i = 0; i != NumLanes; ++i) {
10491 for (
int j = 0; j != NumEltsPerLane; ++j) {
10492 int M = Mask[(i * NumEltsPerLane) + j];
10495 int Lane = (M % NumElts) / NumEltsPerLane;
10496 if (SrcLane >= 0 && SrcLane != Lane)
10520 RepeatedMask.
assign(LaneSize, -1);
10521 int Size = Mask.size();
10522 for (
int i = 0; i <
Size; ++i) {
10526 if ((Mask[i] %
Size) / LaneSize != i / LaneSize)
10532 int LocalM = Mask[i] <
Size ? Mask[i] % LaneSize
10533 : Mask[i] % LaneSize + LaneSize;
10534 if (RepeatedMask[i % LaneSize] < 0)
10536 RepeatedMask[i % LaneSize] = LocalM;
10537 else if (RepeatedMask[i % LaneSize] != LocalM)
10567 unsigned EltSizeInBits,
10570 int LaneSize = LaneSizeInBits / EltSizeInBits;
10572 int Size = Mask.size();
10573 for (
int i = 0; i <
Size; ++i) {
10583 if ((Mask[i] %
Size) / LaneSize != i / LaneSize)
10589 int LaneM = Mask[i] /
Size;
10590 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10593 RepeatedMask[i % LaneSize] = LocalM;
10594 else if (RepeatedMask[i % LaneSize] != LocalM)
10607 Mask, RepeatedMask);
10623 int Size = Mask.size();
10624 if (
Size != (
int)ExpectedMask.
size())
10627 for (
int i = 0; i <
Size; ++i) {
10628 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
10629 int MaskIdx = Mask[i];
10630 int ExpectedIdx = ExpectedMask[i];
10631 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10633 SDValue ExpectedV = ExpectedIdx <
Size ? V1 : V2;
10634 MaskIdx = MaskIdx <
Size ? MaskIdx : (MaskIdx -
Size);
10635 ExpectedIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
10657 int Size = Mask.size();
10658 if (
Size != (
int)ExpectedMask.
size())
10665 "Illegal target shuffle mask");
10672 if (V1 && (V1.getValueSizeInBits() != VT.
getSizeInBits() ||
10673 !V1.getValueType().isVector()))
10675 if (V2 && (V2.getValueSizeInBits() != VT.
getSizeInBits() ||
10676 !V2.getValueType().isVector()))
10682 for (
int i = 0; i <
Size; ++i) {
10683 int MaskIdx = Mask[i];
10684 int ExpectedIdx = ExpectedMask[i];
10688 if (ExpectedIdx < 0)
10694 SDValue ExpectedV = ExpectedIdx <
Size ? V1 : V2;
10697 int BitIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
10698 APInt &ZeroMask = ExpectedIdx <
Size ? ZeroV1 : ZeroV2;
10699 ZeroMask.
setBit(BitIdx);
10703 if (MaskIdx >= 0) {
10705 SDValue ExpectedV = ExpectedIdx <
Size ? V1 : V2;
10706 MaskIdx = MaskIdx <
Size ? MaskIdx : (MaskIdx -
Size);
10707 ExpectedIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
10721 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10732 return IsUnpackwdMask;
10746 for (
unsigned i = 0; i != 4; ++i) {
10761 assert(Mask.size() % 2 == 0 &&
"Expecting even number of elements in mask");
10762 unsigned HalfSize = Mask.size() / 2;
10763 for (
unsigned i = 0; i != HalfSize; ++i) {
10764 if (Mask[i] != Mask[i + HalfSize])
10779 assert(Mask.size() == 4 &&
"Only 4-lane shuffle masks");
10780 assert(Mask[0] >= -1 && Mask[0] < 4 &&
"Out of bound mask element!");
10781 assert(Mask[1] >= -1 && Mask[1] < 4 &&
"Out of bound mask element!");
10782 assert(Mask[2] >= -1 && Mask[2] < 4 &&
"Out of bound mask element!");
10783 assert(Mask[3] >= -1 && Mask[3] < 4 &&
"Out of bound mask element!");
10787 int FirstIndex =
find_if(Mask, [](
int M) {
return M >= 0; }) - Mask.begin();
10788 assert(0 <= FirstIndex && FirstIndex < 4 &&
"All undef shuffle mask");
10790 int FirstElt = Mask[FirstIndex];
10791 if (
all_of(Mask, [FirstElt](
int M) {
return M < 0 || M == FirstElt; }))
10792 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10795 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10796 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10797 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10798 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10810 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10811 "Unexpected SHUFPD mask size");
10812 assert(
all_of(Mask, [](
int M) {
return -1 <= M && M <= 1; }) &&
10813 "Unexpected SHUFPD mask elements");
10817 int FirstIndex =
find_if(Mask, [](
int M) {
return M >= 0; }) - Mask.begin();
10818 assert(0 <= FirstIndex && FirstIndex < (
int)Mask.size() &&
10819 "All undef shuffle mask");
10821 int FirstElt = Mask[FirstIndex];
10822 if (
all_of(Mask, [FirstElt](
int M) {
return M < 0 || M == FirstElt; }) &&
10823 count_if(Mask, [FirstElt](
int M) {
return M == FirstElt; }) > 1) {
10825 for (
unsigned I = 0,
E = Mask.size();
I !=
E; ++
I)
10826 Imm |= FirstElt <<
I;
10833 for (
unsigned I = 0,
E = Mask.size();
I !=
E; ++
I)
10834 Imm |= (Mask[
I] < 0 ? (
I & 1) : Mask[
I]) <<
I;
10853 bool &IsZeroSideLeft) {
10854 int NextElement = -1;
10856 for (
int i = 0, e = Mask.size(); i < e; i++) {
10858 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
10864 if (NextElement < 0) {
10865 NextElement = Mask[i] != 0 ?
VectorType.getVectorNumElements() : 0;
10866 IsZeroSideLeft = NextElement != 0;
10869 if (NextElement != Mask[i])
10879 unsigned Depth = 0);
10887 int Size = Mask.size();
10901 for (
int i = 0; i < NumBytes; ++i) {
10902 int M = Mask[i / NumEltBytes];
10904 PSHUFBMask[i] = DAG.
getUNDEF(MVT::i8);
10907 if (Zeroable[i / NumEltBytes]) {
10908 PSHUFBMask[i] = ZeroMask;
10914 if (V && V != SrcV)
10920 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10924 M = M * NumEltBytes + (i % NumEltBytes);
10927 assert(V &&
"Failed to find a source input");
10940 MVT SrcVT = Mask.getSimpleValueType();
10942 assert(MaskVT.
bitsLE(SrcVT) &&
"Unexpected mask size!");
10956 if (SrcVT == MVT::i64 && Subtarget.is32Bit()) {
10957 assert(MaskVT == MVT::v64i1 &&
"Expected v64i1 mask!");
10958 assert(Subtarget.hasBWI() &&
"Expected AVX512BW target!");
10978 const APInt &Zeroable,
10981 bool IsLeftZeroSide =
true;
10985 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10990 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10991 "Unexpected number of vector elements");
10993 Subtarget, DAG,
DL);
10995 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10996 return DAG.
getNode(X86ISD::EXPAND,
DL, VT, ExpandedVector, ZeroVector, VMask);
11000 unsigned &UnpackOpcode,
bool IsUnary,
11006 bool Undef1 =
true, Undef2 =
true, Zero1 =
true, Zero2 =
true;
11007 for (
int i = 0; i != NumElts; i += 2) {
11008 int M1 = TargetMask[i + 0];
11009 int M2 = TargetMask[i + 1];
11015 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
11016 "Zeroable shuffle detected");
11022 (IsUnary ? V1 : V2))) {
11023 UnpackOpcode = X86ISD::UNPCKL;
11024 V2 = (Undef2 ? DAG.
getUNDEF(VT) : (IsUnary ? V1 : V2));
11025 V1 = (Undef1 ? DAG.
getUNDEF(VT) : V1);
11031 (IsUnary ? V1 : V2))) {
11032 UnpackOpcode = X86ISD::UNPCKH;
11033 V2 = (Undef2 ? DAG.
getUNDEF(VT) : (IsUnary ? V1 : V2));
11034 V1 = (Undef1 ? DAG.
getUNDEF(VT) : V1);
11039 if (IsUnary && (Zero1 || Zero2)) {
11041 if ((Subtarget.
hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11045 bool MatchLo =
true, MatchHi =
true;
11046 for (
int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11047 int M = TargetMask[i];
11050 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11054 MatchLo &= (M == Unpckl[i]);
11055 MatchHi &= (M == Unpckh[i]);
11058 if (MatchLo || MatchHi) {
11059 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11070 UnpackOpcode = X86ISD::UNPCKL;
11077 UnpackOpcode = X86ISD::UNPCKH;
11094 return DAG.
getNode(X86ISD::UNPCKL,
DL, VT, V1, V2);
11099 return DAG.
getNode(X86ISD::UNPCKH,
DL, VT, V1, V2);
11104 return DAG.
getNode(X86ISD::UNPCKL,
DL, VT, V2, V1);
11108 return DAG.
getNode(X86ISD::UNPCKH,
DL, VT, V2, V1);
11122 unsigned UnpackOpcode;
11124 UnpackOpcode = X86ISD::UNPCKL;
11126 UnpackOpcode = X86ISD::UNPCKH;
11134 DAG.
getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11136 return DAG.
getNode(UnpackOpcode,
DL, VT, V1, V1);
11147 unsigned NumElts = Mask.size();
11149 unsigned MaxScale = 64 / EltSizeInBits;
11151 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11152 unsigned SrcEltBits = EltSizeInBits * Scale;
11153 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11155 unsigned NumSrcElts = NumElts / Scale;
11158 unsigned UpperElts = NumElts - NumSrcElts;
11164 if ((NumSrcElts * EltSizeInBits) >= 128) {
11182 MVT SrcVT = Src.getSimpleValueType();
11192 if (NumSrcElts == NumDstElts)
11195 if (NumSrcElts > NumDstElts) {
11201 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11218 if (DstVT != TruncVT)
11242 const APInt &Zeroable,
11245 assert((VT == MVT::v16i8 || VT == MVT::v8i16) &&
"Unexpected VTRUNC type");
11251 unsigned MaxScale = 64 / EltSizeInBits;
11252 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11253 unsigned SrcEltBits = EltSizeInBits * Scale;
11254 unsigned NumSrcElts = NumElts / Scale;
11255 unsigned UpperElts = NumElts - NumSrcElts;
11264 Src.getScalarValueSizeInBits() == SrcEltBits) {
11265 Src = Src.getOperand(0);
11266 }
else if (Subtarget.hasVLX()) {
11279 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
11292 const APInt &Zeroable,
11296 "Unexpected VTRUNC type");
11303 unsigned MaxScale = 64 / EltSizeInBits;
11304 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11306 unsigned SrcEltBits = EltSizeInBits * Scale;
11307 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11312 unsigned NumHalfSrcElts = NumElts / Scale;
11313 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11320 unsigned UpperElts = NumElts - NumSrcElts;
11321 if (UpperElts > 0 &&
11348 X86ISD::VSRLI,
DL, SrcVT, Src,
11385 bool IsSingleInput) {
11388 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11390 "We should only be called with masks with a power-of-2 size!");
11393 int Offset = MatchEven ? 0 : 1;
11398 bool ViableForN[3] = {
true,
true,
true};
11400 for (
int i = 0, e = Mask.size(); i < e; ++i) {
11406 bool IsAnyViable =
false;
11407 for (
unsigned j = 0; j != std::size(ViableForN); ++j)
11408 if (ViableForN[j]) {
11413 IsAnyViable =
true;
11415 ViableForN[j] =
false;
11422 for (
unsigned j = 0; j != std::size(ViableForN); ++j)
11438 unsigned MaxStages = 1) {
11441 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11442 "Illegal maximum compaction");
11445 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11446 unsigned NumPackedBits = NumSrcBits - BitSize;
11450 unsigned NumBits2 = N2.getScalarValueSizeInBits();
11453 if ((!N1.
isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11454 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11456 if (Subtarget.
hasSSE41() || BitSize == 8) {
11463 PackOpcode = X86ISD::PACKUS;
11469 if ((N1.
isUndef() || IsZero1 || IsAllOnes1 ||
11471 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11476 PackOpcode = X86ISD::PACKSS;
11483 for (
unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11491 if (MatchPACK(V1, V2, PackVT))
11498 if (MatchPACK(V1, V1, PackVT))
11510 unsigned PackOpcode;
11513 unsigned MaxStages =
Log2_32(64 / EltBits);
11515 Subtarget, MaxStages))
11519 unsigned NumStages =
Log2_32(CurrentEltBits / EltBits);
11522 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11527 unsigned MaxPackBits = 16;
11528 if (CurrentEltBits > 16 &&
11529 (PackOpcode == X86ISD::PACKSS || Subtarget.
hasSSE41()))
11534 for (
unsigned i = 0; i != NumStages; ++i) {
11535 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11536 unsigned NumSrcElts = SizeBits / SrcEltBits;
11544 CurrentEltBits /= 2;
11547 "Failed to lower compaction shuffle");
11557 const APInt &Zeroable,
11564 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11582 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
11585 if (Mask[i] %
Size != i)
11588 V = Mask[i] <
Size ? V1 : V2;
11589 else if (V != (Mask[i] <
Size ? V1 : V2))
11617 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
11618 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i +
Size)
11620 MaskOps.push_back(Mask[i] <
Size ?
AllOnes : Zero);
11634 const APInt &Zeroable,
bool &ForceV1Zero,
11635 bool &ForceV2Zero,
uint64_t &BlendMask) {
11636 bool V1IsZeroOrUndef =
11638 bool V2IsZeroOrUndef =
11642 ForceV1Zero =
false, ForceV2Zero =
false;
11643 assert(Mask.size() <= 64 &&
"Shuffle mask too big for blend mask");
11645 int NumElts = Mask.size();
11647 int NumEltsPerLane = NumElts / NumLanes;
11648 assert((NumLanes * NumEltsPerLane) == NumElts &&
"Value type mismatch");
11652 bool ForceWholeLaneMasks =
11657 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
11659 bool LaneV1InUse =
false;
11660 bool LaneV2InUse =
false;
11662 for (
int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11663 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11667 if (M == Elt || (0 <= M && M < NumElts &&
11670 LaneV1InUse =
true;
11673 if (M == (Elt + NumElts) ||
11676 LaneBlendMask |= 1ull << LaneElt;
11677 Mask[Elt] = Elt + NumElts;
11678 LaneV2InUse =
true;
11681 if (Zeroable[Elt]) {
11682 if (V1IsZeroOrUndef) {
11683 ForceV1Zero =
true;
11685 LaneV1InUse =
true;
11688 if (V2IsZeroOrUndef) {
11689 ForceV2Zero =
true;
11690 LaneBlendMask |= 1ull << LaneElt;
11691 Mask[Elt] = Elt + NumElts;
11692 LaneV2InUse =
true;
11702 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11703 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11705 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11718 const APInt &Zeroable,
11722 bool ForceV1Zero =
false, ForceV2Zero =
false;
11739 assert(Subtarget.
hasAVX2() &&
"256-bit integer blends require AVX2!");
11743 assert(Subtarget.
hasAVX() &&
"256-bit float blends require AVX!");
11750 assert(Subtarget.
hasSSE41() &&
"128-bit blends require SSE41!");
11751 return DAG.
getNode(X86ISD::BLENDI,
DL, VT, V1, V2,
11753 case MVT::v16i16: {
11754 assert(Subtarget.
hasAVX2() &&
"v16i16 blends require AVX2!");
11758 assert(RepeatedMask.
size() == 8 &&
"Repeated mask size doesn't match!");
11760 for (
int i = 0; i < 8; ++i)
11761 if (RepeatedMask[i] >= 8)
11762 BlendMask |= 1ull << i;
11763 return DAG.
getNode(X86ISD::BLENDI,
DL, MVT::v16i16, V1, V2,
11769 uint64_t LoMask = BlendMask & 0xFF;
11770 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11771 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11777 MVT::v16i16,
DL,
Lo,
Hi,
11778 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11783 assert(Subtarget.
hasAVX2() &&
"256-bit byte-blends require AVX2!");
11786 assert(Subtarget.
hasSSE41() &&
"128-bit byte-blends require SSE41!");
11793 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11800 if (Subtarget.hasVLX())
11833 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
11834 for (
int j = 0; j < Scale; ++j)
11881 bool ImmBlends =
false) {
11887 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
11891 assert(Mask[i] <
Size * 2 &&
"Shuffle input is out of bounds.");
11893 if (BlendMask[Mask[i] %
Size] < 0)
11894 BlendMask[Mask[i] %
Size] = Mask[i];
11895 else if (BlendMask[Mask[i] %
Size] != Mask[i])
11898 PermuteMask[i] = Mask[i] %
Size;
11920 int NumElts = Mask.size();
11922 int NumLaneElts = NumElts / NumLanes;
11923 int NumHalfLaneElts = NumLaneElts / 2;
11925 bool MatchLo =
true, MatchHi =
true;
11929 for (
int Elt = 0; Elt != NumElts; ++Elt) {
11937 if (M < NumElts && (
Op.isUndef() ||
Op == V1))
11939 else if (NumElts <= M && (
Op.isUndef() ||
Op == V2)) {
11945 bool MatchLoAnyLane =
false, MatchHiAnyLane =
false;
11946 for (
int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11947 int Lo = Lane, Mid = Lane + NumHalfLaneElts,
Hi = Lane + NumLaneElts;
11950 if (MatchLoAnyLane || MatchHiAnyLane) {
11951 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11952 "Failed to match UNPCKLO/UNPCKHI");
11956 MatchLo &= MatchLoAnyLane;
11957 MatchHi &= MatchHiAnyLane;
11958 if (!MatchLo && !MatchHi)
11961 assert((MatchLo ^ MatchHi) &&
"Failed to match UNPCKLO/UNPCKHI");
11967 for (
int Elt = 0; Elt != NumElts; ++Elt) {
11974 bool IsFirstOp = M < NumElts;
11976 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11977 if ((IsFirstOp && V1 ==
Ops[0]) || (!IsFirstOp && V2 ==
Ops[0]))
11978 PermuteMask[Elt] = BaseMaskElt;
11979 else if ((IsFirstOp && V1 ==
Ops[1]) || (!IsFirstOp && V2 ==
Ops[1]))
11980 PermuteMask[Elt] = BaseMaskElt + 1;
11981 assert(PermuteMask[Elt] != -1 &&
11982 "Input mask element is defined but failed to assign permute mask");
11985 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12004 int Size = Mask.size();
12005 assert(Mask.size() >= 2 &&
"Single element masks are invalid.");
12016 bool UnpackLo = NumLoInputs >= NumHiInputs;
12018 auto TryUnpack = [&](
int ScalarSize,
int Scale) {
12022 for (
int i = 0; i <
Size; ++i) {
12027 int UnpackIdx = i / Scale;
12031 if ((UnpackIdx % 2 == 0) != (Mask[i] <
Size))
12037 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 :
Size / 2)] =
12059 VT, DAG.
getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
DL,
12060 UnpackVT, V1, V2));
12066 for (
int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
12067 if (
SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
12078 if (NumLoInputs == 0 || NumHiInputs == 0) {
12079 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
12080 "We have to have *some* inputs!");
12081 int HalfOffset = NumLoInputs == 0 ?
Size / 2 : 0;
12089 for (
int i = 0; i <
Size; ++i) {
12093 assert(Mask[i] %
Size >= HalfOffset &&
"Found input from wrong half!");
12096 2 * ((Mask[i] %
Size) - HalfOffset) + (Mask[i] <
Size ? 0 : 1);
12100 DAG.
getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
DL, VT,
12125 int NumEltsPerLane = NumElts / NumLanes;
12128 bool Blend1 =
true;
12129 bool Blend2 =
true;
12130 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
12131 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
12132 for (
int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12133 for (
int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12134 int M = Mask[Lane + Elt];
12138 Blend1 &= (M == (Lane + Elt));
12139 assert(Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask");
12140 M = M % NumEltsPerLane;
12141 Range1.first = std::min(Range1.first, M);
12142 Range1.second = std::max(Range1.second, M);
12145 Blend2 &= (M == (Lane + Elt));
12146 assert(Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask");
12147 M = M % NumEltsPerLane;
12148 Range2.first = std::min(Range2.first, M);
12149 Range2.second = std::max(Range2.second, M);
12157 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12158 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12172 for (
int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12173 for (
int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12174 int M = Mask[Lane + Elt];
12178 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12180 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12187 if (Range2.second < Range1.first)
12188 return RotateAndPermute(V1, V2, Range1.first, 0);
12189 if (Range1.second < Range2.first)
12190 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12204 size_t NumUndefs = 0;
12205 std::optional<int> UniqueElt;
12206 for (
int Elt : Mask) {
12211 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
12217 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
12230 int NumElts = Mask.size();
12232 int NumEltsPerLane = NumElts / NumLanes;
12236 bool IsAlternating =
true;
12237 bool V1Zero =
true, V2Zero =
true;
12241 for (
int i = 0; i < NumElts; ++i) {
12243 if (M >= 0 && M < NumElts) {
12246 V1Zero &= Zeroable[i];
12247 IsAlternating &= (i & 1) == 0;
12248 }
else if (M >= NumElts) {
12249 V2Mask[i] = M - NumElts;
12250 FinalMask[i] = i + NumElts;
12251 V2Zero &= Zeroable[i];
12252 IsAlternating &= (i & 1) == 1;
12259 auto canonicalizeBroadcastableInput = [
DL, VT, &Subtarget,
12262 unsigned EltSizeInBits =
Input.getScalarValueSizeInBits();
12263 if (!Subtarget.
hasAVX2() && (!Subtarget.
hasAVX() || EltSizeInBits < 32 ||
12269 "Expected to demand only the 0'th element.");
12272 int &InputMaskElt =
I.value();
12273 if (InputMaskElt >= 0)
12274 InputMaskElt =
I.index();
12284 canonicalizeBroadcastableInput(V1, V1Mask);
12285 canonicalizeBroadcastableInput(V2, V2Mask);
12299 DL, VT, V1, V2, Mask, Subtarget, DAG))
12320 DL, VT, V1, V2, Mask, Subtarget, DAG))
12330 DL, VT, V1, V2, Mask, Subtarget, DAG))
12339 V1Mask.
assign(NumElts, -1);
12340 V2Mask.
assign(NumElts, -1);
12341 FinalMask.
assign(NumElts, -1);
12342 for (
int i = 0; i != NumElts; i += NumEltsPerLane)
12343 for (
int j = 0; j != NumEltsPerLane; ++j) {
12344 int M = Mask[i + j];
12345 if (M >= 0 && M < NumElts) {
12346 V1Mask[i + (j / 2)] = M;
12347 FinalMask[i + j] = i + (j / 2);
12348 }
else if (M >= NumElts) {
12349 V2Mask[i + (j / 2)] = M - NumElts;
12350 FinalMask[i + j] = i + (j / 2) + NumElts;
12364 assert(EltSizeInBits < 64 &&
"Can't rotate 64-bit integers");
12367 int MinSubElts = Subtarget.
hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12368 int MaxSubElts = 64 / EltSizeInBits;
12369 unsigned RotateAmt, NumSubElts;
12371 MaxSubElts, NumSubElts, RotateAmt))
12373 unsigned NumElts = Mask.size();
12388 if (!IsLegal && Subtarget.
hasSSE3())
12401 if ((RotateAmt % 16) == 0)
12404 unsigned ShlAmt = RotateAmt;
12426 int NumElts = Mask.size();
12437 for (
int i = 0; i < NumElts; ++i) {
12440 "Unexpected mask index.");
12445 int StartIdx = i - (M % NumElts);
12453 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12456 Rotation = CandidateRotation;
12457 else if (Rotation != CandidateRotation)
12462 SDValue MaskV = M < NumElts ? V1 : V2;
12473 else if (TargetV != MaskV)
12480 assert(Rotation != 0 &&
"Failed to locate a viable rotation!");
12481 assert((
Lo ||
Hi) &&
"Failed to find a rotated input vector!");
12526 int NumElts = RepeatedMask.
size();
12527 int Scale = 16 / NumElts;
12528 return Rotation * Scale;
12539 if (ByteRotation <= 0)
12551 "512-bit PALIGNR requires BWI instructions");
12558 "Rotate-based lowering only supports 128-bit lowering!");
12559 assert(Mask.size() <= 16 &&
12560 "Can shuffle at most 16 bytes in a 128-bit vector!");
12561 assert(ByteVT == MVT::v16i8 &&
12562 "SSE2 rotate lowering only needed for v16i8!");
12565 int LoByteShift = 16 - ByteRotation;
12566 int HiByteShift = ByteRotation;
12590 const APInt &Zeroable,
12594 "Only 32-bit and 64-bit elements are supported!");
12598 &&
"VLX required for 128/256-bit vectors");
12610 unsigned NumElts = Mask.size();
12613 assert((ZeroLo + ZeroHi) < NumElts &&
"Zeroable shuffle detected");
12614 if (!ZeroLo && !ZeroHi)
12618 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12619 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12621 return DAG.
getNode(X86ISD::VALIGN,
DL, VT, Src,
12627 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12628 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12630 return DAG.
getNode(X86ISD::VALIGN,
DL, VT,
12641 const APInt &Zeroable,
12651 if (!ZeroLo && !ZeroHi)
12654 unsigned NumElts = Mask.size();
12655 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12665 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12674 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12675 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12677 Res = DAG.
getNode(X86ISD::VSRLDQ,
DL, MVT::v16i8, Res,
12679 }
else if (ZeroHi == 0) {
12680 unsigned Shift = Mask[ZeroLo] % NumElts;
12681 Res = DAG.
getNode(X86ISD::VSRLDQ,
DL, MVT::v16i8, Res,
12683 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12685 }
else if (!Subtarget.
hasSSSE3()) {
12689 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12690 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12692 Shift += Mask[ZeroLo] % NumElts;
12693 Res = DAG.
getNode(X86ISD::VSRLDQ,
DL, MVT::v16i8, Res,
12695 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12728 int MaskOffset,
const APInt &Zeroable,
12730 int Size = Mask.size();
12731 unsigned SizeInBits =
Size * ScalarSizeInBits;
12733 auto CheckZeros = [&](
int Shift,
int Scale,
bool Left) {
12734 for (
int i = 0; i <
Size; i += Scale)
12735 for (
int j = 0; j < Shift; ++j)
12736 if (!Zeroable[i + j + (
Left ? 0 : (Scale - Shift))])
12742 auto MatchShift = [&](
int Shift,
int Scale,
bool Left) {
12743 for (
int i = 0; i !=
Size; i += Scale) {
12744 unsigned Pos =
Left ? i + Shift : i;
12745 unsigned Low =
Left ? i : i + Shift;
12746 unsigned Len = Scale - Shift;
12751 int ShiftEltBits = ScalarSizeInBits * Scale;
12752 bool ByteShift = ShiftEltBits > 64;
12753 Opcode =
Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12754 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12755 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12759 Scale = ByteShift ? Scale / 2 : Scale;
12774 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12775 for (
int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12776 for (
int Shift = 1; Shift != Scale; ++Shift)
12777 for (
bool Left : {
true,
false})
12778 if (CheckZeros(Shift, Scale,
Left)) {
12779 int ShiftAmt = MatchShift(Shift, Scale,
Left);
12790 const APInt &Zeroable,
12793 int Size = Mask.size();
12802 Mask, 0, Zeroable, Subtarget);
12805 if (ShiftAmt < 0) {
12807 Mask,
Size, Zeroable, Subtarget);
12814 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12818 "Illegal integer vector type");
12820 V = DAG.
getNode(Opcode,
DL, ShiftVT, V,
12830 int Size = Mask.size();
12831 int HalfSize =
Size / 2;
12841 int Len = HalfSize;
12842 for (; Len > 0; --Len)
12843 if (!Zeroable[Len - 1])
12845 assert(Len > 0 &&
"Zeroable shuffle mask");
12850 for (
int i = 0; i != Len; ++i) {
12859 if (i > M || M >= HalfSize)
12862 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12870 if (!Src || Idx < 0)
12873 assert((Idx + Len) <= HalfSize &&
"Illegal extraction mask");
12886 int Size = Mask.size();
12887 int HalfSize =
Size / 2;
12894 for (
int Idx = 0; Idx != HalfSize; ++Idx) {
12910 for (
int Hi = Idx + 1;
Hi <= HalfSize; ++
Hi) {
12912 int Len =
Hi - Idx;
12926 }
else if ((!
Base || (
Base == V1)) &&
12929 }
else if ((!
Base || (
Base == V2)) &&
12954 return DAG.
getNode(X86ISD::EXTRQI,
DL, VT, V1,
12978 unsigned ExtOpc,
SDValue InputV,
12982 assert(Scale > 1 &&
"Need a scale to extend.");
12986 int NumEltsPerLane = 128 / EltBits;
12987 int OffsetLane =
Offset / NumEltsPerLane;
12988 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12989 "Only 8, 16, and 32 bit elements can be extended.");
12990 assert(Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.");
12991 assert(0 <=
Offset &&
"Extension offset must be positive.");
12993 "Extension offset must be in the first lane or start an upper lane.");
12996 auto SafeOffset = [&](
int Idx) {
12997 return OffsetLane == (Idx / NumEltsPerLane);
13001 auto ShuffleOffset = [&](
SDValue V) {
13006 for (
int i = 0; i * Scale < NumElements; ++i) {
13007 int SrcIdx = i +
Offset;
13008 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13021 NumElements / Scale);
13023 InputV = ShuffleOffset(InputV);
13038 if (AnyExt && EltBits == 32) {
13042 VT, DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32,
13046 if (AnyExt && EltBits == 16 && Scale > 2) {
13047 int PSHUFDMask[4] = {
Offset / 2, -1,
13049 InputV = DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32,
13052 int PSHUFWMask[4] = {1, -1, -1, -1};
13053 unsigned OddEvenOp = (
Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13055 VT, DAG.
getNode(OddEvenOp,
DL, MVT::v8i16,
13062 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13063 assert(NumElements == (
int)Mask.size() &&
"Unexpected shuffle mask size!");
13066 int LoIdx =
Offset * EltBits;
13068 MVT::v2i64, DAG.
getNode(X86ISD::EXTRQI,
DL, VT, InputV,
13075 int HiIdx = (
Offset + 1) * EltBits;
13077 MVT::v2i64, DAG.
getNode(X86ISD::EXTRQI,
DL, VT, InputV,
13087 if (Scale > 4 && EltBits == 8 && Subtarget.
hasSSSE3()) {
13088 assert(NumElements == 16 &&
"Unexpected byte vector width!");
13090 for (
int i = 0; i < 16; ++i) {
13091 int Idx =
Offset + (i / Scale);
13092 if ((i % Scale == 0 && SafeOffset(Idx))) {
13099 InputV = DAG.
getBitcast(MVT::v16i8, InputV);
13101 VT, DAG.
getNode(X86ISD::PSHUFB,
DL, MVT::v16i8, InputV,
13107 int AlignToUnpack =
Offset % (NumElements / Scale);
13108 if (AlignToUnpack) {
13110 for (
int i = AlignToUnpack; i < NumElements; ++i)
13111 ShMask[i - AlignToUnpack] = i;
13113 Offset -= AlignToUnpack;
13118 unsigned UnpackLoHi = X86ISD::UNPCKL;
13119 if (
Offset >= (NumElements / 2)) {
13120 UnpackLoHi = X86ISD::UNPCKH;
13121 Offset -= (NumElements / 2);
13128 InputV = DAG.
getNode(UnpackLoHi,
DL, InputVT, InputV, Ext);
13132 }
while (Scale > 1);
13153 int NumLanes = Bits / 128;
13155 int NumEltsPerLane = NumElements / NumLanes;
13157 "Exceeds 32-bit integer zero extension limit");
13158 assert((
int)Mask.size() == NumElements &&
"Unexpected shuffle mask size");
13164 bool AnyExt =
true;
13167 for (
int i = 0; i < NumElements; ++i) {
13171 if (i % Scale != 0) {
13183 SDValue V = M < NumElements ? V1 : V2;
13184 M = M % NumElements;
13187 Offset = M - (i / Scale);
13188 }
else if (InputV != V)
13195 (
Offset % NumEltsPerLane) == 0))
13200 if (
Offset && (
Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13203 if ((M % NumElements) != (
Offset + (i / Scale)))
13216 if (
Offset != 0 && Matches < 2)
13221 InputV, Mask, Subtarget, DAG);
13225 assert(Bits % 64 == 0 &&
13226 "The number of bits in a vector must be divisible by 64 on x86!");
13227 int NumExtElements = Bits / 64;
13231 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13232 assert(NumElements % NumExtElements == 0 &&
13233 "The input vector size must be divisible by the extended size.");
13244 auto CanZExtLowHalf = [&]() {
13245 for (
int i = NumElements / 2; i != NumElements; ++i)
13255 if (
SDValue V = CanZExtLowHalf()) {
13257 V = DAG.
getNode(X86ISD::VZEXT_MOVL,
DL, MVT::v2i64, V);
13270 MVT VT = V.getSimpleValueType();
13276 MVT NewVT = V.getSimpleValueType();
13297 return V.hasOneUse() &&
13301template<
typename T>
13303 T EltVT = VT.getScalarType();
13304 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
13305 (EltVT == MVT::f16 && !Subtarget.hasFP16());
13325 find_if(Mask, [&Mask](
int M) {
return M >= (int)Mask.size(); }) -
13328 bool IsV1Zeroable =
true;
13329 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
13330 if (i != V2Index && !Zeroable[i]) {
13331 IsV1Zeroable =
false;
13336 if (!IsV1Zeroable) {
13338 V1Mask[V2Index] = -1;
13353 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
13357 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
13366 if (!IsV1Zeroable) {
13377 }
else if (Mask[V2Index] != (
int)Mask.size() || EltVT == MVT::i8 ||
13378 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
13384 if (!IsV1Zeroable) {
13387 assert(VT == ExtVT &&
"Cannot change extended type when non-zeroable!");
13394 unsigned MovOpc = 0;
13395 if (EltVT == MVT::f16)
13396 MovOpc = X86ISD::MOVSH;
13397 else if (EltVT == MVT::f32)
13398 MovOpc = X86ISD::MOVSS;
13399 else if (EltVT == MVT::f64)
13400 MovOpc = X86ISD::MOVSD;
13403 return DAG.
getNode(MovOpc,
DL, ExtVT, V1, V2);
13410 V2 = DAG.
getNode(X86ISD::VZEXT_MOVL,
DL, ExtVT, V2);
13414 if (V2Index != 0) {
13421 V2Shuffle[V2Index] = 0;
13426 X86ISD::VSHLDQ,
DL, MVT::v16i8, V2,
13443 "We can only lower integer broadcasts with AVX2!");
13449 assert(V0VT.
isVector() &&
"Unexpected non-vector vector-sized value!");
13459 if (V0EltSize <= EltSize)
13462 assert(((V0EltSize % EltSize) == 0) &&
13463 "Scalar type sizes must all be powers of 2 on x86!");
13466 const unsigned Scale = V0EltSize / EltSize;
13467 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13479 if (
const int OffsetIdx = BroadcastIdx % Scale)
13483 return DAG.
getNode(X86ISD::VBROADCAST,
DL, VT,
13493 assert(Mask.size() == 4 &&
"Unsupported mask size!");
13494 assert(Mask[0] >= -1 && Mask[0] < 8 &&
"Out of bound mask element!");
13495 assert(Mask[1] >= -1 && Mask[1] < 8 &&
"Out of bound mask element!");
13496 assert(Mask[2] >= -1 && Mask[2] < 8 &&
"Out of bound mask element!");
13497 assert(Mask[3] >= -1 && Mask[3] < 8 &&
"Out of bound mask element!");
13501 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13503 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13516 int Size = Mask.size();
13517 for (
int i = 0; i <
Size; ++i)
13518 if (Mask[i] >= 0 && Mask[i] /
Size ==
Input && Mask[i] %
Size != i)
13528 int BroadcastableElement = 0) {
13530 int Size = Mask.size();
13531 for (
int i = 0; i <
Size; ++i)
13532 if (Mask[i] >= 0 && Mask[i] /
Size ==
Input &&
13533 Mask[i] %
Size != BroadcastableElement)
13547 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13567 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13569 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13575 if (NumElts == 4 &&
13580 NewMask.
append(NumElts, -1);
13600 if (!((Subtarget.
hasSSE3() && VT == MVT::v2f64) ||
13601 (Subtarget.
hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13608 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.
hasAVX2())
13610 : X86ISD::VBROADCAST;
13611 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.
hasAVX2();
13615 if (BroadcastIdx < 0) {
13622 assert(BroadcastIdx < (
int)Mask.size() &&
"We only expect to be called with "
13623 "a sorted mask where the broadcast "
13625 int NumActiveElts =
count_if(Mask, [](
int M) {
return M >= 0; });
13631 int BitOffset = BroadcastIdx * NumEltBits;
13634 switch (V.getOpcode()) {
13636 V = V.getOperand(0);
13640 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13641 int OpIdx = BitOffset / OpBitWidth;
13642 V = V.getOperand(
OpIdx);
13643 BitOffset %= OpBitWidth;
13648 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13649 unsigned Idx = V.getConstantOperandVal(1);
13650 unsigned BeginOffset = Idx * EltBitWidth;
13651 BitOffset += BeginOffset;
13652 V = V.getOperand(0);
13658 int Idx = (int)V.getConstantOperandVal(2);
13659 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13660 int BeginOffset = Idx * EltBitWidth;
13661 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13662 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13663 BitOffset -= BeginOffset;
13673 assert((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset");
13674 BroadcastIdx = BitOffset / NumEltBits;
13677 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13686 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13687 return TruncBroadcast;
13693 V = V.getOperand(BroadcastIdx);
13706 SDValue BaseAddr = Ld->getBasePtr();
13709 assert((
int)(
Offset * 8) == BitOffset &&
"Unexpected bit-offset");
13716 if (Opcode == X86ISD::VBROADCAST) {
13720 X86ISD::VBROADCAST_LOAD,
DL, Tys,
Ops, SVT,
13726 assert(SVT == MVT::f64 &&
"Unexpected VT!");
13727 V = DAG.
getLoad(SVT,
DL, Ld->getChain(), NewAddr,
13731 }
else if (!BroadcastFromReg) {
13734 }
else if (BitOffset != 0) {
13742 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13747 if (BitOffset < 128 && NumActiveElts > 1 &&
13748 V.getScalarValueSizeInBits() == NumEltBits) {
13749 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13750 "Unexpected bit-offset");
13752 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13757 if ((BitOffset % 128) != 0)
13760 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13761 "Unexpected bit-offset");
13762 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13763 "Unexpected vector size");
13764 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13770 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13772 if (Subtarget.
hasAVX()) {
13773 V = DAG.
getNode(X86ISD::VBROADCAST,
DL, MVT::v2f64, V);
13780 if (!V.getValueType().isVector()) {
13781 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13782 "Unexpected scalar size");
13791 if (V.getValueSizeInBits() > 128)
13796 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13808 unsigned &InsertPSMask,
13809 const APInt &Zeroable,
13813 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
13820 unsigned ZMask = 0;
13821 int VADstIndex = -1;
13822 int VBDstIndex = -1;
13823 bool VAUsedInPlace =
false;
13825 for (
int i = 0; i < 4; ++i) {
13833 if (i == CandidateMask[i]) {
13834 VAUsedInPlace =
true;
13839 if (VADstIndex >= 0 || VBDstIndex >= 0)
13842 if (CandidateMask[i] < 4) {
13852 if (VADstIndex < 0 && VBDstIndex < 0)
13857 unsigned VBSrcIndex = 0;
13858 if (VADstIndex >= 0) {
13861 VBSrcIndex = CandidateMask[VADstIndex];
13862 VBDstIndex = VADstIndex;
13865 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13870 if (!VAUsedInPlace)
13878 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13879 assert((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!");
13883 if (matchAsInsertPS(V1, V2, Mask))
13889 if (matchAsInsertPS(V2, V1, CommutedMask))
13902 unsigned InsertPSMask = 0;
13907 return DAG.
getNode(X86ISD::INSERTPS,
DL, MVT::v4f32, V1, V2,
13924 assert(Mask.size() == 2 &&
"Unexpected mask size for v2 shuffle!");
13929 Mask, Subtarget, DAG))
13934 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13936 if (Subtarget.
hasAVX()) {
13939 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v2f64, V1,
13944 X86ISD::SHUFP,
DL, MVT::v2f64,
13949 assert(Mask[0] >= 0 &&
"No undef lanes in multi-input v2 shuffles!");
13950 assert(Mask[1] >= 0 &&
"No undef lanes in multi-input v2 shuffles!");
13951 assert(Mask[0] < 2 &&
"We sort V1 to be the first input.");
13952 assert(Mask[1] >= 2 &&
"We sort V2 to be the second input.");
13961 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13965 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13966 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13968 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13979 X86ISD::MOVSD,
DL, MVT::v2f64, V2,
13984 Zeroable, Subtarget, DAG))
13991 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13992 return DAG.
getNode(X86ISD::SHUFP,
DL, MVT::v2f64, V1, V2,
14008 assert(Mask.size() == 2 &&
"Unexpected mask size for v2 shuffle!");
14013 Mask, Subtarget, DAG))
14020 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14021 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14022 Mask[1] < 0 ? -1 : (Mask[1] * 2),
14023 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14026 DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32, V1,
14029 assert(Mask[0] != -1 &&
"No undef lanes in multi-input v2 shuffles!");
14030 assert(Mask[1] != -1 &&
"No undef lanes in multi-input v2 shuffles!");
14031 assert(Mask[0] < 2 &&
"We sort V1 to be the first input.");
14032 assert(Mask[1] >= 2 &&
"We sort V2 to be the second input.");
14047 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14051 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14053 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14058 bool IsBlendSupported = Subtarget.
hasSSE41();
14059 if (IsBlendSupported)
14061 Zeroable, Subtarget, DAG))
14071 if (Subtarget.hasVLX())
14073 Zeroable, Subtarget, DAG))
14083 if (IsBlendSupported)
14085 Zeroable, Subtarget, DAG);
14105 SDValue LowV = V1, HighV = V2;
14107 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
14109 if (NumV2Elements == 1) {
14110 int V2Index =
find_if(Mask, [](
int M) {
return M >= 4; }) - Mask.begin();
14114 int V2AdjIndex = V2Index ^ 1;
14116 if (Mask[V2AdjIndex] < 0) {
14122 NewMask[V2Index] -= 4;
14126 int V1Index = V2AdjIndex;
14127 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14128 V2 = DAG.
getNode(X86ISD::SHUFP,
DL, VT, V2, V1,
14139 NewMask[V1Index] = 2;
14140 NewMask[V2Index] = 0;
14142 }
else if (NumV2Elements == 2) {
14143 if (Mask[0] < 4 && Mask[1] < 4) {
14148 }
else if (Mask[2] < 4 && Mask[3] < 4) {
14163 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14164 Mask[2] < 4 ? Mask[2] : Mask[3],
14165 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14166 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14167 V1 = DAG.
getNode(X86ISD::SHUFP,
DL, VT, V1, V2,
14173 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14174 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14175 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14176 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14178 }
else if (NumV2Elements == 3) {
14185 return DAG.
getNode(X86ISD::SHUFP,
DL, VT, LowV, HighV,
14200 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
14204 Zeroable, Subtarget, DAG))
14207 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
14209 if (NumV2Elements == 0) {
14212 Mask, Subtarget, DAG))
14218 return DAG.
getNode(X86ISD::MOVSLDUP,
DL, MVT::v4f32, V1);
14220 return DAG.
getNode(X86ISD::MOVSHDUP,
DL, MVT::v4f32, V1);
14223 if (Subtarget.
hasAVX()) {
14226 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v4f32, V1,
14234 return DAG.
getNode(X86ISD::MOVLHPS,
DL, MVT::v4f32, V1, V1);
14236 return DAG.
getNode(X86ISD::MOVHLPS,
DL, MVT::v4f32, V1, V1);
14241 return DAG.
getNode(X86ISD::SHUFP,
DL, MVT::v4f32, V1, V1,
14247 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
14261 if (NumV2Elements == 1 && Mask[0] >= 4)
14263 Zeroable, Subtarget, DAG))
14270 if (!MatchesShufPS || Zeroable == 0x3 || Zeroable == 0xC)
14274 if (!MatchesShufPS)
14284 return DAG.
getNode(X86ISD::MOVLHPS,
DL, MVT::v4f32, V1, V2);
14286 return DAG.
getNode(X86ISD::MOVHLPS,
DL, MVT::v4f32, V2, V1);
14307 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
14313 Zeroable, Subtarget, DAG))
14316 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
14319 if (Subtarget.preferLowerShuffleAsShift()) {
14322 Subtarget, DAG,
true))
14324 if (NumV2Elements == 0)
14330 if (NumV2Elements == 0) {
14332 if (
count_if(Mask, [](
int M) {
return M >= 0 && M < 4; }) > 1) {
14334 Mask, Subtarget, DAG))
14343 const int UnpackLoMask[] = {0, 0, 1, 1};
14344 const int UnpackHiMask[] = {2, 2, 3, 3};
14347 Mask = UnpackLoMask;
14349 Mask = UnpackHiMask;
14352 return DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32, V1,
14367 if (NumV2Elements == 1)
14369 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14374 bool IsBlendSupported = Subtarget.
hasSSE41();
14375 if (IsBlendSupported)
14377 Zeroable, Subtarget, DAG))
14381 Zeroable, Subtarget, DAG))
14391 if (Subtarget.hasVLX())
14392 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14393 Zeroable, Subtarget, DAG))
14396 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14407 if (IsBlendSupported)
14409 Zeroable, Subtarget, DAG);
14413 Mask, Subtarget, DAG))
14450 assert(Mask.size() == 8 &&
"Shuffle mask length doesn't match!");
14457 return DAG.
getNode(X86ISD::PSHUFLW,
DL, VT, V,
14462 for (
int i = 0; i != 4; ++i)
14463 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14464 return DAG.
getNode(X86ISD::PSHUFHW,
DL, VT, V,
14469 copy_if(LoMask, std::back_inserter(LoInputs), [](
int M) {
return M >= 0; });
14473 copy_if(HiMask, std::back_inserter(HiInputs), [](
int M) {
return M >= 0; });
14477 int NumHToL = LoInputs.
size() - NumLToL;
14479 int NumHToH = HiInputs.
size() - NumLToH;
14493 V = DAG.
getNode(X86ISD::PSHUFD,
DL, PSHUFDVT, V,
14498 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14499 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14501 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14504 for (
int DWord = 0; DWord != 4; ++DWord) {
14505 int M0 = Mask[2 * DWord + 0];
14506 int M1 = Mask[2 * DWord + 1];
14509 if (
M0 < 0 &&
M1 < 0)
14512 bool Match =
false;
14513 for (
int j = 0, e = DWordPairs.
size(); j < e; ++j) {
14514 auto &DWordPair = DWordPairs[j];
14517 DWordPair.first = (
M0 >= 0 ?
M0 : DWordPair.first);
14518 DWordPair.second = (
M1 >= 0 ?
M1 : DWordPair.second);
14519 PSHUFDMask[DWord] = DOffset + j;
14525 PSHUFDMask[DWord] = DOffset + DWordPairs.
size();
14530 if (DWordPairs.
size() <= 2) {
14531 DWordPairs.
resize(2, std::make_pair(-1, -1));
14532 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14533 DWordPairs[1].first, DWordPairs[1].second};
14538 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
14539 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
14540 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
14542 if ((NumHToL + NumHToH) == 0)
14543 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14544 if ((NumLToL + NumLToH) == 0)
14545 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14581 int AOffset,
int BOffset) {
14583 "Must call this with A having 3 or 1 inputs from the A half.");
14585 "Must call this with B having 1 or 3 inputs from the B half.");
14587 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14589 bool ThreeAInputs = AToAInputs.
size() == 3;
14595 int ADWord = 0, BDWord = 0;
14596 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14597 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14598 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14599 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14600 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14601 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14602 int TripleNonInputIdx =
14603 TripleInputSum - std::accumulate(TripleInputs.
begin(), TripleInputs.
end(), 0);
14604 TripleDWord = TripleNonInputIdx / 2;
14608 OneInputDWord = (OneInput / 2) ^ 1;
14615 if (BToBInputs.
size() == 2 && AToBInputs.
size() == 2) {
14620 int NumFlippedAToBInputs =
llvm::count(AToBInputs, 2 * ADWord) +
14622 int NumFlippedBToBInputs =
llvm::count(BToBInputs, 2 * BDWord) +
14624 if ((NumFlippedAToBInputs == 1 &&
14625 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14626 (NumFlippedBToBInputs == 1 &&
14627 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14632 auto FixFlippedInputs = [&V, &
DL, &Mask, &DAG](
int PinnedIdx,
int DWord,
14634 int FixIdx = PinnedIdx ^ 1;
14635 bool IsFixIdxInput =
is_contained(Inputs, PinnedIdx ^ 1);
14639 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14640 bool IsFixFreeIdxInput =
is_contained(Inputs, FixFreeIdx);
14641 if (IsFixIdxInput == IsFixFreeIdxInput)
14644 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14645 "We need to be changing the number of flipped inputs!");
14646 int PSHUFHalfMask[] = {0, 1, 2, 3};
14647 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14649 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW,
DL,
14653 for (
int &M : Mask)
14654 if (M >= 0 && M == FixIdx)
14656 else if (M >= 0 && M == FixFreeIdx)
14659 if (NumFlippedBToBInputs != 0) {
14661 BToAInputs.
size() == 3 ? TripleNonInputIdx : OneInput;
14662 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14664 assert(NumFlippedAToBInputs != 0 &&
"Impossible given predicates!");
14665 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14666 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14671 int PSHUFDMask[] = {0, 1, 2, 3};
14672 PSHUFDMask[ADWord] = BDWord;
14673 PSHUFDMask[BDWord] = ADWord;
14680 for (
int &M : Mask)
14681 if (M >= 0 && M/2 == ADWord)
14682 M = 2 * BDWord + M % 2;
14683 else if (M >= 0 && M/2 == BDWord)
14684 M = 2 * ADWord + M % 2;
14690 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14691 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14692 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14693 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14700 int PSHUFLMask[4] = {-1, -1, -1, -1};
14701 int PSHUFHMask[4] = {-1, -1, -1, -1};
14702 int PSHUFDMask[4] = {-1, -1, -1, -1};
14707 auto fixInPlaceInputs =
14711 if (InPlaceInputs.
empty())
14713 if (InPlaceInputs.
size() == 1) {
14714 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14715 InPlaceInputs[0] - HalfOffset;
14716 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14719 if (IncomingInputs.
empty()) {
14721 for (
int Input : InPlaceInputs) {
14722 SourceHalfMask[
Input - HalfOffset] =
Input - HalfOffset;
14728 assert(InPlaceInputs.
size() == 2 &&
"Cannot handle 3 or 4 inputs!");
14729 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14730 InPlaceInputs[0] - HalfOffset;
14733 int AdjIndex = InPlaceInputs[0] ^ 1;
14734 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14736 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14738 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14739 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14745 auto moveInputsToRightHalf = [&PSHUFDMask](
14750 auto isWordClobbered = [](
ArrayRef<int> SourceHalfMask,
int Word) {
14751 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14753 auto isDWordClobbered = [&isWordClobbered](
ArrayRef<int> SourceHalfMask,
14755 int LowWord = Word & ~1;
14756 int HighWord = Word | 1;
14757 return isWordClobbered(SourceHalfMask, LowWord) ||
14758 isWordClobbered(SourceHalfMask, HighWord);
14761 if (IncomingInputs.
empty())
14764 if (ExistingInputs.
empty()) {
14766 for (
int Input : IncomingInputs) {
14769 if (isWordClobbered(SourceHalfMask,
Input - SourceOffset)) {
14770 if (SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] < 0) {
14771 SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] =
14772 Input - SourceOffset;
14774 for (
int &M : HalfMask)
14775 if (M == SourceHalfMask[
Input - SourceOffset] + SourceOffset)
14777 else if (M ==
Input)
14778 M = SourceHalfMask[
Input - SourceOffset] + SourceOffset;
14780 assert(SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] ==
14781 Input - SourceOffset &&
14782 "Previous placement doesn't match!");
14787 Input = SourceHalfMask[
Input - SourceOffset] + SourceOffset;
14791 if (PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] < 0)
14792 PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] =
Input / 2;
14794 assert(PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] ==
14796 "Previous placement doesn't match!");
14802 for (
int &M : HalfMask)
14803 if (M >= SourceOffset && M < SourceOffset + 4) {
14804 M = M - SourceOffset + DestOffset;
14805 assert(M >= 0 &&
"This should never wrap below zero!");
14813 if (IncomingInputs.
size() == 1) {
14814 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14815 int InputFixed =
find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14817 SourceHalfMask[InputFixed - SourceOffset] =
14818 IncomingInputs[0] - SourceOffset;
14820 IncomingInputs[0] = InputFixed;
14822 }
else if (IncomingInputs.
size() == 2) {
14823 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14824 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14828 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14829 IncomingInputs[1] - SourceOffset};
14834 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14835 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14836 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14837 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14838 InputsFixed[1] = InputsFixed[0] ^ 1;
14839 }
else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14840 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14841 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14842 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14843 InputsFixed[0] = InputsFixed[1] ^ 1;
14844 }
else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14845 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14849 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14850 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14851 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14852 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14858 for (
int i = 0; i < 4; ++i)
14859 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14860 "We can't handle any clobbers here!");
14861 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14862 "Cannot have adjacent inputs here!");
14864 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14865 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14869 for (
int &M : FinalSourceHalfMask)
14870 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14871 M = InputsFixed[1] + SourceOffset;
14872 else if (M == InputsFixed[1] + SourceOffset)
14873 M = (InputsFixed[0] ^ 1) + SourceOffset;
14875 InputsFixed[1] = InputsFixed[0] ^ 1;
14879 for (
int &M : HalfMask)
14880 if (M == IncomingInputs[0])
14881 M = InputsFixed[0] + SourceOffset;
14882 else if (M == IncomingInputs[1])
14883 M = InputsFixed[1] + SourceOffset;
14885 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14886 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14893 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14894 assert(PSHUFDMask[FreeDWord] < 0 &&
"DWord not free");
14895 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14896 for (
int &M : HalfMask)
14897 for (
int Input : IncomingInputs)
14899 M = FreeDWord * 2 +
Input % 2;
14901 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14903 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14909 V = DAG.
getNode(X86ISD::PSHUFLW,
DL, VT, V,
14912 V = DAG.
getNode(X86ISD::PSHUFHW,
DL, VT, V,
14923 "Failed to lift all the high half inputs to the low mask!");
14924 assert(
none_of(HiMask, [](
int M) {
return M >= 0 && M < 4; }) &&
14925 "Failed to lift all the low half inputs to the high mask!");
14929 V = DAG.
getNode(X86ISD::PSHUFLW,
DL, VT, V,
14933 for (
int &M : HiMask)
14937 V = DAG.
getNode(X86ISD::PSHUFHW,
DL, VT, V,
14949 "Lane crossing shuffle masks not supported");
14952 int Size = Mask.size();
14953 int Scale = NumBytes /
Size;
14960 for (
int i = 0; i < NumBytes; ++i) {
14961 int M = Mask[i / Scale];
14965 const int ZeroMask = 0x80;
14966 int V1Idx = M <
Size ? M * Scale + i % Scale : ZeroMask;
14967 int V2Idx = M <
Size ? ZeroMask : (M -
Size) * Scale + i % Scale;
14968 if (Zeroable[i / Scale])
14969 V1Idx = V2Idx = ZeroMask;
14973 V1InUse |= (ZeroMask != V1Idx);
14974 V2InUse |= (ZeroMask != V2Idx);
14987 if (V1InUse && V2InUse)
14990 V = V1InUse ? V1 : V2;
15014 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
15019 Zeroable, Subtarget, DAG))
15027 int NumV2Inputs =
count_if(Mask, [](
int M) {
return M >= 8; });
15029 if (NumV2Inputs == 0) {
15033 Subtarget, DAG,
false))
15038 Mask, Subtarget, DAG))
15067 "All single-input shuffles should be canonicalized to be V1-input "
15077 if (Subtarget.hasSSE4A())
15083 if (NumV2Inputs == 1)
15085 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15090 bool IsBlendSupported = Subtarget.
hasSSE41();
15091 if (IsBlendSupported)
15093 Zeroable, Subtarget, DAG))
15097 Zeroable, Subtarget, DAG))
15125 Zeroable, Subtarget, DAG))
15130 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.
hasSSE41())) &&
15131 !Subtarget.hasVLX()) {
15133 unsigned PackOpc = 0;
15134 if (NumEvenDrops == 2 && Subtarget.
hasAVX2() &&
15138 V1V2 = DAG.
getNode(X86ISD::BLENDI,
DL, MVT::v16i16, V1V2,
15144 PackOpc = X86ISD::PACKUS;
15145 }
else if (Subtarget.
hasSSE41()) {
15148 for (
unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15156 PackOpc = X86ISD::PACKUS;
15157 }
else if (!Subtarget.
hasSSSE3()) {
15161 V1 = DAG.
getNode(X86ISD::VSHLI,
DL, MVT::v4i32, V1, ShAmt);
15162 V2 = DAG.
getNode(X86ISD::VSHLI,
DL, MVT::v4i32, V2, ShAmt);
15163 V1 = DAG.
getNode(X86ISD::VSRAI,
DL, MVT::v4i32, V1, ShAmt);
15164 V2 = DAG.
getNode(X86ISD::VSRAI,
DL, MVT::v4i32, V2, ShAmt);
15165 PackOpc = X86ISD::PACKSS;
15170 if (NumEvenDrops == 2) {
15171 Result = DAG.
getBitcast(MVT::v4i32, Result);
15172 Result = DAG.
getNode(PackOpc,
DL, MVT::v8i16, Result, Result);
15180 if (NumOddDrops == 1) {
15181 bool HasSSE41 = Subtarget.
hasSSE41();
15182 V1 = DAG.
getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI,
DL, MVT::v4i32,
15185 V2 = DAG.
getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI,
DL, MVT::v4i32,
15188 return DAG.
getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS,
DL,
15189 MVT::v8i16, V1, V2);
15194 Mask, Subtarget, DAG))
15199 if (!IsBlendSupported && Subtarget.
hasSSSE3()) {
15200 bool V1InUse, V2InUse;
15202 Zeroable, DAG, V1InUse, V2InUse);
15208 Zeroable, Subtarget, DAG);
15218 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
15219 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
15221 if (Subtarget.hasFP16()) {
15222 if (NumV2Elements == 0) {
15225 Mask, Subtarget, DAG))
15228 if (NumV2Elements == 1 && Mask[0] >= 8)
15230 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15257 MVT ShuffleVT = VT;
15267 for (
int &M : AdjustedMask)
15269 M += (Scale - 1) * NumElts;
15278 Result = DAG.
getNode(X86ISD::VPERMV,
DL, ShuffleVT, MaskNode, V1);
15280 Result = DAG.
getNode(X86ISD::VPERMV3,
DL, ShuffleVT, V1, MaskNode, V2);
15282 if (VT != ShuffleVT)
15301 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
15321 Zeroable, Subtarget, DAG))
15334 if (Subtarget.hasSSE4A())
15339 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
15342 if (NumV2Elements == 0) {
15345 Mask, Subtarget, DAG))
15365 for (
int i = 0; i < 16; i += 2)
15366 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15371 auto tryToWidenViaDuplication = [&]() ->
SDValue {
15372 if (!canWidenViaDuplication(Mask))
15375 copy_if(Mask, std::back_inserter(LoInputs),
15376 [](
int M) {
return M >= 0 && M < 8; });
15380 copy_if(Mask, std::back_inserter(HiInputs), [](
int M) {
return M >= 8; });
15384 bool TargetLo = LoInputs.
size() >= HiInputs.
size();
15385 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15386 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15388 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15390 for (
int I : InPlaceInputs) {
15391 PreDupI16Shuffle[
I/2] =
I/2;
15394 int j = TargetLo ? 0 : 4, je = j + 4;
15395 for (
int i = 0, ie = MovingInputs.
size(); i < ie; ++i) {
15398 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15401 while (j < je && PreDupI16Shuffle[j] >= 0)
15409 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15413 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15418 DAG.
getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15421 bool EvenInUse =
false, OddInUse =
false;
15422 for (
int i = 0; i < 16; i += 2) {
15423 EvenInUse |= (Mask[i + 0] >= 0);
15424 OddInUse |= (Mask[i + 1] >= 0);
15425 if (EvenInUse && OddInUse)
15428 V1 = DAG.
getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
DL,
15429 MVT::v16i8, EvenInUse ? V1 : DAG.
getUNDEF(MVT::v16i8),
15430 OddInUse ? V1 : DAG.
getUNDEF(MVT::v16i8));
15432 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15433 for (
int i = 0; i < 16; ++i)
15434 if (Mask[i] >= 0) {
15435 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15436 assert(MappedMask < 8 &&
"Invalid v8 shuffle mask!");
15437 if (PostDupI16Shuffle[i / 2] < 0)
15438 PostDupI16Shuffle[i / 2] = MappedMask;
15440 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15441 "Conflicting entries in the original shuffle!");
15446 DAG.
getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15448 if (
SDValue V = tryToWidenViaDuplication())
15453 Zeroable, Subtarget, DAG))
15462 Zeroable, Subtarget, DAG))
15466 bool IsSingleInput = V2.
isUndef();
15485 if (Subtarget.
hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15486 bool V1InUse =
false;
15487 bool V2InUse =
false;
15490 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15495 if (V1InUse && V2InUse) {
15498 Zeroable, Subtarget, DAG))
15510 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15514 if (Subtarget.hasVBMI())
15519 if (Subtarget.hasXOP()) {
15521 return DAG.
getNode(X86ISD::VPPERM,
DL, MVT::v16i8, V1, V2, MaskNode);
15527 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15535 if (NumV2Elements == 1)
15537 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15550 if (NumEvenDrops) {
15556 assert(NumEvenDrops <= 3 &&
15557 "No support for dropping even elements more than 3 times.");
15559 for (
unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15564 if (!IsSingleInput)
15570 IsSingleInput ? V1 : V2);
15571 for (
int i = 1; i < NumEvenDrops; ++i) {
15572 Result = DAG.
getBitcast(MVT::v8i16, Result);
15573 Result = DAG.
getNode(X86ISD::PACKUS,
DL, MVT::v16i8, Result, Result);
15579 if (NumOddDrops == 1) {
15580 V1 = DAG.
getNode(X86ISD::VSRLI,
DL, MVT::v8i16,
15583 if (!IsSingleInput)
15584 V2 = DAG.
getNode(X86ISD::VSRLI,
DL, MVT::v8i16,
15587 return DAG.
getNode(X86ISD::PACKUS,
DL, MVT::v16i8, V1,
15588 IsSingleInput ? V1 : V2);
15592 if (NumV2Elements > 0)
15594 Zeroable, Subtarget, DAG);
15601 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15602 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15603 for (
int i = 0; i < 16; ++i)
15605 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15611 if (
none_of(LoBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; }) &&
15612 none_of(HiBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; })) {
15619 VHiHalf = DAG.
getUNDEF(MVT::v8i16);
15622 for (
int &M : LoBlendMask)
15625 for (
int &M : HiBlendMask)
15634 MVT::v8i16, DAG.
getNode(X86ISD::UNPCKL,
DL, MVT::v16i8, V, Zero));
15636 MVT::v8i16, DAG.
getNode(X86ISD::UNPCKH,
DL, MVT::v16i8, V, Zero));
15642 return DAG.
getNode(X86ISD::PACKUS,
DL, MVT::v16i8, LoV, HiV);
15651 const APInt &Zeroable,
15654 if (VT == MVT::v8bf16) {
15691 "Only for 256-bit or wider vector shuffles!");
15696 if (VT == MVT::v8f32) {
15710 int SplitNumElements = NumElements / 2;
15716 auto SplitVector = [&](
SDValue V) {
15719 return std::make_pair(DAG.
getBitcast(SplitVT, LoV),
15723 SDValue LoV1, HiV1, LoV2, HiV2;
15724 std::tie(LoV1, HiV1) = SplitVector(V1);
15725 std::tie(LoV2, HiV2) = SplitVector(V2);
15728 auto GetHalfBlendPiecesReq = [&](
const ArrayRef<int> &HalfMask,
bool &UseLoV1,
15729 bool &UseHiV1,
bool &UseLoV2,
15731 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 =
false;
15732 for (
int i = 0; i < SplitNumElements; ++i) {
15733 int M = HalfMask[i];
15734 if (M >= NumElements) {
15735 if (M >= NumElements + SplitNumElements)
15739 }
else if (M >= 0) {
15740 if (M >= SplitNumElements)
15748 auto CheckHalfBlendUsable = [&](
const ArrayRef<int> &HalfMask) ->
bool {
15752 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15753 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15755 return !(UseHiV1 || UseHiV2);
15762 for (
int i = 0; i < SplitNumElements; ++i) {
15763 int M = HalfMask[i];
15764 if (M >= NumElements) {
15765 V2BlendMask[i] = M - NumElements;
15766 BlendMask[i] = SplitNumElements + i;
15767 }
else if (M >= 0) {
15768 V1BlendMask[i] = M;
15773 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15774 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15779 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) &&
"Shuffle isn't simple");
15782 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15784 if (!UseLoV2 && !UseHiV2)
15786 if (!UseLoV1 && !UseHiV1)
15790 if (UseLoV1 && UseHiV1) {
15794 V1Blend = UseLoV1 ? LoV1 : HiV1;
15795 for (
int i = 0; i < SplitNumElements; ++i)
15796 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15797 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15799 if (UseLoV2 && UseHiV2) {
15803 V2Blend = UseLoV2 ? LoV2 : HiV2;
15804 for (
int i = 0; i < SplitNumElements; ++i)
15805 if (BlendMask[i] >= SplitNumElements)
15806 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15811 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15829 const APInt &Zeroable,
15832 assert(!V2.
isUndef() &&
"This routine must not be used to lower single-input "
15833 "shuffles as it could then recurse on itself.");
15834 int Size = Mask.size();
15839 auto DoBothBroadcast = [&] {
15840 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15843 if (V2BroadcastIdx < 0)
15844 V2BroadcastIdx = M -
Size;
15845 else if ((M -
Size) != V2BroadcastIdx &&
15848 }
else if (M >= 0) {
15849 if (V1BroadcastIdx < 0)
15850 V1BroadcastIdx = M;
15851 else if (M != V1BroadcastIdx &&
15857 if (DoBothBroadcast())
15865 int LaneSize =
Size / LaneCount;
15867 LaneInputs[0].
resize(LaneCount,
false);
15868 LaneInputs[1].
resize(LaneCount,
false);
15869 for (
int i = 0; i <
Size; ++i)
15871 LaneInputs[Mask[i] /
Size][(Mask[i] %
Size) / LaneSize] =
true;
15872 if (LaneInputs[0].
count() <= 1 && LaneInputs[1].
count() <= 1)
15885 if (SplatOrSplitV1 && SplatOrSplitV2)
15902 assert(VT == MVT::v4f64 &&
"Only for v4f64 shuffles");
15904 int LHSMask[4] = {-1, -1, -1, -1};
15905 int RHSMask[4] = {-1, -1, -1, -1};
15906 int SHUFPDMask[4] = {-1, -1, -1, -1};
15910 for (
int i = 0; i != 4; ++i) {
15914 int LaneBase = i & ~1;
15915 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15916 LaneMask[LaneBase + (M & 1)] = M;
15917 SHUFPDMask[i] = M & 1;
15939 int NumEltsPerLane = NumElts / NumLanes;
15947 auto getSublanePermute = [&](
int NumSublanes) ->
SDValue {
15948 int NumSublanesPerLane = NumSublanes / NumLanes;
15949 int NumEltsPerSublane = NumElts / NumSublanes;
15957 for (
int i = 0; i != NumElts; ++i) {
15962 int SrcSublane = M / NumEltsPerSublane;
15963 int DstLane = i / NumEltsPerLane;
15967 bool Found =
false;
15968 int DstSubStart = DstLane * NumSublanesPerLane;
15969 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15970 for (
int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15971 if (!
isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15975 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15976 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15977 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15978 DemandedCrossLane.
setBit(InLaneMask[i]);
15988 if (!CanUseSublanes) {
15993 int NumIdentityLanes = 0;
15994 bool OnlyShuffleLowestLane =
true;
15995 for (
int i = 0; i != NumLanes; ++i) {
15996 int LaneOffset = i * NumEltsPerLane;
15998 i * NumEltsPerLane))
15999 NumIdentityLanes++;
16000 else if (CrossLaneMask[LaneOffset] != 0)
16001 OnlyShuffleLowestLane =
false;
16003 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
16009 for (
int i = 0; i != NumElts; ++i)
16010 if (!DemandedCrossLane[i])
16016 if (CrossLaneMask == Mask || InLaneMask == Mask)
16025 if (
SDValue V = getSublanePermute(NumLanes))
16029 if (!CanUseSublanes)
16033 if (
SDValue V = getSublanePermute(NumLanes * 2))
16038 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16041 return getSublanePermute(NumLanes * 4);
16047 int Size = Mask.size();
16048 InLaneMask.
assign(Mask.begin(), Mask.end());
16049 for (
int i = 0; i <
Size; ++i) {
16050 int &M = InLaneMask[i];
16053 if (((M %
Size) / LaneSize) != (i / LaneSize))
16054 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) +
Size;
16070 int Size = Mask.size();
16071 int LaneSize =
Size / 2;
16076 if (VT == MVT::v4f64 &&
16077 !
all_of(Mask, [LaneSize](
int M) {
return M < LaneSize; }))
16085 bool LaneCrossing[2] = {
false,
false};
16086 for (
int i = 0; i <
Size; ++i)
16087 if (Mask[i] >= 0 && ((Mask[i] %
Size) / LaneSize) != (i / LaneSize))
16088 LaneCrossing[(Mask[i] %
Size) / LaneSize] =
true;
16089 AllLanes = LaneCrossing[0] && LaneCrossing[1];
16091 bool LaneUsed[2] = {
false,
false};
16092 for (
int i = 0; i <
Size; ++i)
16094 LaneUsed[(Mask[i] %
Size) / LaneSize] =
true;
16095 AllLanes = LaneUsed[0] && LaneUsed[1];
16100 "This last part of this routine only works on single input shuffles");
16106 "In-lane shuffle mask expected");
16126 const APInt &Zeroable,
16139 VT, MemVT, Ld, Ofs, DAG))
16154 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16155 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16158 if (WidenedMask[0] == 0 && IsHighZero) {
16178 if (!IsLowZero && !IsHighZero) {
16197 if (Subtarget.hasVLX()) {
16198 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16199 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16200 ((WidenedMask[1] % 2) << 1);
16201 return DAG.
getNode(X86ISD::SHUF128,
DL, VT, V1, V2,
16220 assert((WidenedMask[0] >= 0 || IsLowZero) &&
16221 (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?");
16223 unsigned PermMask = 0;
16224 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16225 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16228 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16230 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16233 return DAG.
getNode(X86ISD::VPERM2X128,
DL, VT, V1, V2,
16251 int NumElts = Mask.size();
16259 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16260 int Srcs[2] = {-1, -1};
16262 for (
int i = 0; i != NumLaneElts; ++i) {
16263 int M = Mask[(Lane * NumLaneElts) + i];
16270 int LaneSrc = M / NumLaneElts;
16272 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16274 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16279 Srcs[Src] = LaneSrc;
16280 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16287 LaneSrcs[Lane][0] = Srcs[0];
16288 LaneSrcs[Lane][1] = Srcs[1];
16291 assert(
M1.size() == M2.size() &&
"Unexpected mask size");
16292 for (
int i = 0, e =
M1.size(); i != e; ++i)
16293 if (
M1[i] >= 0 && M2[i] >= 0 &&
M1[i] != M2[i])
16299 assert(Mask.size() == MergedMask.size() &&
"Unexpected mask size");
16300 for (
int i = 0, e = MergedMask.size(); i != e; ++i) {
16304 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
16305 "Unexpected mask element");
16310 if (MatchMasks(InLaneMask, RepeatMask)) {
16312 MergeMasks(InLaneMask, RepeatMask);
16317 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16320 if (MatchMasks(InLaneMask, RepeatMask)) {
16322 MergeMasks(InLaneMask, RepeatMask);
16331 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16333 if (LaneSrcs[Lane][0] >= 0)
16336 for (
int i = 0; i != NumLaneElts; ++i) {
16337 int M = Mask[(Lane * NumLaneElts) + i];
16342 if (RepeatMask[i] < 0)
16343 RepeatMask[i] = M % NumLaneElts;
16345 if (RepeatMask[i] < NumElts) {
16346 if (RepeatMask[i] != M % NumLaneElts)
16348 LaneSrcs[Lane][0] = M / NumLaneElts;
16350 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16352 LaneSrcs[Lane][1] = M / NumLaneElts;
16356 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16361 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16362 int Src = LaneSrcs[Lane][0];
16363 for (
int i = 0; i != NumLaneElts; ++i) {
16366 M = Src * NumLaneElts + i;
16367 NewMask[Lane * NumLaneElts + i] = M;
16378 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16379 int Src = LaneSrcs[Lane][1];
16380 for (
int i = 0; i != NumLaneElts; ++i) {
16383 M = Src * NumLaneElts + i;
16384 NewMask[Lane * NumLaneElts + i] = M;
16395 for (
int i = 0; i != NumElts; ++i) {
16400 NewMask[i] = RepeatMask[i % NumLaneElts];
16401 if (NewMask[i] < 0)
16404 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16417 int &HalfIdx1,
int &HalfIdx2) {
16418 assert((Mask.size() == HalfMask.
size() * 2) &&
16419 "Expected input mask to be twice as long as output");
16424 if (UndefLower == UndefUpper)
16427 unsigned HalfNumElts = HalfMask.
size();
16428 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16431 for (
unsigned i = 0; i != HalfNumElts; ++i) {
16432 int M = Mask[i + MaskIndexOffset];
16440 int HalfIdx = M / HalfNumElts;
16443 int HalfElt = M % HalfNumElts;
16447 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16448 HalfMask[i] = HalfElt;
16449 HalfIdx1 = HalfIdx;
16452 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16453 HalfMask[i] = HalfElt + HalfNumElts;
16454 HalfIdx2 = HalfIdx;
16469 int HalfIdx2,
bool UndefLower,
16478 auto getHalfVector = [&](
int HalfIdx) {
16481 SDValue V = (HalfIdx < 2 ? V1 : V2);
16482 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16488 SDValue Half1 = getHalfVector(HalfIdx1);
16489 SDValue Half2 = getHalfVector(HalfIdx2);
16499 unsigned Offset = UndefLower ? HalfNumElts : 0;
16512 "Expected 256-bit or 512-bit vector");
16519 "Completely undef shuffle mask should have been simplified already");
16543 int HalfIdx1, HalfIdx2;
16548 assert(HalfMask.
size() == HalfNumElts &&
"Unexpected shuffle mask length");
16551 unsigned NumLowerHalves =
16552 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16553 unsigned NumUpperHalves =
16554 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16555 assert(NumLowerHalves + NumUpperHalves <= 2 &&
"Only 1 or 2 halves allowed");
16563 if (NumUpperHalves == 0)
16567 if (NumUpperHalves == 1) {
16571 if (EltWidth == 32 && NumLowerHalves && HalfVT.
is128BitVector() &&
16574 Subtarget.hasFastVariableCrossLaneShuffle()))
16580 if (EltWidth == 64 && V2.
isUndef())
16584 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16596 assert(NumUpperHalves == 2 &&
"Half vector count went wrong");
16601 if (NumUpperHalves == 0) {
16604 if (Subtarget.
hasAVX2() && EltWidth == 64)
16627 int NumLaneElts = NumElts / NumLanes;
16632 for (
unsigned BroadcastSize : {16, 32, 64}) {
16641 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
16642 for (
int j = 0; j != NumBroadcastElts; ++j) {
16643 int M = Mask[i + j];
16646 int &R = RepeatMask[j];
16647 if (0 != ((M % NumElts) / NumLaneElts))
16649 if (0 <= R && R != M)
16657 if (!FindRepeatingBroadcastMask(RepeatMask))
16665 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
16666 for (
int j = 0; j != NumBroadcastElts; ++j)
16667 BroadcastMask[i + j] = j;
16671 if (BroadcastMask == Mask)
16689 auto ShuffleSubLanes = [&](
int SubLaneScale) {
16690 int NumSubLanes = NumLanes * SubLaneScale;
16691 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16696 int TopSrcSubLane = -1;
16702 for (
int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16707 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16708 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16711 int Lane = (M % NumElts) / NumLaneElts;
16712 if ((0 <= SrcLane) && (SrcLane != Lane))
16715 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16716 SubLaneMask[Elt] = LocalM;
16724 for (
int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16726 for (
int i = 0; i != NumSubLaneElts; ++i) {
16727 if (
M1[i] < 0 || M2[i] < 0)
16729 if (
M1[i] != M2[i])
16735 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16736 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16740 for (
int i = 0; i != NumSubLaneElts; ++i) {
16741 int M = SubLaneMask[i];
16744 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16745 "Unexpected mask element");
16746 RepeatedSubLaneMask[i] = M;
16751 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16752 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16753 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16758 if (Dst2SrcSubLanes[DstSubLane] < 0)
16761 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16762 "Unexpected source lane");
16766 for (
int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16767 int Lane = SubLane / SubLaneScale;
16768 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16769 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16770 int M = RepeatedSubLaneMask[Elt];
16773 int Idx = (SubLane * NumSubLaneElts) + Elt;
16774 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16780 for (
int i = 0; i != NumElts; i += NumSubLaneElts) {
16781 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16782 if (SrcSubLane < 0)
16784 for (
int j = 0; j != NumSubLaneElts; ++j)
16785 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16790 if (RepeatedMask == Mask || SubLaneMask == Mask)
16804 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16807 MinSubLaneScale = 2;
16809 (!OnlyLowestElts && V2.
isUndef() && VT == MVT::v32i8) ? 4 : 2;
16811 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16812 MinSubLaneScale = MaxSubLaneScale = 4;
16814 for (
int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16815 if (
SDValue Shuffle = ShuffleSubLanes(Scale))
16822 bool &ForceV1Zero,
bool &ForceV2Zero,
16824 const APInt &Zeroable) {
16827 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16828 "Unexpected data type for VSHUFPD");
16830 "Illegal shuffle mask");
16832 bool ZeroLane[2] = {
true,
true };
16833 for (
int i = 0; i < NumElts; ++i)
16834 ZeroLane[i & 1] &= Zeroable[i];
16838 bool IsSHUFPD =
true;
16839 bool IsCommutable =
true;
16841 for (
int i = 0; i < NumElts; ++i) {
16846 int Val = (i & 6) + NumElts * (i & 1);
16847 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16848 if (Mask[i] < Val || Mask[i] > Val + 1)
16850 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16851 IsCommutable =
false;
16852 SHUFPDMask[i] = Mask[i] % 2;
16855 if (!IsSHUFPD && !IsCommutable)
16858 if (!IsSHUFPD && IsCommutable)
16861 ForceV1Zero = ZeroLane[0];
16862 ForceV2Zero = ZeroLane[1];
16869 const APInt &Zeroable,
16872 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16873 "Unexpected data type for VSHUFPD");
16875 unsigned Immediate = 0;
16876 bool ForceV1Zero =
false, ForceV2Zero =
false;
16887 return DAG.
getNode(X86ISD::SHUFP,
DL, VT, V1, V2,
16897 const APInt &Zeroable,
16899 assert(VT == MVT::v32i8 &&
"Unexpected type!");
16906 if (Zeroable.
countl_one() < (Mask.size() - 8))
16912 V1 = DAG.
getNode(X86ISD::VTRUNC,
DL, MVT::v16i8, V1);
16913 V2 = DAG.
getNode(X86ISD::VTRUNC,
DL, MVT::v16i8, V2);
16918 { 0, 1, 2, 3, 16, 17, 18, 19,
16919 4, 5, 6, 7, 20, 21, 22, 23 });
16946 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16950 auto IsInterleavingPattern = [&](
ArrayRef<int> Mask,
unsigned Begin0,
16952 size_t Size = Mask.size();
16953 assert(
Size % 2 == 0 &&
"Expected even mask size");
16954 for (
unsigned I = 0;
I <
Size;
I += 2) {
16955 if (Mask[
I] != (
int)(Begin0 +
I / 2) ||
16956 Mask[
I + 1] != (
int)(Begin1 +
I / 2))
16963 size_t FirstQtr = NumElts / 2;
16964 size_t ThirdQtr = NumElts + NumElts / 2;
16965 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16966 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16967 if (!IsFirstHalf && !IsSecondHalf)
16977 if (Shuffles.
size() != 2)
16984 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16985 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16986 FirstHalf = Shuffles[0];
16987 SecondHalf = Shuffles[1];
16988 }
else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16989 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16990 FirstHalf = Shuffles[1];
16991 SecondHalf = Shuffles[0];
17021 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
17030 Mask, Subtarget, DAG))
17035 return DAG.
getNode(X86ISD::MOVDDUP,
DL, MVT::v4f64, V1);
17040 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17041 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
17042 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v4f64, V1,
17048 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v4f64, V1,
17054 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17059 Mask, DAG, Subtarget))
17068 Zeroable, Subtarget, DAG))
17076 Zeroable, Subtarget, DAG))
17089 !
all_of(Mask, [](
int M) {
return M < 2 || (4 <= M && M < 6); }) &&
17093 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
17098 if (V1IsInPlace || V2IsInPlace)
17100 Zeroable, Subtarget, DAG);
17105 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17112 if (!(Subtarget.
hasAVX2() && (V1IsInPlace || V2IsInPlace)))
17114 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17118 if (Subtarget.hasVLX())
17120 Zeroable, Subtarget, DAG))
17127 Zeroable, Subtarget, DAG);
17144 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
17145 assert(Subtarget.
hasAVX2() &&
"We can only lower v4i64 with AVX2!");
17152 Zeroable, Subtarget, DAG))
17161 if (Subtarget.preferLowerShuffleAsShift())
17164 Subtarget, DAG,
true))
17176 DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v8i32,
17183 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v4i64, V1,
17194 if (Subtarget.hasVLX()) {
17196 Zeroable, Subtarget, DAG))
17200 Zeroable, Subtarget, DAG))
17218 if (V1IsInPlace || V2IsInPlace)
17220 Zeroable, Subtarget, DAG);
17225 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17237 if (!V1IsInPlace && !V2IsInPlace)
17239 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17244 Zeroable, Subtarget, DAG);
17257 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17260 Zeroable, Subtarget, DAG))
17278 Zeroable, Subtarget, DAG))
17286 "Repeated masks must be half the mask width!");
17290 return DAG.
getNode(X86ISD::MOVSLDUP,
DL, MVT::v8f32, V1);
17292 return DAG.
getNode(X86ISD::MOVSHDUP,
DL, MVT::v8f32, V1);
17295 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v8f32, V1,
17310 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17318 return DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v8f32, V1, VPermMask);
17322 return DAG.
getNode(X86ISD::VPERMV,
DL, MVT::v8f32, VPermMask, V1);
17332 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17336 if (Subtarget.hasVLX())
17338 Zeroable, Subtarget, DAG))
17363 Zeroable, Subtarget, DAG);
17380 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17381 assert(Subtarget.
hasAVX2() &&
"We can only lower v8i32 with AVX2!");
17383 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
17389 Zeroable, Subtarget, DAG))
17408 Zeroable, Subtarget, DAG))
17417 if (Subtarget.preferLowerShuffleAsShift()) {
17420 Subtarget, DAG,
true))
17422 if (NumV2Elements == 0)
17432 bool Is128BitLaneRepeatedShuffle =
17434 if (Is128BitLaneRepeatedShuffle) {
17435 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
17437 return DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v8i32, V1,
17451 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
17457 if (Subtarget.hasVLX()) {
17459 Zeroable, Subtarget, DAG))
17463 Zeroable, Subtarget, DAG))
17475 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17487 return DAG.
getNode(X86ISD::VPERMV,
DL, MVT::v8i32, VPermMask, V1);
17497 CastV1, CastV2, DAG);
17504 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17509 Zeroable, Subtarget, DAG);
17522 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
17523 assert(Subtarget.
hasAVX2() &&
"We can only lower v16i16 with AVX2!");
17529 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17538 Zeroable, Subtarget, DAG))
17558 Subtarget, DAG,
false))
17569 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17587 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17600 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17605 Zeroable, Subtarget, DAG))
17609 if (Subtarget.hasBWI())
17615 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17620 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17645 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
17646 assert(Subtarget.
hasAVX2() &&
"We can only lower v32i8 with AVX2!");
17652 Zeroable, Subtarget, DAG))
17661 Zeroable, Subtarget, DAG))
17698 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17710 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17718 Zeroable, Subtarget, DAG))
17722 if (Subtarget.hasVBMI())
17728 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17733 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17739 if (Subtarget.hasVLX())
17741 Mask, Zeroable, DAG))
17768 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
17770 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17772 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17788 if (ElementBits < 32) {
17806 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17838 "Unexpected element type size for 128bit shuffle.");
17848 assert(Widened128Mask.
size() == 4 &&
"Shuffle widening mismatch");
17851 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17852 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17853 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17864 bool OnlyUsesV1 =
isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17866 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17876 bool IsInsert =
true;
17878 for (
int i = 0; i < 4; ++i) {
17879 assert(Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value");
17880 if (Widened128Mask[i] < 0)
17884 if (Widened128Mask[i] < 4) {
17885 if (Widened128Mask[i] != i) {
17891 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17898 if (IsInsert && V2Index >= 0) {
17911 Widened128Mask.
clear();
17917 int PermMask[4] = {-1, -1, -1, -1};
17919 for (
int i = 0; i < 4; ++i) {
17920 assert(Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value");
17921 if (Widened128Mask[i] < 0)
17924 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17931 PermMask[i] = Widened128Mask[i] % 4;
17945 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17949 if (
isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17950 return DAG.
getNode(X86ISD::MOVDDUP,
DL, MVT::v8f64, V1);
17955 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17956 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17957 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17958 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17959 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v8f64, V1,
17965 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v8f64, V1,
17970 V2, Subtarget, DAG))
17978 Zeroable, Subtarget, DAG))
17986 Zeroable, Subtarget, DAG))
17999 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
18005 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
18009 return DAG.
getNode(X86ISD::MOVSLDUP,
DL, MVT::v16f32, V1);
18011 return DAG.
getNode(X86ISD::MOVSHDUP,
DL, MVT::v16f32, V1);
18014 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v16f32, V1,
18022 Zeroable, Subtarget, DAG))
18030 Zeroable, Subtarget, DAG))
18034 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18040 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
18048 return DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v16f32, V1, VPermMask);
18053 Zeroable, Subtarget, DAG))
18066 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
18069 if (Subtarget.preferLowerShuffleAsShift())
18072 Subtarget, DAG,
true))
18085 DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v16i32,
18092 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v8i64, V1,
18097 V2, Subtarget, DAG))
18108 Zeroable, Subtarget, DAG))
18112 if (Subtarget.hasBWI())
18126 Zeroable, Subtarget, DAG))
18139 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
18141 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
18147 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18151 if (Subtarget.preferLowerShuffleAsShift()) {
18154 Subtarget, DAG,
true))
18156 if (NumV2Elements == 0)
18166 bool Is128BitLaneRepeatedShuffle =
18168 if (Is128BitLaneRepeatedShuffle) {
18169 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
18171 return DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v16i32, V1,
18182 Subtarget, DAG,
false))
18185 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
18192 Zeroable, Subtarget, DAG))
18196 if (Subtarget.hasBWI())
18207 CastV1, CastV2, DAG);
18214 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
18219 Zeroable, Subtarget, DAG))
18223 Zeroable, Subtarget, DAG))
18236 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
18237 assert(Subtarget.hasBWI() &&
"We can only lower v32i16 with AVX-512-BWI!");
18243 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18258 Subtarget, DAG,
false))
18278 RepeatedMask, Subtarget, DAG);
18283 Zeroable, Subtarget, DAG))
18287 Zeroable, Subtarget, DAG))
18293 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
18306 assert(Mask.size() == 64 &&
"Unexpected mask size for v64 shuffle!");
18307 assert(Subtarget.hasBWI() &&
"We can only lower v64i8 with AVX-512-BWI!");
18313 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18344 Zeroable, Subtarget, DAG))
18348 Zeroable, Subtarget, DAG))
18355 if (!Subtarget.hasVBMI())
18357 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18361 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
18365 Zeroable, Subtarget, DAG))
18372 Mask, Subtarget, DAG))
18377 if (Subtarget.hasVBMI())
18383 bool V1InUse, V2InUse;
18385 DAG, V1InUse, V2InUse);
18391 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18395 if (Subtarget.hasVBMI())
18409 const APInt &Zeroable,
18413 "Cannot lower 512-bit vectors w/ basic ISA!");
18417 int NumElts = Mask.size();
18418 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
18420 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18422 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18435 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18447 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
18448 if (!Subtarget.hasBWI())
18490 int NumElts = Mask.size();
18491 for (
int i = 0; i != NumElts; ++i) {
18494 "Unexpected mask index.");
18499 if (ShiftAmt < 0) {
18506 if (ShiftAmt != M - i)
18509 assert(ShiftAmt >= 0 &&
"All undef?");
18523 int MaskOffset,
const APInt &Zeroable) {
18524 int Size = Mask.size();
18526 auto CheckZeros = [&](
int Shift,
bool Left) {
18527 for (
int j = 0; j < Shift; ++j)
18528 if (!Zeroable[j + (
Left ? 0 : (
Size - Shift))])
18534 auto MatchShift = [&](
int Shift,
bool Left) {
18535 unsigned Pos =
Left ? Shift : 0;
18536 unsigned Low =
Left ? 0 : Shift;
18537 unsigned Len =
Size - Shift;
18541 for (
int Shift = 1; Shift !=
Size; ++Shift)
18542 for (
bool Left : {
true,
false})
18543 if (CheckZeros(Shift,
Left) && MatchShift(Shift,
Left)) {
18544 Opcode =
Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18558 const APInt &Zeroable,
18562 "Cannot lower 512-bit vectors w/o basic ISA!");
18564 int NumElts = Mask.size();
18565 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
18568 int SubvecElts = 0;
18570 for (
int i = 0; i != NumElts; ++i) {
18571 if (Mask[i] >= 0) {
18575 Src = Mask[i] / NumElts;
18576 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18582 assert(SubvecElts != NumElts &&
"Identity shuffle?");
18589 if ((
int)Zeroable.
countl_one() >= (NumElts - SubvecElts)) {
18590 assert(Src >= 0 &&
"Expected a source!");
18610 if (ShiftAmt >= 0) {
18614 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18618 DAG.
getNode(X86ISD::KSHIFTL,
DL, WideVT, Res,
18621 ShiftAmt += WideElts - NumElts;
18624 Res = DAG.
getNode(Opcode,
DL, WideVT, Res,
18648 return Zeroable[M.index()] || (M.value() == (
int)M.index());
18650 if (IsBlendWithZero) {
18651 const unsigned Width = std::max<unsigned>(NumElts, 8u);
18654 APInt MaskValue = (~Zeroable).zextOrTrunc(Width);
18669 ExtVT = MVT::v2i64;
18672 ExtVT = MVT::v4i32;
18677 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18687 assert(Subtarget.hasBWI() &&
"Expected AVX512BW support");
18695 ExtVT = MVT::v64i8;
18705 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18706 (Subtarget.hasDQI() && (NumElems < 32)))
18716 int NumElements = Mask.size();
18718 int NumV1Elements = 0, NumV2Elements = 0;
18722 else if (M < NumElements)
18730 if (NumV2Elements > NumV1Elements)
18733 assert(NumV1Elements > 0 &&
"No V1 indices");
18735 if (NumV2Elements == 0)
18743 if (NumV1Elements == NumV2Elements) {
18744 int LowV1Elements = 0, LowV2Elements = 0;
18745 for (
int M : Mask.slice(0, NumElements / 2))
18746 if (M >= NumElements)
18750 if (LowV2Elements > LowV1Elements)
18752 if (LowV2Elements == LowV1Elements) {
18753 int SumV1Indices = 0, SumV2Indices = 0;
18754 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
18755 if (Mask[i] >= NumElements)
18757 else if (Mask[i] >= 0)
18759 if (SumV2Indices < SumV1Indices)
18761 if (SumV2Indices == SumV1Indices) {
18762 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18763 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
18764 if (Mask[i] >= NumElements)
18765 NumV2OddIndices += i % 2;
18766 else if (Mask[i] >= 0)
18767 NumV1OddIndices += i % 2;
18768 if (NumV2OddIndices < NumV1OddIndices)
18782 if (!V.getValueType().isSimple())
18786 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18791 if ((VT == MVT::i16 || VT == MVT::i8) &&
18792 V.getSimpleValueType().getSizeInBits() < 512)
18795 auto HasMaskOperation = [&](
SDValue V) {
18798 switch (V->getOpcode()) {
18817 if (!V->hasOneUse())
18823 if (HasMaskOperation(V))
18848 MVT VT =
Op.getSimpleValueType();
18854 "Can't lower MMX shuffles");
18856 bool V1IsUndef = V1.
isUndef();
18857 bool V2IsUndef = V2.
isUndef();
18858 if (V1IsUndef && V2IsUndef)
18871 any_of(OrigMask, [NumElements](
int M) {
return M >= NumElements; })) {
18873 for (
int &M : NewMask)
18874 if (M >= NumElements)
18880 int MaskUpperLimit = OrigMask.
size() * (V2IsUndef ? 1 : 2);
18881 (void)MaskUpperLimit;
18883 [&](
int M) {
return -1 <= M && M < MaskUpperLimit; }) &&
18884 "Out of bounds shuffle index");
18889 APInt KnownUndef, KnownZero;
18892 APInt Zeroable = KnownUndef | KnownZero;
18918 int NewNumElts = NumElements / 2;
18926 bool UsedZeroVector =
false;
18928 "V2's non-undef elements are used?!");
18929 for (
int i = 0; i != NewNumElts; ++i)
18931 WidenedMask[i] = i + NewNumElts;
18932 UsedZeroVector =
true;
18936 if (UsedZeroVector)
18960 assert(NumElements == (
int)Mask.size() &&
18961 "canonicalizeShuffleMaskWithHorizOp "
18962 "shouldn't alter the shuffle mask size");
18968 auto CanonicalizeConstant = [VT, &
DL, &DAG](
SDValue V) {
18972 if (Undefs.
any() &&
18981 V1 = CanonicalizeConstant(V1);
18982 V2 = CanonicalizeConstant(V2);
19011 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
19026 if (NumVecBits != 128 && NumVecBits != 256)
19029 if (NumElementBits == 32 || NumElementBits == 64) {
19030 unsigned NumLargeElements = 512 / NumElementBits;
19038 Subtarget, DAG,
DL);
19042 Subtarget, DAG,
DL);
19050 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
19051 VecVT == MVT::v16i16) {
19056 Passthru = Passthru.
isUndef()
19075 MVT VT =
Op.getSimpleValueType();
19094 MVT VT =
Op.getSimpleValueType();
19116 MVT CondVT =
Cond.getSimpleValueType();
19117 unsigned CondEltSize =
Cond.getScalarValueSizeInBits();
19118 if (CondEltSize == 1)
19122 if (!Subtarget.hasSSE41())
19129 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
19146 if (CondEltSize != EltSize) {
19162 if (EltSize < 32 && VT.
is256BitVector() && !Subtarget.hasAVX2() &&
19163 !Subtarget.hasXOP()) {
19169 if (FreeCond && (FreeLHS || FreeRHS))
19183 if (Subtarget.hasAVX2())
19191 case MVT::v16f16: {
19204 MVT VT =
Op.getSimpleValueType();
19223 SDValue Extract = DAG.
getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
19228 if (VT == MVT::f32) {
19234 if (!
Op.hasOneUse())
19239 User->getValueType(0) != MVT::i32))
19246 if (VT == MVT::i32 || VT == MVT::i64)
19261 MVT EltVT =
Op.getSimpleValueType();
19264 "Unexpected vector type in ExtractBitFromMaskVector");
19272 if (NumElts == 1) {
19284 unsigned IdxVal = IdxC->getZExtValue();
19301 MVT VT =
N->getSimpleValueType(0);
19305 switch (
User->getOpcode()) {
19306 case X86ISD::PEXTRB:
19307 case X86ISD::PEXTRW:
19311 return DemandedElts;
19313 DemandedElts.
setBit(
User->getConstantOperandVal(1));
19316 if (!
User->getValueType(0).isSimple() ||
19317 !
User->getValueType(0).isVector()) {
19319 return DemandedElts;
19327 return DemandedElts;
19330 return DemandedElts;
19334X86TargetLowering::LowerEXTRACT_VECTOR_ELT(
SDValue Op,
19379 unsigned IdxVal = IdxC->getZExtValue();
19393 IdxVal &= ElemsPerChunk - 1;
19400 MVT VT =
Op.getSimpleValueType();
19402 if (VT == MVT::i16) {
19407 if (Subtarget.hasFP16())
19415 SDValue Extract = DAG.
getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19420 if (Subtarget.hasSSE41())
19427 if (VT == MVT::i8) {
19432 int DWordIdx = IdxVal / 4;
19433 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
19437 int ShiftVal = (IdxVal % 4) * 8;
19444 int WordIdx = IdxVal / 2;
19445 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
19449 int ShiftVal = (IdxVal % 2) * 8;
19463 Mask[0] =
static_cast<int>(IdxVal);
19479 int Mask[2] = { 1, -1 };
19517 MVT VT =
Op.getSimpleValueType();
19522 if (EltVT == MVT::i1)
19531 if (EltVT == MVT::bf16) {
19543 if (!(Subtarget.hasBWI() ||
19544 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19545 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
19558 for (
unsigned I = 0;
I != NumElts; ++
I)
19563 return DAG.
getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19567 if (N2C->getAPIntValue().uge(NumElts))
19569 uint64_t IdxVal = N2C->getZExtValue();
19574 if (IsZeroElt || IsAllOnesElt) {
19577 if (IsAllOnesElt &&
19578 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
19579 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
19583 CstVectorElts[IdxVal] = OnesCst;
19589 if (Subtarget.hasSSE41() &&
19591 SmallVector<int, 8> BlendMask;
19592 for (
unsigned i = 0; i != NumElts; ++i)
19593 BlendMask.
push_back(i == IdxVal ? i + NumElts : i);
19609 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19610 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19612 return DAG.
getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19617 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19619 "Vectors will always have power-of-two number of elements.");
19624 if (IdxVal >= NumEltsIn128 &&
19625 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19626 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19629 SmallVector<int, 8> BlendMask;
19630 for (
unsigned i = 0; i != NumElts; ++i)
19631 BlendMask.
push_back(i == IdxVal ? i + NumElts : i);
19640 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19652 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19653 EltVT == MVT::f16 || EltVT == MVT::i64) {
19660 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19671 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19673 if (VT == MVT::v8i16) {
19674 assert(Subtarget.hasSSE2() &&
"SSE2 required for PINSRW");
19675 Opc = X86ISD::PINSRW;
19677 assert(VT == MVT::v16i8 &&
"PINSRB requires v16i8 vector");
19678 assert(Subtarget.hasSSE41() &&
"SSE41 required for PINSRB");
19679 Opc = X86ISD::PINSRB;
19685 return DAG.
getNode(
Opc, dl, VT, N0, N1, N2);
19688 if (Subtarget.hasSSE41()) {
19689 if (EltVT == MVT::f32) {
19709 return DAG.
getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19714 return DAG.
getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19719 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19730 MVT XTy =
X.getSimpleValueType();
19737 if (!Subtarget.hasFP16())
19743 128 /
X.getSimpleValueType().getSizeInBits());
19759 return DAG.
getNode(X86ISD::SCALEF,
DL, XTy,
X, Exp);
19764 if (Subtarget.hasFP16()) {
19765 if (Subtarget.hasVLX()) {
19767 return DAG.
getNode(X86ISD::SCALEF,
DL, XTy,
X, Exp);
19773 X.getSimpleValueType().changeTypeToInteger());
19776 if (Subtarget.hasFP16()) {
19778 return DAG.
getNode(X86ISD::SCALEF,
DL, XTy,
X, Exp);
19795 MVT OpVT =
Op.getSimpleValueType();
19816 "Expected an SSE type!");
19820 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19833 assert(
Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19840 assert(
Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19841 "Only vXi1 extract_subvectors need custom lowering");
19845 uint64_t IdxVal =
Op.getConstantOperandVal(1);
19862unsigned X86TargetLowering::getGlobalWrapperKind(
19863 const GlobalValue *GV,
const unsigned char OpFlags)
const {
19866 return X86ISD::Wrapper;
19869 if (Subtarget.isPICStyleRIPRel() &&
19872 return X86ISD::WrapperRIP;
19876 return X86ISD::WrapperRIP;
19878 return X86ISD::Wrapper;
19893 unsigned char OpFlag = Subtarget.classifyLocalReference(
nullptr);
19900 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlag),
DL, PtrVT, Result);
19916 unsigned char OpFlag = Subtarget.classifyLocalReference(
nullptr);
19918 EVT PtrVT =
Op.getValueType();
19922 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlag),
DL, PtrVT, Result);
19935 return LowerGlobalOrExternal(
Op, DAG,
false,
nullptr);
19941 unsigned char OpFlags =
19942 Subtarget.classifyBlockAddressReference();
19946 EVT PtrVT =
Op.getValueType();
19949 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlags), dl, PtrVT, Result);
19964 bool *IsImpCall)
const {
19967 const GlobalValue *GV =
nullptr;
19969 const char *ExternalSym =
nullptr;
19971 GV =
G->getGlobal();
19975 ExternalSym = ES->getSymbol();
19980 unsigned char OpFlags;
19982 OpFlags = Subtarget.classifyGlobalFunctionReference(GV,
Mod);
19984 OpFlags = Subtarget.classifyGlobalReference(GV,
Mod);
19989 EVT PtrVT =
Op.getValueType();
19998 int64_t GlobalOffset = 0;
20011 if (ForCall && !NeedsLoad && !HasPICReg &&
Offset == 0)
20017 Mod.getModuleFlag(
"import-call-optimization")) {
20018 assert(ForCall &&
"Should only enable import call optimization if we are "
20019 "lowering a call");
20024 Result = DAG.
getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
20049 return LowerGlobalOrExternal(
Op, DAG,
false,
nullptr);
20053 const EVT PtrVT,
unsigned ReturnReg,
20054 unsigned char OperandFlags,
20055 bool LoadGlobalBaseReg =
false,
20056 bool LocalDynamic =
false) {
20064 if (LocalDynamic && UseTLSDESC) {
20071 "Unexpected TLSDESC DAG");
20075 "Unexpected TLSDESC DAG");
20077 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
20079 "Unexpected TLSDESC DAG");
20080 Ret =
SDValue(CopyFromRegOp, 0);
20088 unsigned CallType = UseTLSDESC ? X86ISD::TLSDESC
20089 : LocalDynamic ? X86ISD::TLSBASEADDR
20093 if (LoadGlobalBaseReg) {
20099 Chain = DAG.
getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
20101 Chain = DAG.
getNode(CallType, dl, NodeTys, {Chain, TGA});
20149 bool Is64Bit,
bool Is64BitLP64) {
20159 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
20174 unsigned WrapperKind = X86ISD::Wrapper;
20198 unsigned char OperandFlags = 0;
20201 unsigned WrapperKind = X86ISD::Wrapper;
20207 WrapperKind = X86ISD::WrapperRIP;
20247 const GlobalValue *GV = GA->
getGlobal();
20248 EVT PtrVT =
Op.getValueType();
20251 if (Subtarget.isTargetELF()) {
20255 if (Subtarget.is64Bit()) {
20256 if (Subtarget.isTarget64BitLP64())
20263 Subtarget.isTarget64BitLP64());
20267 PositionIndependent);
20272 if (Subtarget.isTargetDarwin()) {
20274 unsigned char OpFlag = 0;
20275 unsigned WrapperKind = 0;
20279 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
20282 WrapperKind = X86ISD::Wrapper;
20285 WrapperKind = X86ISD::WrapperRIP;
20302 SDVTList NodeTys = DAG.
getVTList(MVT::Other, MVT::Glue);
20305 Chain = DAG.
getNode(X86ISD::TLSCALL,
DL, NodeTys, Args);
20314 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
20318 if (Subtarget.isOSWindows()) {
20340 SDValue TlsArray = Subtarget.is64Bit()
20342 : (Subtarget.isTargetWindowsGNU()
20347 DAG.
getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
20351 res = ThreadPointer;
20355 if (Subtarget.is64Bit())
20357 MachinePointerInfo(), MVT::i32);
20359 IDX = DAG.
getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
20369 res = DAG.
getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
20386 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
20427 "Unexpected opcode!");
20428 bool IsStrict =
Op->isStrictFPOpcode();
20429 unsigned OpNo = IsStrict ? 1 : 0;
20431 MVT SrcVT = Src.getSimpleValueType();
20432 MVT VT =
Op.getSimpleValueType();
20434 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20435 (VT != MVT::f32 && VT != MVT::f64))
20441 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20448 {Op.getOperand(0), InVec});
20468 "Unexpected opcode!");
20469 bool IsStrict =
Op->isStrictFPOpcode();
20470 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
20471 MVT SrcVT = Src.getSimpleValueType();
20472 MVT VT =
Op.getSimpleValueType();
20474 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20479 assert(Subtarget.hasFP16() &&
"Expected FP16");
20483 SDValue CvtVec = DAG.
getNode(
Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20484 {Op.getOperand(0), InVec});
20502 if (!Subtarget.
hasSSE2() || FromVT != MVT::v4i32)
20505 return ToVT == MVT::v4f32 || (Subtarget.
hasAVX() && ToVT == MVT::v4f64);
20509 if (!Subtarget.
hasAVX512() || FromVT != MVT::v4i32)
20512 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20552 if (FromVT != Vec128VT)
20577 MVT SrcVT =
X.getSimpleValueType();
20578 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20583 if (!Subtarget.
hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20584 (IntVT != MVT::i32 && IntVT != MVT::i64))
20591 unsigned ToIntOpcode =
20593 unsigned ToFPOpcode =
20595 unsigned Width = 128;
20597 if (Subtarget.hasVLX() && Subtarget.hasDQI()) {
20606 if (IsUnsigned || IntVT == MVT::i64) {
20618 MVT VecSrcVT, VecIntVT, VecVT;
20620 unsigned SrcElts, VTElts;
20622 if (Width == 512) {
20623 NumElts = std::min(Width / IntSize, Width / SrcSize);
20627 NumElts = Width / IntSize;
20628 SrcElts = Width / SrcSize;
20629 VTElts = Width / VTSize;
20650 bool IsStrict =
Op->isStrictFPOpcode();
20651 MVT VT =
Op->getSimpleValueType(0);
20652 SDValue Src =
Op->getOperand(IsStrict ? 1 : 0);
20654 if (Subtarget.hasDQI()) {
20655 assert(!Subtarget.hasVLX() &&
"Unexpected features");
20657 assert((Src.getSimpleValueType() == MVT::v2i64 ||
20658 Src.getSimpleValueType() == MVT::v4i64) &&
20659 "Unsupported custom type");
20662 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
20664 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20674 Res = DAG.
getNode(
Op.getOpcode(),
DL, {WideVT, MVT::Other},
20675 {Op->getOperand(0), Src});
20678 Res = DAG.
getNode(
Op.getOpcode(),
DL, WideVT, Src);
20691 if (VT != MVT::v4f32 || IsSigned)
20703 for (
int i = 0; i != 4; ++i) {
20709 {
Op.getOperand(0), Elt});
20710 Chains[i] = SignCvts[i].getValue(1);
20721 {Chain, SignCvt, SignCvt});
20738 bool IsStrict =
Op->isStrictFPOpcode();
20739 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
20741 MVT VT =
Op.getSimpleValueType();
20749 DAG.
getNode(
Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20752 DAG.
getNode(
Op.getOpcode(), dl, NVT, Src), Rnd);
20757 if (FloatVT.
getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20758 if (VT == MVT::v4i32 && Subtarget.
hasSSE2() && IsSigned)
20760 if (VT == MVT::v8i32 && Subtarget.
hasAVX() && IsSigned)
20763 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20766 if (VT == MVT::v16i32)
20768 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20770 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20773 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20774 (VT == MVT::v2i64 || VT == MVT::v4i64))
20781 bool IsStrict =
Op->isStrictFPOpcode();
20782 unsigned OpNo = IsStrict ? 1 : 0;
20785 MVT SrcVT = Src.getSimpleValueType();
20786 MVT VT =
Op.getSimpleValueType();
20794 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20795 return LowerWin64_INT128_TO_FP(
Op, DAG);
20804 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20809 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20812 return DAG.
getNode(X86ISD::CVTSI2P, dl, VT,
20816 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20822 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20823 "Unknown SINT_TO_FP to lower!");
20829 if (SrcVT == MVT::i32 && UseSSEReg)
20831 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20840 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20849 if (VT == MVT::f128 || !Subtarget.hasX87())
20853 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20857 ValueToStore = DAG.
getBitcast(MVT::f64, ValueToStore);
20864 MachinePointerInfo MPI =
20867 Chain = DAG.
getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20868 std::pair<SDValue, SDValue> Tmp =
20869 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20884 Tys = DAG.
getVTList(MVT::f80, MVT::Other);
20886 Tys = DAG.
getVTList(DstVT, MVT::Other);
20888 SDValue FILDOps[] = {Chain, Pointer};
20892 Chain = Result.getValue(1);
20902 SDValue FSTOps[] = {Chain, Result, StackSlot};
20910 DstVT,
DL, Chain, StackSlot,
20912 Chain = Result.getValue(1);
20915 return { Result, Chain };
20924 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20925 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20935 assert(!
Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!");
20952 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20960 APInt(64, 0x4330000000000000ULL))));
20963 APInt(64, 0x4530000000000000ULL))));
20977 MVT::v2f64, dl, CLod0.
getValue(1), CPIdx1,
20986 Result = DAG.
getNode(X86ISD::FHADD, dl, MVT::v2f64,
Sub,
Sub);
21000 unsigned OpNo =
Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21022 if (
Op.getNode()->isStrictFPOpcode()) {
21027 {Chain,
Or, Bias});
21029 if (
Op.getValueType() ==
Sub.getValueType())
21034 Sub,
Sub.getValue(1), dl,
Op.getSimpleValueType());
21036 return DAG.
getMergeValues({ResultPair.first, ResultPair.second}, dl);
21050 if (
Op.getSimpleValueType() != MVT::v2f64)
21053 bool IsStrict =
Op->isStrictFPOpcode();
21055 SDValue N0 =
Op.getOperand(IsStrict ? 1 : 0);
21059 if (!Subtarget.hasVLX()) {
21067 {Op.getOperand(0), N0});
21078 return DAG.
getNode(X86ISD::STRICT_CVTUI2P,
DL, {MVT::v2f64, MVT::Other},
21079 {
Op.getOperand(0), N0});
21080 return DAG.
getNode(X86ISD::CVTUI2P,
DL, MVT::v2f64, N0);
21096 {
Op.getOperand(0),
Or, VBias});
21103 bool IsStrict =
Op->isStrictFPOpcode();
21104 SDValue V =
Op->getOperand(IsStrict ? 1 : 0);
21105 MVT VecIntVT = V.getSimpleValueType();
21106 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
21107 "Unsupported custom type");
21111 assert(!Subtarget.hasVLX() &&
"Unexpected features");
21112 MVT VT =
Op->getSimpleValueType(0);
21115 if (VT == MVT::v8f64)
21118 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
21119 VT == MVT::v8f16) &&
21121 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
21122 MVT WideIntVT = MVT::v16i32;
21123 if (VT == MVT::v4f64) {
21124 WideVT = MVT::v8f64;
21125 WideIntVT = MVT::v8i32;
21137 {
Op->getOperand(0), V});
21151 if (Subtarget.
hasAVX() && VecIntVT == MVT::v4i32 &&
21152 Op->getSimpleValueType(0) == MVT::v4f64) {
21162 X86ISD::VBROADCAST_LOAD,
DL, Tys,
Ops, MVT::f64,
21172 {
Op.getOperand(0),
Or, VBias});
21188 bool Is128 = VecIntVT == MVT::v4i32;
21189 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
21192 if (VecFloatVT !=
Op->getSimpleValueType(0))
21213 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
21219 Low = DAG.
getNode(X86ISD::BLENDI,
DL, VecI16VT, VecBitcast,
21227 High = DAG.
getNode(X86ISD::BLENDI,
DL, VecI16VT, VecShiftBitcast,
21254 {
Op.getOperand(0), HighBitcast, VecCstFSub});
21256 {FHigh.
getValue(1), LowBitcast, FHigh});
21266 unsigned OpNo =
Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21286 bool IsStrict =
Op->isStrictFPOpcode();
21287 unsigned OpNo = IsStrict ? 1 : 0;
21291 MVT SrcVT = Src.getSimpleValueType();
21292 MVT DstVT =
Op->getSimpleValueType(0);
21296 if (DstVT == MVT::f128)
21310 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21311 return LowerWin64_INT128_TO_FP(
Op, DAG);
21317 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
21324 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
21339 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
21344 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
21347 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
21348 (DstVT == MVT::f32 || DstVT == MVT::f64))
21354 Align SlotAlign(8);
21355 MachinePointerInfo MPI =
21357 if (SrcVT == MVT::i32) {
21360 SDValue Store1 = DAG.
getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
21363 std::pair<SDValue, SDValue> Tmp =
21364 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21371 assert(SrcVT == MVT::i64 &&
"Unexpected type in UINT_TO_FP");
21377 ValueToStore = DAG.
getBitcast(MVT::f64, ValueToStore);
21380 DAG.
getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21384 SDVTList Tys = DAG.
getVTList(MVT::f80, MVT::Other);
21397 APInt FF(64, 0x5F80000000000000ULL);
21419 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21420 Opc = X86ISD::STRICT_FP80_ADD;
21423 DAG.
getNode(
Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
21425 if (DstVT == MVT::f80)
21433 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21434 Opc = X86ISD::FP80_ADD;
21450 bool IsStrict =
Op->isStrictFPOpcode();
21453 EVT DstTy =
Op.getValueType();
21455 EVT TheVT =
Value.getValueType();
21458 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21467 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21471 if (!IsSigned && DstTy != MVT::i64) {
21474 assert(DstTy == MVT::i32 &&
"Unexpected FP_TO_UINT");
21478 assert(DstTy.getSimpleVT() <= MVT::i64 &&
21479 DstTy.getSimpleVT() >= MVT::i16 &&
21480 "Unknown FP_TO_INT to lower!");
21485 unsigned MemSize = DstTy.getStoreSize();
21494 if (UnsignedFixup) {
21514 bool LosesInfo =
false;
21515 if (TheVT == MVT::f64)
21519 else if (TheVT == MVT::f80)
21524 "FP conversion should have been exact");
21534 Chain =
Cmp.getValue(1);
21559 { Chain,
Value, FltOfs });
21560 Chain =
Value.getValue(1);
21570 assert(DstTy == MVT::i64 &&
"Invalid FP_TO_SINT to lower!");
21572 SDVTList Tys = DAG.
getVTList(MVT::f80, MVT::Other);
21576 assert(FLDSize <= MemSize &&
"Stack slot not big enough");
21580 Chain =
Value.getValue(1);
21603 MVT VT =
Op.getSimpleValueType();
21605 MVT InVT = In.getSimpleValueType();
21606 unsigned Opc =
Op.getOpcode();
21610 "Unexpected extension opcode");
21612 "Expected same number of elements");
21616 "Unexpected element type");
21620 "Unexpected element type");