70#define DEBUG_TYPE "x86-isel"
73 "x86-experimental-pref-innermost-loop-alignment",
cl::init(4),
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
81 "x86-br-merging-base-cost",
cl::init(2),
83 "Sets the cost threshold for when multiple conditionals will be merged "
84 "into one branch versus be split in multiple branches. Merging "
85 "conditionals saves branches at the cost of additional instructions. "
86 "This value sets the instruction cost limit, below which conditionals "
87 "will be merged, and above which conditionals will be split. Set to -1 "
88 "to never merge branches."),
92 "x86-br-merging-ccmp-bias",
cl::init(6),
93 cl::desc(
"Increases 'x86-br-merging-base-cost' in cases that the target "
94 "supports conditional compare instructions."),
99 cl::desc(
"Replace narrow shifts with wider shifts."),
103 "x86-br-merging-likely-bias",
cl::init(0),
104 cl::desc(
"Increases 'x86-br-merging-base-cost' in cases that it is likely "
105 "that all conditionals will be executed. For example for merging "
106 "the conditionals (a == b && c > d), if its known that a == b is "
107 "likely, then it is likely that if the conditionals are split "
108 "both sides will be executed, so it may be desirable to increase "
109 "the instruction cost threshold. Set to -1 to never merge likely "
114 "x86-br-merging-unlikely-bias",
cl::init(-1),
116 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
117 "that all conditionals will be executed. For example for merging "
118 "the conditionals (a == b && c > d), if its known that a == b is "
119 "unlikely, then it is unlikely that if the conditionals are split "
120 "both sides will be executed, so it may be desirable to decrease "
121 "the instruction cost threshold. Set to -1 to never merge unlikely "
126 "mul-constant-optimization",
cl::init(
true),
127 cl::desc(
"Replace 'mul x, Const' with more effective instructions like "
134 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
151 if (Subtarget.isAtom())
153 else if (Subtarget.is64Bit())
162 if (Subtarget.hasSlowDivide32())
164 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
168 if (Subtarget.canUseCMPXCHG16B())
170 else if (Subtarget.canUseCMPXCHG8B())
183 if (Subtarget.is64Bit())
200 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
206 if (Subtarget.canUseCMOV()) {
209 if (Subtarget.is64Bit())
218 if (Subtarget.is64Bit())
226 if (Subtarget.is64Bit())
237 if (Subtarget.is64Bit())
241 if (!Subtarget.useSoftFloat()) {
305 if (!Subtarget.is64Bit() && Subtarget.hasX87()) {
311 if (Subtarget.hasSSE2()) {
314 for (
MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 if (Subtarget.is64Bit()) {
323 if (Subtarget.hasAVX10_2()) {
324 for (
MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v32i8}) {
332 for (
MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
337 if (Subtarget.is64Bit()) {
348 if (!Subtarget.hasSSE2()) {
351 if (Subtarget.is64Bit()) {
356 }
else if (!Subtarget.is64Bit())
369 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
380 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
381 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
385 if (Subtarget.is64Bit())
396 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
414 if (!Subtarget.hasBMI()) {
417 if (Subtarget.is64Bit()) {
423 if (Subtarget.hasLZCNT()) {
429 for (
auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
430 if (VT == MVT::i64 && !Subtarget.is64Bit())
444 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ?
Custom :
Expand);
451 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
456 for (
MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
469 if (Subtarget.is64Bit())
471 if (Subtarget.hasPOPCNT()) {
485 if (!Subtarget.hasMOVBE())
489 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
496 if (VT == MVT::i64 && !Subtarget.is64Bit())
516 for (
auto VT : { MVT::i32, MVT::i64 }) {
517 if (VT == MVT::i64 && !Subtarget.is64Bit())
528 for (
auto VT : { MVT::i32, MVT::i64 }) {
529 if (VT == MVT::i64 && !Subtarget.is64Bit())
536 if (Subtarget.hasSSEPrefetch())
542 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
552 if (!Subtarget.is64Bit())
555 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
562 if (Subtarget.canUseCMPXCHG16B())
566 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
567 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
580 if (Subtarget.isTargetPS())
588 bool Is64Bit = Subtarget.is64Bit();
643 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
647 : &X86::FR16RegClass);
649 : &X86::FR32RegClass);
651 : &X86::FR64RegClass);
659 for (
auto VT : { MVT::f32, MVT::f64 }) {
680 setF16Action(MVT::f16,
Promote);
736 }
else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
737 (UseX87 || Is64Bit)) {
775 for (
auto VT : { MVT::f32, MVT::f64 }) {
788 if (UseX87 && (
getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
789 addLegalFPImmediate(
APFloat(+0.0f));
790 addLegalFPImmediate(
APFloat(+1.0f));
791 addLegalFPImmediate(
APFloat(-0.0f));
792 addLegalFPImmediate(
APFloat(-1.0f));
794 addLegalFPImmediate(
APFloat(+0.0f));
799 addLegalFPImmediate(
APFloat(+0.0));
800 addLegalFPImmediate(
APFloat(+1.0));
801 addLegalFPImmediate(
APFloat(-0.0));
802 addLegalFPImmediate(
APFloat(-1.0));
804 addLegalFPImmediate(
APFloat(+0.0));
835 addLegalFPImmediate(TmpFlt);
837 addLegalFPImmediate(TmpFlt);
843 addLegalFPImmediate(TmpFlt2);
845 addLegalFPImmediate(TmpFlt2);
894 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
896 : &X86::VR128RegClass);
973 for (
auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
974 MVT::v4f32, MVT::v8f32, MVT::v16f32,
975 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
1058 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1063 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1065 : &X86::VR128RegClass);
1093 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1095 : &X86::VR128RegClass);
1100 : &X86::VR128RegClass);
1102 : &X86::VR128RegClass);
1104 : &X86::VR128RegClass);
1106 : &X86::VR128RegClass);
1108 : &X86::VR128RegClass);
1110 for (
auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1151 if (Subtarget.hasPCLMUL()) {
1152 for (
auto VT : {MVT::i64, MVT::v4i32, MVT::v2i64}) {
1161 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::i64}) {
1197 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1217 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1225 for (
auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1230 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1236 setF16Action(MVT::v8f16,
Expand);
1261 for (
auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1307 if (!Subtarget.hasAVX512())
1335 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1339 if (VT == MVT::v2i64)
continue;
1353 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1359 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1367 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1372 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1385 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1386 for (
MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1426 for (
auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1441 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1453 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1457 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1458 for (
MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1459 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1465 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1469 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1470 bool HasInt256 = Subtarget.hasInt256();
1473 : &X86::VR256RegClass);
1475 : &X86::VR256RegClass);
1477 : &X86::VR256RegClass);
1479 : &X86::VR256RegClass);
1481 : &X86::VR256RegClass);
1483 : &X86::VR256RegClass);
1485 : &X86::VR256RegClass);
1487 for (
auto VT : { MVT::v8f32, MVT::v4f64 }) {
1552 if (!Subtarget.hasAVX512())
1557 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1570 if (VT == MVT::v4i64)
continue;
1591 for (
auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1602 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1621 if (Subtarget.hasAnyFMA()) {
1622 for (
auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1623 MVT::v2f64, MVT::v4f64 }) {
1629 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1670 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1678 for (
auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1700 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1701 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1708 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1709 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1714 for (
MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1715 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1726 setF16Action(MVT::v16f16,
Expand);
1736 if (Subtarget.hasPCLMUL()) {
1737 for (
auto VT : {MVT::v8i32, MVT::v4i64}) {
1750 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1751 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1755 if (Subtarget.hasGFNI()) {
1761 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1762 Subtarget.hasF16C()) {
1763 for (
MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1767 for (
MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1782 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1810 if (!Subtarget.hasDQI()) {
1823 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1829 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1832 for (
auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1845 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1848 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1849 for (
MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1858 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1859 bool HasBWI = Subtarget.hasBWI();
1879 for (
MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1895 if (Subtarget.hasDQI())
1913 for (
MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1920 for (
MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1957 if (!Subtarget.hasVLX()) {
1958 for (
auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1959 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1985 for (
auto VT : { MVT::v16f32, MVT::v8f64 }) {
2002 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2029 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
2060 for (
auto VT : { MVT::v16i32, MVT::v8i64 }) {
2071 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2092 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2098 if (Subtarget.hasDQI())
2101 if (Subtarget.hasCDI()) {
2103 for (
auto VT : { MVT::v16i32, MVT::v8i64} ) {
2108 if (Subtarget.hasVPOPCNTDQ()) {
2109 for (
auto VT : { MVT::v16i32, MVT::v8i64 })
2117 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2118 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2121 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2122 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2133 setF16Action(MVT::v32f16,
Expand);
2142 for (
auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2149 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2158 if (Subtarget.hasVBMI2()) {
2159 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2169 if (Subtarget.hasPCLMUL()) {
2170 for (
auto VT : {MVT::v16i32, MVT::v8i64}) {
2180 if (Subtarget.hasGFNI()) {
2186 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2187 for (
auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2202 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2203 for (
MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
2204 MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
2205 MVT::v16f32, MVT::v8f64})
2214 if (Subtarget.hasDQI()) {
2219 "Unexpected operation action!");
2227 for (
auto VT : { MVT::v2i64, MVT::v4i64 }) {
2235 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2244 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2245 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2248 if (Subtarget.hasDQI()) {
2259 if (Subtarget.hasCDI()) {
2260 for (
auto VT : {MVT::i256, MVT::i512}) {
2261 if (VT == MVT::i512 && !Subtarget.useAVX512Regs())
2268 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2273 if (Subtarget.hasVPOPCNTDQ()) {
2274 for (
auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64})
2282 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2283 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2284 MVT::v16i16, MVT::v8i8})
2289 for (
MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2293 if (Subtarget.hasVLX())
2294 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2295 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2299 if (Subtarget.hasVBMI2())
2300 for (
MVT VT : {MVT::v32i16, MVT::v64i8})
2304 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2305 for (
MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2311 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2315 for (
auto VT : { MVT::v32i1, MVT::v64i1 }) {
2328 for (
auto VT : { MVT::v16i1, MVT::v32i1 })
2336 for (
auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2337 MVT::v16f16, MVT::v8f16}) {
2346 if (Subtarget.hasBITALG()) {
2347 for (
auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2352 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2353 auto setGroup = [&] (
MVT VT) {
2423 if (Subtarget.useAVX512Regs()) {
2424 setGroup(MVT::v32f16);
2475 if (Subtarget.hasVLX()) {
2476 setGroup(MVT::v8f16);
2477 setGroup(MVT::v16f16);
2528 if (!Subtarget.useSoftFloat() &&
2529 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2531 : &X86::VR128RegClass);
2532 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2533 : &X86::VR256RegClass);
2539 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2540 setF16Action(VT,
Expand);
2541 if (!Subtarget.hasBF16())
2558 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2559 Subtarget.useAVX512Regs()) {
2561 setF16Action(MVT::v32bf16,
Expand);
2572 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2584 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2597 for (
auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2603 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2616 if (Subtarget.hasBWI()) {
2621 if (Subtarget.hasFP16()) {
2653 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2661 if (!Subtarget.is64Bit()) {
2671 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2672 if (VT == MVT::i64 && !Subtarget.is64Bit())
2694 if (Subtarget.isTargetWin64()) {
2713 if (Subtarget.is32Bit() &&
2714 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2742 if (Subtarget.isOSWindows()) {
2859 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2864 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2870 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2877 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2878 !Subtarget.hasBWI())
2905 bool AssumeSingleUse,
bool IgnoreAlignment) {
2906 if (!AssumeSingleUse && !
Op.hasOneUse())
2913 if (!IgnoreAlignment && !Subtarget.
hasAVX() &&
2914 !Subtarget.hasSSEUnalignedMem() && Ld->getValueSizeInBits(0) == 128 &&
2915 Ld->getAlign() <
Align(16))
2926 bool AssumeSingleUse) {
2927 assert(Subtarget.
hasAVX() &&
"Expected AVX for broadcast from memory");
2934 return !Ld->isVolatile() ||
2939 if (!
Op.hasOneUse())
2952 if (
Op.hasOneUse()) {
2953 unsigned Opcode =
Op.getNode()->user_begin()->getOpcode();
2967 EVT VT =
Op.getValueType();
2968 unsigned Opcode =
Op.getOpcode();
2969 if ((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
3006 default:
return false;
3007 case X86ISD::BLENDI:
3008 case X86ISD::PSHUFB:
3009 case X86ISD::PSHUFD:
3010 case X86ISD::PSHUFHW:
3011 case X86ISD::PSHUFLW:
3013 case X86ISD::INSERTPS:
3014 case X86ISD::EXTRQI:
3015 case X86ISD::INSERTQI:
3016 case X86ISD::VALIGN:
3017 case X86ISD::PALIGNR:
3018 case X86ISD::VSHLDQ:
3019 case X86ISD::VSRLDQ:
3020 case X86ISD::MOVLHPS:
3021 case X86ISD::MOVHLPS:
3022 case X86ISD::MOVSHDUP:
3023 case X86ISD::MOVSLDUP:
3024 case X86ISD::MOVDDUP:
3028 case X86ISD::UNPCKL:
3029 case X86ISD::UNPCKH:
3030 case X86ISD::VBROADCAST:
3031 case X86ISD::VPERMILPI:
3032 case X86ISD::VPERMILPV:
3033 case X86ISD::VPERM2X128:
3034 case X86ISD::SHUF128:
3035 case X86ISD::VPERMIL2:
3036 case X86ISD::VPERMI:
3037 case X86ISD::VPPERM:
3038 case X86ISD::VPERMV:
3039 case X86ISD::VPERMV3:
3040 case X86ISD::VZEXT_MOVL:
3041 case X86ISD::COMPRESS:
3042 case X86ISD::EXPAND:
3049 default:
return false;
3051 case X86ISD::PSHUFB:
3052 case X86ISD::VPERMILPV:
3053 case X86ISD::VPERMIL2:
3054 case X86ISD::VPPERM:
3055 case X86ISD::VPERMV:
3056 case X86ISD::VPERMV3:
3070 int ReturnAddrIndex = FuncInfo->
getRAIndex();
3072 if (ReturnAddrIndex == 0) {
3074 unsigned SlotSize = RegInfo->getSlotSize();
3085 bool HasSymbolicDisplacement) {
3092 if (!HasSymbolicDisplacement)
3110 return Offset < 16 * 1024 * 1024;
3134 switch (SetCCOpcode) {
3159 if (SetCCOpcode ==
ISD::SETGT && RHSC->isAllOnes()) {
3164 if (SetCCOpcode ==
ISD::SETLT && RHSC->isZero()) {
3168 if (SetCCOpcode ==
ISD::SETGE && RHSC->isZero()) {
3172 if (SetCCOpcode ==
ISD::SETLT && RHSC->isOne()) {
3187 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3191 switch (SetCCOpcode) {
3207 switch (SetCCOpcode) {
3268 case Intrinsic::x86_aesenc128kl:
3269 case Intrinsic::x86_aesdec128kl:
3271 Info.ptrVal =
I.getArgOperand(1);
3273 Info.align =
Align(1);
3277 case Intrinsic::x86_aesenc256kl:
3278 case Intrinsic::x86_aesdec256kl:
3280 Info.ptrVal =
I.getArgOperand(1);
3282 Info.align =
Align(1);
3286 case Intrinsic::x86_aesencwide128kl:
3287 case Intrinsic::x86_aesdecwide128kl:
3289 Info.ptrVal =
I.getArgOperand(0);
3291 Info.align =
Align(1);
3295 case Intrinsic::x86_aesencwide256kl:
3296 case Intrinsic::x86_aesdecwide256kl:
3298 Info.ptrVal =
I.getArgOperand(0);
3300 Info.align =
Align(1);
3304 case Intrinsic::x86_cmpccxadd32:
3305 case Intrinsic::x86_cmpccxadd64:
3306 case Intrinsic::x86_atomic_bts:
3307 case Intrinsic::x86_atomic_btc:
3308 case Intrinsic::x86_atomic_btr: {
3310 Info.ptrVal =
I.getArgOperand(0);
3311 unsigned Size =
I.getType()->getScalarSizeInBits();
3319 case Intrinsic::x86_atomic_bts_rm:
3320 case Intrinsic::x86_atomic_btc_rm:
3321 case Intrinsic::x86_atomic_btr_rm: {
3323 Info.ptrVal =
I.getArgOperand(0);
3324 unsigned Size =
I.getArgOperand(1)->getType()->getScalarSizeInBits();
3332 case Intrinsic::x86_aadd32:
3333 case Intrinsic::x86_aadd64:
3334 case Intrinsic::x86_aand32:
3335 case Intrinsic::x86_aand64:
3336 case Intrinsic::x86_aor32:
3337 case Intrinsic::x86_aor64:
3338 case Intrinsic::x86_axor32:
3339 case Intrinsic::x86_axor64:
3340 case Intrinsic::x86_atomic_add_cc:
3341 case Intrinsic::x86_atomic_sub_cc:
3342 case Intrinsic::x86_atomic_or_cc:
3343 case Intrinsic::x86_atomic_and_cc:
3344 case Intrinsic::x86_atomic_xor_cc: {
3346 Info.ptrVal =
I.getArgOperand(0);
3347 unsigned Size =
I.getArgOperand(1)->getType()->getScalarSizeInBits();
3359 switch (IntrData->
Type) {
3364 Info.ptrVal =
I.getArgOperand(0);
3370 ScalarVT = MVT::i16;
3372 ScalarVT = MVT::i32;
3375 Info.align =
Align(1);
3383 Info.ptrVal =
nullptr;
3389 Info.align =
Align(1);
3396 Info.ptrVal =
nullptr;
3402 Info.align =
Align(1);
3416 bool ForCodeSize)
const {
3417 for (
const APFloat &FPImm : LegalFPImmediates)
3418 if (Imm.bitwiseIsEqual(FPImm))
3425 std::optional<unsigned> ByteOffset)
const {
3428 auto PeekThroughOneUserBitcasts = [](
const SDNode *
N) {
3430 N = *
N->user_begin();
3437 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3445 EVT VT = Load->getValueType(0);
3447 !
SDValue(Load, 0).hasOneUse()) {
3448 bool FullWidthUse =
false;
3449 bool AllExtractStores =
true;
3452 if (
Use.getResNo() != 0)
3460 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3461 return Inner->getOpcode() == ISD::STORE;
3465 AllExtractStores =
false;
3472 FullWidthUse =
true;
3475 if (AllExtractStores)
3492 assert(Ty->isIntegerTy());
3494 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3495 if (BitSize == 0 || BitSize > 64)
3506 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3512 if (VT.
isVector() && Subtarget.hasAVX512())
3532 unsigned TZeros = ShiftedMulC == 2 ? 0 : ShiftedMulC.
countr_zero();
3534 if ((ShiftedMulC - 1).isPowerOf2() || (ShiftedMulC + 1).isPowerOf2())
3554 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3558 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3559 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3563 unsigned Index)
const {
3570 return Index == 0 ||
3607 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3608 Subtarget.hasBitScanPassThrough() ||
3609 (!Ty->isVectorTy() &&
3610 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3616 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3617 Subtarget.hasBitScanPassThrough();
3624 return !Subtarget.hasSSE2() || VT == MVT::f80;
3628 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3629 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3635 if (!Subtarget.hasAVX512() && !LoadVT.
isVector() && BitcastVT.
isVector() &&
3639 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3665 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3670 if (MemVT.
getSizeInBits() > Subtarget.getPreferVectorWidth())
3677 return Subtarget.hasFastLZCNT();
3688 return Y.getValueType().isScalarInteger();
3692 EVT VT =
Y.getValueType();
3695 if (!Subtarget.hasBMI())
3699 if (VT != MVT::i32 && VT != MVT::i64)
3708 if (VT == MVT::v4i32)
3711 return Subtarget.hasSSE2();
3715 return X.getValueType().isScalarInteger();
3721 unsigned OldShiftOpcode,
unsigned NewShiftOpcode,
3725 X, XC, CC,
Y, OldShiftOpcode, NewShiftOpcode, DAG))
3728 if (
X.getValueType().isScalarInteger())
3735 if (Subtarget.hasAVX2())
3742 EVT VT,
unsigned ShiftOpc,
bool MayTransformRotate,
3743 const APInt &ShiftOrRotateAmt,
const std::optional<APInt> &AndMask)
const {
3747 bool PreferRotate =
false;
3751 PreferRotate = Subtarget.hasAVX512() && (VT.
getScalarType() == MVT::i32 ||
3756 PreferRotate = Subtarget.hasBMI2();
3757 if (!PreferRotate) {
3760 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3765 assert(AndMask.has_value() &&
"Null andmask when querying about shift+and");
3767 if (PreferRotate && MayTransformRotate)
3801 if (PreferRotate || !MayTransformRotate || VT.
isVector())
3811 const Value *Rhs)
const {
3815 if (BaseCost >= 0 && Subtarget.hasCCMP())
3818 if (BaseCost >= 0 &&
Opc == Instruction::And &&
3827 if (BaseCost >= 0 && !Subtarget.hasCCMP() &&
Opc == Instruction::Or &&
3830 return {-1, -1, -1};
3843 N->getOperand(0).getOpcode() ==
ISD::SRL) ||
3845 N->getOperand(0).getOpcode() ==
ISD::SHL)) &&
3846 "Expected shift-shift mask");
3848 EVT VT =
N->getValueType(0);
3849 if ((Subtarget.hasFastVectorShiftMasks() && VT.
isVector()) ||
3850 (Subtarget.hasFastScalarShiftMasks() && !VT.
isVector())) {
3854 return N->getOperand(1) ==
N->getOperand(0).getOperand(1);
3860 EVT VT =
Y.getValueType();
3866 unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
3874 !Subtarget.isOSWindows())
3924 [CmpVal](
int M) { return isUndefOrEqual(M, CmpVal); });
3940 unsigned NumElts = Mask.size();
3946 unsigned NumElts = Mask.size();
3952 return (Val >=
Low && Val <
Hi);
3995 unsigned NumElts = Mask.size();
4006 unsigned Size,
int Low,
int Step = 1) {
4007 for (
unsigned i = Pos, e = Pos +
Size; i != e; ++i,
Low += Step)
4019 for (
unsigned i = Pos, e = Pos +
Size; i != e; ++i,
Low += Step)
4035 unsigned NumElts = Mask.size();
4054 WidenedMask.
assign(Mask.size() / 2, 0);
4055 for (
int i = 0,
Size = Mask.size(); i <
Size; i += 2) {
4057 int M1 = Mask[i + 1];
4068 WidenedMask[i / 2] =
M1 / 2;
4072 WidenedMask[i / 2] =
M0 / 2;
4089 WidenedMask[i / 2] =
M0 / 2;
4096 assert(WidenedMask.
size() == Mask.size() / 2 &&
4097 "Incorrect size of mask after widening the elements!");
4103 const APInt &Zeroable,
4110 assert(!Zeroable.
isZero() &&
"V2's non-undef elements are used?!");
4111 for (
int i = 0,
Size = Mask.size(); i !=
Size; ++i)
4127 unsigned NumSrcElts = Mask.size();
4128 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
4129 "Illegal shuffle scale factor");
4132 if (NumDstElts >= NumSrcElts) {
4133 int Scale = NumDstElts / NumSrcElts;
4141 while (ScaledMask.
size() > NumDstElts) {
4145 ScaledMask = std::move(WidenedMask);
4162 unsigned SrcSizeInBits,
unsigned DstSizeInBits) {
4163 assert(DstMask.
empty() &&
"Expected an empty shuffle mas");
4164 assert((DstSizeInBits % SrcSizeInBits) == 0 &&
"Illegal shuffle scale");
4165 unsigned Scale = DstSizeInBits / SrcSizeInBits;
4166 unsigned NumSrcElts = SrcMask.
size();
4168 for (
int &M : DstMask) {
4171 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
4185 const SDLoc &dl,
bool IsMask =
false) {
4190 MVT ConstVecVT = VT;
4199 for (
unsigned i = 0; i < NumElts; ++i) {
4200 bool IsUndef = Values[i] < 0 && IsMask;
4203 Ops.push_back(OpNode);
4217 "Unequal constant and undef arrays");
4221 MVT ConstVecVT = VT;
4231 for (
unsigned i = 0, e = Bits.size(); i != e; ++i) {
4236 const APInt &V = Bits[i];
4240 Ops.push_back(DAG.
getConstant(V.extractBits(32, 32), dl, EltVT));
4261 "Unexpected vector type");
4275 "Unexpected vector type");
4289 LHS.getValueType() !=
RHS.getValueType() ||
4290 LHS.getOperand(0) !=
RHS.getOperand(0))
4294 if (Src.getValueSizeInBits() != (
LHS.getValueSizeInBits() * 2))
4297 unsigned NumElts =
LHS.getValueType().getVectorNumElements();
4298 if ((
LHS.getConstantOperandAPInt(1) == 0 &&
4299 RHS.getConstantOperandAPInt(1) == NumElts) ||
4300 (AllowCommute &&
RHS.getConstantOperandAPInt(1) == 0 &&
4301 LHS.getConstantOperandAPInt(1) == NumElts))
4308 const SDLoc &dl,
unsigned vectorWidth) {
4311 unsigned ResultNumElts =
4316 "Illegal subvector extraction");
4319 unsigned ElemsPerChunk = vectorWidth / ElVT.
getSizeInBits();
4324 IdxVal &= ~(ElemsPerChunk - 1);
4329 Vec->
ops().slice(IdxVal, ElemsPerChunk));
4350 "Unexpected vector size!");
4363 unsigned vectorWidth) {
4364 assert((vectorWidth == 128 || vectorWidth == 256) &&
4365 "Unsupported vector width");
4377 IdxVal &= ~(ElemsPerChunk - 1);
4401 "Unsupported vector widening type");
4421 const SDLoc &dl,
unsigned WideSizeInBits) {
4424 "Unsupported vector widening type");
4428 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4436 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4437 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4447 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4455 assert(
Ops.empty() &&
"Expected an empty ops vector");
4458 Ops.append(
N->op_begin(),
N->op_end());
4465 const APInt &Idx =
N->getConstantOperandAPInt(2);
4466 EVT VT = Src.getValueType();
4467 EVT SubVT =
Sub.getValueType();
4471 if (Idx == 0 && Src.isUndef()) {
4479 Src.getOperand(1).getValueType() == SubVT &&
4503 if (Src.isUndef()) {
4513 EVT VT =
N->getValueType(0);
4515 uint64_t Idx =
N->getConstantOperandVal(1);
4522 (VT.
getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4523 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4524 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4525 unsigned NumSubs = VT.
getSizeInBits() / SrcOps[0].getValueSizeInBits();
4526 Ops.append(SrcOps.
begin() + SubIdx, SrcOps.
begin() + SubIdx + NumSubs);
4531 assert(
Ops.empty() &&
"Expected an empty ops vector");
4543 unsigned NumSubOps = SubOps.
size();
4544 unsigned HalfNumSubOps = NumSubOps / 2;
4545 assert((NumSubOps % 2) == 0 &&
"Unexpected number of subvectors");
4565 EVT VT =
Op.getValueType();
4568 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4569 "Can't split odd sized vector");
4573 assert((SubOps.
size() % 2) == 0 &&
"Can't split odd sized vector concat");
4574 unsigned HalfOps = SubOps.
size() / 2;
4580 return std::make_pair(
Lo,
Hi);
4587 return std::make_pair(
Lo,
Lo);
4590 return std::make_pair(
Lo,
Hi);
4595 unsigned NumOps =
Op.getNumOperands();
4596 EVT VT =
Op.getValueType();
4601 for (
unsigned I = 0;
I !=
NumOps; ++
I) {
4603 if (!
SrcOp.getValueType().isVector()) {
4613 DAG.
getNode(
Op.getOpcode(), dl, LoVT, LoOps),
4614 DAG.
getNode(
Op.getOpcode(), dl, HiVT, HiOps));
4623 [[maybe_unused]]
EVT VT =
Op.getValueType();
4624 assert((
Op.getOperand(0).getValueType().is256BitVector() ||
4625 Op.getOperand(0).getValueType().is512BitVector()) &&
4627 assert(
Op.getOperand(0).getValueType().getVectorNumElements() ==
4638 [[maybe_unused]]
EVT VT =
Op.getValueType();
4639 assert(
Op.getOperand(0).getValueType() == VT &&
4640 Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!");
4652template <
typename F>
4655 F Builder,
bool CheckBWI =
true,
4656 bool AllowAVX512 =
true) {
4657 assert(Subtarget.
hasSSE2() &&
"Target assumed to support at least SSE2");
4658 unsigned NumSubs = 1;
4659 if (AllowAVX512 && ((CheckBWI && Subtarget.
useBWIRegs()) ||
4665 }
else if (Subtarget.
hasAVX2()) {
4678 return Builder(DAG,
DL,
Ops);
4681 for (
unsigned i = 0; i != NumSubs; ++i) {
4684 EVT OpVT =
Op.getValueType();
4705 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4708 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4715 APInt SplatValue, SplatUndef;
4716 unsigned SplatBitSize;
4718 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4719 HasAnyUndefs, OpEltSizeInBits) &&
4720 !HasAnyUndefs && SplatValue.
getBitWidth() == OpEltSizeInBits)
4735 MVT OpVT =
Op.getSimpleValueType();
4739 assert(OpVT == VT &&
"Vector type mismatch");
4741 if (
SDValue BroadcastOp = MakeBroadcastOp(
Op, OpVT, DstVT)) {
4767 unsigned IdxVal =
Op.getConstantOperandVal(2);
4773 if (IdxVal == 0 && Vec.
isUndef())
4776 MVT OpVT =
Op.getSimpleValueType();
4795 assert(IdxVal + SubVecNumElems <= NumElems &&
4797 "Unexpected index value in INSERT_SUBVECTOR");
4806 Vec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4807 Vec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4817 Undef, SubVec, ZeroIdx);
4820 assert(IdxVal != 0 &&
"Unexpected index");
4821 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4827 assert(IdxVal != 0 &&
"Unexpected index");
4830 [](
SDValue V) { return V.isUndef(); })) {
4831 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4835 unsigned ShiftLeft = NumElems - SubVecNumElems;
4836 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4837 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4839 if (ShiftRight != 0)
4840 SubVec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4847 if (IdxVal + SubVecNumElems == NumElems) {
4848 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4850 if (SubVecNumElems * 2 == NumElems) {
4860 Undef, Vec, ZeroIdx);
4863 Vec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4864 Vec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4877 unsigned ShiftLeft = NumElems - SubVecNumElems;
4878 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4881 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4887 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4889 SubVec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4898 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4900 SubVec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4904 unsigned LowShift = NumElems - IdxVal;
4911 unsigned HighShift = IdxVal + SubVecNumElems;
4942 "Expected a 128/256/512-bit vector type");
4954 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
4958 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
4962 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
4973 (
Opc == X86ISD::VSHLI ||
Opc == X86ISD::VSRLI ||
Opc == X86ISD::VSRAI) &&
4974 "Unknown target vector shift-by-constant node");
4986 if (ShiftAmt >= EltSizeInBits) {
4987 if (
Opc == X86ISD::VSRAI)
4988 ShiftAmt = EltSizeInBits - 1;
5028 "Illegal vector splat index");
5031 if (ShAmtIdx != 0) {
5050 bool IsMasked =
false;
5058 ShAmt = DAG.
getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
5071 {ShAmt.getOperand(1), Mask}))) {
5087 if (AmtVT == MVT::v4i32 && (ShAmt.
getOpcode() == X86ISD::VBROADCAST ||
5088 ShAmt.
getOpcode() == X86ISD::VBROADCAST_LOAD)) {
5089 ShAmt = DAG.
getNode(X86ISD::VZEXT_MOVL,
SDLoc(ShAmt), MVT::v4i32, ShAmt);
5097 ShAmt = DAG.
getNode(X86ISD::VSHLDQ,
SDLoc(ShAmt), MVT::v16i8, ShAmt,
5099 ShAmt = DAG.
getNode(X86ISD::VSRLDQ,
SDLoc(ShAmt), MVT::v16i8, ShAmt,
5118 EVT InVT = In.getValueType();
5143 "Expected VTs to be the same size!");
5147 InVT = In.getValueType();
5165 bool Lo,
bool Unary) {
5167 "Illegal vector type to unpack");
5168 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
5171 for (
int i = 0; i < NumElts; ++i) {
5172 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5173 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5174 Pos += (Unary ? 0 : NumElts * (i % 2));
5175 Pos += (
Lo ? 0 : NumEltsInLane / 2);
5176 Mask.push_back(Pos);
5186 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
5188 for (
int i = 0; i < NumElts; ++i) {
5190 Pos += (
Lo ? 0 : NumElts / 2);
5191 Mask.push_back(Pos);
5201 for (
int I = 0, NumElts = Mask.size();
I != NumElts; ++
I) {
5205 SDValue V = (M < NumElts) ? V1 : V2;
5208 Ops[
I] = V.getOperand(M % NumElts);
5237 bool PackHiHalf =
false) {
5238 MVT OpVT =
LHS.getSimpleValueType();
5240 bool UsePackUS = Subtarget.
hasSSE41() || EltSizeInBits == 8;
5241 assert(OpVT ==
RHS.getSimpleValueType() &&
5244 "Unexpected PACK operand types");
5245 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
5246 "Unexpected PACK result type");
5249 if (EltSizeInBits == 32) {
5251 int Offset = PackHiHalf ? 1 : 0;
5253 for (
int I = 0;
I != NumElts;
I += 4) {
5311 for (
int i = 0; i != NumElems; ++i)
5313 MaskVec[i] = (i == Idx) ? NumElems : i;
5318 if (Ptr.
getOpcode() == X86ISD::Wrapper ||
5345 assert(LD &&
"Unexpected null LoadSDNode");
5353 EVT CondVT =
Cond.getValueType();
5354 return N->getOpcode() ==
ISD::VSELECT && Subtarget.hasAVX512() &&
5363 bool AllowWholeUndefs =
true,
5364 bool AllowPartialUndefs =
false) {
5365 assert(EltBits.
empty() &&
"Expected an empty EltBits vector");
5369 EVT VT =
Op.getValueType();
5371 unsigned NumElts = SizeInBits / EltSizeInBits;
5374 if ((SizeInBits % EltSizeInBits) != 0)
5380 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5381 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5382 "Constant bit sizes don't match");
5385 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5390 if (NumSrcElts == NumElts) {
5391 UndefElts = UndefSrcElts;
5392 EltBits.
assign(SrcEltBits.begin(), SrcEltBits.end());
5397 APInt UndefBits(SizeInBits, 0);
5398 APInt MaskBits(SizeInBits, 0);
5400 for (
unsigned i = 0; i != NumSrcElts; ++i) {
5401 unsigned BitOffset = i * SrcEltSizeInBits;
5402 if (UndefSrcElts[i])
5403 UndefBits.
setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5404 MaskBits.
insertBits(SrcEltBits[i], BitOffset);
5408 UndefElts =
APInt(NumElts, 0);
5411 for (
unsigned i = 0; i != NumElts; ++i) {
5412 unsigned BitOffset = i * EltSizeInBits;
5417 if (!AllowWholeUndefs)
5425 if (UndefEltBits.
getBoolValue() && !AllowPartialUndefs)
5428 EltBits[i] = MaskBits.
extractBits(EltSizeInBits, BitOffset);
5435 unsigned UndefBitIndex) {
5439 Undefs.setBit(UndefBitIndex);
5449 CFP->getValueAPF().bitcastToAPInt());
5453 Type *Ty = CDS->getType();
5455 Type *EltTy = CDS->getElementType();
5459 if (!IsInteger && !IsFP)
5462 for (
unsigned I = 0,
E = CDS->getNumElements();
I !=
E; ++
I)
5464 Mask.insertBits(CDS->getElementAsAPInt(
I),
I * EltBits);
5466 Mask.insertBits(CDS->getElementAsAPFloat(
I).bitcastToAPInt(),
5477 return CastBitData(UndefSrcElts, SrcEltBits);
5484 return CastBitData(UndefSrcElts, SrcEltBits);
5488 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5490 return CastBitData(UndefSrcElts, SrcEltBits);
5498 if (BV->getConstantRawBits(
true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5500 for (
unsigned I = 0,
E = SrcEltBits.
size();
I !=
E; ++
I)
5503 return CastBitData(UndefSrcElts, SrcEltBits);
5511 if (!CstTy->
isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5515 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5516 if ((SizeInBits % SrcEltSizeInBits) != 0)
5519 APInt UndefSrcElts(NumSrcElts, 0);
5521 for (
unsigned i = 0; i != NumSrcElts; ++i)
5526 return CastBitData(UndefSrcElts, SrcEltBits);
5530 if (
Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5536 SDValue Ptr = MemIntr->getBasePtr();
5539 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5541 APInt UndefSrcElts(NumSrcElts, 0);
5543 if (CollectConstantBits(
C, SrcEltBits[0], UndefSrcElts, 0)) {
5544 if (UndefSrcElts[0])
5545 UndefSrcElts.
setBits(0, NumSrcElts);
5546 if (SrcEltBits[0].
getBitWidth() != SrcEltSizeInBits)
5547 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5548 SrcEltBits.
append(NumSrcElts - 1, SrcEltBits[0]);
5549 return CastBitData(UndefSrcElts, SrcEltBits);
5555 if (
Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5557 SDValue Ptr = MemIntr->getBasePtr();
5563 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5564 if (!CstTy->
isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5565 (SizeInBits % SubVecSizeInBits) != 0)
5568 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5569 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5570 APInt UndefSubElts(NumSubElts, 0);
5572 APInt(CstEltSizeInBits, 0));
5573 for (
unsigned i = 0; i != NumSubElts; ++i) {
5577 for (
unsigned j = 1; j != NumSubVecs; ++j)
5578 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5582 return CastBitData(UndefSubElts, SubEltBits);
5587 if (
Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5591 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5593 APInt UndefSrcElts(NumSrcElts, 0);
5595 const APInt &
C =
Op.getOperand(0).getConstantOperandAPInt(0);
5596 SrcEltBits.
push_back(
C.zextOrTrunc(SrcEltSizeInBits));
5597 SrcEltBits.
append(NumSrcElts - 1,
APInt(SrcEltSizeInBits, 0));
5598 return CastBitData(UndefSrcElts, SrcEltBits);
5606 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5608 APInt UndefSrcElts, UndefSubElts;
5611 UndefSubElts, EltSubBits,
5612 AllowWholeUndefs && AllowUndefs,
5613 AllowPartialUndefs && AllowUndefs) &&
5615 UndefSrcElts, EltSrcBits,
5616 AllowWholeUndefs && AllowUndefs,
5617 AllowPartialUndefs && AllowUndefs)) {
5618 unsigned BaseIdx =
Op.getConstantOperandVal(2);
5619 UndefSrcElts.
insertBits(UndefSubElts, BaseIdx);
5620 for (
unsigned i = 0, e = EltSubBits.
size(); i != e; ++i)
5621 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5622 return CastBitData(UndefSrcElts, EltSrcBits);
5629 EltBits, AllowWholeUndefs,
5630 AllowPartialUndefs)) {
5631 EVT SrcVT =
Op.getOperand(0).getValueType();
5632 unsigned NumSrcElts = SrcVT.
getSizeInBits() / EltSizeInBits;
5635 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5638 (BaseOfs % EltSizeInBits) == 0 &&
"Bad subvector index");
5640 UndefElts = UndefElts.
extractBits(NumSubElts, BaseIdx);
5641 if ((BaseIdx + NumSubElts) != NumSrcElts)
5642 EltBits.
erase(EltBits.
begin() + BaseIdx + NumSubElts, EltBits.
end());
5655 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5659 APInt UndefElts0, UndefElts1;
5663 UndefElts0, EltBits0, AllowWholeUndefs,
5664 AllowPartialUndefs))
5668 UndefElts1, EltBits1, AllowWholeUndefs,
5669 AllowPartialUndefs))
5673 for (
int i = 0; i != (int)NumElts; ++i) {
5678 }
else if (M < (
int)NumElts) {
5683 if (UndefElts1[M - NumElts])
5685 EltBits.
push_back(EltBits1[M - NumElts]);
5700 Op,
Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5701 true, AllowPartialUndefs)) {
5702 int SplatIndex = -1;
5703 for (
int i = 0, e = EltBits.
size(); i != e; ++i) {
5706 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5712 if (0 <= SplatIndex) {
5713 SplatVal = EltBits[SplatIndex];
5726 case ::llvm::RoundingMode::TowardPositive:
return X86::rmUpward;
5737 unsigned MaskEltSizeInBits,
5748 for (
const APInt &Elt : EltBits)
5763 bool IsPow2OrUndef =
true;
5764 for (
unsigned I = 0,
E = EltBits.
size();
I !=
E; ++
I)
5765 IsPow2OrUndef &= UndefElts[
I] || EltBits[
I].isPowerOf2();
5766 return IsPow2OrUndef;
5773 EVT VT = V.getValueType();
5779 return V.getOperand(0);
5783 (
isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5785 Not = DAG.
getBitcast(V.getOperand(0).getValueType(), Not);
5792 if (V.getOpcode() == X86ISD::PCMPGT &&
5795 V.getOperand(0).hasOneUse()) {
5799 V.getScalarValueSizeInBits(), UndefElts,
5803 bool MinSigned =
false;
5804 for (
APInt &Elt : EltBits) {
5805 MinSigned |= Elt.isMinSignedValue();
5810 MVT VT = V.getSimpleValueType();
5811 return DAG.
getNode(X86ISD::PCMPGT,
DL, VT, V.getOperand(1),
5820 for (
SDValue &CatOp : CatOps) {
5824 CatOp = DAG.
getBitcast(CatOp.getValueType(), NotCat);
5831 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5846 bool Unary,
unsigned NumStages = 1) {
5847 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
5851 unsigned Offset = Unary ? 0 : NumElts;
5852 unsigned Repetitions = 1u << (NumStages - 1);
5854 assert((NumEltsPerLane >> NumStages) > 0 &&
"Illegal packing compaction");
5856 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5857 for (
unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5858 for (
unsigned Elt = 0; Elt != NumEltsPerLane; Elt +=
Increment)
5859 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5860 for (
unsigned Elt = 0; Elt != NumEltsPerLane; Elt +=
Increment)
5861 Mask.push_back(Elt + (Lane * NumEltsPerLane) +
Offset);
5871 int NumInnerElts = NumElts / 2;
5872 int NumEltsPerLane = NumElts / NumLanes;
5873 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5879 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
5880 for (
int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5881 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5882 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5883 if (DemandedElts[OuterIdx])
5884 DemandedLHS.
setBit(InnerIdx);
5885 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5886 DemandedRHS.
setBit(InnerIdx);
5895 DemandedLHS, DemandedRHS);
5896 DemandedLHS |= DemandedLHS << 1;
5897 DemandedRHS |= DemandedRHS << 1;
5913 MVT VT =
N.getSimpleValueType();
5920 assert(Mask.empty() &&
"getTargetShuffleMask expects an empty Mask vector");
5921 assert(
Ops.empty() &&
"getTargetShuffleMask expects an empty Ops vector");
5924 bool IsFakeUnary =
false;
5925 switch (
N.getOpcode()) {
5926 case X86ISD::BLENDI:
5927 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5928 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5929 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5931 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5934 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5935 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5936 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5938 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5940 case X86ISD::INSERTPS:
5941 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5942 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5943 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5945 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5947 case X86ISD::EXTRQI:
5948 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5951 int BitLen =
N.getConstantOperandVal(1);
5952 int BitIdx =
N.getConstantOperandVal(2);
5957 case X86ISD::INSERTQI:
5958 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5959 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5962 int BitLen =
N.getConstantOperandVal(2);
5963 int BitIdx =
N.getConstantOperandVal(3);
5965 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5968 case X86ISD::UNPCKH:
5969 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5970 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5972 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5974 case X86ISD::UNPCKL:
5975 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5976 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5978 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5980 case X86ISD::MOVHLPS:
5981 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5982 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5984 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5986 case X86ISD::MOVLHPS:
5987 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5988 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5990 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5992 case X86ISD::VALIGN:
5994 "Only 32-bit and 64-bit elements are supported!");
5995 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5996 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5997 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5999 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6000 Ops.push_back(
N.getOperand(1));
6001 Ops.push_back(
N.getOperand(0));
6003 case X86ISD::PALIGNR:
6005 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6006 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6007 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6009 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6010 Ops.push_back(
N.getOperand(1));
6011 Ops.push_back(
N.getOperand(0));
6013 case X86ISD::VSHLDQ:
6015 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6016 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6020 case X86ISD::VSRLDQ:
6022 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6023 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6027 case X86ISD::PSHUFD:
6028 case X86ISD::VPERMILPI:
6029 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6030 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6034 case X86ISD::PSHUFHW:
6035 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6036 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6040 case X86ISD::PSHUFLW:
6041 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6042 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6046 case X86ISD::VZEXT_MOVL:
6047 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6051 case X86ISD::VBROADCAST:
6055 if (
N.getOperand(0).getValueType() == VT) {
6061 case X86ISD::VPERMILPV: {
6062 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6064 SDValue MaskNode =
N.getOperand(1);
6072 case X86ISD::PSHUFB: {
6074 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6075 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6077 SDValue MaskNode =
N.getOperand(1);
6084 case X86ISD::VPERMI:
6085 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6086 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6093 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6094 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6097 case X86ISD::VPERM2X128:
6098 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6099 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6100 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6102 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6104 case X86ISD::SHUF128:
6105 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6106 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6107 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6109 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6111 case X86ISD::MOVSLDUP:
6112 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6116 case X86ISD::MOVSHDUP:
6117 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6121 case X86ISD::MOVDDUP:
6122 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6126 case X86ISD::VPERMIL2: {
6127 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6128 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6129 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6130 SDValue MaskNode =
N.getOperand(2);
6131 SDValue CtrlNode =
N.getOperand(3);
6133 unsigned CtrlImm = CtrlOp->getZExtValue();
6143 case X86ISD::VPPERM: {
6144 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6145 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6146 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6147 SDValue MaskNode =
N.getOperand(2);
6154 case X86ISD::VPERMV: {
6155 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6158 Ops.push_back(
N.getOperand(1));
6159 SDValue MaskNode =
N.getOperand(0);
6167 case X86ISD::VPERMV3: {
6168 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6169 assert(
N.getOperand(2).getValueType() == VT &&
"Unexpected value type");
6170 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(2);
6172 Ops.push_back(
N.getOperand(0));
6173 Ops.push_back(
N.getOperand(2));
6174 SDValue MaskNode =
N.getOperand(1);
6182 case X86ISD::COMPRESS: {
6184 SDValue PassThru =
N.getOperand(1);
6191 "Illegal compression mask");
6192 for (
unsigned I = 0;
I != NumElems; ++
I) {
6196 while (Mask.size() != NumElems) {
6197 Mask.push_back(NumElems + Mask.size());
6199 Ops.push_back(CmpVec);
6200 Ops.push_back(PassThru);
6203 case X86ISD::EXPAND: {
6205 SDValue PassThru =
N.getOperand(1);
6212 "Illegal expansion mask");
6213 unsigned ExpIndex = 0;
6214 for (
unsigned I = 0;
I != NumElems; ++
I) {
6216 Mask.push_back(
I + NumElems);
6218 Mask.push_back(ExpIndex++);
6220 Ops.push_back(ExpVec);
6221 Ops.push_back(PassThru);
6233 if (!AllowSentinelZero &&
isAnyZero(Mask))
6241 if (M >= (
int)Mask.size())
6247 Ops.push_back(
N.getOperand(0));
6248 if (!IsUnary || IsFakeUnary)
6249 Ops.push_back(
N.getOperand(1));
6274 int Size = Mask.size();
6284 int ScalarSizeInBits = VectorSizeInBits /
Size;
6285 assert(!(VectorSizeInBits % ScalarSizeInBits) &&
"Illegal shuffle mask size");
6287 for (
int i = 0; i <
Size; ++i) {
6294 if ((M >= 0 && M <
Size && V1IsZero) || (M >=
Size && V2IsZero)) {
6309 if ((
Size % V.getNumOperands()) == 0) {
6310 int Scale =
Size / V->getNumOperands();
6317 APInt Val = Cst->getAPIntValue();
6318 Val = Val.
extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6322 APInt Val = Cst->getValueAPF().bitcastToAPInt();
6323 Val = Val.
extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6332 if ((V.getNumOperands() %
Size) == 0) {
6333 int Scale = V->getNumOperands() /
Size;
6334 bool AllUndef =
true;
6335 bool AllZero =
true;
6336 for (
int j = 0; j < Scale; ++j) {
6337 SDValue Op = V.getOperand((M * Scale) + j);
6338 AllUndef &=
Op.isUndef();
6361 MVT VT =
N.getSimpleValueType();
6365 int Size = Mask.size();
6374 "Illegal split of shuffle value type");
6378 APInt UndefSrcElts[2];
6380 bool IsSrcConstant[2] = {
6382 SrcEltBits[0],
true,
6385 SrcEltBits[1],
true,
6388 for (
int i = 0; i <
Size; ++i) {
6402 unsigned SrcIdx = M /
Size;
6417 (
Size % V.getValueType().getVectorNumElements()) == 0) {
6418 int Scale =
Size / V.getValueType().getVectorNumElements();
6419 int Idx = M / Scale;
6433 int Idx = V.getConstantOperandVal(2);
6434 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6435 if (M < Idx || (Idx + NumSubElts) <= M)
6442 if (IsSrcConstant[SrcIdx]) {
6443 if (UndefSrcElts[SrcIdx][M])
6445 else if (SrcEltBits[SrcIdx][M] == 0)
6451 "Different mask size from vector size!");
6457 const APInt &KnownUndef,
6458 const APInt &KnownZero,
6459 bool ResolveKnownZeros=
true) {
6460 unsigned NumElts = Mask.size();
6462 KnownZero.
getBitWidth() == NumElts &&
"Shuffle mask size mismatch");
6464 for (
unsigned i = 0; i != NumElts; ++i) {
6467 else if (ResolveKnownZeros && KnownZero[i])
6476 unsigned NumElts = Mask.size();
6479 for (
unsigned i = 0; i != NumElts; ++i) {
6491 EVT CondVT =
Cond.getValueType();
6504 for (
int i = 0; i != (int)NumElts; ++i) {
6509 if (UndefElts[i] || (!IsBLENDV && EltBits[i].
isZero()) ||
6510 (IsBLENDV && EltBits[i].isNonNegative()))
6522 bool ResolveKnownElts);
6532 bool ResolveKnownElts) {
6536 MVT VT =
N.getSimpleValueType();
6540 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6543 unsigned NumSizeInBytes = NumSizeInBits / 8;
6544 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6546 unsigned Opcode =
N.getOpcode();
6552 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6553 Ops.push_back(
N.getOperand(0));
6554 Ops.push_back(
N.getOperand(1));
6560 case X86ISD::ANDNP: {
6566 bool IsAndN = (X86ISD::ANDNP == Opcode);
6567 uint64_t ZeroMask = IsAndN ? 255 : 0;
6574 assert(UndefElts.
isZero() &&
"Unexpected UNDEF element in AND/ANDNP mask");
6575 for (
int i = 0, e = (
int)EltBits.
size(); i != e; ++i) {
6576 const APInt &ByteBits = EltBits[i];
6577 if (ByteBits != 0 && ByteBits != 255)
6581 Ops.push_back(IsAndN ? N1 : N0);
6602 size_t MaskSize = std::max(SrcMask0.
size(), SrcMask1.
size());
6606 for (
int i = 0; i != (int)MaskSize; ++i) {
6616 Mask.push_back(i + MaskSize);
6617 else if (MaskSize == NumElts && !DemandedElts[i])
6622 Ops.push_back(
N.getOperand(0));
6623 Ops.push_back(
N.getOperand(1));
6628 unsigned NumSubElts =
N.getOperand(0).getValueType().getVectorNumElements();
6629 if (NumBitsPerElt == 64) {
6630 for (
unsigned I = 0,
E =
N.getNumOperands();
I !=
E; ++
I) {
6631 for (
unsigned M = 0; M != NumSubElts; ++M)
6632 Mask.push_back((
I * NumElts) + M);
6633 Ops.push_back(
N.getOperand(
I));
6642 EVT SubVT =
Sub.getValueType();
6644 uint64_t InsertIdx =
N.getConstantOperandVal(2);
6646 if (DemandedElts.
extractBits(NumSubElts, InsertIdx) == 0) {
6647 Mask.resize(NumElts);
6648 std::iota(Mask.begin(), Mask.end(), 0);
6654 if (
Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6656 Src.getOperand(0).isUndef() &&
6657 Src.getOperand(1).getValueType() == SubVT &&
6658 Src.getConstantOperandVal(2) == 0 &&
6659 (NumBitsPerElt == 64 || Src.getOperand(1) ==
Sub) &&
6661 Mask.resize(NumElts);
6662 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6663 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6664 Ops.push_back(Src.getOperand(1));
6669 if (InsertIdx != 0 && Src.isUndef() &&
6672 std::iota(Mask.begin() + InsertIdx, Mask.begin() + InsertIdx + NumSubElts,
6677 if (!
N->isOnlyUserOf(
Sub.getNode()))
6692 unsigned NumSubSrcSrcElts =
6694 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6695 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6696 "Subvector valuetype mismatch");
6697 InsertIdx *= (MaxElts / NumElts);
6698 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6699 NumSubElts *= (MaxElts / NumElts);
6700 bool SrcIsUndef = Src.isUndef();
6701 for (
int i = 0; i != (int)MaxElts; ++i)
6703 for (
int i = 0; i != (int)NumSubElts; ++i)
6704 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6707 Ops.push_back(SubSrcSrc);
6714 Depth + 1, ResolveKnownElts))
6724 if (SubMask.
size() != NumSubElts) {
6725 assert(((SubMask.
size() % NumSubElts) == 0 ||
6726 (NumSubElts % SubMask.
size()) == 0) &&
6727 "Illegal submask scale");
6728 if ((NumSubElts % SubMask.
size()) == 0) {
6729 int Scale = NumSubElts / SubMask.
size();
6732 SubMask = ScaledSubMask;
6734 int Scale = SubMask.
size() / NumSubElts;
6735 NumSubElts = SubMask.
size();
6745 for (
int i = 0; i != (int)NumElts; ++i)
6747 for (
int i = 0; i != (int)NumSubElts; ++i) {
6750 int InputIdx = M / NumSubElts;
6751 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6753 Mask[i + InsertIdx] = M;
6757 case X86ISD::PINSRB:
6758 case X86ISD::PINSRW:
6765 unsigned DstIdx = 0;
6769 N.getConstantOperandAPInt(2).uge(NumElts))
6771 DstIdx =
N.getConstantOperandVal(2);
6775 Ops.push_back(
N.getOperand(0));
6776 for (
unsigned i = 0; i != NumElts; ++i)
6796 if ((MinBitsPerElt % 8) != 0)
6816 unsigned DstByte = DstIdx * NumBytesPerElt;
6822 Ops.push_back(SrcVec);
6825 Ops.push_back(SrcVec);
6826 Ops.push_back(
N.getOperand(0));
6827 for (
int i = 0; i != (int)NumSizeInBytes; ++i)
6828 Mask.push_back(NumSizeInBytes + i);
6831 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6832 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6833 for (
unsigned i = 0; i != MinBytesPerElts; ++i)
6834 Mask[DstByte + i] = SrcByte + i;
6835 for (
unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6839 case X86ISD::PACKSS:
6840 case X86ISD::PACKUS: {
6845 "Unexpected input value type");
6847 APInt EltsLHS, EltsRHS;
6852 bool Offset0 =
false, Offset1 =
false;
6853 if (Opcode == X86ISD::PACKSS) {
6881 bool IsUnary = (N0 == N1);
6889 if (Offset0 || Offset1) {
6891 if ((Offset0 &&
isInRange(M, 0, NumElts)) ||
6892 (Offset1 &&
isInRange(M, NumElts, 2 * NumElts)))
6898 case X86ISD::BLENDV: {
6901 Ops.push_back(
N.getOperand(1));
6902 Ops.push_back(
N.getOperand(2));
6907 case X86ISD::VTRUNC: {
6909 EVT SrcVT = Src.getValueType();
6914 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6915 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 &&
"Illegal truncation");
6916 for (
unsigned i = 0; i != NumSrcElts; ++i)
6917 Mask.push_back(i * Scale);
6933 for (
unsigned I = 0;
I != NumElts; ++
I)
6934 if (DemandedElts[
I] && !UndefElts[
I] &&
6935 (EltBits[
I].urem(8) != 0 || EltBits[
I].uge(NumBitsPerElt)))
6939 Ops.push_back(
N.getOperand(0));
6941 for (
unsigned I = 0;
I != NumElts; ++
I) {
6942 if (!DemandedElts[
I] || UndefElts[
I])
6944 unsigned ByteShift = EltBits[
I].getZExtValue() / 8;
6945 unsigned Lo =
I * NumBytesPerElt;
6946 unsigned Hi =
Lo + NumBytesPerElt;
6950 std::iota(Mask.begin() +
Lo + ByteShift, Mask.begin() +
Hi,
Lo);
6952 std::iota(Mask.begin() +
Lo, Mask.begin() +
Hi - ByteShift,
6958 case X86ISD::VSRLI: {
6959 uint64_t ShiftVal =
N.getConstantOperandVal(1);
6961 if (NumBitsPerElt <= ShiftVal) {
6967 if ((ShiftVal % 8) != 0)
6971 Ops.push_back(
N.getOperand(0));
6976 if (X86ISD::VSHLI == Opcode) {
6977 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6978 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6979 Mask[i + j] = i + j - ByteShift;
6981 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6982 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6983 Mask[i + j - ByteShift] = i + j;
6998 for (
unsigned I = 0;
I != NumElts; ++
I)
6999 if (DemandedElts[
I] && !UndefElts[
I] &&
7000 (EltBits[
I].urem(NumBitsPerElt) % 8) != 0)
7003 Ops.push_back(
N.getOperand(0));
7004 for (
unsigned I = 0;
I != NumElts; ++
I) {
7005 if (!DemandedElts[
I] || UndefElts[
I]) {
7009 int Offset = EltBits[
I].urem(NumBitsPerElt) / 8;
7011 int BaseIdx =
I * NumBytesPerElt;
7012 for (
int J = 0; J != (int)NumBytesPerElt; ++J) {
7013 Mask.push_back(BaseIdx + ((
Offset + J) % NumBytesPerElt));
7018 case X86ISD::VROTLI:
7019 case X86ISD::VROTRI: {
7021 uint64_t RotateVal =
N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7022 if ((RotateVal % 8) != 0)
7024 Ops.push_back(
N.getOperand(0));
7025 int Offset = RotateVal / 8;
7027 for (
int i = 0; i != (int)NumElts; ++i) {
7028 int BaseIdx = i * NumBytesPerElt;
7029 for (
int j = 0; j != (int)NumBytesPerElt; ++j) {
7030 Mask.push_back(BaseIdx + ((
Offset + j) % NumBytesPerElt));
7035 case X86ISD::VBROADCAST: {
7037 if (!Src.getSimpleValueType().isVector()) {
7040 Src.getOperand(0).getValueType().getScalarType() !=
7043 Src = Src.getOperand(0);
7046 Mask.append(NumElts, 0);
7051 EVT SrcVT = Src.getValueType();
7056 (NumBitsPerSrcElt % 8) != 0)
7060 APInt DemandedSrcElts =
7065 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 &&
"Unexpected extension");
7066 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
7067 for (
unsigned I = 0;
I != NumElts; ++
I)
7068 Mask.append(Scale,
I);
7077 EVT SrcVT = Src.getValueType();
7099 int MaskWidth = Mask.size();
7101 for (
int i = 0, e = Inputs.
size(); i < e; ++i) {
7102 int lo = UsedInputs.
size() * MaskWidth;
7103 int hi = lo + MaskWidth;
7108 if ((lo <= M) && (M < hi))
7112 if (
none_of(Mask, [lo, hi](
int i) {
return (lo <= i) && (i < hi); })) {
7120 bool IsRepeat =
false;
7121 for (
int j = 0, ue = UsedInputs.
size(); j != ue; ++j) {
7126 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7135 Inputs = std::move(UsedInputs);
7146 bool ResolveKnownElts) {
7150 EVT VT =
Op.getValueType();
7155 if (ResolveKnownElts)
7160 ResolveKnownElts)) {
7171 bool ResolveKnownElts) {
7172 APInt KnownUndef, KnownZero;
7174 KnownZero, DAG,
Depth, ResolveKnownElts);
7180 bool ResolveKnownElts =
true) {
7181 EVT VT =
Op.getValueType();
7185 unsigned NumElts =
Op.getValueType().getVectorNumElements();
7195 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
7196 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
7197 "Unknown broadcast load type");
7208 Opcode,
DL, Tys,
Ops, MemVT,
7222 EVT VT =
Op.getValueType();
7223 unsigned Opcode =
Op.getOpcode();
7228 int Elt = SV->getMaskElt(Index);
7247 int Elt = ShuffleMask[Index];
7254 assert(0 <= Elt && Elt < (2 * NumElems) &&
"Shuffle index out of range");
7263 uint64_t SubIdx =
Op.getConstantOperandVal(2);
7264 unsigned NumSubElts =
Sub.getValueType().getVectorNumElements();
7266 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
7273 EVT SubVT =
Op.getOperand(0).getValueType();
7275 uint64_t SubIdx = Index / NumSubElts;
7276 uint64_t SubElt = Index % NumSubElts;
7283 uint64_t SrcIdx =
Op.getConstantOperandVal(1);
7290 EVT SrcVT = Src.getValueType();
7302 if (
Op.getConstantOperandAPInt(2) == Index)
7303 return Op.getOperand(1);
7308 return (Index == 0) ?
Op.getOperand(0)
7312 return Op.getOperand(Index);
7319 const APInt &NonZeroMask,
7320 unsigned NumNonZero,
unsigned NumZero,
7323 MVT VT =
Op.getSimpleValueType();
7326 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.
hasSSE41())) &&
7327 "Illegal vector insertion");
7332 for (
unsigned i = 0; i < NumElts; ++i) {
7333 bool IsNonZero = NonZeroMask[i];
7342 if (NumZero || 0 != i)
7345 assert(0 == i &&
"Expected insertion into zero-index");
7361 const APInt &NonZeroMask,
7362 unsigned NumNonZero,
unsigned NumZero,
7365 if (NumNonZero > 8 && !Subtarget.
hasSSE41())
7379 for (
unsigned I = 0;
I != 4; ++
I) {
7380 if (!NonZeroMask[
I])
7388 assert(V &&
"Failed to fold v16i8 vector to zero");
7390 V = DAG.
getNode(X86ISD::VZEXT_MOVL,
DL, MVT::v4i32, V);
7393 for (
unsigned i = V ? 4 : 0; i < 16; i += 2) {
7394 bool ThisIsNonZero = NonZeroMask[i];
7395 bool NextIsNonZero = NonZeroMask[i + 1];
7396 if (!ThisIsNonZero && !NextIsNonZero)
7400 if (ThisIsNonZero) {
7401 if (NumZero || NextIsNonZero)
7407 if (NextIsNonZero) {
7409 if (i == 0 && NumZero)
7425 if (i != 0 || NumZero)
7443 const APInt &NonZeroMask,
7444 unsigned NumNonZero,
unsigned NumZero,
7447 if (NumNonZero > 4 && !Subtarget.
hasSSE41())
7463 if (Subtarget.
hasSSE3() && !Subtarget.hasXOP() &&
7464 Op.getOperand(0) ==
Op.getOperand(2) &&
7465 Op.getOperand(1) ==
Op.getOperand(3) &&
7466 Op.getOperand(0) !=
Op.getOperand(1)) {
7467 MVT VT =
Op.getSimpleValueType();
7479 std::bitset<4> Zeroable, Undefs;
7480 for (
int i = 0; i < 4; ++i) {
7485 assert(Zeroable.size() - Zeroable.count() > 1 &&
7486 "We expect at least two non-zero elements!");
7491 unsigned FirstNonZeroIdx;
7492 for (
unsigned i = 0; i < 4; ++i) {
7503 if (!FirstNonZero.
getNode()) {
7505 FirstNonZeroIdx = i;
7509 assert(FirstNonZero.
getNode() &&
"Unexpected build vector of all zeros!");
7515 unsigned EltMaskIdx, EltIdx;
7517 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7518 if (Zeroable[EltIdx]) {
7520 Mask[EltIdx] = EltIdx+4;
7524 Elt =
Op->getOperand(EltIdx);
7527 if (Elt.
getOperand(0) != V1 || EltMaskIdx != EltIdx)
7529 Mask[EltIdx] = EltIdx;
7534 SDValue VZeroOrUndef = (Zeroable == Undefs)
7547 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7550 bool CanFold =
true;
7551 for (
unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7565 assert(V1.
getNode() &&
"Expected at least two non-zero elements!");
7572 unsigned ZMask = Zeroable.to_ulong();
7574 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7575 assert((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!");
7577 DAG.
getNode(X86ISD::INSERTPS,
DL, MVT::v4f32, V1, V2,
7587 MVT ShVT = MVT::v16i8;
7588 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7590 assert(NumBits % 8 == 0 &&
"Only support byte sized shifts");
7602 SDValue Ptr = LD->getBasePtr();
7605 EVT PVT = LD->getValueType(0);
7606 if (PVT != MVT::i32 && PVT != MVT::f32)
7612 FI = FINode->getIndex();
7626 SDValue Chain = LD->getChain();
7630 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7647 int64_t StartOffset =
Offset & ~int64_t(RequiredAlign.
value() - 1);
7654 int EltNo = (
Offset - StartOffset) >> 2;
7659 LD->getPointerInfo().getWithOffset(StartOffset));
7673 if (!BaseLd->isSimple())
7687 uint64_t Amt = AmtC->getZExtValue();
7689 ByteOffset += Amt / 8;
7699 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7701 uint64_t Idx = IdxC->getZExtValue();
7702 ByteOffset += Idx * (SrcSizeInBits / 8);
7720 bool IsAfterLegalize,
7721 unsigned Depth = 0) {
7727 unsigned NumElems = Elts.
size();
7729 int LastLoadedElt = -1;
7739 for (
unsigned i = 0; i < NumElems; ++i) {
7758 if (!
findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7760 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7761 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7769 "Incomplete element masks");
7772 if (UndefMask.
popcount() == NumElems)
7783 "Register/Memory size mismatch");
7785 assert(LDBase &&
"Did not find base load for merging consecutive loads");
7787 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7788 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7789 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7790 assert((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected");
7793 if (ByteOffsets[FirstLoadedElt] != 0)
7800 int64_t ByteOffset = ByteOffsets[EltIdx];
7801 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7802 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7803 return (0 <= BaseIdx && BaseIdx < (
int)NumElems && LoadMask[BaseIdx] &&
7804 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7806 int Stride = EltIdx - FirstLoadedElt;
7812 unsigned BaseMemSizeInBits =
Base->getMemoryVT().getSizeInBits();
7813 if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
7814 (BaseMemSizeInBits % BaseSizeInBits) == 0) {
7815 unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
7825 bool IsConsecutiveLoad =
true;
7826 bool IsConsecutiveLoadWithZeros =
true;
7827 for (
int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7829 if (!CheckConsecutiveLoad(LDBase, i)) {
7830 IsConsecutiveLoad =
false;
7831 IsConsecutiveLoadWithZeros =
false;
7834 }
else if (ZeroMask[i]) {
7835 IsConsecutiveLoad =
false;
7842 "Cannot merge volatile or atomic loads.");
7846 for (
auto *LD : Loads)
7861 if (FirstLoadedElt == 0 &&
7862 (NumLoadedElts == (
int)NumElems || IsDereferenceable) &&
7863 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7874 return DAG.
getBitcast(VT, Elts[FirstLoadedElt]);
7877 return CreateLoad(VT, LDBase);
7881 if (!IsAfterLegalize && VT.
isVector()) {
7883 if ((NumMaskElts % NumElems) == 0) {
7884 unsigned Scale = NumMaskElts / NumElems;
7886 for (
unsigned i = 0; i < NumElems; ++i) {
7889 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7890 for (
unsigned j = 0; j != Scale; ++j)
7891 ClearMask[(i * Scale) + j] = (i * Scale) + j +
Offset;
7893 SDValue V = CreateLoad(VT, LDBase);
7903 unsigned HalfNumElems = NumElems / 2;
7909 DAG, Subtarget, IsAfterLegalize,
Depth + 1);
7917 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7918 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7919 LoadSizeInBits == 64) &&
7926 if (!Subtarget.
hasSSE2() && VT == MVT::v4f32)
7934 for (
auto *LD : Loads)
7945 for (
unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7946 unsigned RepeatSize = SubElems * BaseSizeInBits;
7947 unsigned ScalarSize = std::min(RepeatSize, 64u);
7948 if (!Subtarget.
hasAVX2() && ScalarSize < 32)
7953 if (RepeatSize > ScalarSize && SubElems == 1)
7958 for (
unsigned i = 0; i != NumElems && Match; ++i) {
7962 if (RepeatedLoads[i % SubElems].
isUndef())
7963 RepeatedLoads[i % SubElems] = Elt;
7965 Match &= (RepeatedLoads[i % SubElems] == Elt);
7969 Match &= !RepeatedLoads.
front().isUndef();
7970 Match &= !RepeatedLoads.
back().isUndef();
7978 if (RepeatSize > ScalarSize)
7980 RepeatSize / ScalarSize);
7986 RepeatVT, RepeatedLoads,
DL, DAG, Subtarget, IsAfterLegalize,
7988 SDValue Broadcast = RepeatLoad;
7989 if (RepeatSize > ScalarSize) {
8000 DAG.
getNode(X86ISD::VBROADCAST,
DL, BroadcastVT, RepeatLoad);
8015 VT, ReverseElts,
DL, DAG, Subtarget, IsAfterLegalize,
Depth + 1)) {
8017 std::iota(ReverseMask.
rbegin(), ReverseMask.
rend(), 0);
8031 bool IsAfterLegalize) {
8050 auto getConstantScalar = [&](
const APInt &Val) ->
Constant * {
8052 if (ScalarSize == 16)
8054 if (ScalarSize == 32)
8056 assert(ScalarSize == 64 &&
"Unsupported floating point scalar size");
8063 for (
unsigned I = 0,
E = Bits.size();
I !=
E; ++
I)
8065 : getConstantScalar(Bits[
I]));
8074 auto getConstantScalar = [&](
const APInt &Val) ->
Constant * {
8076 if (ScalarSize == 16)
8078 if (ScalarSize == 32)
8080 assert(ScalarSize == 64 &&
"Unsupported floating point scalar size");
8086 if (ScalarSize == SplatBitSize)
8087 return getConstantScalar(SplatValue);
8089 unsigned NumElm = SplatBitSize / ScalarSize;
8091 for (
unsigned I = 0;
I != NumElm; ++
I) {
8093 ConstantVec.
push_back(getConstantScalar(Val));
8099 for (
auto *U :
N->users()) {
8100 unsigned Opc = U->getOpcode();
8102 if (
Opc == X86ISD::VPERMV && U->getOperand(0).getNode() ==
N)
8104 if (
Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() ==
N)
8110 if (
N->hasOneUse()) {
8113 if (
Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() !=
N)
8124 unsigned SizeInBits = V.getValueSizeInBits();
8125 if ((SizeInBits == 512 && Subtarget.
hasAVX512()) ||
8126 (SizeInBits >= 128 && Subtarget.hasVLX())) {
8127 if (V.hasOneUse() && V->user_begin()->getOpcode() ==
ISD::VSELECT &&
8128 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
8156 "Unsupported vector type for broadcast.");
8163 assert((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.");
8164 if (Sequence.size() == 1)
8174 if (!Sequence.empty() && Subtarget.hasCDI()) {
8176 unsigned SeqLen = Sequence.size();
8177 bool UpperZeroOrUndef =
8182 if (UpperZeroOrUndef && ((Op0.getOpcode() ==
ISD::BITCAST) ||
8187 : Op0.getOperand(0).getOperand(0);
8190 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||
8191 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) {
8197 SDValue Bcst = DAG.
getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8205 unsigned NumUndefElts = UndefElements.
count();
8206 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8208 unsigned SplatBitSize;
8221 if (SplatBitSize == 32 || SplatBitSize == 64 ||
8222 (SplatBitSize < 32 && Subtarget.
hasAVX2())) {
8239 if (SplatBitSize > 64) {
8251 Ops, VVT, MPI, Alignment,
8261 if (!Ld || NumElts - NumUndefElts != 1)
8264 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8268 bool ConstSplatVal =
8296 if (ConstSplatVal && (Subtarget.
hasAVX2() || OptForSize)) {
8304 if (ScalarSize == 32 ||
8305 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
8306 (CVT == MVT::f16 && Subtarget.
hasAVX2()) ||
8307 (OptForSize && (ScalarSize == 64 || Subtarget.
hasAVX2()))) {
8310 C = CI->getConstantIntValue();
8312 C = CF->getConstantFPValue();
8314 assert(
C &&
"Invalid constant type");
8331 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
8332 return DAG.
getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8342 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8343 (Subtarget.hasVLX() && ScalarSize == 64)) {
8346 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8349 LN->getMemoryVT(), LN->getMemOperand());
8357 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
8360 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8363 LN->getMemoryVT(), LN->getMemOperand());
8368 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
8369 return DAG.
getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8404 ExtractedFromVec = ShuffleVec;
8412 MVT VT =
Op.getSimpleValueType();
8419 unsigned NumElems =
Op.getNumOperands();
8425 for (
unsigned i = 0; i != NumElems; ++i) {
8426 unsigned Opc =
Op.getOperand(i).getOpcode();
8433 if (InsertIndices.
size() > 1)
8440 SDValue ExtractedFromVec =
Op.getOperand(i).getOperand(0);
8441 SDValue ExtIdx =
Op.getOperand(i).getOperand(1);
8453 VecIn1 = ExtractedFromVec;
8454 else if (VecIn1 != ExtractedFromVec) {
8456 VecIn2 = ExtractedFromVec;
8457 else if (VecIn2 != ExtractedFromVec)
8462 if (ExtractedFromVec == VecIn1)
8464 else if (ExtractedFromVec == VecIn2)
8465 Mask[i] = Idx + NumElems;
8474 for (
unsigned Idx : InsertIndices)
8484 MVT VT =
Op.getSimpleValueType();
8485 MVT SVT = Subtarget.hasFP16() ? MVT::f16 : MVT::i16;
8488 for (
unsigned I = 0,
E =
Op.getNumOperands();
I !=
E; ++
I)
8499 MVT VT =
Op.getSimpleValueType();
8501 "Unexpected type in LowerBUILD_VECTORvXi1!");
8508 bool IsSplat =
true;
8509 bool HasConstElts =
false;
8511 for (
unsigned idx = 0, e =
Op.getNumOperands(); idx < e; ++idx) {
8516 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8517 HasConstElts =
true;
8523 else if (In !=
Op.getOperand(SplatIdx))
8534 assert(
Cond.getValueType() == MVT::i8 &&
"Unexpected VT!");
8540 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8559 MVT OpVT =
Op.getOperand(0).getSimpleValueType();
8560 if (NonConstIdx.
size() > 1 && OpVT == MVT::i8) {
8563 MVT WideSVT = Subtarget.hasBWI() ? MVT::i8 : MVT::i32;
8565 WideSVT = ByteVT == MVT::v4i8 ? MVT::i32 : MVT::i64;
8566 ByteVT = MVT::v16i8;
8587 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8604 for (
unsigned InsertIdx : NonConstIdx) {
8606 Op.getOperand(InsertIdx),
8614 case X86ISD::PACKSS:
8615 case X86ISD::PACKUS:
8649 unsigned BaseIdx,
unsigned LastIdx,
8651 EVT VT =
N->getValueType(0);
8653 assert(BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!");
8655 "Invalid Vector in input!");
8658 bool CanFold =
true;
8659 unsigned ExpectedVExtractIdx = BaseIdx;
8660 unsigned NumElts = LastIdx - BaseIdx;
8665 for (
unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8669 if (
Op->isUndef()) {
8671 if (i * 2 == NumElts)
8672 ExpectedVExtractIdx = BaseIdx;
8673 ExpectedVExtractIdx += 2;
8677 CanFold =
Op->getOpcode() == Opcode &&
Op->hasOneUse();
8698 if (i * 2 < NumElts) {
8710 if (i * 2 == NumElts)
8711 ExpectedVExtractIdx = BaseIdx;
8715 if (I0 == ExpectedVExtractIdx)
8717 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8724 ExpectedVExtractIdx += 2;
8763 unsigned X86Opcode,
bool Mode,
8764 bool isUndefLO,
bool isUndefHI) {
8767 "Invalid nodes in input!");
8781 if (!isUndefLO && !V0->
isUndef())
8782 LO = DAG.
getNode(X86Opcode,
DL, NewVT, V0_LO, V0_HI);
8783 if (!isUndefHI && !V1->
isUndef())
8784 HI = DAG.
getNode(X86Opcode,
DL, NewVT, V1_LO, V1_HI);
8788 LO = DAG.
getNode(X86Opcode,
DL, NewVT, V0_LO, V1_LO);
8791 HI = DAG.
getNode(X86Opcode,
DL, NewVT, V0_HI, V1_HI);
8805 unsigned &NumExtracts,
bool &IsSubAdd,
8806 bool &HasAllowContract) {
8818 HasAllowContract = NumElts != 0;
8824 unsigned Opc[2] = {0, 0};
8825 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
8829 unsigned Opcode =
Op.getOpcode();
8849 if (
Opc[i % 2] != 0 &&
Opc[i % 2] != Opcode)
8851 Opc[i % 2] = Opcode;
8877 HasAllowContract &=
Op->getFlags().hasAllowContract();
8923 unsigned ExpectedUses,
8924 bool AllowSubAddOrAddSubContract) {
8934 (AllowSubAddOrAddSubContract && Opnd0->
getFlags().hasAllowContract());
8953 unsigned NumExtracts;
8955 bool HasAllowContract;
8956 if (!
isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8965 HasAllowContract)) {
8966 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8980 Mask.push_back(
I +
E + 1);
8987 return DAG.
getNode(X86ISD::ADDSUB,
DL, VT, Opnd0, Opnd1);
9004 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9005 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9006 for (
unsigned i = 0; i != Num128BitChunks; ++i) {
9007 for (
unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9019 GenericOpcode =
Op.getOpcode();
9020 switch (GenericOpcode) {
9022 case ISD::ADD: HOpcode = X86ISD::HADD;
break;
9023 case ISD::SUB: HOpcode = X86ISD::HSUB;
break;
9024 case ISD::FADD: HOpcode = X86ISD::FHADD;
break;
9025 case ISD::FSUB: HOpcode = X86ISD::FHSUB;
break;
9026 default:
return false;
9042 if (j < NumEltsIn64Bits) {
9050 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9057 unsigned ExpectedIndex = i * NumEltsIn128Bits +
9058 (j % NumEltsIn64Bits) * 2;
9059 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9068 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9099 for (
unsigned i = 0; i != NumElts; ++i)
9104 unsigned HalfNumElts = NumElts / 2;
9113 return DAG.
getNode(HOpcode,
DL, VT, V0, V1);
9121 unsigned NumNonUndefs =
9123 if (NumNonUndefs < 2)
9130 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.
hasSSE3()) ||
9131 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.
hasSSSE3()) ||
9132 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.
hasAVX()) ||
9133 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.
hasAVX2())) {
9146 unsigned Half = NumElts / 2;
9147 unsigned NumUndefsLO = 0;
9148 unsigned NumUndefsHI = 0;
9149 for (
unsigned i = 0, e = Half; i != e; ++i)
9153 for (
unsigned i = Half, e = NumElts; i != e; ++i)
9158 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9161 bool CanFold =
true;
9168 X86Opcode = X86ISD::HADD;
9175 X86Opcode = X86ISD::HSUB;
9182 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9191 bool isUndefLO = NumUndefsLO == Half;
9192 bool isUndefHI = NumUndefsHI == Half;
9198 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9199 VT == MVT::v16i16) {
9203 X86Opcode = X86ISD::HADD;
9206 X86Opcode = X86ISD::HSUB;
9209 X86Opcode = X86ISD::FHADD;
9212 X86Opcode = X86ISD::FHSUB;
9218 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9223 bool isUndefLO = NumUndefsLO == Half;
9224 bool isUndefHI = NumUndefsHI == Half;
9226 isUndefLO, isUndefHI);
9244 MVT VT =
Op->getSimpleValueType(0);
9251 unsigned Opcode =
Op->getOperand(0).getOpcode();
9252 for (
unsigned i = 1; i < NumElems; ++i)
9253 if (Opcode !=
Op->getOperand(i).getOpcode())
9257 bool IsShift =
false;
9271 if (
Op->getSplatValue())
9279 bool RHSAllConst =
true;
9298 if (Op1.getValueSizeInBits() != ElemSize)
9304 if (
any_of(RHSElts, [&](
SDValue V) {
return RHSElts[0] != V; }))
9309 return V.getValueSizeInBits() == ElemSize;
9311 "Element size mismatch");
9319 if (!
LHS && !
RHS && !RHSAllConst)
9345 if (VT != MVT::v4f64)
9353 UniqueOps.insert(
Op);
9357 if (UniqueOps.size() != 2u)
9361 UniqueOps.erase(Op0);
9362 SDValue Op1 = *UniqueOps.begin();
9369 for (
auto I = 0u;
I < NumElems; ++
I) {
9371 Mask[
I] =
Op == Op0 ?
I :
I + NumElems;
9392 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
9395 unsigned WideBits = 2 * EltBits;
9402 for (
unsigned I = 0;
I != NumElts;
I += 2) {
9418 X.getValueType().bitsGE(WideSVT)) {
9419 if (
X.getValueType().bitsGT(WideSVT))
9428 assert(WideOps.
size() == (NumElts / 2) &&
"Failed to widen build vector");
9438 MVT VT =
Op.getSimpleValueType();
9448 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9470 "Illegal variable permute mask size");
9478 SDLoc(IndicesVec), SizeInBits);
9482 IndicesVT, IndicesVec);
9494 Subtarget, DAG,
SDLoc(IndicesVec));
9519 for (
uint64_t i = 0; i != Scale; ++i) {
9520 IndexScale |= Scale << (i * NumDstBits);
9521 IndexOffset |= i << (i * NumDstBits);
9531 unsigned Opcode = 0;
9537 Opcode = X86ISD::PSHUFB;
9540 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9541 Opcode = X86ISD::VPERMV;
9543 Opcode = X86ISD::PSHUFB;
9544 ShuffleVT = MVT::v16i8;
9549 if (Subtarget.
hasAVX()) {
9550 Opcode = X86ISD::VPERMILPV;
9551 ShuffleVT = MVT::v4f32;
9553 Opcode = X86ISD::PSHUFB;
9554 ShuffleVT = MVT::v16i8;
9559 if (Subtarget.
hasAVX()) {
9562 Opcode = X86ISD::VPERMILPV;
9563 ShuffleVT = MVT::v2f64;
9575 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9576 Opcode = X86ISD::VPERMV;
9577 else if (Subtarget.hasXOP()) {
9584 DAG.
getNode(X86ISD::VPPERM,
DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9585 DAG.
getNode(X86ISD::VPPERM,
DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9586 }
else if (Subtarget.
hasAVX()) {
9609 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9610 Opcode = X86ISD::VPERMV;
9611 else if (Subtarget.
hasAVX()) {
9613 IndicesVec = ScaleIndices(IndicesVec, 2);
9616 MVT::v32i8, DAG.
getBitcast(MVT::v32i8, SrcVec),
9617 DAG.
getBitcast(MVT::v32i8, IndicesVec),
DL, DAG, Subtarget));
9623 Opcode = X86ISD::VPERMV;
9624 else if (Subtarget.
hasAVX()) {
9627 {0, 1, 2, 3, 0, 1, 2, 3});
9629 {4, 5, 6, 7, 4, 5, 6, 7});
9630 if (Subtarget.hasXOP())
9632 VT, DAG.
getNode(X86ISD::VPERMIL2,
DL, MVT::v8f32, LoLo, HiHi,
9638 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v8f32, HiHi, IndicesVec),
9639 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v8f32, LoLo, IndicesVec),
9647 if (!Subtarget.hasVLX()) {
9649 SrcVec =
widenSubVector(WidenSrcVT, SrcVec,
false, Subtarget, DAG,
9651 IndicesVec =
widenSubVector(MVT::v8i64, IndicesVec,
false, Subtarget,
9652 DAG,
SDLoc(IndicesVec));
9657 Opcode = X86ISD::VPERMV;
9658 }
else if (Subtarget.
hasAVX()) {
9666 if (Subtarget.hasXOP())
9668 VT, DAG.
getNode(X86ISD::VPERMIL2,
DL, MVT::v4f64, LoLo, HiHi,
9674 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v4f64, HiHi, IndicesVec),
9675 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v4f64, LoLo, IndicesVec),
9681 if (Subtarget.hasVBMI())
9682 Opcode = X86ISD::VPERMV;
9685 if (Subtarget.hasBWI())
9686 Opcode = X86ISD::VPERMV;
9693 Opcode = X86ISD::VPERMV;
9701 "Illegal variable permute shuffle type");
9705 IndicesVec = ScaleIndices(IndicesVec, Scale);
9708 IndicesVec = DAG.
getBitcast(ShuffleIdxVT, IndicesVec);
9711 SDValue Res = Opcode == X86ISD::VPERMV
9712 ? DAG.
getNode(Opcode,
DL, ShuffleVT, IndicesVec, SrcVec)
9713 : DAG.
getNode(Opcode,
DL, ShuffleVT, SrcVec, IndicesVec);
9737 for (
unsigned Idx = 0,
E = V.getNumOperands(); Idx !=
E; ++Idx) {
9746 SrcVec =
Op.getOperand(0);
9747 else if (SrcVec !=
Op.getOperand(0))
9749 SDValue ExtractedIndex =
Op->getOperand(1);
9753 ExtractedIndex = ExtractedIndex.
getOperand(0);
9762 else if (IndicesVec != ExtractedIndex.
getOperand(0))
9766 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9770 MVT VT = V.getSimpleValueType();
9778 MVT VT =
Op.getSimpleValueType();
9780 MVT OpEltVT =
Op.getOperand(0).getSimpleValueType();
9788 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9799 bool IsAllConstants =
true;
9800 bool OneUseFrozenUndefs =
true;
9801 SmallSet<SDValue, 8> Values;
9802 unsigned NumConstants = NumElems;
9803 for (
unsigned i = 0; i < NumElems; ++i) {
9810 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->
hasOneUse();
9811 FrozenUndefMask.
setBit(i);
9816 IsAllConstants =
false;
9831 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9835 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9843 if (
unsigned NumFrozenUndefElts = FrozenUndefMask.
popcount();
9844 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9845 SmallVector<int, 16> BlendMask(NumElems, -1);
9847 for (
unsigned i = 0; i < NumElems; ++i) {
9853 if (!FrozenUndefMask[i])
9854 Elts[i] =
Op.getOperand(i);
9856 BlendMask[i] += NumElems;
9871 unsigned UpperElems = NumElems / 2;
9872 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9873 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.
countl_one();
9874 if (NumUpperUndefsOrZeros >= UpperElems) {
9876 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9877 UpperElems = NumElems - (NumElems / 4);
9879 bool UndefUpper = UndefMask.
countl_one() >= UpperElems;
9883 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9890 return HorizontalOp;
9900 unsigned NumZero = ZeroMask.
popcount();
9901 unsigned NumNonZero = NonZeroMask.
popcount();
9909 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9910 FrozenUndefMask.
isZero() &&
9917 Type *EltType =
Op.getValueType().getScalarType().getTypeForEVT(
Context);
9921 for (
unsigned i = 0; i != NumElems; ++i) {
9924 ConstVecOps[i] = ConstantInt::get(
Context,
C->getAPIntValue());
9926 ConstVecOps[i] = ConstantFP::get(
Context,
C->getValueAPF());
9929 "Expected one variable element in this vector");
9943 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9949 if (InsertC < NumEltsInLow128Bits)
9955 assert(Subtarget.hasAVX() &&
"Must have AVX with >16-byte vector");
9956 SmallVector<int, 8> ShuffleMask;
9958 for (
unsigned i = 0; i != NumElts; ++i)
9959 ShuffleMask.
push_back(i == InsertC ? NumElts : i);
9965 if (NumNonZero == 1) {
9977 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9978 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9979 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9982 "Expected an SSE value type!");
9991 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10001 if (NumElems == 2 && Idx == 1 &&
10007 VT,
Op.getOperand(1)),
10008 NumBits/2, DAG, *
this, dl);
10011 if (IsAllConstants)
10019 if (EVTBits == 32) {
10026 if (Values.
size() == 1) {
10027 if (EVTBits == 32) {
10034 if (
Op.getNode()->isOnlyUserOf(Item.
getNode()))
10042 if (IsAllConstants)
10059 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.
size() == 2) {
10064 for (
unsigned i = 2; i != NumElems; ++i)
10065 if (
Ops[i % 2] !=
Op.getOperand(i))
10069 if (CanSplat(
Op, NumElems,
Ops)) {
10091 HVT, dl,
Op->ops().slice(NumElems / 2, NumElems /2));
10098 if (EVTBits == 64) {
10099 if (NumNonZero == 1) {
10103 Op.getOperand(Idx));
10110 if (EVTBits == 8 && NumElems == 16)
10112 NumZero, DAG, Subtarget))
10115 if (EltVT == MVT::i16 && NumElems == 8)
10117 NumZero, DAG, Subtarget))
10121 if (EVTBits == 32 && NumElems == 4)
10126 if (NumElems == 4 && NumZero > 0) {
10128 for (
unsigned i = 0; i < 4; ++i) {
10129 bool isZero = !NonZeroMask[i];
10136 for (
unsigned i = 0; i < 2; ++i) {
10143 Ops[i] = getMOVL(DAG, dl, VT,
Ops[i*2+1],
Ops[i*2]);
10146 Ops[i] = getMOVL(DAG, dl, VT,
Ops[i*2],
Ops[i*2+1]);
10159 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10160 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10165 assert(Values.
size() > 1 &&
"Expected non-undef and non-splat vector");
10172 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
10174 if (!
Op.getOperand(0).isUndef())
10179 for (
unsigned i = 1; i < NumElems; ++i) {
10180 if (
Op.getOperand(i).isUndef())
continue;
10191 for (
unsigned i = 0; i < NumElems; ++i) {
10192 if (!
Op.getOperand(i).isUndef())
10202 for (
unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10204 SmallVector<int, 16>
Mask;
10205 for(
unsigned i = 0; i != Scale; ++i)
10207 for (
unsigned i = 0; i != Scale; ++i)
10208 Mask.push_back(NumElems+i);
10211 for (
unsigned i = 0, e = NumElems / (2 * Scale); i !=
e; ++i)
10223 MVT ResVT =
Op.getSimpleValueType();
10225 "Value type must be 256-/512-bit wide");
10227 unsigned NumOperands =
Op.getNumOperands();
10228 unsigned NumFreezeUndef = 0;
10229 unsigned NumZero = 0;
10230 unsigned NumNonZero = 0;
10231 unsigned NonZeros = 0;
10233 for (
unsigned i = 0; i != NumOperands; ++i) {
10249 assert(i <
sizeof(NonZeros) * CHAR_BIT);
10250 NonZeros |= 1 << i;
10256 if (NumNonZero > 2) {
10260 Ops.slice(0, NumOperands/2));
10262 Ops.slice(NumOperands/2));
10274 U,
getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
10276 MVT SubVT =
Op.getOperand(0).getSimpleValueType();
10278 for (
unsigned i = 0; i != NumOperands; ++i) {
10279 if ((NonZeros & (1 << i)) == 0)
10296 MVT ResVT =
Op.getSimpleValueType();
10297 unsigned NumOperands =
Op.getNumOperands();
10299 "Unexpected number of operands in CONCAT_VECTORS");
10303 for (
unsigned i = 0; i != NumOperands; ++i) {
10307 assert(i <
sizeof(NonZeros) * CHAR_BIT);
10319 if (
isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10320 Log2_64(NonZeros) != NumOperands - 1) {
10321 unsigned Idx =
Log2_64(NonZeros);
10326 Op = DAG.
getNode(X86ISD::KSHIFTL, dl, ShiftVT,
Op,
10337 unsigned Idx =
Log2_64(NonZeros);
10344 if (NumOperands > 2) {
10348 Ops.slice(0, NumOperands / 2));
10350 Ops.slice(NumOperands / 2));
10370 MVT VT =
Op.getSimpleValueType();
10379 (
Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4)));
10397 int Idx,
int ExpectedIdx) {
10398 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
10399 ExpectedIdx < MaskSize &&
"Out of range element index");
10400 if (!
Op || !ExpectedOp ||
Op.getOpcode() != ExpectedOp.
getOpcode())
10403 EVT VT =
Op.getValueType();
10413 if (Idx == ExpectedIdx &&
Op == ExpectedOp)
10416 switch (
Op.getOpcode()) {
10420 return Op.getOperand(Idx) == ExpectedOp.
getOperand(ExpectedIdx);
10423 EVT SrcVT = Src.getValueType();
10427 return (Idx % Scale) == (ExpectedIdx % Scale) &&
10429 Idx / Scale, ExpectedIdx / Scale);
10433 for (
unsigned I = 0;
I != Scale; ++
I)
10436 (ExpectedIdx * Scale) +
I))
10445 return Op == ExpectedOp &&
10446 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
10448 case X86ISD::VBROADCAST:
10449 case X86ISD::VBROADCAST_LOAD:
10450 return Op == ExpectedOp;
10451 case X86ISD::SUBV_BROADCAST_LOAD:
10452 if (
Op == ExpectedOp) {
10454 unsigned NumMemElts =
MemOp->getMemoryVT().getVectorNumElements();
10455 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
10458 case X86ISD::VPERMI: {
10459 if (
Op == ExpectedOp) {
10464 Mask[ExpectedIdx]);
10470 case X86ISD::FHADD:
10471 case X86ISD::FHSUB:
10472 case X86ISD::PACKSS:
10473 case X86ISD::PACKUS:
10476 if (
Op == ExpectedOp &&
Op.getOperand(0) ==
Op.getOperand(1)) {
10479 int NumEltsPerLane = NumElts / NumLanes;
10480 int NumHalfEltsPerLane = NumEltsPerLane / 2;
10481 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
10483 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
10484 return SameLane && SameElt;
10500 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
10501 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
10502 if (Mask[i] >= 0 && Mask[i] != i)
10514 unsigned ScalarSizeInBits,
10516 assert(LaneSizeInBits && ScalarSizeInBits &&
10517 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10518 "Illegal shuffle lane size");
10519 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10520 int Size = Mask.size();
10521 for (
int i = 0; i <
Size; ++i)
10522 if (Mask[i] >= 0 && (Mask[i] %
Size) / LaneSize != i / LaneSize)
10537 unsigned ScalarSizeInBits,
10539 assert(LaneSizeInBits && ScalarSizeInBits &&
10540 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10541 "Illegal shuffle lane size");
10542 int NumElts = Mask.size();
10543 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10544 int NumLanes = NumElts / NumEltsPerLane;
10545 if (NumLanes > 1) {
10546 for (
int i = 0; i != NumLanes; ++i) {
10548 for (
int j = 0; j != NumEltsPerLane; ++j) {
10549 int M = Mask[(i * NumEltsPerLane) + j];
10552 int Lane = (M % NumElts) / NumEltsPerLane;
10553 if (SrcLane >= 0 && SrcLane != Lane)
10577 RepeatedMask.
assign(LaneSize, -1);
10578 int Size = Mask.size();
10579 for (
int i = 0; i <
Size; ++i) {
10583 if ((Mask[i] %
Size) / LaneSize != i / LaneSize)
10589 int LocalM = Mask[i] <
Size ? Mask[i] % LaneSize
10590 : Mask[i] % LaneSize + LaneSize;
10591 if (RepeatedMask[i % LaneSize] < 0)
10593 RepeatedMask[i % LaneSize] = LocalM;
10594 else if (RepeatedMask[i % LaneSize] != LocalM)
10624 unsigned EltSizeInBits,
10627 int LaneSize = LaneSizeInBits / EltSizeInBits;
10629 int Size = Mask.size();
10630 for (
int i = 0; i <
Size; ++i) {
10640 if ((Mask[i] %
Size) / LaneSize != i / LaneSize)
10646 int LaneM = Mask[i] /
Size;
10647 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10650 RepeatedMask[i % LaneSize] = LocalM;
10651 else if (RepeatedMask[i % LaneSize] != LocalM)
10664 Mask, RepeatedMask);
10680 int Size = Mask.size();
10681 if (
Size != (
int)ExpectedMask.
size())
10684 for (
int i = 0; i <
Size; ++i) {
10685 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
10686 int MaskIdx = Mask[i];
10687 int ExpectedIdx = ExpectedMask[i];
10688 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10690 SDValue ExpectedV = ExpectedIdx <
Size ? V1 : V2;
10691 MaskIdx = MaskIdx <
Size ? MaskIdx : (MaskIdx -
Size);
10692 ExpectedIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
10714 int Size = Mask.size();
10715 if (
Size != (
int)ExpectedMask.
size())
10722 "Illegal target shuffle mask");
10729 if (V1 && (V1.getValueSizeInBits() != VT.
getSizeInBits() ||
10730 !V1.getValueType().isVector()))
10732 if (V2 && (V2.getValueSizeInBits() != VT.
getSizeInBits() ||
10733 !V2.getValueType().isVector()))
10739 for (
int i = 0; i <
Size; ++i) {
10740 int MaskIdx = Mask[i];
10741 int ExpectedIdx = ExpectedMask[i];
10745 if (ExpectedIdx < 0)
10751 SDValue ExpectedV = ExpectedIdx <
Size ? V1 : V2;
10754 int BitIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
10755 APInt &ZeroMask = ExpectedIdx <
Size ? ZeroV1 : ZeroV2;
10756 ZeroMask.
setBit(BitIdx);
10760 if (MaskIdx >= 0) {
10762 SDValue ExpectedV = ExpectedIdx <
Size ? V1 : V2;
10763 MaskIdx = MaskIdx <
Size ? MaskIdx : (MaskIdx -
Size);
10764 ExpectedIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
10778 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10789 return IsUnpackwdMask;
10803 for (
unsigned i = 0; i != 4; ++i) {
10818 assert(Mask.size() % 2 == 0 &&
"Expecting even number of elements in mask");
10819 unsigned HalfSize = Mask.size() / 2;
10820 for (
unsigned i = 0; i != HalfSize; ++i) {
10821 if (Mask[i] != Mask[i + HalfSize])
10836 assert(Mask.size() == 4 &&
"Only 4-lane shuffle masks");
10837 assert(Mask[0] >= -1 && Mask[0] < 4 &&
"Out of bound mask element!");
10838 assert(Mask[1] >= -1 && Mask[1] < 4 &&
"Out of bound mask element!");
10839 assert(Mask[2] >= -1 && Mask[2] < 4 &&
"Out of bound mask element!");
10840 assert(Mask[3] >= -1 && Mask[3] < 4 &&
"Out of bound mask element!");
10844 int FirstIndex =
find_if(Mask, [](
int M) {
return M >= 0; }) - Mask.begin();
10845 assert(0 <= FirstIndex && FirstIndex < 4 &&
"All undef shuffle mask");
10847 int FirstElt = Mask[FirstIndex];
10848 if (
all_of(Mask, [FirstElt](
int M) {
return M < 0 || M == FirstElt; }))
10849 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10852 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10853 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10854 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10855 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10867 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10868 "Unexpected SHUFPD mask size");
10869 assert(
all_of(Mask, [](
int M) {
return -1 <= M && M <= 1; }) &&
10870 "Unexpected SHUFPD mask elements");
10874 int FirstIndex =
find_if(Mask, [](
int M) {
return M >= 0; }) - Mask.begin();
10875 assert(0 <= FirstIndex && FirstIndex < (
int)Mask.size() &&
10876 "All undef shuffle mask");
10878 int FirstElt = Mask[FirstIndex];
10879 if (
all_of(Mask, [FirstElt](
int M) {
return M < 0 || M == FirstElt; }) &&
10880 count_if(Mask, [FirstElt](
int M) {
return M == FirstElt; }) > 1) {
10882 for (
unsigned I = 0,
E = Mask.size();
I !=
E; ++
I)
10883 Imm |= FirstElt <<
I;
10890 for (
unsigned I = 0,
E = Mask.size();
I !=
E; ++
I)
10891 Imm |= (Mask[
I] < 0 ? (
I & 1) : Mask[
I]) <<
I;
10910 bool &IsZeroSideLeft) {
10911 int NextElement = -1;
10913 for (
int i = 0, e = Mask.size(); i < e; i++) {
10915 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
10921 if (NextElement < 0) {
10922 NextElement = Mask[i] != 0 ?
VectorType.getVectorNumElements() : 0;
10923 IsZeroSideLeft = NextElement != 0;
10926 if (NextElement != Mask[i])
10936 unsigned Depth = 0);
10944 int Size = Mask.size();
10958 for (
int i = 0; i < NumBytes; ++i) {
10959 int M = Mask[i / NumEltBytes];
10961 PSHUFBMask[i] = DAG.
getUNDEF(MVT::i8);
10964 if (Zeroable[i / NumEltBytes]) {
10965 PSHUFBMask[i] = ZeroMask;
10971 if (V && V != SrcV)
10977 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10981 M = M * NumEltBytes + (i % NumEltBytes);
10984 assert(V &&
"Failed to find a source input");
10997 MVT SrcVT = Mask.getSimpleValueType();
10999 assert(MaskVT.
bitsLE(SrcVT) &&
"Unexpected mask size!");
11013 if (SrcVT == MVT::i64 && Subtarget.is32Bit()) {
11014 assert(MaskVT == MVT::v64i1 &&
"Expected v64i1 mask!");
11015 assert(Subtarget.hasBWI() &&
"Expected AVX512BW target!");
11035 const APInt &Zeroable,
11038 bool IsLeftZeroSide =
true;
11042 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11047 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
11048 "Unexpected number of vector elements");
11050 Subtarget, DAG,
DL);
11052 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11053 return DAG.
getNode(X86ISD::EXPAND,
DL, VT, ExpandedVector, ZeroVector, VMask);
11057 unsigned &UnpackOpcode,
bool IsUnary,
11063 bool Undef1 =
true, Undef2 =
true, Zero1 =
true, Zero2 =
true;
11064 for (
int i = 0; i != NumElts; i += 2) {
11065 int M1 = TargetMask[i + 0];
11066 int M2 = TargetMask[i + 1];
11072 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
11073 "Zeroable shuffle detected");
11079 (IsUnary ? V1 : V2))) {
11080 UnpackOpcode = X86ISD::UNPCKL;
11081 V2 = (Undef2 ? DAG.
getUNDEF(VT) : (IsUnary ? V1 : V2));
11082 V1 = (Undef1 ? DAG.
getUNDEF(VT) : V1);
11088 (IsUnary ? V1 : V2))) {
11089 UnpackOpcode = X86ISD::UNPCKH;
11090 V2 = (Undef2 ? DAG.
getUNDEF(VT) : (IsUnary ? V1 : V2));
11091 V1 = (Undef1 ? DAG.
getUNDEF(VT) : V1);
11096 if (IsUnary && (Zero1 || Zero2)) {
11098 if ((Subtarget.
hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11102 bool MatchLo =
true, MatchHi =
true;
11103 for (
int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11104 int M = TargetMask[i];
11107 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11111 MatchLo &= (M == Unpckl[i]);
11112 MatchHi &= (M == Unpckh[i]);
11115 if (MatchLo || MatchHi) {
11116 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11127 UnpackOpcode = X86ISD::UNPCKL;
11134 UnpackOpcode = X86ISD::UNPCKH;
11151 return DAG.
getNode(X86ISD::UNPCKL,
DL, VT, V1, V2);
11156 return DAG.
getNode(X86ISD::UNPCKH,
DL, VT, V1, V2);
11161 return DAG.
getNode(X86ISD::UNPCKL,
DL, VT, V2, V1);
11165 return DAG.
getNode(X86ISD::UNPCKH,
DL, VT, V2, V1);
11179 unsigned UnpackOpcode;
11181 UnpackOpcode = X86ISD::UNPCKL;
11183 UnpackOpcode = X86ISD::UNPCKH;
11191 DAG.
getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11193 return DAG.
getNode(UnpackOpcode,
DL, VT, V1, V1);
11204 unsigned NumElts = Mask.size();
11206 unsigned MaxScale = 64 / EltSizeInBits;
11208 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11209 unsigned SrcEltBits = EltSizeInBits * Scale;
11210 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11212 unsigned NumSrcElts = NumElts / Scale;
11215 unsigned UpperElts = NumElts - NumSrcElts;
11221 if ((NumSrcElts * EltSizeInBits) >= 128) {
11239 MVT SrcVT = Src.getSimpleValueType();
11249 if (NumSrcElts == NumDstElts)
11252 if (NumSrcElts > NumDstElts) {
11258 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11275 if (DstVT != TruncVT)
11299 const APInt &Zeroable,
11302 assert((VT == MVT::v16i8 || VT == MVT::v8i16) &&
"Unexpected VTRUNC type");
11308 unsigned MaxScale = 64 / EltSizeInBits;
11309 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11310 unsigned SrcEltBits = EltSizeInBits * Scale;
11311 unsigned NumSrcElts = NumElts / Scale;
11312 unsigned UpperElts = NumElts - NumSrcElts;
11321 Src.getScalarValueSizeInBits() == SrcEltBits) {
11322 Src = Src.getOperand(0);
11323 }
else if (Subtarget.hasVLX()) {
11336 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
11349 const APInt &Zeroable,
11353 "Unexpected VTRUNC type");
11360 unsigned MaxScale = 64 / EltSizeInBits;
11361 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11363 unsigned SrcEltBits = EltSizeInBits * Scale;
11364 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11369 unsigned NumHalfSrcElts = NumElts / Scale;
11370 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11377 unsigned UpperElts = NumElts - NumSrcElts;
11378 if (UpperElts > 0 &&
11400 Offset * EltSizeInBits, DAG);
11435 bool IsSingleInput) {
11438 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11440 "We should only be called with masks with a power-of-2 size!");
11443 int Offset = MatchEven ? 0 : 1;
11448 bool ViableForN[3] = {
true,
true,
true};
11450 for (
int i = 0, e = Mask.size(); i < e; ++i) {
11456 bool IsAnyViable =
false;
11457 for (
unsigned j = 0; j != std::size(ViableForN); ++j)
11458 if (ViableForN[j]) {
11463 IsAnyViable =
true;
11465 ViableForN[j] =
false;
11472 for (
unsigned j = 0; j != std::size(ViableForN); ++j)
11488 unsigned MaxStages = 1) {
11491 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11492 "Illegal maximum compaction");
11495 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11496 unsigned NumPackedBits = NumSrcBits - BitSize;
11500 unsigned NumBits2 = N2.getScalarValueSizeInBits();
11503 if ((!N1.
isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11504 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11506 if (Subtarget.
hasSSE41() || BitSize == 8) {
11513 PackOpcode = X86ISD::PACKUS;
11519 if ((N1.
isUndef() || IsZero1 || IsAllOnes1 ||
11521 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11526 PackOpcode = X86ISD::PACKSS;
11533 for (
unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11541 if (MatchPACK(V1, V2, PackVT))
11548 if (MatchPACK(V1, V1, PackVT))
11560 unsigned PackOpcode;
11563 unsigned MaxStages =
Log2_32(64 / EltBits);
11565 Subtarget, MaxStages))
11569 unsigned NumStages =
Log2_32(CurrentEltBits / EltBits);
11572 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11577 unsigned MaxPackBits = 16;
11578 if (CurrentEltBits > 16 &&
11579 (PackOpcode == X86ISD::PACKSS || Subtarget.
hasSSE41()))
11584 for (
unsigned i = 0; i != NumStages; ++i) {
11585 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11586 unsigned NumSrcElts = SizeBits / SrcEltBits;
11594 CurrentEltBits /= 2;
11597 "Failed to lower compaction shuffle");
11607 const APInt &Zeroable,
11614 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11632 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
11635 if (Mask[i] %
Size != i)
11638 V = Mask[i] <
Size ? V1 : V2;
11639 else if (V != (Mask[i] <
Size ? V1 : V2))
11667 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
11668 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i +
Size)
11670 MaskOps.push_back(Mask[i] <
Size ?
AllOnes : Zero);
11684 const APInt &Zeroable,
bool &ForceV1Zero,
11685 bool &ForceV2Zero,
uint64_t &BlendMask) {
11686 bool V1IsZeroOrUndef =
11688 bool V2IsZeroOrUndef =
11692 ForceV1Zero =
false, ForceV2Zero =
false;
11693 assert(Mask.size() <= 64 &&
"Shuffle mask too big for blend mask");
11695 int NumElts = Mask.size();
11697 int NumEltsPerLane = NumElts / NumLanes;
11698 assert((NumLanes * NumEltsPerLane) == NumElts &&
"Value type mismatch");
11702 bool ForceWholeLaneMasks =
11707 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
11709 bool LaneV1InUse =
false;
11710 bool LaneV2InUse =
false;
11712 for (
int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11713 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11717 if (M == Elt || (0 <= M && M < NumElts &&
11720 LaneV1InUse =
true;
11723 if (M == (Elt + NumElts) ||
11726 LaneBlendMask |= 1ull << LaneElt;
11727 Mask[Elt] = Elt + NumElts;
11728 LaneV2InUse =
true;
11731 if (Zeroable[Elt]) {
11732 if (V1IsZeroOrUndef) {
11733 ForceV1Zero =
true;
11735 LaneV1InUse =
true;
11738 if (V2IsZeroOrUndef) {
11739 ForceV2Zero =
true;
11740 LaneBlendMask |= 1ull << LaneElt;
11741 Mask[Elt] = Elt + NumElts;
11742 LaneV2InUse =
true;
11752 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11753 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11755 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11768 const APInt &Zeroable,
11772 bool ForceV1Zero =
false, ForceV2Zero =
false;
11789 assert(Subtarget.
hasAVX2() &&
"256-bit integer blends require AVX2!");
11793 assert(Subtarget.
hasAVX() &&
"256-bit float blends require AVX!");
11800 assert(Subtarget.
hasSSE41() &&
"128-bit blends require SSE41!");
11801 return DAG.
getNode(X86ISD::BLENDI,
DL, VT, V1, V2,
11803 case MVT::v16i16: {
11804 assert(Subtarget.
hasAVX2() &&
"v16i16 blends require AVX2!");
11808 assert(RepeatedMask.
size() == 8 &&
"Repeated mask size doesn't match!");
11810 for (
int i = 0; i < 8; ++i)
11811 if (RepeatedMask[i] >= 8)
11812 BlendMask |= 1ull << i;
11813 return DAG.
getNode(X86ISD::BLENDI,
DL, MVT::v16i16, V1, V2,
11819 uint64_t LoMask = BlendMask & 0xFF;
11820 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11821 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11827 MVT::v16i16,
DL,
Lo,
Hi,
11828 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11833 assert(Subtarget.
hasAVX2() &&
"256-bit byte-blends require AVX2!");
11836 assert(Subtarget.
hasSSE41() &&
"128-bit byte-blends require SSE41!");
11843 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11850 if (Subtarget.hasVLX())
11883 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
11884 for (
int j = 0; j < Scale; ++j)
11931 bool ImmBlends =
false) {
11937 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
11941 assert(Mask[i] <
Size * 2 &&
"Shuffle input is out of bounds.");
11943 if (BlendMask[Mask[i] %
Size] < 0)
11944 BlendMask[Mask[i] %
Size] = Mask[i];
11945 else if (BlendMask[Mask[i] %
Size] != Mask[i])
11948 PermuteMask[i] = Mask[i] %
Size;
11970 int NumElts = Mask.size();
11972 int NumLaneElts = NumElts / NumLanes;
11973 int NumHalfLaneElts = NumLaneElts / 2;
11975 bool MatchLo =
true, MatchHi =
true;
11979 for (
int Elt = 0; Elt != NumElts; ++Elt) {
11987 if (M < NumElts && (
Op.isUndef() ||
Op == V1))
11989 else if (NumElts <= M && (
Op.isUndef() ||
Op == V2)) {
11995 bool MatchLoAnyLane =
false, MatchHiAnyLane =
false;
11996 for (
int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11997 int Lo = Lane, Mid = Lane + NumHalfLaneElts,
Hi = Lane + NumLaneElts;
12000 if (MatchLoAnyLane || MatchHiAnyLane) {
12001 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
12002 "Failed to match UNPCKLO/UNPCKHI");
12006 MatchLo &= MatchLoAnyLane;
12007 MatchHi &= MatchHiAnyLane;
12008 if (!MatchLo && !MatchHi)
12011 assert((MatchLo ^ MatchHi) &&
"Failed to match UNPCKLO/UNPCKHI");
12017 for (
int Elt = 0; Elt != NumElts; ++Elt) {
12024 bool IsFirstOp = M < NumElts;
12026 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
12027 if ((IsFirstOp && V1 ==
Ops[0]) || (!IsFirstOp && V2 ==
Ops[0]))
12028 PermuteMask[Elt] = BaseMaskElt;
12029 else if ((IsFirstOp && V1 ==
Ops[1]) || (!IsFirstOp && V2 ==
Ops[1]))
12030 PermuteMask[Elt] = BaseMaskElt + 1;
12031 assert(PermuteMask[Elt] != -1 &&
12032 "Input mask element is defined but failed to assign permute mask");
12035 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12054 int Size = Mask.size();
12055 assert(Mask.size() >= 2 &&
"Single element masks are invalid.");
12066 bool UnpackLo = NumLoInputs >= NumHiInputs;
12068 auto TryUnpack = [&](
int ScalarSize,
int Scale) {
12072 for (
int i = 0; i <
Size; ++i) {
12077 int UnpackIdx = i / Scale;
12081 if ((UnpackIdx % 2 == 0) != (Mask[i] <
Size))
12087 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 :
Size / 2)] =
12109 VT, DAG.
getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
DL,
12110 UnpackVT, V1, V2));
12116 for (
int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
12117 if (
SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
12128 if (NumLoInputs == 0 || NumHiInputs == 0) {
12129 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
12130 "We have to have *some* inputs!");
12131 int HalfOffset = NumLoInputs == 0 ?
Size / 2 : 0;
12139 for (
int i = 0; i <
Size; ++i) {
12143 assert(Mask[i] %
Size >= HalfOffset &&
"Found input from wrong half!");
12146 2 * ((Mask[i] %
Size) - HalfOffset) + (Mask[i] <
Size ? 0 : 1);
12150 DAG.
getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
DL, VT,
12175 int NumEltsPerLane = NumElts / NumLanes;
12178 bool Blend1 =
true;
12179 bool Blend2 =
true;
12180 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
12181 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
12182 for (
int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12183 for (
int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12184 int M = Mask[Lane + Elt];
12188 Blend1 &= (M == (Lane + Elt));
12189 assert(Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask");
12190 M = M % NumEltsPerLane;
12191 Range1.first = std::min(Range1.first, M);
12192 Range1.second = std::max(Range1.second, M);
12195 Blend2 &= (M == (Lane + Elt));
12196 assert(Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask");
12197 M = M % NumEltsPerLane;
12198 Range2.first = std::min(Range2.first, M);
12199 Range2.second = std::max(Range2.second, M);
12207 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12208 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12222 for (
int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12223 for (
int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12224 int M = Mask[Lane + Elt];
12228 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12230 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12237 if (Range2.second < Range1.first)
12238 return RotateAndPermute(V1, V2, Range1.first, 0);
12239 if (Range1.second < Range2.first)
12240 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12254 size_t NumUndefs = 0;
12255 std::optional<int> UniqueElt;
12256 for (
int Elt : Mask) {
12261 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
12267 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
12280 int NumElts = Mask.size();
12282 int NumEltsPerLane = NumElts / NumLanes;
12286 bool IsAlternating =
true;
12287 bool V1Zero =
true, V2Zero =
true;
12291 for (
int i = 0; i < NumElts; ++i) {
12293 if (M >= 0 && M < NumElts) {
12296 V1Zero &= Zeroable[i];
12297 IsAlternating &= (i & 1) == 0;
12298 }
else if (M >= NumElts) {
12299 V2Mask[i] = M - NumElts;
12300 FinalMask[i] = i + NumElts;
12301 V2Zero &= Zeroable[i];
12302 IsAlternating &= (i & 1) == 1;
12309 auto canonicalizeBroadcastableInput = [
DL, VT, &Subtarget,
12312 unsigned EltSizeInBits =
Input.getScalarValueSizeInBits();
12313 if (!Subtarget.
hasAVX2() && (!Subtarget.
hasAVX() || EltSizeInBits < 32 ||
12319 "Expected to demand only the 0'th element.");
12322 int &InputMaskElt =
I.value();
12323 if (InputMaskElt >= 0)
12324 InputMaskElt =
I.index();
12334 canonicalizeBroadcastableInput(V1, V1Mask);
12335 canonicalizeBroadcastableInput(V2, V2Mask);
12349 DL, VT, V1, V2, Mask, Subtarget, DAG))
12370 DL, VT, V1, V2, Mask, Subtarget, DAG))
12380 DL, VT, V1, V2, Mask, Subtarget, DAG))
12389 V1Mask.
assign(NumElts, -1);
12390 V2Mask.
assign(NumElts, -1);
12391 FinalMask.
assign(NumElts, -1);
12392 for (
int i = 0; i != NumElts; i += NumEltsPerLane)
12393 for (
int j = 0; j != NumEltsPerLane; ++j) {
12394 int M = Mask[i + j];
12395 if (M >= 0 && M < NumElts) {
12396 V1Mask[i + (j / 2)] = M;
12397 FinalMask[i + j] = i + (j / 2);
12398 }
else if (M >= NumElts) {
12399 V2Mask[i + (j / 2)] = M - NumElts;
12400 FinalMask[i + j] = i + (j / 2) + NumElts;
12414 assert(EltSizeInBits < 64 &&
"Can't rotate 64-bit integers");
12417 int MinSubElts = Subtarget.
hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12418 int MaxSubElts = 64 / EltSizeInBits;
12419 unsigned RotateAmt, NumSubElts;
12421 MaxSubElts, NumSubElts, RotateAmt))
12423 unsigned NumElts = Mask.size();
12438 if (!IsLegal && Subtarget.
hasSSE3())
12451 if ((RotateAmt % 16) == 0)
12453 unsigned ShlAmt = RotateAmt;
12474 int NumElts = Mask.size();
12485 for (
int i = 0; i < NumElts; ++i) {
12488 "Unexpected mask index.");
12493 int StartIdx = i - (M % NumElts);
12501 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12504 Rotation = CandidateRotation;
12505 else if (Rotation != CandidateRotation)
12510 SDValue MaskV = M < NumElts ? V1 : V2;
12521 else if (TargetV != MaskV)
12528 assert(Rotation != 0 &&
"Failed to locate a viable rotation!");
12529 assert((
Lo ||
Hi) &&
"Failed to find a rotated input vector!");
12574 int NumElts = RepeatedMask.
size();
12575 int Scale = 16 / NumElts;
12576 return Rotation * Scale;
12587 if (ByteRotation <= 0)
12599 "512-bit PALIGNR requires BWI instructions");
12606 "Rotate-based lowering only supports 128-bit lowering!");
12607 assert(Mask.size() <= 16 &&
12608 "Can shuffle at most 16 bytes in a 128-bit vector!");
12609 assert(ByteVT == MVT::v16i8 &&
12610 "SSE2 rotate lowering only needed for v16i8!");
12613 int LoByteShift = 16 - ByteRotation;
12614 int HiByteShift = ByteRotation;
12638 const APInt &Zeroable,
12642 "Only 32-bit and 64-bit elements are supported!");
12646 &&
"VLX required for 128/256-bit vectors");
12658 unsigned NumElts = Mask.size();
12661 assert((ZeroLo + ZeroHi) < NumElts &&
"Zeroable shuffle detected");
12662 if (!ZeroLo && !ZeroHi)
12666 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12667 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12669 return DAG.
getNode(X86ISD::VALIGN,
DL, VT, Src,
12675 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12676 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12678 return DAG.
getNode(X86ISD::VALIGN,
DL, VT,
12689 const APInt &Zeroable,
12699 if (!ZeroLo && !ZeroHi)
12702 unsigned NumElts = Mask.size();
12703 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12713 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12722 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12723 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12725 Res = DAG.
getNode(X86ISD::VSRLDQ,
DL, MVT::v16i8, Res,
12727 }
else if (ZeroHi == 0) {
12728 unsigned Shift = Mask[ZeroLo] % NumElts;
12729 Res = DAG.
getNode(X86ISD::VSRLDQ,
DL, MVT::v16i8, Res,
12731 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12733 }
else if (!Subtarget.
hasSSSE3()) {
12737 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12738 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12740 Shift += Mask[ZeroLo] % NumElts;
12741 Res = DAG.
getNode(X86ISD::VSRLDQ,
DL, MVT::v16i8, Res,
12743 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12776 int MaskOffset,
const APInt &Zeroable,
12778 int Size = Mask.size();
12779 unsigned SizeInBits =
Size * ScalarSizeInBits;
12781 auto CheckZeros = [&](
int Shift,
int Scale,
bool Left) {
12782 for (
int i = 0; i <
Size; i += Scale)
12783 for (
int j = 0; j < Shift; ++j)
12784 if (!Zeroable[i + j + (
Left ? 0 : (Scale - Shift))])
12790 auto MatchShift = [&](
int Shift,
int Scale,
bool Left) {
12791 for (
int i = 0; i !=
Size; i += Scale) {
12792 unsigned Pos =
Left ? i + Shift : i;
12793 unsigned Low =
Left ? i : i + Shift;
12794 unsigned Len = Scale - Shift;
12799 int ShiftEltBits = ScalarSizeInBits * Scale;
12800 bool ByteShift = ShiftEltBits > 64;
12801 Opcode =
Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12802 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12803 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12807 Scale = ByteShift ? Scale / 2 : Scale;
12822 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12823 for (
int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12824 for (
int Shift = 1; Shift != Scale; ++Shift)
12825 for (
bool Left : {
true,
false})
12826 if (CheckZeros(Shift, Scale,
Left)) {
12827 int ShiftAmt = MatchShift(Shift, Scale,
Left);
12838 const APInt &Zeroable,
12841 int Size = Mask.size();
12850 Mask, 0, Zeroable, Subtarget);
12853 if (ShiftAmt < 0) {
12855 Mask,
Size, Zeroable, Subtarget);
12862 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12866 "Illegal integer vector type");
12868 V = DAG.
getNode(Opcode,
DL, ShiftVT, V,
12878 int Size = Mask.size();
12879 int HalfSize =
Size / 2;
12889 int Len = HalfSize;
12890 for (; Len > 0; --Len)
12891 if (!Zeroable[Len - 1])
12893 assert(Len > 0 &&
"Zeroable shuffle mask");
12898 for (
int i = 0; i != Len; ++i) {
12907 if (i > M || M >= HalfSize)
12910 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12918 if (!Src || Idx < 0)
12921 assert((Idx + Len) <= HalfSize &&
"Illegal extraction mask");
12934 int Size = Mask.size();
12935 int HalfSize =
Size / 2;
12942 for (
int Idx = 0; Idx != HalfSize; ++Idx) {
12958 for (
int Hi = Idx + 1;
Hi <= HalfSize; ++
Hi) {
12960 int Len =
Hi - Idx;
12974 }
else if ((!
Base || (
Base == V1)) &&
12977 }
else if ((!
Base || (
Base == V2)) &&
13002 return DAG.
getNode(X86ISD::EXTRQI,
DL, VT, V1,
13026 unsigned ExtOpc,
SDValue InputV,
13030 assert(Scale > 1 &&
"Need a scale to extend.");
13034 int NumEltsPerLane = 128 / EltBits;
13035 int OffsetLane =
Offset / NumEltsPerLane;
13036 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
13037 "Only 8, 16, and 32 bit elements can be extended.");
13038 assert(Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.");
13039 assert(0 <=
Offset &&
"Extension offset must be positive.");
13041 "Extension offset must be in the first lane or start an upper lane.");
13044 auto SafeOffset = [&](
int Idx) {
13045 return OffsetLane == (Idx / NumEltsPerLane);
13049 auto ShuffleOffset = [&](
SDValue V) {
13054 for (
int i = 0; i * Scale < NumElements; ++i) {
13055 int SrcIdx = i +
Offset;
13056 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13069 NumElements / Scale);
13071 InputV = ShuffleOffset(InputV);
13086 if (AnyExt && EltBits == 32) {
13090 VT, DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32,
13094 if (AnyExt && EltBits == 16 && Scale > 2) {
13095 int PSHUFDMask[4] = {
Offset / 2, -1,
13097 InputV = DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32,
13100 int PSHUFWMask[4] = {1, -1, -1, -1};
13101 unsigned OddEvenOp = (
Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13103 VT, DAG.
getNode(OddEvenOp,
DL, MVT::v8i16,
13110 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13111 assert(NumElements == (
int)Mask.size() &&
"Unexpected shuffle mask size!");
13114 int LoIdx =
Offset * EltBits;
13116 MVT::v2i64, DAG.
getNode(X86ISD::EXTRQI,
DL, VT, InputV,
13123 int HiIdx = (
Offset + 1) * EltBits;
13125 MVT::v2i64, DAG.
getNode(X86ISD::EXTRQI,
DL, VT, InputV,
13135 if (Scale > 4 && EltBits == 8 && Subtarget.
hasSSSE3()) {
13136 assert(NumElements == 16 &&
"Unexpected byte vector width!");
13138 for (
int i = 0; i < 16; ++i) {
13139 int Idx =
Offset + (i / Scale);
13140 if ((i % Scale == 0 && SafeOffset(Idx))) {
13147 InputV = DAG.
getBitcast(MVT::v16i8, InputV);
13149 VT, DAG.
getNode(X86ISD::PSHUFB,
DL, MVT::v16i8, InputV,
13155 int AlignToUnpack =
Offset % (NumElements / Scale);
13156 if (AlignToUnpack) {
13158 for (
int i = AlignToUnpack; i < NumElements; ++i)
13159 ShMask[i - AlignToUnpack] = i;
13161 Offset -= AlignToUnpack;
13166 unsigned UnpackLoHi = X86ISD::UNPCKL;
13167 if (
Offset >= (NumElements / 2)) {
13168 UnpackLoHi = X86ISD::UNPCKH;
13169 Offset -= (NumElements / 2);
13176 InputV = DAG.
getNode(UnpackLoHi,
DL, InputVT, InputV, Ext);
13180 }
while (Scale > 1);
13201 int NumLanes = Bits / 128;
13203 int NumEltsPerLane = NumElements / NumLanes;
13205 "Exceeds 32-bit integer zero extension limit");
13206 assert((
int)Mask.size() == NumElements &&
"Unexpected shuffle mask size");
13212 bool AnyExt =
true;
13215 for (
int i = 0; i < NumElements; ++i) {
13219 if (i % Scale != 0) {
13231 SDValue V = M < NumElements ? V1 : V2;
13232 M = M % NumElements;
13235 Offset = M - (i / Scale);
13236 }
else if (InputV != V)
13243 (
Offset % NumEltsPerLane) == 0))
13248 if (
Offset && (
Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13251 if ((M % NumElements) != (
Offset + (i / Scale)))
13264 if (
Offset != 0 && Matches < 2)
13269 InputV, Mask, Subtarget, DAG);
13273 assert(Bits % 64 == 0 &&
13274 "The number of bits in a vector must be divisible by 64 on x86!");
13275 int NumExtElements = Bits / 64;
13279 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13280 assert(NumElements % NumExtElements == 0 &&
13281 "The input vector size must be divisible by the extended size.");
13292 auto CanZExtLowHalf = [&]() {
13293 for (
int i = NumElements / 2; i != NumElements; ++i)
13303 if (
SDValue V = CanZExtLowHalf()) {
13305 V = DAG.
getNode(X86ISD::VZEXT_MOVL,
DL, MVT::v2i64, V);
13318 MVT VT = V.getSimpleValueType();
13324 MVT NewVT = V.getSimpleValueType();
13345 return V.hasOneUse() &&
13349template<
typename T>
13351 T EltVT = VT.getScalarType();
13352 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
13353 (EltVT == MVT::f16 && !Subtarget.hasFP16());
13356template<
typename T>
13358 T EltVT = VT.getScalarType();
13359 return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
13379 find_if(Mask, [&Mask](
int M) {
return M >= (int)Mask.size(); }) -
13382 bool IsV1Zeroable =
true;
13383 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
13384 if (i != V2Index && !Zeroable[i]) {
13385 IsV1Zeroable =
false;
13390 if (!IsV1Zeroable) {
13392 V1Mask[V2Index] = -1;
13407 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
13411 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
13420 if (!IsV1Zeroable) {
13431 }
else if (Mask[V2Index] != (
int)Mask.size() || EltVT == MVT::i8 ||
13432 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
13438 if (!IsV1Zeroable) {
13441 assert(VT == ExtVT &&
"Cannot change extended type when non-zeroable!");
13448 unsigned MovOpc = 0;
13449 if (EltVT == MVT::f16)
13450 MovOpc = X86ISD::MOVSH;
13451 else if (EltVT == MVT::f32)
13452 MovOpc = X86ISD::MOVSS;
13453 else if (EltVT == MVT::f64)
13454 MovOpc = X86ISD::MOVSD;
13457 return DAG.
getNode(MovOpc,
DL, ExtVT, V1, V2);
13464 V2 = DAG.
getNode(X86ISD::VZEXT_MOVL,
DL, ExtVT, V2);
13468 if (V2Index != 0) {
13475 V2Shuffle[V2Index] = 0;
13480 X86ISD::VSHLDQ,
DL, MVT::v16i8, V2,
13497 "We can only lower integer broadcasts with AVX2!");
13503 assert(V0VT.
isVector() &&
"Unexpected non-vector vector-sized value!");
13513 if (V0EltSize <= EltSize)
13516 assert(((V0EltSize % EltSize) == 0) &&
13517 "Scalar type sizes must all be powers of 2 on x86!");
13520 const unsigned Scale = V0EltSize / EltSize;
13521 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13533 if (
const int OffsetIdx = BroadcastIdx % Scale)
13537 return DAG.
getNode(X86ISD::VBROADCAST,
DL, VT,
13547 assert(Mask.size() == 4 &&
"Unsupported mask size!");
13548 assert(Mask[0] >= -1 && Mask[0] < 8 &&
"Out of bound mask element!");
13549 assert(Mask[1] >= -1 && Mask[1] < 8 &&
"Out of bound mask element!");
13550 assert(Mask[2] >= -1 && Mask[2] < 8 &&
"Out of bound mask element!");
13551 assert(Mask[3] >= -1 && Mask[3] < 8 &&
"Out of bound mask element!");
13555 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13557 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13570 int Size = Mask.size();
13571 for (
int i = 0; i <
Size; ++i)
13572 if (Mask[i] >= 0 && Mask[i] /
Size ==
Input && Mask[i] %
Size != i)
13582 int BroadcastableElement = 0) {
13584 int Size = Mask.size();
13585 for (
int i = 0; i <
Size; ++i)
13586 if (Mask[i] >= 0 && Mask[i] /
Size ==
Input &&
13587 Mask[i] %
Size != BroadcastableElement)
13601 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13621 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13623 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13629 if (NumElts == 4 &&
13634 NewMask.
append(NumElts, -1);
13654 if (!((Subtarget.
hasSSE3() && VT == MVT::v2f64) ||
13655 (Subtarget.
hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13662 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.
hasAVX2())
13664 : X86ISD::VBROADCAST;
13665 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.
hasAVX2();
13669 if (BroadcastIdx < 0) {
13676 assert(BroadcastIdx < (
int)Mask.size() &&
"We only expect to be called with "
13677 "a sorted mask where the broadcast "
13679 int NumActiveElts =
count_if(Mask, [](
int M) {
return M >= 0; });
13685 int BitOffset = BroadcastIdx * NumEltBits;
13688 switch (V.getOpcode()) {
13690 V = V.getOperand(0);
13694 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13695 int OpIdx = BitOffset / OpBitWidth;
13696 V = V.getOperand(
OpIdx);
13697 BitOffset %= OpBitWidth;
13702 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13703 unsigned Idx = V.getConstantOperandVal(1);
13704 unsigned BeginOffset = Idx * EltBitWidth;
13705 BitOffset += BeginOffset;
13706 V = V.getOperand(0);
13712 int Idx = (int)V.getConstantOperandVal(2);
13713 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13714 int BeginOffset = Idx * EltBitWidth;
13715 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13716 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13717 BitOffset -= BeginOffset;
13727 assert((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset");
13728 BroadcastIdx = BitOffset / NumEltBits;
13731 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13740 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13741 return TruncBroadcast;
13747 V = V.getOperand(BroadcastIdx);
13760 SDValue BaseAddr = Ld->getBasePtr();
13763 assert((
int)(
Offset * 8) == BitOffset &&
"Unexpected bit-offset");
13770 if (Opcode == X86ISD::VBROADCAST) {
13774 X86ISD::VBROADCAST_LOAD,
DL, Tys,
Ops, SVT,
13780 assert(SVT == MVT::f64 &&
"Unexpected VT!");
13781 V = DAG.
getLoad(SVT,
DL, Ld->getChain(), NewAddr,
13785 }
else if (!BroadcastFromReg) {
13788 }
else if (BitOffset != 0) {
13796 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13801 if (BitOffset < 128 && NumActiveElts > 1 &&
13802 V.getScalarValueSizeInBits() == NumEltBits) {
13803 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13804 "Unexpected bit-offset");
13806 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13811 if ((BitOffset % 128) != 0)
13814 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13815 "Unexpected bit-offset");
13816 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13817 "Unexpected vector size");
13818 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13824 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13826 if (Subtarget.
hasAVX()) {
13827 V = DAG.
getNode(X86ISD::VBROADCAST,
DL, MVT::v2f64, V);
13834 if (!V.getValueType().isVector()) {
13835 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13836 "Unexpected scalar size");
13845 if (V.getValueSizeInBits() > 128)
13850 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13862 unsigned &InsertPSMask,
13863 const APInt &Zeroable,
13867 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
13874 unsigned ZMask = 0;
13875 int VADstIndex = -1;
13876 int VBDstIndex = -1;
13877 bool VAUsedInPlace =
false;
13879 for (
int i = 0; i < 4; ++i) {
13887 if (i == CandidateMask[i]) {
13888 VAUsedInPlace =
true;
13893 if (VADstIndex >= 0 || VBDstIndex >= 0)
13896 if (CandidateMask[i] < 4) {
13906 if (VADstIndex < 0 && VBDstIndex < 0)
13911 unsigned VBSrcIndex = 0;
13912 if (VADstIndex >= 0) {
13915 VBSrcIndex = CandidateMask[VADstIndex];
13916 VBDstIndex = VADstIndex;
13919 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13924 if (!VAUsedInPlace)
13932 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13933 assert((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!");
13937 if (matchAsInsertPS(V1, V2, Mask))
13943 if (matchAsInsertPS(V2, V1, CommutedMask))
13956 unsigned InsertPSMask = 0;
13961 return DAG.
getNode(X86ISD::INSERTPS,
DL, MVT::v4f32, V1, V2,
13978 assert(Mask.size() == 2 &&
"Unexpected mask size for v2 shuffle!");
13983 Mask, Subtarget, DAG))
13988 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13990 if (Subtarget.
hasAVX()) {
13993 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v2f64, V1,
13998 X86ISD::SHUFP,
DL, MVT::v2f64,
14003 assert(Mask[0] >= 0 &&
"No undef lanes in multi-input v2 shuffles!");
14004 assert(Mask[1] >= 0 &&
"No undef lanes in multi-input v2 shuffles!");
14005 assert(Mask[0] < 2 &&
"We sort V1 to be the first input.");
14006 assert(Mask[1] >= 2 &&
"We sort V2 to be the second input.");
14015 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14019 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14020 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14022 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14033 X86ISD::MOVSD,
DL, MVT::v2f64, V2,
14038 Zeroable, Subtarget, DAG))
14045 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14046 return DAG.
getNode(X86ISD::SHUFP,
DL, MVT::v2f64, V1, V2,
14062 assert(Mask.size() == 2 &&
"Unexpected mask size for v2 shuffle!");
14067 Mask, Subtarget, DAG))
14074 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14075 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14076 Mask[1] < 0 ? -1 : (Mask[1] * 2),
14077 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14080 DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32, V1,
14083 assert(Mask[0] != -1 &&
"No undef lanes in multi-input v2 shuffles!");
14084 assert(Mask[1] != -1 &&
"No undef lanes in multi-input v2 shuffles!");
14085 assert(Mask[0] < 2 &&
"We sort V1 to be the first input.");
14086 assert(Mask[1] >= 2 &&
"We sort V2 to be the second input.");
14101 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14105 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14107 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14112 bool IsBlendSupported = Subtarget.
hasSSE41();
14113 if (IsBlendSupported)
14115 Zeroable, Subtarget, DAG))
14125 if (Subtarget.hasVLX())
14127 Zeroable, Subtarget, DAG))
14137 if (IsBlendSupported)
14139 Zeroable, Subtarget, DAG);
14159 SDValue LowV = V1, HighV = V2;
14161 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
14163 if (NumV2Elements == 1) {
14164 int V2Index =
find_if(Mask, [](
int M) {
return M >= 4; }) - Mask.begin();
14168 int V2AdjIndex = V2Index ^ 1;
14170 if (Mask[V2AdjIndex] < 0) {
14176 NewMask[V2Index] -= 4;
14180 int V1Index = V2AdjIndex;
14181 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14182 V2 = DAG.
getNode(X86ISD::SHUFP,
DL, VT, V2, V1,
14193 NewMask[V1Index] = 2;
14194 NewMask[V2Index] = 0;
14196 }
else if (NumV2Elements == 2) {
14197 if (Mask[0] < 4 && Mask[1] < 4) {
14202 }
else if (Mask[2] < 4 && Mask[3] < 4) {
14217 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14218 Mask[2] < 4 ? Mask[2] : Mask[3],
14219 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14220 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14221 V1 = DAG.
getNode(X86ISD::SHUFP,
DL, VT, V1, V2,
14227 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14228 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14229 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14230 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14232 }
else if (NumV2Elements == 3) {
14239 return DAG.
getNode(X86ISD::SHUFP,
DL, VT, LowV, HighV,
14254 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
14258 Zeroable, Subtarget, DAG))
14261 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
14263 if (NumV2Elements == 0) {
14266 Mask, Subtarget, DAG))
14272 return DAG.
getNode(X86ISD::MOVSLDUP,
DL, MVT::v4f32, V1);
14274 return DAG.
getNode(X86ISD::MOVSHDUP,
DL, MVT::v4f32, V1);
14277 if (Subtarget.
hasAVX()) {
14280 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v4f32, V1,
14288 return DAG.
getNode(X86ISD::MOVLHPS,
DL, MVT::v4f32, V1, V1);
14290 return DAG.
getNode(X86ISD::MOVHLPS,
DL, MVT::v4f32, V1, V1);
14295 return DAG.
getNode(X86ISD::SHUFP,
DL, MVT::v4f32, V1, V1,
14301 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
14315 if (NumV2Elements == 1 && Mask[0] >= 4)
14317 Zeroable, Subtarget, DAG))
14324 if (!MatchesShufPS || Zeroable == 0x3 || Zeroable == 0xC)
14328 if (!MatchesShufPS)
14338 return DAG.
getNode(X86ISD::MOVLHPS,
DL, MVT::v4f32, V1, V2);
14340 return DAG.
getNode(X86ISD::MOVHLPS,
DL, MVT::v4f32, V2, V1);
14361 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
14367 Zeroable, Subtarget, DAG))
14370 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
14373 if (Subtarget.preferLowerShuffleAsShift()) {
14376 Subtarget, DAG,
true))
14378 if (NumV2Elements == 0)
14384 if (NumV2Elements == 0) {
14386 if (
count_if(Mask, [](
int M) {
return M >= 0 && M < 4; }) > 1) {
14388 Mask, Subtarget, DAG))
14397 const int UnpackLoMask[] = {0, 0, 1, 1};
14398 const int UnpackHiMask[] = {2, 2, 3, 3};
14401 Mask = UnpackLoMask;
14403 Mask = UnpackHiMask;
14406 return DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32, V1,
14421 if (NumV2Elements == 1)
14423 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14428 bool IsBlendSupported = Subtarget.
hasSSE41();
14429 if (IsBlendSupported)
14431 Zeroable, Subtarget, DAG))
14435 Zeroable, Subtarget, DAG))
14445 if (Subtarget.hasVLX())
14446 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14447 Zeroable, Subtarget, DAG))
14450 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14461 if (IsBlendSupported)
14463 Zeroable, Subtarget, DAG);
14467 Mask, Subtarget, DAG))
14504 assert(Mask.size() == 8 &&
"Shuffle mask length doesn't match!");
14511 return DAG.
getNode(X86ISD::PSHUFLW,
DL, VT, V,
14516 for (
int i = 0; i != 4; ++i)
14517 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14518 return DAG.
getNode(X86ISD::PSHUFHW,
DL, VT, V,
14523 copy_if(LoMask, std::back_inserter(LoInputs), [](
int M) {
return M >= 0; });
14527 copy_if(HiMask, std::back_inserter(HiInputs), [](
int M) {
return M >= 0; });
14531 int NumHToL = LoInputs.
size() - NumLToL;
14533 int NumHToH = HiInputs.
size() - NumLToH;
14547 V = DAG.
getNode(X86ISD::PSHUFD,
DL, PSHUFDVT, V,
14552 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14553 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14555 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14558 for (
int DWord = 0; DWord != 4; ++DWord) {
14559 int M0 = Mask[2 * DWord + 0];
14560 int M1 = Mask[2 * DWord + 1];
14563 if (
M0 < 0 &&
M1 < 0)
14566 bool Match =
false;
14567 for (
int j = 0, e = DWordPairs.
size(); j < e; ++j) {
14568 auto &DWordPair = DWordPairs[j];
14571 DWordPair.first = (
M0 >= 0 ?
M0 : DWordPair.first);
14572 DWordPair.second = (
M1 >= 0 ?
M1 : DWordPair.second);
14573 PSHUFDMask[DWord] = DOffset + j;
14579 PSHUFDMask[DWord] = DOffset + DWordPairs.
size();
14584 if (DWordPairs.
size() <= 2) {
14585 DWordPairs.
resize(2, std::make_pair(-1, -1));
14586 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14587 DWordPairs[1].first, DWordPairs[1].second};
14592 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
14593 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
14594 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
14596 if ((NumHToL + NumHToH) == 0)
14597 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14598 if ((NumLToL + NumLToH) == 0)
14599 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14635 int AOffset,
int BOffset) {
14637 "Must call this with A having 3 or 1 inputs from the A half.");
14639 "Must call this with B having 1 or 3 inputs from the B half.");
14641 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14643 bool ThreeAInputs = AToAInputs.
size() == 3;
14649 int ADWord = 0, BDWord = 0;
14650 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14651 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14652 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14653 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14654 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14655 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14656 int TripleNonInputIdx =
14657 TripleInputSum - std::accumulate(TripleInputs.
begin(), TripleInputs.
end(), 0);
14658 TripleDWord = TripleNonInputIdx / 2;
14662 OneInputDWord = (OneInput / 2) ^ 1;
14669 if (BToBInputs.
size() == 2 && AToBInputs.
size() == 2) {
14674 int NumFlippedAToBInputs =
llvm::count(AToBInputs, 2 * ADWord) +
14676 int NumFlippedBToBInputs =
llvm::count(BToBInputs, 2 * BDWord) +
14678 if ((NumFlippedAToBInputs == 1 &&
14679 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14680 (NumFlippedBToBInputs == 1 &&
14681 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14686 auto FixFlippedInputs = [&V, &
DL, &Mask, &DAG](
int PinnedIdx,
int DWord,
14688 int FixIdx = PinnedIdx ^ 1;
14689 bool IsFixIdxInput =
is_contained(Inputs, PinnedIdx ^ 1);
14693 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14694 bool IsFixFreeIdxInput =
is_contained(Inputs, FixFreeIdx);
14695 if (IsFixIdxInput == IsFixFreeIdxInput)
14698 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14699 "We need to be changing the number of flipped inputs!");
14700 int PSHUFHalfMask[] = {0, 1, 2, 3};
14701 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14703 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW,
DL,
14707 for (
int &M : Mask)
14708 if (M >= 0 && M == FixIdx)
14710 else if (M >= 0 && M == FixFreeIdx)
14713 if (NumFlippedBToBInputs != 0) {
14715 BToAInputs.
size() == 3 ? TripleNonInputIdx : OneInput;
14716 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14718 assert(NumFlippedAToBInputs != 0 &&
"Impossible given predicates!");
14719 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14720 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14725 int PSHUFDMask[] = {0, 1, 2, 3};
14726 PSHUFDMask[ADWord] = BDWord;
14727 PSHUFDMask[BDWord] = ADWord;
14734 for (
int &M : Mask)
14735 if (M >= 0 && M/2 == ADWord)
14736 M = 2 * BDWord + M % 2;
14737 else if (M >= 0 && M/2 == BDWord)
14738 M = 2 * ADWord + M % 2;
14744 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14745 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14746 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14747 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14754 int PSHUFLMask[4] = {-1, -1, -1, -1};
14755 int PSHUFHMask[4] = {-1, -1, -1, -1};
14756 int PSHUFDMask[4] = {-1, -1, -1, -1};
14761 auto fixInPlaceInputs =
14765 if (InPlaceInputs.
empty())
14767 if (InPlaceInputs.
size() == 1) {
14768 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14769 InPlaceInputs[0] - HalfOffset;
14770 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14773 if (IncomingInputs.
empty()) {
14775 for (
int Input : InPlaceInputs) {
14776 SourceHalfMask[
Input - HalfOffset] =
Input - HalfOffset;
14782 assert(InPlaceInputs.
size() == 2 &&
"Cannot handle 3 or 4 inputs!");
14783 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14784 InPlaceInputs[0] - HalfOffset;
14787 int AdjIndex = InPlaceInputs[0] ^ 1;
14788 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14790 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14792 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14793 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14799 auto moveInputsToRightHalf = [&PSHUFDMask](
14804 auto isWordClobbered = [](
ArrayRef<int> SourceHalfMask,
int Word) {
14805 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14807 auto isDWordClobbered = [&isWordClobbered](
ArrayRef<int> SourceHalfMask,
14809 int LowWord = Word & ~1;
14810 int HighWord = Word | 1;
14811 return isWordClobbered(SourceHalfMask, LowWord) ||
14812 isWordClobbered(SourceHalfMask, HighWord);
14815 if (IncomingInputs.
empty())
14818 if (ExistingInputs.
empty()) {
14820 for (
int Input : IncomingInputs) {
14823 if (isWordClobbered(SourceHalfMask,
Input - SourceOffset)) {
14824 if (SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] < 0) {
14825 SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] =
14826 Input - SourceOffset;
14828 for (
int &M : HalfMask)
14829 if (M == SourceHalfMask[
Input - SourceOffset] + SourceOffset)
14831 else if (M ==
Input)
14832 M = SourceHalfMask[
Input - SourceOffset] + SourceOffset;
14834 assert(SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] ==
14835 Input - SourceOffset &&
14836 "Previous placement doesn't match!");
14841 Input = SourceHalfMask[
Input - SourceOffset] + SourceOffset;
14845 if (PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] < 0)
14846 PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] =
Input / 2;
14848 assert(PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] ==
14850 "Previous placement doesn't match!");
14856 for (
int &M : HalfMask)
14857 if (M >= SourceOffset && M < SourceOffset + 4) {
14858 M = M - SourceOffset + DestOffset;
14859 assert(M >= 0 &&
"This should never wrap below zero!");
14867 if (IncomingInputs.
size() == 1) {
14868 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14869 int InputFixed =
find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14871 SourceHalfMask[InputFixed - SourceOffset] =
14872 IncomingInputs[0] - SourceOffset;
14874 IncomingInputs[0] = InputFixed;
14876 }
else if (IncomingInputs.
size() == 2) {
14877 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14878 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14882 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14883 IncomingInputs[1] - SourceOffset};
14888 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14889 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14890 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14891 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14892 InputsFixed[1] = InputsFixed[0] ^ 1;
14893 }
else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14894 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14895 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14896 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14897 InputsFixed[0] = InputsFixed[1] ^ 1;
14898 }
else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14899 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14903 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14904 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14905 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14906 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14912 for (
int i = 0; i < 4; ++i)
14913 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14914 "We can't handle any clobbers here!");
14915 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14916 "Cannot have adjacent inputs here!");
14918 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14919 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14923 for (
int &M : FinalSourceHalfMask)
14924 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14925 M = InputsFixed[1] + SourceOffset;
14926 else if (M == InputsFixed[1] + SourceOffset)
14927 M = (InputsFixed[0] ^ 1) + SourceOffset;
14929 InputsFixed[1] = InputsFixed[0] ^ 1;
14933 for (
int &M : HalfMask)
14934 if (M == IncomingInputs[0])
14935 M = InputsFixed[0] + SourceOffset;
14936 else if (M == IncomingInputs[1])
14937 M = InputsFixed[1] + SourceOffset;
14939 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14940 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14947 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14948 assert(PSHUFDMask[FreeDWord] < 0 &&
"DWord not free");
14949 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14950 for (
int &M : HalfMask)
14951 for (
int Input : IncomingInputs)
14953 M = FreeDWord * 2 +
Input % 2;
14955 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14957 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14963 V = DAG.
getNode(X86ISD::PSHUFLW,
DL, VT, V,
14966 V = DAG.
getNode(X86ISD::PSHUFHW,
DL, VT, V,
14977 "Failed to lift all the high half inputs to the low mask!");
14978 assert(
none_of(HiMask, [](
int M) {
return M >= 0 && M < 4; }) &&
14979 "Failed to lift all the low half inputs to the high mask!");
14983 V = DAG.
getNode(X86ISD::PSHUFLW,
DL, VT, V,
14987 for (
int &M : HiMask)
14991 V = DAG.
getNode(X86ISD::PSHUFHW,
DL, VT, V,
15003 "Lane crossing shuffle masks not supported");
15006 int Size = Mask.size();
15007 int Scale = NumBytes /
Size;
15014 for (
int i = 0; i < NumBytes; ++i) {
15015 int M = Mask[i / Scale];
15019 const int ZeroMask = 0x80;
15020 int V1Idx = M <
Size ? M * Scale + i % Scale : ZeroMask;
15021 int V2Idx = M <
Size ? ZeroMask : (M -
Size) * Scale + i % Scale;
15022 if (Zeroable[i / Scale])
15023 V1Idx = V2Idx = ZeroMask;
15027 V1InUse |= (ZeroMask != V1Idx);
15028 V2InUse |= (ZeroMask != V2Idx);
15041 if (V1InUse && V2InUse)
15044 V = V1InUse ? V1 : V2;
15068 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
15073 Zeroable, Subtarget, DAG))
15081 int NumV2Inputs =
count_if(Mask, [](
int M) {
return M >= 8; });
15083 if (NumV2Inputs == 0) {
15087 Subtarget, DAG,
false))
15092 Mask, Subtarget, DAG))
15121 "All single-input shuffles should be canonicalized to be V1-input "
15131 if (Subtarget.hasSSE4A())
15137 if (NumV2Inputs == 1)
15139 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15144 bool IsBlendSupported = Subtarget.
hasSSE41();
15145 if (IsBlendSupported)
15147 Zeroable, Subtarget, DAG))
15151 Zeroable, Subtarget, DAG))
15179 Zeroable, Subtarget, DAG))
15184 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.
hasSSE41())) &&
15185 !Subtarget.hasVLX()) {
15187 unsigned PackOpc = 0;
15188 if (NumEvenDrops == 2 && Subtarget.
hasAVX2() &&
15192 V1V2 = DAG.
getNode(X86ISD::BLENDI,
DL, MVT::v16i16, V1V2,
15198 PackOpc = X86ISD::PACKUS;
15199 }
else if (Subtarget.
hasSSE41()) {
15202 for (
unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15210 PackOpc = X86ISD::PACKUS;
15211 }
else if (!Subtarget.
hasSSSE3()) {
15215 V1 = DAG.
getNode(X86ISD::VSHLI,
DL, MVT::v4i32, V1, ShAmt);
15216 V2 = DAG.
getNode(X86ISD::VSHLI,
DL, MVT::v4i32, V2, ShAmt);
15217 V1 = DAG.
getNode(X86ISD::VSRAI,
DL, MVT::v4i32, V1, ShAmt);
15218 V2 = DAG.
getNode(X86ISD::VSRAI,
DL, MVT::v4i32, V2, ShAmt);
15219 PackOpc = X86ISD::PACKSS;
15224 if (NumEvenDrops == 2) {
15225 Result = DAG.
getBitcast(MVT::v4i32, Result);
15226 Result = DAG.
getNode(PackOpc,
DL, MVT::v8i16, Result, Result);
15234 if (NumOddDrops == 1) {
15235 bool HasSSE41 = Subtarget.
hasSSE41();
15236 V1 = DAG.
getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI,
DL, MVT::v4i32,
15239 V2 = DAG.
getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI,
DL, MVT::v4i32,
15242 return DAG.
getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS,
DL,
15243 MVT::v8i16, V1, V2);
15248 Mask, Subtarget, DAG))
15253 if (!IsBlendSupported && Subtarget.
hasSSSE3()) {
15254 bool V1InUse, V2InUse;
15256 Zeroable, DAG, V1InUse, V2InUse);
15262 Zeroable, Subtarget, DAG);
15272 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
15273 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
15275 if (Subtarget.hasFP16()) {
15276 if (NumV2Elements == 0) {
15279 Mask, Subtarget, DAG))
15282 if (NumV2Elements == 1 && Mask[0] >= 8)
15284 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15311 MVT ShuffleVT = VT;
15321 for (
int &M : AdjustedMask)
15323 M += (Scale - 1) * NumElts;
15332 Result = DAG.
getNode(X86ISD::VPERMV,
DL, ShuffleVT, MaskNode, V1);
15334 Result = DAG.
getNode(X86ISD::VPERMV3,
DL, ShuffleVT, V1, MaskNode, V2);
15336 if (VT != ShuffleVT)
15355 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
15375 Zeroable, Subtarget, DAG))
15388 if (Subtarget.hasSSE4A())
15393 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
15396 if (NumV2Elements == 0) {
15399 Mask, Subtarget, DAG))
15419 for (
int i = 0; i < 16; i += 2)
15420 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15425 auto tryToWidenViaDuplication = [&]() ->
SDValue {
15426 if (!canWidenViaDuplication(Mask))
15429 copy_if(Mask, std::back_inserter(LoInputs),
15430 [](
int M) {
return M >= 0 && M < 8; });
15434 copy_if(Mask, std::back_inserter(HiInputs), [](
int M) {
return M >= 8; });
15438 bool TargetLo = LoInputs.
size() >= HiInputs.
size();
15439 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15440 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15442 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15444 for (
int I : InPlaceInputs) {
15445 PreDupI16Shuffle[
I/2] =
I/2;
15448 int j = TargetLo ? 0 : 4, je = j + 4;
15449 for (
int i = 0, ie = MovingInputs.
size(); i < ie; ++i) {
15452 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15455 while (j < je && PreDupI16Shuffle[j] >= 0)
15463 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15467 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15472 DAG.
getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15475 bool EvenInUse =
false, OddInUse =
false;
15476 for (
int i = 0; i < 16; i += 2) {
15477 EvenInUse |= (Mask[i + 0] >= 0);
15478 OddInUse |= (Mask[i + 1] >= 0);
15479 if (EvenInUse && OddInUse)
15482 V1 = DAG.
getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
DL,
15483 MVT::v16i8, EvenInUse ? V1 : DAG.
getUNDEF(MVT::v16i8),
15484 OddInUse ? V1 : DAG.
getUNDEF(MVT::v16i8));
15486 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15487 for (
int i = 0; i < 16; ++i)
15488 if (Mask[i] >= 0) {
15489 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15490 assert(MappedMask < 8 &&
"Invalid v8 shuffle mask!");
15491 if (PostDupI16Shuffle[i / 2] < 0)
15492 PostDupI16Shuffle[i / 2] = MappedMask;
15494 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15495 "Conflicting entries in the original shuffle!");
15500 DAG.
getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15502 if (
SDValue V = tryToWidenViaDuplication())
15507 Zeroable, Subtarget, DAG))
15516 Zeroable, Subtarget, DAG))
15520 bool IsSingleInput = V2.
isUndef();
15539 if (Subtarget.
hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15540 bool V1InUse =
false;
15541 bool V2InUse =
false;
15544 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15549 if (V1InUse && V2InUse) {
15552 Zeroable, Subtarget, DAG))
15564 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15568 if (Subtarget.hasVBMI())
15573 if (Subtarget.hasXOP()) {
15575 return DAG.
getNode(X86ISD::VPPERM,
DL, MVT::v16i8, V1, V2, MaskNode);
15581 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15589 if (NumV2Elements == 1)
15591 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15604 if (NumEvenDrops) {
15610 assert(NumEvenDrops <= 3 &&
15611 "No support for dropping even elements more than 3 times.");
15613 for (
unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15618 if (!IsSingleInput)
15624 IsSingleInput ? V1 : V2);
15625 for (
int i = 1; i < NumEvenDrops; ++i) {
15626 Result = DAG.
getBitcast(MVT::v8i16, Result);
15627 Result = DAG.
getNode(X86ISD::PACKUS,
DL, MVT::v16i8, Result, Result);
15633 if (NumOddDrops == 1) {
15634 V1 = DAG.
getNode(X86ISD::VSRLI,
DL, MVT::v8i16,
15637 if (!IsSingleInput)
15638 V2 = DAG.
getNode(X86ISD::VSRLI,
DL, MVT::v8i16,
15641 return DAG.
getNode(X86ISD::PACKUS,
DL, MVT::v16i8, V1,
15642 IsSingleInput ? V1 : V2);
15646 if (NumV2Elements > 0)
15648 Zeroable, Subtarget, DAG);
15655 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15656 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15657 for (
int i = 0; i < 16; ++i)
15659 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15665 if (
none_of(LoBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; }) &&
15666 none_of(HiBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; })) {
15673 VHiHalf = DAG.
getUNDEF(MVT::v8i16);
15676 for (
int &M : LoBlendMask)
15679 for (
int &M : HiBlendMask)
15688 MVT::v8i16, DAG.
getNode(X86ISD::UNPCKL,
DL, MVT::v16i8, V, Zero));
15690 MVT::v8i16, DAG.
getNode(X86ISD::UNPCKH,
DL, MVT::v16i8, V, Zero));
15696 return DAG.
getNode(X86ISD::PACKUS,
DL, MVT::v16i8, LoV, HiV);
15705 const APInt &Zeroable,
15708 if (VT == MVT::v8bf16) {
15745 "Only for 256-bit or wider vector shuffles!");
15750 if (VT == MVT::v8f32) {
15764 int SplitNumElements = NumElements / 2;
15770 auto SplitVector = [&](
SDValue V) {
15773 return std::make_pair(DAG.
getBitcast(SplitVT, LoV),
15777 SDValue LoV1, HiV1, LoV2, HiV2;
15778 std::tie(LoV1, HiV1) = SplitVector(V1);
15779 std::tie(LoV2, HiV2) = SplitVector(V2);
15782 auto GetHalfBlendPiecesReq = [&](
const ArrayRef<int> &HalfMask,
bool &UseLoV1,
15783 bool &UseHiV1,
bool &UseLoV2,
15785 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 =
false;
15786 for (
int i = 0; i < SplitNumElements; ++i) {
15787 int M = HalfMask[i];
15788 if (M >= NumElements) {
15789 if (M >= NumElements + SplitNumElements)
15793 }
else if (M >= 0) {
15794 if (M >= SplitNumElements)
15802 auto CheckHalfBlendUsable = [&](
const ArrayRef<int> &HalfMask) ->
bool {
15806 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15807 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15809 return !(UseHiV1 || UseHiV2);
15816 for (
int i = 0; i < SplitNumElements; ++i) {
15817 int M = HalfMask[i];
15818 if (M >= NumElements) {
15819 V2BlendMask[i] = M - NumElements;
15820 BlendMask[i] = SplitNumElements + i;
15821 }
else if (M >= 0) {
15822 V1BlendMask[i] = M;
15827 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15828 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15833 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) &&
"Shuffle isn't simple");
15836 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15838 if (!UseLoV2 && !UseHiV2)
15840 if (!UseLoV1 && !UseHiV1)
15844 if (UseLoV1 && UseHiV1) {
15848 V1Blend = UseLoV1 ? LoV1 : HiV1;
15849 for (
int i = 0; i < SplitNumElements; ++i)
15850 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15851 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15853 if (UseLoV2 && UseHiV2) {
15857 V2Blend = UseLoV2 ? LoV2 : HiV2;
15858 for (
int i = 0; i < SplitNumElements; ++i)
15859 if (BlendMask[i] >= SplitNumElements)
15860 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15865 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15883 const APInt &Zeroable,
15886 assert(!V2.
isUndef() &&
"This routine must not be used to lower single-input "
15887 "shuffles as it could then recurse on itself.");
15888 int Size = Mask.size();
15893 auto DoBothBroadcast = [&] {
15894 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15897 if (V2BroadcastIdx < 0)
15898 V2BroadcastIdx = M -
Size;
15899 else if ((M -
Size) != V2BroadcastIdx &&
15902 }
else if (M >= 0) {
15903 if (V1BroadcastIdx < 0)
15904 V1BroadcastIdx = M;
15905 else if (M != V1BroadcastIdx &&
15911 if (DoBothBroadcast())
15919 int LaneSize =
Size / LaneCount;
15921 LaneInputs[0].
resize(LaneCount,
false);
15922 LaneInputs[1].
resize(LaneCount,
false);
15923 for (
int i = 0; i <
Size; ++i)
15925 LaneInputs[Mask[i] /
Size][(Mask[i] %
Size) / LaneSize] =
true;
15926 if (LaneInputs[0].
count() <= 1 && LaneInputs[1].
count() <= 1)
15939 if (SplatOrSplitV1 && SplatOrSplitV2)
15956 assert(VT == MVT::v4f64 &&
"Only for v4f64 shuffles");
15958 int LHSMask[4] = {-1, -1, -1, -1};
15959 int RHSMask[4] = {-1, -1, -1, -1};
15960 int SHUFPDMask[4] = {-1, -1, -1, -1};
15964 for (
int i = 0; i != 4; ++i) {
15968 int LaneBase = i & ~1;
15969 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15970 LaneMask[LaneBase + (M & 1)] = M;
15971 SHUFPDMask[i] = M & 1;
15993 int NumEltsPerLane = NumElts / NumLanes;
16001 auto getSublanePermute = [&](
int NumSublanes) ->
SDValue {
16002 int NumSublanesPerLane = NumSublanes / NumLanes;
16003 int NumEltsPerSublane = NumElts / NumSublanes;
16011 for (
int i = 0; i != NumElts; ++i) {
16016 int SrcSublane = M / NumEltsPerSublane;
16017 int DstLane = i / NumEltsPerLane;
16021 bool Found =
false;
16022 int DstSubStart = DstLane * NumSublanesPerLane;
16023 int DstSubEnd = DstSubStart + NumSublanesPerLane;
16024 for (
int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
16025 if (!
isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
16029 CrossLaneMaskLarge[DstSublane] = SrcSublane;
16030 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
16031 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
16032 DemandedCrossLane.
setBit(InLaneMask[i]);
16042 if (!CanUseSublanes) {
16047 int NumIdentityLanes = 0;
16048 bool OnlyShuffleLowestLane =
true;
16049 for (
int i = 0; i != NumLanes; ++i) {
16050 int LaneOffset = i * NumEltsPerLane;
16052 i * NumEltsPerLane))
16053 NumIdentityLanes++;
16054 else if (CrossLaneMask[LaneOffset] != 0)
16055 OnlyShuffleLowestLane =
false;
16057 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
16063 for (
int i = 0; i != NumElts; ++i)
16064 if (!DemandedCrossLane[i])
16070 if (CrossLaneMask == Mask || InLaneMask == Mask)
16079 if (
SDValue V = getSublanePermute(NumLanes))
16083 if (!CanUseSublanes)
16087 if (
SDValue V = getSublanePermute(NumLanes * 2))
16092 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16095 return getSublanePermute(NumLanes * 4);
16101 int Size = Mask.size();
16102 InLaneMask.
assign(Mask.begin(), Mask.end());
16103 for (
int i = 0; i <
Size; ++i) {
16104 int &M = InLaneMask[i];
16107 if (((M %
Size) / LaneSize) != (i / LaneSize))
16108 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) +
Size;
16124 int Size = Mask.size();
16125 int LaneSize =
Size / 2;
16130 if (VT == MVT::v4f64 &&
16131 !
all_of(Mask, [LaneSize](
int M) {
return M < LaneSize; }))
16139 bool LaneCrossing[2] = {
false,
false};
16140 for (
int i = 0; i <
Size; ++i)
16141 if (Mask[i] >= 0 && ((Mask[i] %
Size) / LaneSize) != (i / LaneSize))
16142 LaneCrossing[(Mask[i] %
Size) / LaneSize] =
true;
16143 AllLanes = LaneCrossing[0] && LaneCrossing[1];
16145 bool LaneUsed[2] = {
false,
false};
16146 for (
int i = 0; i <
Size; ++i)
16148 LaneUsed[(Mask[i] %
Size) / LaneSize] =
true;
16149 AllLanes = LaneUsed[0] && LaneUsed[1];
16154 "This last part of this routine only works on single input shuffles");
16160 "In-lane shuffle mask expected");
16180 const APInt &Zeroable,
16193 VT, MemVT, Ld, Ofs, DAG))
16208 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16209 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16212 if (WidenedMask[0] == 0 && IsHighZero) {
16232 if (!IsLowZero && !IsHighZero) {
16251 if (Subtarget.hasVLX()) {
16252 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16253 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16254 ((WidenedMask[1] % 2) << 1);
16255 return DAG.
getNode(X86ISD::SHUF128,
DL, VT, V1, V2,
16274 assert((WidenedMask[0] >= 0 || IsLowZero) &&
16275 (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?");
16277 unsigned PermMask = 0;
16278 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16279 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16282 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16284 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16287 return DAG.
getNode(X86ISD::VPERM2X128,
DL, VT, V1, V2,
16305 int NumElts = Mask.size();
16313 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16314 int Srcs[2] = {-1, -1};
16316 for (
int i = 0; i != NumLaneElts; ++i) {
16317 int M = Mask[(Lane * NumLaneElts) + i];
16324 int LaneSrc = M / NumLaneElts;
16326 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16328 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16333 Srcs[Src] = LaneSrc;
16334 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16341 LaneSrcs[Lane][0] = Srcs[0];
16342 LaneSrcs[Lane][1] = Srcs[1];
16345 assert(
M1.size() == M2.size() &&
"Unexpected mask size");
16346 for (
int i = 0, e =
M1.size(); i != e; ++i)
16347 if (
M1[i] >= 0 && M2[i] >= 0 &&
M1[i] != M2[i])
16353 assert(Mask.size() == MergedMask.size() &&
"Unexpected mask size");
16354 for (
int i = 0, e = MergedMask.size(); i != e; ++i) {
16358 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
16359 "Unexpected mask element");
16364 if (MatchMasks(InLaneMask, RepeatMask)) {
16366 MergeMasks(InLaneMask, RepeatMask);
16371 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16374 if (MatchMasks(InLaneMask, RepeatMask)) {
16376 MergeMasks(InLaneMask, RepeatMask);
16385 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16387 if (LaneSrcs[Lane][0] >= 0)
16390 for (
int i = 0; i != NumLaneElts; ++i) {
16391 int M = Mask[(Lane * NumLaneElts) + i];
16396 if (RepeatMask[i] < 0)
16397 RepeatMask[i] = M % NumLaneElts;
16399 if (RepeatMask[i] < NumElts) {
16400 if (RepeatMask[i] != M % NumLaneElts)
16402 LaneSrcs[Lane][0] = M / NumLaneElts;
16404 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16406 LaneSrcs[Lane][1] = M / NumLaneElts;
16410 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16415 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16416 int Src = LaneSrcs[Lane][0];
16417 for (
int i = 0; i != NumLaneElts; ++i) {
16420 M = Src * NumLaneElts + i;
16421 NewMask[Lane * NumLaneElts + i] = M;
16432 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16433 int Src = LaneSrcs[Lane][1];
16434 for (
int i = 0; i != NumLaneElts; ++i) {
16437 M = Src * NumLaneElts + i;
16438 NewMask[Lane * NumLaneElts + i] = M;
16449 for (
int i = 0; i != NumElts; ++i) {
16454 NewMask[i] = RepeatMask[i % NumLaneElts];
16455 if (NewMask[i] < 0)
16458 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16471 int &HalfIdx1,
int &HalfIdx2) {
16472 assert((Mask.size() == HalfMask.
size() * 2) &&
16473 "Expected input mask to be twice as long as output");
16478 if (UndefLower == UndefUpper)
16481 unsigned HalfNumElts = HalfMask.
size();
16482 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16485 for (
unsigned i = 0; i != HalfNumElts; ++i) {
16486 int M = Mask[i + MaskIndexOffset];
16494 int HalfIdx = M / HalfNumElts;
16497 int HalfElt = M % HalfNumElts;
16501 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16502 HalfMask[i] = HalfElt;
16503 HalfIdx1 = HalfIdx;
16506 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16507 HalfMask[i] = HalfElt + HalfNumElts;
16508 HalfIdx2 = HalfIdx;
16523 int HalfIdx2,
bool UndefLower,
16532 auto getHalfVector = [&](
int HalfIdx) {
16535 SDValue V = (HalfIdx < 2 ? V1 : V2);
16536 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16542 SDValue Half1 = getHalfVector(HalfIdx1);
16543 SDValue Half2 = getHalfVector(HalfIdx2);
16553 unsigned Offset = UndefLower ? HalfNumElts : 0;
16566 "Expected 256-bit or 512-bit vector");
16573 "Completely undef shuffle mask should have been simplified already");
16597 int HalfIdx1, HalfIdx2;
16602 assert(HalfMask.
size() == HalfNumElts &&
"Unexpected shuffle mask length");
16605 unsigned NumLowerHalves =
16606 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16607 unsigned NumUpperHalves =
16608 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16609 assert(NumLowerHalves + NumUpperHalves <= 2 &&
"Only 1 or 2 halves allowed");
16617 if (NumUpperHalves == 0)
16621 if (NumUpperHalves == 1) {
16625 if (EltWidth == 32 && NumLowerHalves && HalfVT.
is128BitVector() &&
16628 Subtarget.hasFastVariableCrossLaneShuffle()))
16634 if (EltWidth == 64 && V2.
isUndef())
16638 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16650 assert(NumUpperHalves == 2 &&
"Half vector count went wrong");
16655 if (NumUpperHalves == 0) {
16658 if (Subtarget.
hasAVX2() && EltWidth == 64)
16681 int NumLaneElts = NumElts / NumLanes;
16686 for (
unsigned BroadcastSize : {16, 32, 64}) {
16695 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
16696 for (
int j = 0; j != NumBroadcastElts; ++j) {
16697 int M = Mask[i + j];
16700 int &R = RepeatMask[j];
16701 if (0 != ((M % NumElts) / NumLaneElts))
16703 if (0 <= R && R != M)
16711 if (!FindRepeatingBroadcastMask(RepeatMask))
16719 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
16720 for (
int j = 0; j != NumBroadcastElts; ++j)
16721 BroadcastMask[i + j] = j;
16725 if (BroadcastMask == Mask)
16743 auto ShuffleSubLanes = [&](
int SubLaneScale) {
16744 int NumSubLanes = NumLanes * SubLaneScale;
16745 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16750 int TopSrcSubLane = -1;
16756 for (
int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16761 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16762 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16765 int Lane = (M % NumElts) / NumLaneElts;
16766 if ((0 <= SrcLane) && (SrcLane != Lane))
16769 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16770 SubLaneMask[Elt] = LocalM;
16778 for (
int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16780 for (
int i = 0; i != NumSubLaneElts; ++i) {
16781 if (
M1[i] < 0 || M2[i] < 0)
16783 if (
M1[i] != M2[i])
16789 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16790 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16794 for (
int i = 0; i != NumSubLaneElts; ++i) {
16795 int M = SubLaneMask[i];
16798 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16799 "Unexpected mask element");
16800 RepeatedSubLaneMask[i] = M;
16805 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16806 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16807 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16812 if (Dst2SrcSubLanes[DstSubLane] < 0)
16815 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16816 "Unexpected source lane");
16820 for (
int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16821 int Lane = SubLane / SubLaneScale;
16822 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16823 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16824 int M = RepeatedSubLaneMask[Elt];
16827 int Idx = (SubLane * NumSubLaneElts) + Elt;
16828 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16834 for (
int i = 0; i != NumElts; i += NumSubLaneElts) {
16835 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16836 if (SrcSubLane < 0)
16838 for (
int j = 0; j != NumSubLaneElts; ++j)
16839 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16844 if (RepeatedMask == Mask || SubLaneMask == Mask)
16858 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16861 MinSubLaneScale = 2;
16863 (!OnlyLowestElts && V2.
isUndef() && VT == MVT::v32i8) ? 4 : 2;
16865 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16866 MinSubLaneScale = MaxSubLaneScale = 4;
16868 for (
int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16869 if (
SDValue Shuffle = ShuffleSubLanes(Scale))
16876 bool &ForceV1Zero,
bool &ForceV2Zero,
16878 const APInt &Zeroable) {
16881 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16882 "Unexpected data type for VSHUFPD");
16884 "Illegal shuffle mask");
16886 bool ZeroLane[2] = {
true,
true };
16887 for (
int i = 0; i < NumElts; ++i)
16888 ZeroLane[i & 1] &= Zeroable[i];
16892 bool IsSHUFPD =
true;
16893 bool IsCommutable =
true;
16895 for (
int i = 0; i < NumElts; ++i) {
16900 int Val = (i & 6) + NumElts * (i & 1);
16901 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16902 if (Mask[i] < Val || Mask[i] > Val + 1)
16904 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16905 IsCommutable =
false;
16906 SHUFPDMask[i] = Mask[i] % 2;
16909 if (!IsSHUFPD && !IsCommutable)
16912 if (!IsSHUFPD && IsCommutable)
16915 ForceV1Zero = ZeroLane[0];
16916 ForceV2Zero = ZeroLane[1];
16923 const APInt &Zeroable,
16926 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16927 "Unexpected data type for VSHUFPD");
16929 unsigned Immediate = 0;
16930 bool ForceV1Zero =
false, ForceV2Zero =
false;
16941 return DAG.
getNode(X86ISD::SHUFP,
DL, VT, V1, V2,
16951 const APInt &Zeroable,
16953 assert(VT == MVT::v32i8 &&
"Unexpected type!");
16960 if (Zeroable.
countl_one() < (Mask.size() - 8))
16966 V1 = DAG.
getNode(X86ISD::VTRUNC,
DL, MVT::v16i8, V1);
16967 V2 = DAG.
getNode(X86ISD::VTRUNC,
DL, MVT::v16i8, V2);
16972 { 0, 1, 2, 3, 16, 17, 18, 19,
16973 4, 5, 6, 7, 20, 21, 22, 23 });
17000 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
17004 auto IsInterleavingPattern = [&](
ArrayRef<int> Mask,
unsigned Begin0,
17006 size_t Size = Mask.size();
17007 assert(
Size % 2 == 0 &&
"Expected even mask size");
17008 for (
unsigned I = 0;
I <
Size;
I += 2) {
17009 if (Mask[
I] != (
int)(Begin0 +
I / 2) ||
17010 Mask[
I + 1] != (
int)(Begin1 +
I / 2))
17017 size_t FirstQtr = NumElts / 2;
17018 size_t ThirdQtr = NumElts + NumElts / 2;
17019 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
17020 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
17021 if (!IsFirstHalf && !IsSecondHalf)
17031 if (Shuffles.
size() != 2)
17038 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
17039 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
17040 FirstHalf = Shuffles[0];
17041 SecondHalf = Shuffles[1];
17042 }
else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
17043 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
17044 FirstHalf = Shuffles[1];
17045 SecondHalf = Shuffles[0];
17075 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
17084 Mask, Subtarget, DAG))
17089 return DAG.
getNode(X86ISD::MOVDDUP,
DL, MVT::v4f64, V1);
17094 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17095 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
17096 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v4f64, V1,
17102 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v4f64, V1,
17108 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17113 Mask, DAG, Subtarget))
17122 Zeroable, Subtarget, DAG))
17130 Zeroable, Subtarget, DAG))
17143 !
all_of(Mask, [](
int M) {
return M < 2 || (4 <= M && M < 6); }) &&
17147 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
17152 if (V1IsInPlace || V2IsInPlace)
17154 Zeroable, Subtarget, DAG);
17159 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17166 if (!(Subtarget.
hasAVX2() && (V1IsInPlace || V2IsInPlace)))
17168 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17172 if (Subtarget.hasVLX())
17174 Zeroable, Subtarget, DAG))
17181 Zeroable, Subtarget, DAG);
17198 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
17199 assert(Subtarget.
hasAVX2() &&
"We can only lower v4i64 with AVX2!");
17206 Zeroable, Subtarget, DAG))
17215 if (Subtarget.preferLowerShuffleAsShift())
17218 Subtarget, DAG,
true))
17230 DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v8i32,
17237 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v4i64, V1,
17248 if (Subtarget.hasVLX()) {
17250 Zeroable, Subtarget, DAG))
17254 Zeroable, Subtarget, DAG))
17272 if (V1IsInPlace || V2IsInPlace)
17274 Zeroable, Subtarget, DAG);
17279 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17291 if (!V1IsInPlace && !V2IsInPlace)
17293 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17298 Zeroable, Subtarget, DAG);
17311 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17314 Zeroable, Subtarget, DAG))
17332 Zeroable, Subtarget, DAG))
17340 "Repeated masks must be half the mask width!");
17344 return DAG.
getNode(X86ISD::MOVSLDUP,
DL, MVT::v8f32, V1);
17346 return DAG.
getNode(X86ISD::MOVSHDUP,
DL, MVT::v8f32, V1);
17349 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v8f32, V1,
17364 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17372 return DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v8f32, V1, VPermMask);
17376 return DAG.
getNode(X86ISD::VPERMV,
DL, MVT::v8f32, VPermMask, V1);
17386 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17390 if (Subtarget.hasVLX())
17392 Zeroable, Subtarget, DAG))
17417 Zeroable, Subtarget, DAG);
17434 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17435 assert(Subtarget.
hasAVX2() &&
"We can only lower v8i32 with AVX2!");
17437 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
17443 Zeroable, Subtarget, DAG))
17462 Zeroable, Subtarget, DAG))
17471 if (Subtarget.preferLowerShuffleAsShift()) {
17474 Subtarget, DAG,
true))
17476 if (NumV2Elements == 0)
17486 bool Is128BitLaneRepeatedShuffle =
17488 if (Is128BitLaneRepeatedShuffle) {
17489 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
17491 return DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v8i32, V1,
17505 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
17511 if (Subtarget.hasVLX()) {
17513 Zeroable, Subtarget, DAG))
17517 Zeroable, Subtarget, DAG))
17529 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17541 return DAG.
getNode(X86ISD::VPERMV,
DL, MVT::v8i32, VPermMask, V1);
17551 CastV1, CastV2, DAG);
17558 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17563 Zeroable, Subtarget, DAG);
17576 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
17577 assert(Subtarget.
hasAVX2() &&
"We can only lower v16i16 with AVX2!");
17583 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17592 Zeroable, Subtarget, DAG))
17612 Subtarget, DAG,
false))
17623 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17641 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17654 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17659 Zeroable, Subtarget, DAG))
17663 if (Subtarget.hasBWI())
17669 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17674 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17699 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
17700 assert(Subtarget.
hasAVX2() &&
"We can only lower v32i8 with AVX2!");
17706 Zeroable, Subtarget, DAG))
17715 Zeroable, Subtarget, DAG))
17752 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17764 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17772 Zeroable, Subtarget, DAG))
17776 if (Subtarget.hasVBMI())
17782 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17787 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17793 if (Subtarget.hasVLX())
17795 Mask, Zeroable, DAG))
17822 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
17824 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17826 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17842 if (ElementBits < 32) {
17860 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17892 "Unexpected element type size for 128bit shuffle.");
17902 assert(Widened128Mask.
size() == 4 &&
"Shuffle widening mismatch");
17905 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17906 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17907 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17918 bool OnlyUsesV1 =
isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17920 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17930 bool IsInsert =
true;
17932 for (
int i = 0; i < 4; ++i) {
17933 assert(Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value");
17934 if (Widened128Mask[i] < 0)
17938 if (Widened128Mask[i] < 4) {
17939 if (Widened128Mask[i] != i) {
17945 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17952 if (IsInsert && V2Index >= 0) {
17965 Widened128Mask.
clear();
17971 int PermMask[4] = {-1, -1, -1, -1};
17973 for (
int i = 0; i < 4; ++i) {
17974 assert(Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value");
17975 if (Widened128Mask[i] < 0)
17978 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17985 PermMask[i] = Widened128Mask[i] % 4;
17999 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
18003 if (
isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
18004 return DAG.
getNode(X86ISD::MOVDDUP,
DL, MVT::v8f64, V1);
18009 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18010 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
18011 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
18012 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
18013 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v8f64, V1,
18019 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v8f64, V1,
18024 V2, Subtarget, DAG))
18032 Zeroable, Subtarget, DAG))
18040 Zeroable, Subtarget, DAG))
18053 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
18059 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
18063 return DAG.
getNode(X86ISD::MOVSLDUP,
DL, MVT::v16f32, V1);
18065 return DAG.
getNode(X86ISD::MOVSHDUP,
DL, MVT::v16f32, V1);
18068 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v16f32, V1,
18076 Zeroable, Subtarget, DAG))
18084 Zeroable, Subtarget, DAG))
18088 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18094 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
18102 return DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v16f32, V1, VPermMask);
18107 Zeroable, Subtarget, DAG))
18120 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
18123 if (Subtarget.preferLowerShuffleAsShift())
18126 Subtarget, DAG,
true))
18139 DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v16i32,
18146 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v8i64, V1,
18151 V2, Subtarget, DAG))
18162 Zeroable, Subtarget, DAG))
18166 if (Subtarget.hasBWI())
18180 Zeroable, Subtarget, DAG))
18193 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
18195 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
18201 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18205 if (Subtarget.preferLowerShuffleAsShift()) {
18208 Subtarget, DAG,
true))
18210 if (NumV2Elements == 0)
18220 bool Is128BitLaneRepeatedShuffle =
18222 if (Is128BitLaneRepeatedShuffle) {
18223 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
18225 return DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v16i32, V1,
18236 Subtarget, DAG,
false))
18239 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
18246 Zeroable, Subtarget, DAG))
18250 if (Subtarget.hasBWI())
18261 CastV1, CastV2, DAG);
18268 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
18273 Zeroable, Subtarget, DAG))
18277 Zeroable, Subtarget, DAG))
18290 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
18291 assert(Subtarget.hasBWI() &&
"We can only lower v32i16 with AVX-512-BWI!");
18297 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18312 Subtarget, DAG,
false))
18332 RepeatedMask, Subtarget, DAG);
18337 Zeroable, Subtarget, DAG))
18341 Zeroable, Subtarget, DAG))
18347 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
18360 assert(Mask.size() == 64 &&
"Unexpected mask size for v64 shuffle!");
18361 assert(Subtarget.hasBWI() &&
"We can only lower v64i8 with AVX-512-BWI!");
18367 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18398 Zeroable, Subtarget, DAG))
18402 Zeroable, Subtarget, DAG))
18409 if (!Subtarget.hasVBMI())
18411 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18415 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
18419 Zeroable, Subtarget, DAG))
18426 Mask, Subtarget, DAG))
18431 if (Subtarget.hasVBMI())
18437 bool V1InUse, V2InUse;
18439 DAG, V1InUse, V2InUse);
18445 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18449 if (Subtarget.hasVBMI())
18463 const APInt &Zeroable,
18467 "Cannot lower 512-bit vectors w/ basic ISA!");
18471 int NumElts = Mask.size();
18472 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
18474 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18476 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18489 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18501 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
18502 if (!Subtarget.hasBWI())
18544 int NumElts = Mask.size();
18545 for (
int i = 0; i != NumElts; ++i) {
18548 "Unexpected mask index.");
18553 if (ShiftAmt < 0) {
18560 if (ShiftAmt != M - i)
18563 assert(ShiftAmt >= 0 &&
"All undef?");
18577 int MaskOffset,
const APInt &Zeroable) {
18578 int Size = Mask.size();
18580 auto CheckZeros = [&](
int Shift,
bool Left) {
18581 for (
int j = 0; j < Shift; ++j)
18582 if (!Zeroable[j + (
Left ? 0 : (
Size - Shift))])
18588 auto MatchShift = [&](
int Shift,
bool Left) {
18589 unsigned Pos =
Left ? Shift : 0;
18590 unsigned Low =
Left ? 0 : Shift;
18591 unsigned Len =
Size - Shift;
18595 for (
int Shift = 1; Shift !=
Size; ++Shift)
18596 for (
bool Left : {
true,
false})
18597 if (CheckZeros(Shift,
Left) && MatchShift(Shift,
Left)) {
18598 Opcode =
Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18612 const APInt &Zeroable,
18616 "Cannot lower 512-bit vectors w/o basic ISA!");
18618 int NumElts = Mask.size();
18619 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
18622 int SubvecElts = 0;
18624 for (
int i = 0; i != NumElts; ++i) {
18625 if (Mask[i] >= 0) {
18629 Src = Mask[i] / NumElts;
18630 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18636 assert(SubvecElts != NumElts &&
"Identity shuffle?");
18643 if ((
int)Zeroable.
countl_one() >= (NumElts - SubvecElts)) {
18644 assert(Src >= 0 &&
"Expected a source!");
18664 if (ShiftAmt >= 0) {
18668 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18672 DAG.
getNode(X86ISD::KSHIFTL,
DL, WideVT, Res,
18675 ShiftAmt += WideElts - NumElts;
18678 Res = DAG.
getNode(Opcode,
DL, WideVT, Res,
18702 return Zeroable[M.index()] || (M.value() == (
int)M.index());
18704 if (IsBlendWithZero) {
18705 const unsigned Width = std::max<unsigned>(NumElts, 8u);
18708 APInt MaskValue = (~Zeroable).zextOrTrunc(Width);
18723 ExtVT = MVT::v2i64;
18726 ExtVT = MVT::v4i32;
18731 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18741 assert(Subtarget.hasBWI() &&
"Expected AVX512BW support");
18749 ExtVT = MVT::v64i8;
18759 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18760 (Subtarget.hasDQI() && (NumElems < 32)))
18770 int NumElements = Mask.size();
18772 int NumV1Elements = 0, NumV2Elements = 0;
18776 else if (M < NumElements)
18784 if (NumV2Elements > NumV1Elements)
18787 assert(NumV1Elements > 0 &&
"No V1 indices");
18789 if (NumV2Elements == 0)
18797 if (NumV1Elements == NumV2Elements) {
18798 int LowV1Elements = 0, LowV2Elements = 0;
18799 for (
int M : Mask.slice(0, NumElements / 2))
18800 if (M >= NumElements)
18804 if (LowV2Elements > LowV1Elements)
18806 if (LowV2Elements == LowV1Elements) {
18807 int SumV1Indices = 0, SumV2Indices = 0;
18808 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
18809 if (Mask[i] >= NumElements)
18811 else if (Mask[i] >= 0)
18813 if (SumV2Indices < SumV1Indices)
18815 if (SumV2Indices == SumV1Indices) {
18816 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18817 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
18818 if (Mask[i] >= NumElements)
18819 NumV2OddIndices += i % 2;
18820 else if (Mask[i] >= 0)
18821 NumV1OddIndices += i % 2;
18822 if (NumV2OddIndices < NumV1OddIndices)
18836 if (!V.getValueType().isSimple())
18840 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18845 if ((VT == MVT::i16 || VT == MVT::i8) &&
18846 V.getSimpleValueType().getSizeInBits() < 512)
18849 auto HasMaskOperation = [&](
SDValue V) {
18852 switch (V->getOpcode()) {
18871 if (!V->hasOneUse())
18877 if (HasMaskOperation(V))
18902 MVT VT =
Op.getSimpleValueType();
18908 "Can't lower MMX shuffles");
18910 bool V1IsUndef = V1.
isUndef();
18911 bool V2IsUndef = V2.
isUndef();
18912 if (V1IsUndef && V2IsUndef)
18925 any_of(OrigMask, [NumElements](
int M) {
return M >= NumElements; })) {
18927 for (
int &M : NewMask)
18928 if (M >= NumElements)
18934 int MaskUpperLimit = OrigMask.
size() * (V2IsUndef ? 1 : 2);
18935 (void)MaskUpperLimit;
18937 [&](
int M) {
return -1 <= M && M < MaskUpperLimit; }) &&
18938 "Out of bounds shuffle index");
18943 APInt KnownUndef, KnownZero;
18946 APInt Zeroable = KnownUndef | KnownZero;
18972 int NewNumElts = NumElements / 2;
18980 bool UsedZeroVector =
false;
18982 "V2's non-undef elements are used?!");
18983 for (
int i = 0; i != NewNumElts; ++i)
18985 WidenedMask[i] = i + NewNumElts;
18986 UsedZeroVector =
true;
18990 if (UsedZeroVector)
19014 assert(NumElements == (
int)Mask.size() &&
19015 "canonicalizeShuffleMaskWithHorizOp "
19016 "shouldn't alter the shuffle mask size");
19022 auto CanonicalizeConstant = [VT, &
DL, &DAG](
SDValue V) {
19026 if (Undefs.
any() &&
19035 V1 = CanonicalizeConstant(V1);
19036 V2 = CanonicalizeConstant(V2);
19065 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
19080 if (NumVecBits != 128 && NumVecBits != 256)
19083 if (NumElementBits == 32 || NumElementBits == 64) {
19084 unsigned NumLargeElements = 512 / NumElementBits;
19092 Subtarget, DAG,
DL);
19096 Subtarget, DAG,
DL);
19104 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
19105 VecVT == MVT::v16i16) {
19110 Passthru = Passthru.
isUndef()
19129 MVT VT =
Op.getSimpleValueType();
19148 MVT VT =
Op.getSimpleValueType();
19170 MVT CondVT =
Cond.getSimpleValueType();
19171 unsigned CondEltSize =
Cond.getScalarValueSizeInBits();
19172 if (CondEltSize == 1)
19176 if (!Subtarget.hasSSE41())
19183 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
19200 if (CondEltSize != EltSize) {
19216 if (EltSize < 32 && VT.
is256BitVector() && !Subtarget.hasAVX2() &&
19217 !Subtarget.hasXOP()) {
19223 if (FreeCond && (FreeLHS || FreeRHS))
19237 if (Subtarget.hasAVX2())
19245 case MVT::v16f16: {
19258 MVT VT =
Op.getSimpleValueType();
19277 SDValue Extract = DAG.
getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
19282 if (VT == MVT::f32) {
19288 if (!
Op.hasOneUse())
19293 User->getValueType(0) != MVT::i32))
19300 if (VT == MVT::i32 || VT == MVT::i64)
19315 MVT EltVT =
Op.getSimpleValueType();
19318 "Unexpected vector type in ExtractBitFromMaskVector");
19326 if (NumElts == 1) {
19338 unsigned IdxVal = IdxC->getZExtValue();
19355 MVT VT =
N->getSimpleValueType(0);
19359 switch (
User->getOpcode()) {
19360 case X86ISD::PEXTRB:
19361 case X86ISD::PEXTRW:
19365 return DemandedElts;
19367 DemandedElts.
setBit(
User->getConstantOperandVal(1));
19370 if (!
User->getValueType(0).isSimple() ||
19371 !
User->getValueType(0).isVector()) {
19373 return DemandedElts;
19381 return DemandedElts;
19384 return DemandedElts;
19388X86TargetLowering::LowerEXTRACT_VECTOR_ELT(
SDValue Op,
19433 unsigned IdxVal = IdxC->getZExtValue();
19447 IdxVal &= ElemsPerChunk - 1;
19454 MVT VT =
Op.getSimpleValueType();
19456 if (VT == MVT::i16) {
19461 if (Subtarget.hasFP16())
19469 SDValue Extract = DAG.
getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19474 if (Subtarget.hasSSE41())
19481 if (VT == MVT::i8) {
19486 int DWordIdx = IdxVal / 4;
19487 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
19491 int ShiftVal = (IdxVal % 4) * 8;
19498 int WordIdx = IdxVal / 2;
19499 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
19503 int ShiftVal = (IdxVal % 2) * 8;
19517 Mask[0] =
static_cast<int>(IdxVal);
19533 int Mask[2] = { 1, -1 };
19571 MVT VT =
Op.getSimpleValueType();
19576 if (EltVT == MVT::i1)
19585 if (EltVT == MVT::bf16) {
19597 if (!(Subtarget.hasBWI() ||
19598 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19599 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
19612 for (
unsigned I = 0;
I != NumElts; ++
I)
19617 return DAG.
getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19621 if (N2C->getAPIntValue().uge(NumElts))
19623 uint64_t IdxVal = N2C->getZExtValue();
19628 if (IsZeroElt || IsAllOnesElt) {
19631 if (IsAllOnesElt &&
19632 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
19633 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
19637 CstVectorElts[IdxVal] = OnesCst;
19643 if (Subtarget.hasSSE41() &&
19645 SmallVector<int, 8> BlendMask;
19646 for (
unsigned i = 0; i != NumElts; ++i)
19647 BlendMask.
push_back(i == IdxVal ? i + NumElts : i);
19663 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19664 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19666 return DAG.
getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19671 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19673 "Vectors will always have power-of-two number of elements.");
19678 if (IdxVal >= NumEltsIn128 &&
19679 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19680 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19683 SmallVector<int, 8> BlendMask;
19684 for (
unsigned i = 0; i != NumElts; ++i)
19685 BlendMask.
push_back(i == IdxVal ? i + NumElts : i);
19694 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19706 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19707 EltVT == MVT::f16 || EltVT == MVT::i64) {
19714 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19725 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19727 if (VT == MVT::v8i16) {
19728 assert(Subtarget.hasSSE2() &&
"SSE2 required for PINSRW");
19729 Opc = X86ISD::PINSRW;
19731 assert(VT == MVT::v16i8 &&
"PINSRB requires v16i8 vector");
19732 assert(Subtarget.hasSSE41() &&
"SSE41 required for PINSRB");
19733 Opc = X86ISD::PINSRB;
19739 return DAG.
getNode(
Opc, dl, VT, N0, N1, N2);
19742 if (Subtarget.hasSSE41()) {
19743 if (EltVT == MVT::f32) {
19763 return DAG.
getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19768 return DAG.
getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19773 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19784 MVT XTy =
X.getSimpleValueType();
19791 if (!Subtarget.hasFP16())
19797 128 /
X.getSimpleValueType().getSizeInBits());
19813 return DAG.
getNode(X86ISD::SCALEF,
DL, XTy,
X, Exp);
19818 if (Subtarget.hasFP16()) {
19819 if (Subtarget.hasVLX()) {
19821 return DAG.
getNode(X86ISD::SCALEF,
DL, XTy,
X, Exp);
19827 X.getSimpleValueType().changeTypeToInteger());
19830 if (Subtarget.hasFP16()) {
19832 return DAG.
getNode(X86ISD::SCALEF,
DL, XTy,
X, Exp);
19849 MVT OpVT =
Op.getSimpleValueType();
19870 "Expected an SSE type!");
19874 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19887 assert(
Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19894 assert(
Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19895 "Only vXi1 extract_subvectors need custom lowering");
19899 uint64_t IdxVal =
Op.getConstantOperandVal(1);
19916unsigned X86TargetLowering::getGlobalWrapperKind(
19917 const GlobalValue *GV,
const unsigned char OpFlags)
const {
19920 return X86ISD::Wrapper;
19923 if (Subtarget.isPICStyleRIPRel() &&
19926 return X86ISD::WrapperRIP;
19930 return X86ISD::WrapperRIP;
19932 return X86ISD::Wrapper;
19947 unsigned char OpFlag = Subtarget.classifyLocalReference(
nullptr);
19954 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlag),
DL, PtrVT, Result);
19970 unsigned char OpFlag = Subtarget.classifyLocalReference(
nullptr);
19972 EVT PtrVT =
Op.getValueType();
19976 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlag),
DL, PtrVT, Result);
19989 return LowerGlobalOrExternal(
Op, DAG,
false,
nullptr);
19995 unsigned char OpFlags =
19996 Subtarget.classifyBlockAddressReference();
20000 EVT PtrVT =
Op.getValueType();
20003 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlags), dl, PtrVT, Result);
20018 bool *IsImpCall)
const {
20021 const GlobalValue *GV =
nullptr;
20023 const char *ExternalSym =
nullptr;
20025 GV =
G->getGlobal();
20029 ExternalSym = ES->getSymbol();
20034 unsigned char OpFlags;
20036 OpFlags = Subtarget.classifyGlobalFunctionReference(GV,
Mod);
20038 OpFlags = Subtarget.classifyGlobalReference(GV,
Mod);
20043 EVT PtrVT =
Op.getValueType();
20052 int64_t GlobalOffset = 0;
20065 if (ForCall && !NeedsLoad && !HasPICReg &&
Offset == 0)
20071 Mod.getModuleFlag(
"import-call-optimization")) {
20072 assert(ForCall &&
"Should only enable import call optimization if we are "
20073 "lowering a call");
20078 Result = DAG.
getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
20103 return LowerGlobalOrExternal(
Op, DAG,
false,
nullptr);
20107 const EVT PtrVT,
unsigned ReturnReg,
20108 unsigned char OperandFlags,
20109 bool LoadGlobalBaseReg =
false,
20110 bool LocalDynamic =
false) {
20118 if (LocalDynamic && UseTLSDESC) {
20125 "Unexpected TLSDESC DAG");
20129 "Unexpected TLSDESC DAG");
20131 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
20133 "Unexpected TLSDESC DAG");
20134 Ret =
SDValue(CopyFromRegOp, 0);
20142 unsigned CallType = UseTLSDESC ? X86ISD::TLSDESC
20143 : LocalDynamic ? X86ISD::TLSBASEADDR
20147 if (LoadGlobalBaseReg) {
20153 Chain = DAG.
getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
20155 Chain = DAG.
getNode(CallType, dl, NodeTys, {Chain, TGA});
20203 bool Is64Bit,
bool Is64BitLP64) {
20213 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
20228 unsigned WrapperKind = X86ISD::Wrapper;
20252 unsigned char OperandFlags = 0;
20255 unsigned WrapperKind = X86ISD::Wrapper;
20261 WrapperKind = X86ISD::WrapperRIP;
20301 const GlobalValue *GV = GA->
getGlobal();
20302 EVT PtrVT =
Op.getValueType();
20305 if (Subtarget.isTargetELF()) {
20309 if (Subtarget.is64Bit()) {
20310 if (Subtarget.isTarget64BitLP64())
20317 Subtarget.isTarget64BitLP64());
20321 PositionIndependent);
20326 if (Subtarget.isTargetDarwin()) {
20328 unsigned char OpFlag = 0;
20329 unsigned WrapperKind = 0;
20333 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
20336 WrapperKind = X86ISD::Wrapper;
20339 WrapperKind = X86ISD::WrapperRIP;
20356 SDVTList NodeTys = DAG.
getVTList(MVT::Other, MVT::Glue);
20359 Chain = DAG.
getNode(X86ISD::TLSCALL,
DL, NodeTys, Args);
20368 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
20372 if (Subtarget.isOSWindows()) {
20394 SDValue TlsArray = Subtarget.is64Bit()
20396 : (Subtarget.isTargetWindowsGNU()
20401 DAG.
getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
20405 res = ThreadPointer;
20409 if (Subtarget.is64Bit())
20411 MachinePointerInfo(), MVT::i32);
20413 IDX = DAG.
getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
20423 res = DAG.
getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
20440 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
20481 "Unexpected opcode!");
20482 bool IsStrict =
Op->isStrictFPOpcode();
20483 unsigned OpNo = IsStrict ? 1 : 0;
20485 MVT SrcVT = Src.getSimpleValueType();
20486 MVT VT =
Op.getSimpleValueType();
20488 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20489 (VT != MVT::f32 && VT != MVT::f64))
20495 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20502 {Op.getOperand(0), InVec});
20522 "Unexpected opcode!");
20523 bool IsStrict =
Op->isStrictFPOpcode();
20524 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
20525 MVT SrcVT = Src.getSimpleValueType();
20526 MVT VT =
Op.getSimpleValueType();
20528 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20533 assert(Subtarget.hasFP16() &&
"Expected FP16");
20537 SDValue CvtVec = DAG.
getNode(
Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20538 {Op.getOperand(0), InVec});
20556 if (!Subtarget.
hasSSE2() || FromVT != MVT::v4i32)
20559 return ToVT == MVT::v4f32 || (Subtarget.
hasAVX() && ToVT == MVT::v4f64);
20563 if (!Subtarget.
hasAVX512() || FromVT != MVT::v4i32)
20566 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20606 if (FromVT != Vec128VT)
20631 MVT SrcVT =
X.getSimpleValueType();
20632 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20637 if (!Subtarget.
hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20638 (IntVT != MVT::i32 && IntVT != MVT::i64))
20645 unsigned ToIntOpcode =
20647 unsigned ToFPOpcode =
20649 unsigned Width = 128;
20651 if (Subtarget.hasVLX() && Subtarget.hasDQI()) {
20660 if (IsUnsigned || IntVT == MVT::i64) {
20672 MVT VecSrcVT, VecIntVT, VecVT;
20674 unsigned SrcElts, VTElts;
20676 if (Width == 512) {
20677 NumElts = std::min(Width / IntSize, Width / SrcSize);
20681 NumElts = Width / IntSize;
20682 SrcElts = Width / SrcSize;
20683 VTElts = Width / VTSize;
20704 bool IsStrict =
Op->isStrictFPOpcode();
20705 MVT VT =
Op->getSimpleValueType(0);
20706 SDValue Src =
Op->getOperand(IsStrict ? 1 : 0);
20708 if (Subtarget.hasDQI()) {
20709 assert(!Subtarget.hasVLX() &&
"Unexpected features");
20711 assert((Src.getSimpleValueType() == MVT::v2i64 ||
20712 Src.getSimpleValueType() == MVT::v4i64) &&
20713 "Unsupported custom type");
20716 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
20718 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20728 Res = DAG.
getNode(
Op.getOpcode(),
DL, {WideVT, MVT::Other},
20729 {Op->getOperand(0), Src});
20732 Res = DAG.
getNode(
Op.getOpcode(),
DL, WideVT, Src);
20745 if (VT != MVT::v4f32 || IsSigned)
20757 for (
int i = 0; i != 4; ++i) {
20763 {
Op.getOperand(0), Elt});
20764 Chains[i] = SignCvts[i].getValue(1);
20775 {Chain, SignCvt, SignCvt});
20792 bool IsStrict =
Op->isStrictFPOpcode();
20793 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
20795 MVT VT =
Op.getSimpleValueType();
20803 DAG.
getNode(
Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20806 DAG.
getNode(
Op.getOpcode(), dl, NVT, Src), Rnd);
20811 if (FloatVT.
getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20812 if (VT == MVT::v4i32 && Subtarget.
hasSSE2() && IsSigned)
20814 if (VT == MVT::v8i32 && Subtarget.
hasAVX() && IsSigned)
20817 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20820 if (VT == MVT::v16i32)
20822 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20824 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20827 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20828 (VT == MVT::v2i64 || VT == MVT::v4i64))
20835 bool IsStrict =
Op->isStrictFPOpcode();
20836 unsigned OpNo = IsStrict ? 1 : 0;
20839 MVT SrcVT = Src.getSimpleValueType();
20840 MVT VT =
Op.getSimpleValueType();
20848 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20849 return LowerWin64_INT128_TO_FP(
Op, DAG);
20858 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20863 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20866 return DAG.
getNode(X86ISD::CVTSI2P, dl, VT,
20870 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20876 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20877 "Unknown SINT_TO_FP to lower!");
20883 if (SrcVT == MVT::i32 && UseSSEReg)
20885 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20894 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20903 if (VT == MVT::f128 || !Subtarget.hasX87())
20907 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20911 ValueToStore = DAG.
getBitcast(MVT::f64, ValueToStore);
20918 MachinePointerInfo MPI =
20921 Chain = DAG.
getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20922 std::pair<SDValue, SDValue> Tmp =
20923 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20938 Tys = DAG.
getVTList(MVT::f80, MVT::Other);
20940 Tys = DAG.
getVTList(DstVT, MVT::Other);
20942 SDValue FILDOps[] = {Chain, Pointer};
20946 Chain = Result.getValue(1);
20956 SDValue FSTOps[] = {Chain, Result, StackSlot};
20964 DstVT,
DL, Chain, StackSlot,
20966 Chain = Result.getValue(1);
20969 return { Result, Chain };
20978 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20979 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20989 assert(!
Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!");
21006 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
21014 APInt(64, 0x4330000000000000ULL))));
21017 APInt(64, 0x4530000000000000ULL))));
21031 MVT::v2f64, dl, CLod0.
getValue(1), CPIdx1,
21040 Result = DAG.
getNode(X86ISD::FHADD, dl, MVT::v2f64,
Sub,
Sub);
21054 unsigned OpNo =
Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21076 if (
Op.getNode()->isStrictFPOpcode()) {
21081 {Chain,
Or, Bias});
21083 if (
Op.getValueType() ==
Sub.getValueType())
21088 Sub,
Sub.getValue(1), dl,
Op.getSimpleValueType());
21090 return DAG.
getMergeValues({ResultPair.first, ResultPair.second}, dl);
21104 if (
Op.getSimpleValueType() != MVT::v2f64)
21107 bool IsStrict =
Op->isStrictFPOpcode();
21109 SDValue N0 =
Op.getOperand(IsStrict ? 1 : 0);
21113 if (!Subtarget.hasVLX()) {
21121 {Op.getOperand(0), N0});
21132 return DAG.
getNode(X86ISD::STRICT_CVTUI2P,
DL, {MVT::v2f64, MVT::Other},
21133 {
Op.getOperand(0), N0});
21134 return DAG.
getNode(X86ISD::CVTUI2P,
DL, MVT::v2f64, N0);
21150 {
Op.getOperand(0),
Or, VBias});
21157 bool IsStrict =
Op->isStrictFPOpcode();
21158 SDValue V =
Op->getOperand(IsStrict ? 1 : 0);
21159 MVT VecIntVT = V.getSimpleValueType();
21160 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
21161 "Unsupported custom type");
21165 assert(!Subtarget.hasVLX() &&
"Unexpected features");
21166 MVT VT =
Op->getSimpleValueType(0);
21169 if (VT == MVT::v8f64)
21172 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
21173 VT == MVT::v8f16) &&
21175 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
21176 MVT WideIntVT = MVT::v16i32;
21177 if (VT == MVT::v4f64) {
21178 WideVT = MVT::v8f64;
21179 WideIntVT = MVT::v8i32;
21191 {
Op->getOperand(0), V});
21205 if (Subtarget.
hasAVX() && VecIntVT == MVT::v4i32 &&
21206 Op->getSimpleValueType(0) == MVT::v4f64) {
21216 X86ISD::VBROADCAST_LOAD,
DL, Tys,
Ops, MVT::f64,
21226 {
Op.getOperand(0),
Or, VBias});
21242 bool Is128 = VecIntVT == MVT::v4i32;
21243 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
21246 if (VecFloatVT !=
Op->getSimpleValueType(0))
21267 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
21273 Low = DAG.
getNode(X86ISD::BLENDI,
DL, VecI16VT, VecBitcast,
21281 High = DAG.
getNode(X86ISD::BLENDI,
DL, VecI16VT, VecShiftBitcast,
21308 {
Op.getOperand(0), HighBitcast, VecCstFSub});
21310 {FHigh.
getValue(1), LowBitcast, FHigh});
21320 unsigned OpNo =
Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21340 bool IsStrict =
Op->isStrictFPOpcode();
21341 unsigned OpNo = IsStrict ? 1 : 0;
21345 MVT SrcVT = Src.getSimpleValueType();
21346 MVT DstVT =
Op->getSimpleValueType(0);
21350 if (DstVT == MVT::f128)
21364 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21365 return LowerWin64_INT128_TO_FP(
Op, DAG);
21371 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
21378 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
21393 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
21398 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
21401 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
21402 (DstVT == MVT::f32 || DstVT == MVT::f64))
21408 Align SlotAlign(8);
21409 MachinePointerInfo MPI =
21411 if (SrcVT == MVT::i32) {
21414 SDValue Store1 = DAG.
getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
21417 std::pair<SDValue, SDValue> Tmp =
21418 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21425 assert(SrcVT == MVT::i64 &&
"Unexpected type in UINT_TO_FP");
21431 ValueToStore = DAG.
getBitcast(MVT::f64, ValueToStore);
21434 DAG.
getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21438 SDVTList Tys = DAG.
getVTList(MVT::f80, MVT::Other);
21451 APInt FF(64, 0x5F80000000000000ULL);
21473 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21474 Opc = X86ISD::STRICT_FP80_ADD;
21477 DAG.
getNode(
Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
21479 if (DstVT == MVT::f80)
21487 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21488 Opc = X86ISD::FP80_ADD;
21504 bool IsStrict =
Op->isStrictFPOpcode();
21507 EVT DstTy =
Op.getValueType();
21509 EVT TheVT =
Value.getValueType();
21512 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21521 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21525 if (!IsSigned && DstTy != MVT::i64) {
21528 assert(DstTy == MVT::i32 &&
"Unexpected FP_TO_UINT");
21532 assert(DstTy.getSimpleVT() <= MVT::i64 &&
21533 DstTy.getSimpleVT() >= MVT::i16 &&
21534 "Unknown FP_TO_INT to lower!");
21539 unsigned MemSize = DstTy.getStoreSize();
21548 if (UnsignedFixup) {
21568 bool LosesInfo =
false;
21569 if (TheVT == MVT::f64)
21573 else if (TheVT == MVT::f80)
21578 "FP conversion should have been exact");
21588 Chain =
Cmp.getValue(1);