68#define DEBUG_TYPE "x86-isel"
71 "x86-experimental-pref-innermost-loop-alignment",
cl::init(4),
73 "Sets the preferable loop alignment for experiments (as log2 bytes) "
74 "for innermost loops only. If specified, this option overrides "
75 "alignment set by x86-experimental-pref-loop-alignment."),
79 "x86-br-merging-base-cost",
cl::init(2),
81 "Sets the cost threshold for when multiple conditionals will be merged "
82 "into one branch versus be split in multiple branches. Merging "
83 "conditionals saves branches at the cost of additional instructions. "
84 "This value sets the instruction cost limit, below which conditionals "
85 "will be merged, and above which conditionals will be split. Set to -1 "
86 "to never merge branches."),
90 "x86-br-merging-ccmp-bias",
cl::init(6),
91 cl::desc(
"Increases 'x86-br-merging-base-cost' in cases that the target "
92 "supports conditional compare instructions."),
97 cl::desc(
"Replace narrow shifts with wider shifts."),
101 "x86-br-merging-likely-bias",
cl::init(0),
102 cl::desc(
"Increases 'x86-br-merging-base-cost' in cases that it is likely "
103 "that all conditionals will be executed. For example for merging "
104 "the conditionals (a == b && c > d), if its known that a == b is "
105 "likely, then it is likely that if the conditionals are split "
106 "both sides will be executed, so it may be desirable to increase "
107 "the instruction cost threshold. Set to -1 to never merge likely "
112 "x86-br-merging-unlikely-bias",
cl::init(-1),
114 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
115 "that all conditionals will be executed. For example for merging "
116 "the conditionals (a == b && c > d), if its known that a == b is "
117 "unlikely, then it is unlikely that if the conditionals are split "
118 "both sides will be executed, so it may be desirable to decrease "
119 "the instruction cost threshold. Set to -1 to never merge unlikely "
124 "mul-constant-optimization",
cl::init(
true),
125 cl::desc(
"Replace 'mul x, Const' with more effective instructions like "
132 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
149 if (Subtarget.isAtom())
151 else if (Subtarget.is64Bit())
160 if (Subtarget.hasSlowDivide32())
162 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
168 static const struct {
170 const char *
const Name;
180 for (
const auto &LC : LibraryCalls) {
201 if (Subtarget.is64Bit())
218 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
227 if (Subtarget.is64Bit())
236 if (Subtarget.is64Bit())
244 if (Subtarget.is64Bit())
255 if (Subtarget.is64Bit())
259 if (!Subtarget.useSoftFloat()) {
323 if (!Subtarget.is64Bit()) {
332 for (
MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
337 if (Subtarget.is64Bit()) {
343 if (Subtarget.hasAVX10_2()) {
346 for (
MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
351 if (Subtarget.hasAVX10_2_512()) {
355 if (Subtarget.is64Bit()) {
372 if (Subtarget.is64Bit()) {
377 }
else if (!Subtarget.is64Bit())
390 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
401 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
402 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
406 if (Subtarget.is64Bit())
417 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
435 if (!Subtarget.hasBMI()) {
438 if (Subtarget.is64Bit()) {
445 if (Subtarget.hasLZCNT()) {
451 for (
auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
452 if (VT == MVT::i64 && !Subtarget.is64Bit())
466 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ?
Custom :
Expand);
473 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
478 for (
MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
491 if (Subtarget.is64Bit())
493 if (Subtarget.hasPOPCNT()) {
507 if (!Subtarget.hasMOVBE())
511 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
517 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
518 if (VT == MVT::i64 && !Subtarget.is64Bit())
538 for (
auto VT : { MVT::i32, MVT::i64 }) {
539 if (VT == MVT::i64 && !Subtarget.is64Bit())
550 for (
auto VT : { MVT::i32, MVT::i64 }) {
551 if (VT == MVT::i64 && !Subtarget.is64Bit())
564 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
574 if (!Subtarget.is64Bit())
577 if (Subtarget.is64Bit() && Subtarget.
hasAVX()) {
610 bool Is64Bit = Subtarget.is64Bit();
664 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE2()) {
668 : &X86::FR16RegClass);
670 : &X86::FR32RegClass);
672 : &X86::FR64RegClass);
680 for (
auto VT : { MVT::f32, MVT::f64 }) {
701 setF16Action(MVT::f16,
Promote);
748 }
else if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE1() &&
749 (UseX87 || Is64Bit)) {
787 for (
auto VT : { MVT::f32, MVT::f64 }) {
800 if (UseX87 && (
getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
801 addLegalFPImmediate(
APFloat(+0.0f));
802 addLegalFPImmediate(
APFloat(+1.0f));
803 addLegalFPImmediate(
APFloat(-0.0f));
804 addLegalFPImmediate(
APFloat(-1.0f));
806 addLegalFPImmediate(
APFloat(+0.0f));
811 addLegalFPImmediate(
APFloat(+0.0));
812 addLegalFPImmediate(
APFloat(+1.0));
813 addLegalFPImmediate(
APFloat(-0.0));
814 addLegalFPImmediate(
APFloat(-1.0));
816 addLegalFPImmediate(
APFloat(+0.0));
847 addLegalFPImmediate(TmpFlt);
849 addLegalFPImmediate(TmpFlt);
855 addLegalFPImmediate(TmpFlt2);
857 addLegalFPImmediate(TmpFlt2);
905 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.
hasSSE1()) {
907 : &X86::VR128RegClass);
984 for (
auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
985 MVT::v4f32, MVT::v8f32, MVT::v16f32,
986 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
1069 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1074 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE1()) {
1076 : &X86::VR128RegClass);
1104 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE2()) {
1106 : &X86::VR128RegClass);
1111 : &X86::VR128RegClass);
1113 : &X86::VR128RegClass);
1115 : &X86::VR128RegClass);
1117 : &X86::VR128RegClass);
1119 : &X86::VR128RegClass);
1121 for (
auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1128 for (
auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1129 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1164 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1187 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1207 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1215 for (
auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1220 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1226 setF16Action(MVT::v8f16,
Expand);
1251 for (
auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1325 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1329 if (VT == MVT::v2i64)
continue;
1343 if (Subtarget.hasGFNI()) {
1350 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSSE3()) {
1355 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1367 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE41()) {
1368 for (
MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1408 for (
auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1423 if (Subtarget.is64Bit() && !Subtarget.
hasAVX512()) {
1435 if (!Subtarget.useSoftFloat() && Subtarget.
hasSSE42()) {
1439 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1440 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1441 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1447 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1451 if (!Subtarget.useSoftFloat() && Subtarget.
hasAVX()) {
1455 : &X86::VR256RegClass);
1457 : &X86::VR256RegClass);
1459 : &X86::VR256RegClass);
1461 : &X86::VR256RegClass);
1463 : &X86::VR256RegClass);
1465 : &X86::VR256RegClass);
1467 : &X86::VR256RegClass);
1469 for (
auto VT : { MVT::v8f32, MVT::v4f64 }) {
1533 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1539 if (VT == MVT::v4i64)
continue;
1560 for (
auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1571 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1591 for (
auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1592 MVT::v2f64, MVT::v4f64 }) {
1598 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1639 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1647 for (
auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1669 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1670 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1677 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1678 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1683 for (
MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1684 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1695 setF16Action(MVT::v16f16,
Expand);
1711 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1712 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1717 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1718 Subtarget.hasF16C()) {
1719 for (
MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1723 for (
MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1738 if (!Subtarget.useSoftFloat() && Subtarget.
hasAVX512()) {
1766 if (!Subtarget.hasDQI()) {
1779 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1785 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1788 for (
auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1801 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1804 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1805 for (
MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1814 if (!Subtarget.useSoftFloat() && Subtarget.
useAVX512Regs()) {
1815 bool HasBWI = Subtarget.hasBWI();
1835 for (
MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1851 if (Subtarget.hasDQI())
1854 for (
MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1861 for (
MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1898 if (!Subtarget.hasVLX()) {
1899 for (
auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1900 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1926 for (
auto VT : { MVT::v16f32, MVT::v8f64 }) {
1943 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1970 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1994 for (
auto VT : { MVT::v16i32, MVT::v8i64 }) {
2003 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2024 if (Subtarget.hasDQI()) {
2032 if (Subtarget.hasCDI()) {
2034 for (
auto VT : { MVT::v16i32, MVT::v8i64} ) {
2039 if (Subtarget.hasVPOPCNTDQ()) {
2040 for (
auto VT : { MVT::v16i32, MVT::v8i64 })
2047 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2048 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2051 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2052 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2063 setF16Action(MVT::v32f16,
Expand);
2072 for (
auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2079 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2088 if (Subtarget.hasVBMI2()) {
2089 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2103 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2104 for (
auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2114 if (!Subtarget.useSoftFloat() && Subtarget.
hasAVX512()) {
2122 if (Subtarget.hasDQI()) {
2127 "Unexpected operation action!");
2135 for (
auto VT : { MVT::v2i64, MVT::v4i64 }) {
2143 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2152 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2153 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2156 if (Subtarget.hasDQI()) {
2167 if (Subtarget.hasCDI()) {
2168 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2173 if (Subtarget.hasVPOPCNTDQ()) {
2174 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2181 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2182 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2183 MVT::v16i16, MVT::v8i8})
2188 for (
MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2192 if (Subtarget.hasVLX())
2193 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2194 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2198 if (Subtarget.hasVBMI2())
2199 for (
MVT VT : {MVT::v32i16, MVT::v64i8})
2203 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2204 for (
MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2210 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2214 for (
auto VT : { MVT::v32i1, MVT::v64i1 }) {
2227 for (
auto VT : { MVT::v16i1, MVT::v32i1 })
2235 for (
auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2244 if (Subtarget.hasBITALG()) {
2245 for (
auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2250 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2251 auto setGroup = [&] (
MVT VT) {
2320 setGroup(MVT::v32f16);
2364 if (Subtarget.hasVLX()) {
2365 setGroup(MVT::v8f16);
2366 setGroup(MVT::v16f16);
2419 if (!Subtarget.useSoftFloat() &&
2420 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2422 : &X86::VR128RegClass);
2424 : &X86::VR256RegClass);
2430 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2431 setF16Action(VT,
Expand);
2432 if (!Subtarget.hasBF16())
2449 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2452 setF16Action(MVT::v32bf16,
Expand);
2463 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2464 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2477 if (Subtarget.hasAVX10_2_512()) {
2490 for (
auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2496 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2509 if (Subtarget.hasBWI()) {
2514 if (Subtarget.hasFP16()) {
2546 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2554 if (!Subtarget.is64Bit()) {
2564 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2565 if (VT == MVT::i64 && !Subtarget.is64Bit())
2609 if (Subtarget.is32Bit() &&
2749 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2756 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.
hasAVX512() &&
2757 !Subtarget.hasBWI())
2782 bool AssumeSingleUse) {
2783 if (!AssumeSingleUse && !
Op.hasOneUse())
2789 auto *Ld = cast<LoadSDNode>(
Op.getNode());
2790 if (!Subtarget.
hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2791 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() <
Align(16))
2802 bool AssumeSingleUse) {
2803 assert(Subtarget.
hasAVX() &&
"Expected AVX for broadcast from memory");
2809 auto *Ld = cast<LoadSDNode>(
Op.getNode());
2810 return !Ld->isVolatile() ||
2819 if (
Op.hasOneUse()) {
2820 unsigned Opcode =
Op.getNode()->user_begin()->getOpcode();
2833 default:
return false;
2874 default:
return false;
2895 int ReturnAddrIndex = FuncInfo->
getRAIndex();
2897 if (ReturnAddrIndex == 0) {
2910 bool HasSymbolicDisplacement) {
2917 if (!HasSymbolicDisplacement)
2935 return Offset < 16 * 1024 * 1024;
2959 switch (SetCCOpcode) {
2984 if (SetCCOpcode ==
ISD::SETGT && RHSC->isAllOnes()) {
2989 if (SetCCOpcode ==
ISD::SETLT && RHSC->isZero()) {
2993 if (SetCCOpcode ==
ISD::SETGE && RHSC->isZero()) {
2997 if (SetCCOpcode ==
ISD::SETLT && RHSC->isOne()) {
3012 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3016 switch (SetCCOpcode) {
3032 switch (SetCCOpcode) {
3086 unsigned Intrinsic)
const {
3092 switch (Intrinsic) {
3093 case Intrinsic::x86_aesenc128kl:
3094 case Intrinsic::x86_aesdec128kl:
3096 Info.ptrVal =
I.getArgOperand(1);
3101 case Intrinsic::x86_aesenc256kl:
3102 case Intrinsic::x86_aesdec256kl:
3104 Info.ptrVal =
I.getArgOperand(1);
3109 case Intrinsic::x86_aesencwide128kl:
3110 case Intrinsic::x86_aesdecwide128kl:
3112 Info.ptrVal =
I.getArgOperand(0);
3117 case Intrinsic::x86_aesencwide256kl:
3118 case Intrinsic::x86_aesdecwide256kl:
3120 Info.ptrVal =
I.getArgOperand(0);
3125 case Intrinsic::x86_cmpccxadd32:
3126 case Intrinsic::x86_cmpccxadd64:
3127 case Intrinsic::x86_atomic_bts:
3128 case Intrinsic::x86_atomic_btc:
3129 case Intrinsic::x86_atomic_btr: {
3131 Info.ptrVal =
I.getArgOperand(0);
3132 unsigned Size =
I.getType()->getScalarSizeInBits();
3139 case Intrinsic::x86_atomic_bts_rm:
3140 case Intrinsic::x86_atomic_btc_rm:
3141 case Intrinsic::x86_atomic_btr_rm: {
3143 Info.ptrVal =
I.getArgOperand(0);
3144 unsigned Size =
I.getArgOperand(1)->getType()->getScalarSizeInBits();
3151 case Intrinsic::x86_aadd32:
3152 case Intrinsic::x86_aadd64:
3153 case Intrinsic::x86_aand32:
3154 case Intrinsic::x86_aand64:
3155 case Intrinsic::x86_aor32:
3156 case Intrinsic::x86_aor64:
3157 case Intrinsic::x86_axor32:
3158 case Intrinsic::x86_axor64:
3159 case Intrinsic::x86_atomic_add_cc:
3160 case Intrinsic::x86_atomic_sub_cc:
3161 case Intrinsic::x86_atomic_or_cc:
3162 case Intrinsic::x86_atomic_and_cc:
3163 case Intrinsic::x86_atomic_xor_cc: {
3165 Info.ptrVal =
I.getArgOperand(0);
3166 unsigned Size =
I.getArgOperand(1)->getType()->getScalarSizeInBits();
3177 switch (IntrData->
Type) {
3182 Info.ptrVal =
I.getArgOperand(0);
3188 ScalarVT = MVT::i16;
3190 ScalarVT = MVT::i32;
3200 Info.ptrVal =
nullptr;
3212 Info.ptrVal =
nullptr;
3233 bool ForCodeSize)
const {
3234 for (
const APFloat &FPImm : LegalFPImmediates)
3235 if (Imm.bitwiseIsEqual(FPImm))
3243 assert(cast<LoadSDNode>(Load)->
isSimple() &&
"illegal to narrow");
3247 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3249 if (
const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3255 EVT VT = Load->getValueType(0);
3259 if (
Use.getResNo() != 0)
3283 if (BitSize == 0 || BitSize > 64)
3330 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3334 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3335 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3339 unsigned Index)
const {
3381 return Subtarget.hasBMI() || Subtarget.
canUseCMOV() ||
3388 return Subtarget.hasLZCNT() || Subtarget.
canUseCMOV();
3395 return !Subtarget.
hasSSE2() || VT == MVT::f80;
3399 return (VT == MVT::f64 && Subtarget.
hasSSE2()) ||
3400 (VT == MVT::f32 && Subtarget.
hasSSE1()) || VT == MVT::f16;
3410 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3428 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3440 return Subtarget.hasFastLZCNT();
3449 EVT VT =
Y.getValueType();
3454 if (!Subtarget.hasBMI())
3458 if (VT != MVT::i32 && VT != MVT::i64)
3461 return !isa<ConstantSDNode>(
Y) || cast<ConstantSDNode>(
Y)->isOpaque();
3465 EVT VT =
Y.getValueType();
3475 if (VT == MVT::v4i32)
3482 return X.getValueType().isScalarInteger();
3488 unsigned OldShiftOpcode,
unsigned NewShiftOpcode,
3492 X, XC,
CC,
Y, OldShiftOpcode, NewShiftOpcode, DAG))
3495 if (
X.getValueType().isScalarInteger())
3509 EVT VT,
unsigned ShiftOpc,
bool MayTransformRotate,
3510 const APInt &ShiftOrRotateAmt,
const std::optional<APInt> &AndMask)
const {
3514 bool PreferRotate =
false;
3523 PreferRotate = Subtarget.hasBMI2();
3524 if (!PreferRotate) {
3527 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3532 assert(AndMask.has_value() &&
"Null andmask when querying about shift+and");
3534 if (PreferRotate && MayTransformRotate)
3568 if (PreferRotate || !MayTransformRotate || VT.
isVector())
3578 const Value *Rhs)
const {
3582 if (BaseCost >= 0 && Subtarget.hasCCMP())
3585 if (BaseCost >= 0 && Opc == Instruction::And &&
3600 N->getOperand(0).getOpcode() ==
ISD::SRL) ||
3602 N->getOperand(0).getOpcode() ==
ISD::SHL)) &&
3603 "Expected shift-shift mask");
3605 EVT VT =
N->getValueType(0);
3606 if ((Subtarget.hasFastVectorShiftMasks() && VT.
isVector()) ||
3607 (Subtarget.hasFastScalarShiftMasks() && !VT.
isVector())) {
3611 return N->getOperand(1) ==
N->getOperand(0).getOperand(1);
3617 EVT VT =
Y.getValueType();
3624 if (VT == MVT::i64 && !Subtarget.is64Bit())
3684 [CmpVal](
int M) { return isUndefOrEqual(M, CmpVal); });
3696 [](
int M) { return M == SM_SentinelUndef; });
3701 unsigned NumElts = Mask.size();
3707 unsigned NumElts = Mask.size();
3713 return (Val >=
Low && Val <
Hi);
3756 unsigned NumElts = Mask.size();
3767 unsigned Size,
int Low,
int Step = 1) {
3768 for (
unsigned i = Pos, e = Pos +
Size; i != e; ++i,
Low += Step)
3780 for (
unsigned i = Pos, e = Pos +
Size; i != e; ++i,
Low += Step)
3796 unsigned NumElts = Mask.size();
3815 WidenedMask.
assign(Mask.size() / 2, 0);
3816 for (
int i = 0,
Size = Mask.size(); i <
Size; i += 2) {
3818 int M1 = Mask[i + 1];
3829 WidenedMask[i / 2] =
M1 / 2;
3833 WidenedMask[i / 2] =
M0 / 2;
3850 WidenedMask[i / 2] =
M0 / 2;
3857 assert(WidenedMask.
size() == Mask.size() / 2 &&
3858 "Incorrect size of mask after widening the elements!");
3864 const APInt &Zeroable,
3871 assert(!Zeroable.
isZero() &&
"V2's non-undef elements are used?!");
3872 for (
int i = 0,
Size = Mask.size(); i !=
Size; ++i)
3888 unsigned NumSrcElts = Mask.size();
3889 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3890 "Illegal shuffle scale factor");
3893 if (NumDstElts >= NumSrcElts) {
3894 int Scale = NumDstElts / NumSrcElts;
3902 while (ScaledMask.
size() > NumDstElts) {
3906 ScaledMask = std::move(WidenedMask);
3928 const SDLoc &dl,
bool IsMask =
false) {
3933 MVT ConstVecVT = VT;
3942 for (
unsigned i = 0; i < NumElts; ++i) {
3943 bool IsUndef = Values[i] < 0 && IsMask;
3960 "Unequal constant and undef arrays");
3964 MVT ConstVecVT = VT;
3973 for (
unsigned i = 0, e = Bits.size(); i != e; ++i) {
3978 const APInt &V = Bits[i];
3983 }
else if (EltVT == MVT::f32) {
3986 }
else if (EltVT == MVT::f64) {
4009 "Unexpected vector type");
4023 "Unexpected vector type");
4037 LHS.getValueType() !=
RHS.getValueType() ||
4038 LHS.getOperand(0) !=
RHS.getOperand(0))
4042 if (Src.getValueSizeInBits() != (
LHS.getValueSizeInBits() * 2))
4045 unsigned NumElts =
LHS.getValueType().getVectorNumElements();
4046 if ((
LHS.getConstantOperandAPInt(1) == 0 &&
4047 RHS.getConstantOperandAPInt(1) == NumElts) ||
4048 (AllowCommute &&
RHS.getConstantOperandAPInt(1) == 0 &&
4049 LHS.getConstantOperandAPInt(1) == NumElts))
4056 const SDLoc &dl,
unsigned vectorWidth) {
4064 unsigned ElemsPerChunk = vectorWidth / ElVT.
getSizeInBits();
4069 IdxVal &= ~(ElemsPerChunk - 1);
4074 Vec->
ops().slice(IdxVal, ElemsPerChunk));
4108 unsigned vectorWidth) {
4109 assert((vectorWidth == 128 || vectorWidth == 256) &&
4110 "Unsupported vector width");
4116 EVT ResultVT = Result.getValueType();
4124 IdxVal &= ~(ElemsPerChunk - 1);
4150 "Unsupported vector widening type");
4171 const SDLoc &dl,
unsigned WideSizeInBits) {
4174 "Unsupported vector widening type");
4178 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4186 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4187 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4197 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4205 assert(Ops.
empty() &&
"Expected an empty ops vector");
4208 Ops.
append(
N->op_begin(),
N->op_end());
4215 const APInt &
Idx =
N->getConstantOperandAPInt(2);
4216 EVT VT = Src.getValueType();
4221 if (
Idx == 0 && Src.isUndef()) {
4229 Src.getOperand(1).getValueType() == SubVT &&
4253 if (Src.isUndef()) {
4273 unsigned NumSubOps = SubOps.
size();
4274 unsigned HalfNumSubOps = NumSubOps / 2;
4275 assert((NumSubOps % 2) == 0 &&
"Unexpected number of subvectors");
4281 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.
getContext());
4295 EVT VT =
Op.getValueType();
4298 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4299 "Can't split odd sized vector");
4305 return std::make_pair(
Lo,
Lo);
4308 return std::make_pair(
Lo,
Hi);
4314 EVT VT =
Op.getValueType();
4319 for (
unsigned I = 0;
I != NumOps; ++
I) {
4321 if (!
SrcOp.getValueType().isVector()) {
4331 DAG.
getNode(
Op.getOpcode(), dl, LoVT, LoOps),
4332 DAG.
getNode(
Op.getOpcode(), dl, HiVT, HiOps));
4341 [[maybe_unused]]
EVT VT =
Op.getValueType();
4342 assert((
Op.getOperand(0).getValueType().is256BitVector() ||
4343 Op.getOperand(0).getValueType().is512BitVector()) &&
4345 assert(
Op.getOperand(0).getValueType().getVectorNumElements() ==
4356 [[maybe_unused]]
EVT VT =
Op.getValueType();
4357 assert(
Op.getOperand(0).getValueType() == VT &&
4358 Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!");
4370template <
typename F>
4373 F Builder,
bool CheckBWI =
true) {
4374 assert(Subtarget.
hasSSE2() &&
"Target assumed to support at least SSE2");
4375 unsigned NumSubs = 1;
4382 }
else if (Subtarget.
hasAVX2()) {
4395 return Builder(DAG,
DL, Ops);
4398 for (
unsigned i = 0; i != NumSubs; ++i) {
4401 EVT OpVT =
Op.getValueType();
4425 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4432 APInt SplatValue, SplatUndef;
4433 unsigned SplatBitSize;
4435 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4436 HasAnyUndefs, OpEltSizeInBits) &&
4437 !HasAnyUndefs && SplatValue.
getBitWidth() == OpEltSizeInBits)
4452 MVT OpVT =
Op.getSimpleValueType();
4456 assert(OpVT == VT &&
"Vector type mismatch");
4458 if (
SDValue BroadcastOp = MakeBroadcastOp(
Op, OpVT, DstVT)) {
4484 unsigned IdxVal =
Op.getConstantOperandVal(2);
4490 if (IdxVal == 0 && Vec.
isUndef())
4493 MVT OpVT =
Op.getSimpleValueType();
4512 assert(IdxVal + SubVecNumElems <= NumElems &&
4514 "Unexpected index value in INSERT_SUBVECTOR");
4534 Undef, SubVec, ZeroIdx);
4537 assert(IdxVal != 0 &&
"Unexpected index");
4544 assert(IdxVal != 0 &&
"Unexpected index");
4547 [](
SDValue V) { return V.isUndef(); })) {
4552 unsigned ShiftLeft = NumElems - SubVecNumElems;
4553 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4556 if (ShiftRight != 0)
4564 if (IdxVal + SubVecNumElems == NumElems) {
4567 if (SubVecNumElems * 2 == NumElems) {
4577 Undef, Vec, ZeroIdx);
4594 unsigned ShiftLeft = NumElems - SubVecNumElems;
4595 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4598 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4621 unsigned LowShift = NumElems - IdxVal;
4628 unsigned HighShift = IdxVal + SubVecNumElems;
4659 "Expected a 128/256/512-bit vector type");
4667 EVT InVT = In.getValueType();
4671 "Unknown extension opcode");
4677 "Expected VTs to be the same size!");
4681 InVT = In.getValueType();
4699 bool Lo,
bool Unary) {
4701 "Illegal vector type to unpack");
4702 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
4705 for (
int i = 0; i < NumElts; ++i) {
4706 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4707 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4708 Pos += (Unary ? 0 : NumElts * (i % 2));
4709 Pos += (
Lo ? 0 : NumEltsInLane / 2);
4710 Mask.push_back(Pos);
4720 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
4722 for (
int i = 0; i < NumElts; ++i) {
4724 Pos += (
Lo ? 0 : NumElts / 2);
4725 Mask.push_back(Pos);
4735 for (
int I = 0, NumElts = Mask.size();
I != NumElts; ++
I) {
4739 SDValue V = (M < NumElts) ? V1 : V2;
4742 Ops[
I] = V.getOperand(M % NumElts);
4771 bool PackHiHalf =
false) {
4772 MVT OpVT =
LHS.getSimpleValueType();
4774 bool UsePackUS = Subtarget.
hasSSE41() || EltSizeInBits == 8;
4775 assert(OpVT ==
RHS.getSimpleValueType() &&
4778 "Unexpected PACK operand types");
4779 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4780 "Unexpected PACK result type");
4783 if (EltSizeInBits == 32) {
4785 int Offset = PackHiHalf ? 1 : 0;
4787 for (
int I = 0;
I != NumElts;
I += 4) {
4840 MVT VT = V2.getSimpleValueType();
4845 for (
int i = 0; i != NumElems; ++i)
4847 MaskVec[i] = (i ==
Idx) ? NumElems : i;
4855 return dyn_cast<ConstantPoolSDNode>(
Ptr);
4879 assert(LD &&
"Unexpected null LoadSDNode");
4887 bool AllowWholeUndefs =
true,
4888 bool AllowPartialUndefs =
false) {
4889 assert(EltBits.
empty() &&
"Expected an empty EltBits vector");
4893 EVT VT =
Op.getValueType();
4895 assert((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!");
4896 unsigned NumElts = SizeInBits / EltSizeInBits;
4901 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4902 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4903 "Constant bit sizes don't match");
4906 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4911 if (NumSrcElts == NumElts) {
4912 UndefElts = UndefSrcElts;
4913 EltBits.
assign(SrcEltBits.begin(), SrcEltBits.end());
4918 APInt UndefBits(SizeInBits, 0);
4919 APInt MaskBits(SizeInBits, 0);
4921 for (
unsigned i = 0; i != NumSrcElts; ++i) {
4922 unsigned BitOffset = i * SrcEltSizeInBits;
4923 if (UndefSrcElts[i])
4924 UndefBits.
setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4925 MaskBits.
insertBits(SrcEltBits[i], BitOffset);
4929 UndefElts =
APInt(NumElts, 0);
4932 for (
unsigned i = 0; i != NumElts; ++i) {
4933 unsigned BitOffset = i * EltSizeInBits;
4938 if (!AllowWholeUndefs)
4946 if (UndefEltBits.
getBoolValue() && !AllowPartialUndefs)
4949 EltBits[i] = MaskBits.
extractBits(EltSizeInBits, BitOffset);
4956 unsigned UndefBitIndex) {
4959 if (isa<UndefValue>(Cst)) {
4960 Undefs.
setBit(UndefBitIndex);
4963 if (
auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4964 Mask = CInt->getValue();
4967 if (
auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4968 Mask = CFP->getValueAPF().bitcastToAPInt();
4971 if (
auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4972 Type *Ty = CDS->getType();
4974 Type *EltTy = CDS->getElementType();
4978 if (!IsInteger && !IsFP)
4981 for (
unsigned I = 0, E = CDS->getNumElements();
I != E; ++
I)
4983 Mask.insertBits(CDS->getElementAsAPInt(
I),
I * EltBits);
4985 Mask.insertBits(CDS->getElementAsAPFloat(
I).bitcastToAPInt(),
4996 return CastBitData(UndefSrcElts, SrcEltBits);
5000 if (
auto *Cst = dyn_cast<ConstantSDNode>(
Op)) {
5003 return CastBitData(UndefSrcElts, SrcEltBits);
5005 if (
auto *Cst = dyn_cast<ConstantFPSDNode>(
Op)) {
5007 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5009 return CastBitData(UndefSrcElts, SrcEltBits);
5013 if (
auto *BV = dyn_cast<BuildVectorSDNode>(
Op)) {
5017 if (BV->getConstantRawBits(
true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5019 for (
unsigned I = 0, E = SrcEltBits.
size();
I != E; ++
I)
5022 return CastBitData(UndefSrcElts, SrcEltBits);
5030 if (!CstTy->
isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5034 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5035 if ((SizeInBits % SrcEltSizeInBits) != 0)
5038 APInt UndefSrcElts(NumSrcElts, 0);
5040 for (
unsigned i = 0; i != NumSrcElts; ++i)
5045 return CastBitData(UndefSrcElts, SrcEltBits);
5051 auto *MemIntr = cast<MemIntrinsicSDNode>(
Op);
5058 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5060 APInt UndefSrcElts(NumSrcElts, 0);
5062 if (CollectConstantBits(
C, SrcEltBits[0], UndefSrcElts, 0)) {
5063 if (UndefSrcElts[0])
5064 UndefSrcElts.
setBits(0, NumSrcElts);
5065 if (SrcEltBits[0].
getBitWidth() != SrcEltSizeInBits)
5066 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5067 SrcEltBits.
append(NumSrcElts - 1, SrcEltBits[0]);
5068 return CastBitData(UndefSrcElts, SrcEltBits);
5075 auto *MemIntr = cast<MemIntrinsicSDNode>(
Op);
5082 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5083 if (!CstTy->
isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5084 (SizeInBits % SubVecSizeInBits) != 0)
5087 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5088 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5089 APInt UndefSubElts(NumSubElts, 0);
5091 APInt(CstEltSizeInBits, 0));
5092 for (
unsigned i = 0; i != NumSubElts; ++i) {
5096 for (
unsigned j = 1; j != NumSubVecs; ++j)
5097 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5101 return CastBitData(UndefSubElts, SubEltBits);
5108 isa<ConstantSDNode>(
Op.getOperand(0).getOperand(0))) {
5110 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5112 APInt UndefSrcElts(NumSrcElts, 0);
5114 const APInt &
C =
Op.getOperand(0).getConstantOperandAPInt(0);
5115 SrcEltBits.
push_back(
C.zextOrTrunc(SrcEltSizeInBits));
5116 SrcEltBits.
append(NumSrcElts - 1,
APInt(SrcEltSizeInBits, 0));
5117 return CastBitData(UndefSrcElts, SrcEltBits);
5125 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5127 APInt UndefSrcElts, UndefSubElts;
5130 UndefSubElts, EltSubBits,
5131 AllowWholeUndefs && AllowUndefs,
5132 AllowPartialUndefs && AllowUndefs) &&
5134 UndefSrcElts, EltSrcBits,
5135 AllowWholeUndefs && AllowUndefs,
5136 AllowPartialUndefs && AllowUndefs)) {
5137 unsigned BaseIdx =
Op.getConstantOperandVal(2);
5138 UndefSrcElts.
insertBits(UndefSubElts, BaseIdx);
5139 for (
unsigned i = 0, e = EltSubBits.
size(); i != e; ++i)
5140 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5141 return CastBitData(UndefSrcElts, EltSrcBits);
5152 UndefElts, EltBits, AllowWholeUndefs,
5153 AllowPartialUndefs)) {
5154 EVT SrcVT =
Op.getOperand(0).getValueType();
5157 unsigned BaseIdx =
Op.getConstantOperandVal(1);
5158 UndefElts = UndefElts.
extractBits(NumSubElts, BaseIdx);
5159 if ((BaseIdx + NumSubElts) != NumSrcElts)
5160 EltBits.
erase(EltBits.
begin() + BaseIdx + NumSubElts, EltBits.
end());
5168 if (
auto *SVN = dyn_cast<ShuffleVectorSDNode>(
Op)) {
5174 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5178 APInt UndefElts0, UndefElts1;
5182 UndefElts0, EltBits0, AllowWholeUndefs,
5183 AllowPartialUndefs))
5187 UndefElts1, EltBits1, AllowWholeUndefs,
5188 AllowPartialUndefs))
5192 for (
int i = 0; i != (int)NumElts; ++i) {
5197 }
else if (M < (
int)NumElts) {
5202 if (UndefElts1[M - NumElts])
5204 EltBits.
push_back(EltBits1[M - NumElts]);
5219 Op,
Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5220 true, AllowPartialUndefs)) {
5221 int SplatIndex = -1;
5222 for (
int i = 0, e = EltBits.
size(); i != e; ++i) {
5225 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5231 if (0 <= SplatIndex) {
5232 SplatVal = EltBits[SplatIndex];
5243 unsigned MaskEltSizeInBits,
5254 for (
const APInt &Elt : EltBits)
5269 bool IsPow2OrUndef =
true;
5270 for (
unsigned I = 0, E = EltBits.
size();
I != E; ++
I)
5271 IsPow2OrUndef &= UndefElts[
I] || EltBits[
I].isPowerOf2();
5272 return IsPow2OrUndef;
5279 EVT VT = V.getValueType();
5285 return V.getOperand(0);
5289 (
isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5291 Not = DAG.
getBitcast(V.getOperand(0).getValueType(), Not);
5301 V.getOperand(0).hasOneUse()) {
5305 V.getScalarValueSizeInBits(), UndefElts,
5309 bool MinSigned =
false;
5310 for (
APInt &Elt : EltBits) {
5311 MinSigned |= Elt.isMinSignedValue();
5316 MVT VT = V.getSimpleValueType();
5326 for (
SDValue &CatOp : CatOps) {
5330 CatOp = DAG.
getBitcast(CatOp.getValueType(), NotCat);
5337 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5352 bool Unary,
unsigned NumStages = 1) {
5353 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
5357 unsigned Offset = Unary ? 0 : NumElts;
5358 unsigned Repetitions = 1u << (NumStages - 1);
5359 unsigned Increment = 1u << NumStages;
5360 assert((NumEltsPerLane >> NumStages) > 0 &&
"Illegal packing compaction");
5362 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5363 for (
unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5364 for (
unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5365 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5366 for (
unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5367 Mask.push_back(Elt + (Lane * NumEltsPerLane) +
Offset);
5377 int NumInnerElts = NumElts / 2;
5378 int NumEltsPerLane = NumElts / NumLanes;
5379 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5385 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
5386 for (
int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5387 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5388 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5389 if (DemandedElts[OuterIdx])
5390 DemandedLHS.
setBit(InnerIdx);
5391 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5392 DemandedRHS.
setBit(InnerIdx);
5401 DemandedLHS, DemandedRHS);
5402 DemandedLHS |= DemandedLHS << 1;
5403 DemandedRHS |= DemandedRHS << 1;
5419 MVT VT =
N.getSimpleValueType();
5426 assert(Mask.empty() &&
"getTargetShuffleMask expects an empty Mask vector");
5427 assert(Ops.
empty() &&
"getTargetShuffleMask expects an empty Ops vector");
5430 bool IsFakeUnary =
false;
5431 switch (
N.getOpcode()) {
5433 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5434 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5435 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5437 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5440 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5441 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5442 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5444 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5447 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5448 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5449 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5451 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5454 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5455 if (isa<ConstantSDNode>(
N.getOperand(1)) &&
5456 isa<ConstantSDNode>(
N.getOperand(2))) {
5457 int BitLen =
N.getConstantOperandVal(1);
5458 int BitIdx =
N.getConstantOperandVal(2);
5464 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5465 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5466 if (isa<ConstantSDNode>(
N.getOperand(2)) &&
5467 isa<ConstantSDNode>(
N.getOperand(3))) {
5468 int BitLen =
N.getConstantOperandVal(2);
5469 int BitIdx =
N.getConstantOperandVal(3);
5471 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5475 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5476 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5478 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5481 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5482 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5484 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5487 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5488 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5490 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5493 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5494 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5496 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5500 "Only 32-bit and 64-bit elements are supported!");
5501 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5502 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5503 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5505 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5511 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5512 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5513 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5515 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5521 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5522 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5528 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5529 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5535 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5536 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5541 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5542 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5547 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5548 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5553 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5561 if (
N.getOperand(0).getValueType() == VT) {
5568 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5570 SDValue MaskNode =
N.getOperand(1);
5580 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5581 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5583 SDValue MaskNode =
N.getOperand(1);
5591 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5592 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5599 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5600 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5604 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5605 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5606 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5608 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5611 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5612 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5613 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5615 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5618 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5623 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5628 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5633 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5634 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5635 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5636 SDValue MaskNode =
N.getOperand(2);
5637 SDValue CtrlNode =
N.getOperand(3);
5638 if (
ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5639 unsigned CtrlImm = CtrlOp->getZExtValue();
5650 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5651 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5652 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5653 SDValue MaskNode =
N.getOperand(2);
5661 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5665 SDValue MaskNode =
N.getOperand(0);
5674 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5675 assert(
N.getOperand(2).getValueType() == VT &&
"Unexpected value type");
5676 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(2);
5680 SDValue MaskNode =
N.getOperand(1);
5697 if (!AllowSentinelZero &&
isAnyZero(Mask))
5705 if (M >= (
int)Mask.size())
5712 if (!IsUnary || IsFakeUnary)
5738 int Size = Mask.size();
5748 int ScalarSizeInBits = VectorSizeInBits /
Size;
5749 assert(!(VectorSizeInBits % ScalarSizeInBits) &&
"Illegal shuffle mask size");
5751 for (
int i = 0; i <
Size; ++i) {
5758 if ((M >= 0 && M <
Size && V1IsZero) || (M >=
Size && V2IsZero)) {
5773 if ((
Size % V.getNumOperands()) == 0) {
5774 int Scale =
Size / V->getNumOperands();
5781 APInt Val = Cst->getAPIntValue();
5782 Val = Val.
extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5786 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5787 Val = Val.
extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5796 if ((V.getNumOperands() %
Size) == 0) {
5797 int Scale = V->getNumOperands() /
Size;
5798 bool AllUndef =
true;
5799 bool AllZero =
true;
5800 for (
int j = 0; j < Scale; ++j) {
5801 SDValue Op = V.getOperand((M * Scale) + j);
5802 AllUndef &=
Op.isUndef();
5825 MVT VT =
N.getSimpleValueType();
5829 int Size = Mask.size();
5831 SDValue V2 = IsUnary ? V1 : Ops[1];
5838 "Illegal split of shuffle value type");
5842 APInt UndefSrcElts[2];
5844 bool IsSrcConstant[2] = {
5846 SrcEltBits[0],
true,
5849 SrcEltBits[1],
true,
5852 for (
int i = 0; i <
Size; ++i) {
5866 unsigned SrcIdx = M /
Size;
5881 (
Size % V.getValueType().getVectorNumElements()) == 0) {
5882 int Scale =
Size / V.getValueType().getVectorNumElements();
5883 int Idx = M / Scale;
5894 SDValue Vec = V.getOperand(0);
5897 int Idx = V.getConstantOperandVal(2);
5898 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5899 if (M <
Idx || (
Idx + NumSubElts) <= M)
5906 if (IsSrcConstant[SrcIdx]) {
5907 if (UndefSrcElts[SrcIdx][M])
5909 else if (SrcEltBits[SrcIdx][M] == 0)
5915 "Different mask size from vector size!");
5921 const APInt &KnownUndef,
5922 const APInt &KnownZero,
5923 bool ResolveKnownZeros=
true) {
5924 unsigned NumElts = Mask.size();
5926 KnownZero.
getBitWidth() == NumElts &&
"Shuffle mask size mismatch");
5928 for (
unsigned i = 0; i != NumElts; ++i) {
5931 else if (ResolveKnownZeros && KnownZero[i])
5940 unsigned NumElts = Mask.size();
5943 for (
unsigned i = 0; i != NumElts; ++i) {
5955 EVT CondVT =
Cond.getValueType();
5968 for (
int i = 0; i != (int)NumElts; ++i) {
5973 if (UndefElts[i] || (!IsBLENDV && EltBits[i].
isZero()) ||
5974 (IsBLENDV && EltBits[i].isNonNegative()))
5986 bool ResolveKnownElts);
5996 bool ResolveKnownElts) {
6000 MVT VT =
N.getSimpleValueType();
6004 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6007 unsigned NumSizeInBytes = NumSizeInBits / 8;
6008 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6010 unsigned Opcode =
N.getOpcode();
6014 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(
N)->getMask();
6016 Mask.append(ShuffleMask.
begin(), ShuffleMask.
end());
6031 uint64_t ZeroMask = IsAndN ? 255 : 0;
6038 assert(UndefElts.
isZero() &&
"Unexpected UNDEF element in AND/ANDNP mask");
6039 for (
int i = 0, e = (
int)EltBits.
size(); i != e; ++i) {
6040 const APInt &ByteBits = EltBits[i];
6041 if (ByteBits != 0 && ByteBits != 255)
6066 size_t MaskSize = std::max(SrcMask0.
size(), SrcMask1.
size());
6070 for (
int i = 0; i != (int)MaskSize; ++i) {
6080 Mask.push_back(i + MaskSize);
6093 if (!
N->isOnlyUserOf(Sub.
getNode()))
6096 uint64_t InsertIdx =
N.getConstantOperandVal(2);
6103 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
6104 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
6105 "Subvector valuetype mismatch");
6106 InsertIdx *= (MaxElts / NumElts);
6107 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
6108 NumSubElts *= (MaxElts / NumElts);
6109 bool SrcIsUndef = Src.isUndef();
6110 for (
int i = 0; i != (int)MaxElts; ++i)
6112 for (
int i = 0; i != (int)NumSubElts; ++i)
6113 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6122 if (
Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6123 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
6125 Src.getOperand(0).isUndef() &&
6126 Src.getOperand(1).getValueType() == SubVT &&
6127 Src.getConstantOperandVal(2) == 0) {
6128 for (
int i = 0; i != (int)NumSubElts; ++i)
6130 for (
int i = 0; i != (int)NumSubElts; ++i)
6131 Mask.push_back(i + NumElts);
6146 Depth + 1, ResolveKnownElts))
6156 if (SubMask.
size() != NumSubElts) {
6157 assert(((SubMask.
size() % NumSubElts) == 0 ||
6158 (NumSubElts % SubMask.
size()) == 0) &&
"Illegal submask scale");
6159 if ((NumSubElts % SubMask.
size()) == 0) {
6160 int Scale = NumSubElts / SubMask.
size();
6163 SubMask = ScaledSubMask;
6165 int Scale = SubMask.
size() / NumSubElts;
6166 NumSubElts = SubMask.
size();
6176 for (
int i = 0; i != (int)NumElts; ++i)
6178 for (
int i = 0; i != (int)NumSubElts; ++i) {
6181 int InputIdx = M / NumSubElts;
6182 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6184 Mask[i + InsertIdx] = M;
6196 unsigned DstIdx = 0;
6199 if (!isa<ConstantSDNode>(
N.getOperand(2)) ||
6200 N.getConstantOperandAPInt(2).uge(NumElts))
6202 DstIdx =
N.getConstantOperandVal(2);
6207 for (
unsigned i = 0; i != NumElts; ++i)
6227 if ((MinBitsPerElt % 8) != 0)
6238 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.
getOperand(1)))
6247 unsigned DstByte = DstIdx * NumBytesPerElt;
6258 for (
int i = 0; i != (int)NumSizeInBytes; ++i)
6259 Mask.push_back(NumSizeInBytes + i);
6262 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6263 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6264 for (
unsigned i = 0; i != MinBytesPerElts; ++i)
6265 Mask[DstByte + i] = SrcByte + i;
6266 for (
unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6276 "Unexpected input value type");
6278 APInt EltsLHS, EltsRHS;
6283 bool Offset0 =
false, Offset1 =
false;
6312 bool IsUnary = (N0 == N1);
6320 if (Offset0 || Offset1) {
6322 if ((Offset0 &&
isInRange(M, 0, NumElts)) ||
6323 (Offset1 &&
isInRange(M, NumElts, 2 * NumElts)))
6340 EVT SrcVT = Src.getValueType();
6347 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6348 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 &&
"Illegal truncation");
6349 for (
unsigned i = 0; i != NumSrcElts; ++i)
6350 Mask.push_back(i * Scale);
6359 if (!Amt || (*Amt % 8) != 0)
6369 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6370 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6371 Mask[i + j] = i + j - ByteShift;
6373 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6374 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6375 Mask[i + j - ByteShift] = i + j;
6381 uint64_t ShiftVal =
N.getConstantOperandVal(1);
6383 if (NumBitsPerElt <= ShiftVal) {
6389 if ((ShiftVal % 8) != 0)
6399 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6400 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6401 Mask[i + j] = i + j - ByteShift;
6403 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6404 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6405 Mask[i + j - ByteShift] = i + j;
6412 uint64_t RotateVal =
N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6413 if ((RotateVal % 8) != 0)
6416 int Offset = RotateVal / 8;
6418 for (
int i = 0; i != (int)NumElts; ++i) {
6419 int BaseIdx = i * NumBytesPerElt;
6420 for (
int j = 0; j != (int)NumBytesPerElt; ++j) {
6421 Mask.push_back(BaseIdx + ((
Offset + j) % NumBytesPerElt));
6428 if (!Src.getSimpleValueType().isVector()) {
6431 Src.getOperand(0).getValueType().getScalarType() !=
6434 Src = Src.getOperand(0);
6437 Mask.append(NumElts, 0);
6442 EVT SrcVT = Src.getValueType();
6447 (NumBitsPerSrcElt % 8) != 0)
6451 APInt DemandedSrcElts =
6456 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 &&
"Unexpected extension");
6457 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6458 for (
unsigned I = 0;
I != NumElts; ++
I)
6459 Mask.append(Scale,
I);
6468 EVT SrcVT = Src.getValueType();
6490 int MaskWidth = Mask.size();
6492 for (
int i = 0, e = Inputs.
size(); i < e; ++i) {
6493 int lo = UsedInputs.
size() * MaskWidth;
6494 int hi = lo + MaskWidth;
6499 if ((lo <= M) && (M < hi))
6503 if (
none_of(Mask, [lo, hi](
int i) {
return (lo <= i) && (i < hi); })) {
6511 bool IsRepeat =
false;
6512 for (
int j = 0, ue = UsedInputs.
size(); j != ue; ++j) {
6513 if (UsedInputs[j] != Inputs[i])
6517 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6526 Inputs = UsedInputs;
6537 bool ResolveKnownElts) {
6541 EVT VT =
Op.getValueType();
6546 if (ResolveKnownElts)
6551 ResolveKnownElts)) {
6562 bool ResolveKnownElts) {
6563 APInt KnownUndef, KnownZero;
6565 KnownZero, DAG,
Depth, ResolveKnownElts);
6571 bool ResolveKnownElts =
true) {
6572 EVT VT =
Op.getValueType();
6576 unsigned NumElts =
Op.getValueType().getVectorNumElements();
6588 "Unknown broadcast load type");
6599 Opcode,
DL, Tys, Ops, MemVT,
6613 EVT VT =
Op.getValueType();
6614 unsigned Opcode =
Op.getOpcode();
6618 if (
auto *SV = dyn_cast<ShuffleVectorSDNode>(
Op)) {
6619 int Elt = SV->getMaskElt(Index);
6624 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6638 int Elt = ShuffleMask[Index];
6645 assert(0 <= Elt && Elt < (2 * NumElems) &&
"Shuffle index out of range");
6654 uint64_t SubIdx =
Op.getConstantOperandVal(2);
6657 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6664 EVT SubVT =
Op.getOperand(0).getValueType();
6666 uint64_t SubIdx = Index / NumSubElts;
6667 uint64_t SubElt = Index % NumSubElts;
6674 uint64_t SrcIdx =
Op.getConstantOperandVal(1);
6681 EVT SrcVT = Src.getValueType();
6692 isa<ConstantSDNode>(
Op.getOperand(2))) {
6693 if (
Op.getConstantOperandAPInt(2) == Index)
6694 return Op.getOperand(1);
6699 return (Index == 0) ?
Op.getOperand(0)
6703 return Op.getOperand(Index);
6710 const APInt &NonZeroMask,
6711 unsigned NumNonZero,
unsigned NumZero,
6714 MVT VT =
Op.getSimpleValueType();
6717 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.
hasSSE41())) &&
6718 "Illegal vector insertion");
6723 for (
unsigned i = 0; i < NumElts; ++i) {
6724 bool IsNonZero = NonZeroMask[i];
6733 if (NumZero || 0 != i)
6736 assert(0 == i &&
"Expected insertion into zero-index");
6752 const APInt &NonZeroMask,
6753 unsigned NumNonZero,
unsigned NumZero,
6756 if (NumNonZero > 8 && !Subtarget.
hasSSE41())
6770 for (
unsigned I = 0;
I != 4; ++
I) {
6771 if (!NonZeroMask[
I])
6779 assert(V &&
"Failed to fold v16i8 vector to zero");
6784 for (
unsigned i = V ? 4 : 0; i < 16; i += 2) {
6785 bool ThisIsNonZero = NonZeroMask[i];
6786 bool NextIsNonZero = NonZeroMask[i + 1];
6787 if (!ThisIsNonZero && !NextIsNonZero)
6791 if (ThisIsNonZero) {
6792 if (NumZero || NextIsNonZero)
6798 if (NextIsNonZero) {
6800 if (i == 0 && NumZero)
6816 if (i != 0 || NumZero)
6834 const APInt &NonZeroMask,
6835 unsigned NumNonZero,
unsigned NumZero,
6838 if (NumNonZero > 4 && !Subtarget.
hasSSE41())
6854 if (Subtarget.
hasSSE3() && !Subtarget.hasXOP() &&
6855 Op.getOperand(0) ==
Op.getOperand(2) &&
6856 Op.getOperand(1) ==
Op.getOperand(3) &&
6857 Op.getOperand(0) !=
Op.getOperand(1)) {
6858 MVT VT =
Op.getSimpleValueType();
6862 SDValue Ops[4] = {
Op.getOperand(0),
Op.getOperand(1),
6870 std::bitset<4> Zeroable, Undefs;
6871 for (
int i = 0; i < 4; ++i) {
6876 assert(Zeroable.size() - Zeroable.count() > 1 &&
6877 "We expect at least two non-zero elements!");
6882 unsigned FirstNonZeroIdx;
6883 for (
unsigned i = 0; i < 4; ++i) {
6894 if (!FirstNonZero.
getNode()) {
6896 FirstNonZeroIdx = i;
6900 assert(FirstNonZero.
getNode() &&
"Unexpected build vector of all zeros!");
6906 unsigned EltMaskIdx, EltIdx;
6908 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6909 if (Zeroable[EltIdx]) {
6911 Mask[EltIdx] = EltIdx+4;
6915 Elt =
Op->getOperand(EltIdx);
6918 if (Elt.
getOperand(0) != V1 || EltMaskIdx != EltIdx)
6920 Mask[EltIdx] = EltIdx;
6925 SDValue VZeroOrUndef = (Zeroable == Undefs)
6938 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6941 bool CanFold =
true;
6942 for (
unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6956 assert(V1.
getNode() &&
"Expected at least two non-zero elements!");
6959 if (V2.getSimpleValueType() != MVT::v4f32)
6963 unsigned ZMask = Zeroable.to_ulong();
6965 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6966 assert((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!");
6977 MVT ShVT = MVT::v16i8;
6980 assert(NumBits % 8 == 0 &&
"Only support byte sized shifts");
6995 EVT PVT = LD->getValueType(0);
6996 if (PVT != MVT::i32 && PVT != MVT::f32)
7002 FI = FINode->getIndex();
7005 isa<FrameIndexSDNode>(
Ptr.getOperand(0))) {
7006 FI = cast<FrameIndexSDNode>(
Ptr.getOperand(0))->getIndex();
7016 SDValue Chain = LD->getChain();
7020 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7037 int64_t StartOffset =
Offset & ~int64_t(RequiredAlign.
value() - 1);
7044 int EltNo = (
Offset - StartOffset) >> 2;
7049 LD->getPointerInfo().getWithOffset(StartOffset));
7062 auto *BaseLd = cast<LoadSDNode>(Elt);
7063 if (!BaseLd->isSimple())
7076 if (
auto *AmtC = dyn_cast<ConstantSDNode>(Elt.
getOperand(1))) {
7077 uint64_t Amt = AmtC->getZExtValue();
7079 ByteOffset += Amt / 8;
7085 if (
auto *IdxC = dyn_cast<ConstantSDNode>(Elt.
getOperand(1))) {
7087 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7089 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7092 ByteOffset +=
Idx * (SrcSizeInBits / 8);
7110 bool IsAfterLegalize) {
7114 unsigned NumElems = Elts.
size();
7116 int LastLoadedElt = -1;
7126 for (
unsigned i = 0; i < NumElems; ++i) {
7145 if (!
findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7147 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7148 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7156 "Incomplete element masks");
7159 if (UndefMask.
popcount() == NumElems)
7170 "Register/Memory size mismatch");
7172 assert(LDBase &&
"Did not find base load for merging consecutive loads");
7174 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7175 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7176 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7177 assert((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected");
7180 if (ByteOffsets[FirstLoadedElt] != 0)
7187 int64_t ByteOffset = ByteOffsets[EltIdx];
7188 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7189 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7190 return (0 <= BaseIdx && BaseIdx < (
int)NumElems && LoadMask[BaseIdx] &&
7191 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7194 EltIdx - FirstLoadedElt);
7200 bool IsConsecutiveLoad =
true;
7201 bool IsConsecutiveLoadWithZeros =
true;
7202 for (
int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7204 if (!CheckConsecutiveLoad(LDBase, i)) {
7205 IsConsecutiveLoad =
false;
7206 IsConsecutiveLoadWithZeros =
false;
7209 }
else if (ZeroMask[i]) {
7210 IsConsecutiveLoad =
false;
7217 "Cannot merge volatile or atomic loads.");
7222 for (
auto *LD : Loads)
7237 if (FirstLoadedElt == 0 &&
7238 (NumLoadedElts == (
int)NumElems || IsDereferenceable) &&
7239 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7250 return DAG.
getBitcast(VT, Elts[FirstLoadedElt]);
7253 return CreateLoad(VT, LDBase);
7257 if (!IsAfterLegalize && VT.
isVector()) {
7259 if ((NumMaskElts % NumElems) == 0) {
7260 unsigned Scale = NumMaskElts / NumElems;
7262 for (
unsigned i = 0; i < NumElems; ++i) {
7265 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7266 for (
unsigned j = 0; j != Scale; ++j)
7267 ClearMask[(i * Scale) + j] = (i * Scale) + j +
Offset;
7269 SDValue V = CreateLoad(VT, LDBase);
7279 unsigned HalfNumElems = NumElems / 2;
7285 DAG, Subtarget, IsAfterLegalize);
7293 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7294 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7295 LoadSizeInBits == 64) &&
7302 if (!Subtarget.
hasSSE2() && VT == MVT::v4f32)
7310 for (
auto *LD : Loads)
7321 for (
unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7322 unsigned RepeatSize = SubElems * BaseSizeInBits;
7323 unsigned ScalarSize = std::min(RepeatSize, 64u);
7324 if (!Subtarget.
hasAVX2() && ScalarSize < 32)
7329 if (RepeatSize > ScalarSize && SubElems == 1)
7334 for (
unsigned i = 0; i != NumElems &&
Match; ++i) {
7338 if (RepeatedLoads[i % SubElems].
isUndef())
7339 RepeatedLoads[i % SubElems] = Elt;
7341 Match &= (RepeatedLoads[i % SubElems] == Elt);
7346 Match &= !RepeatedLoads.
back().isUndef();
7354 if (RepeatSize > ScalarSize)
7356 RepeatSize / ScalarSize);
7362 RepeatVT, RepeatedLoads,
DL, DAG, Subtarget, IsAfterLegalize)) {
7363 SDValue Broadcast = RepeatLoad;
7364 if (RepeatSize > ScalarSize) {
7392 bool IsAfterLegalize) {
7411 auto getConstantScalar = [&](
const APInt &Val) ->
Constant * {
7413 if (ScalarSize == 16)
7415 if (ScalarSize == 32)
7417 assert(ScalarSize == 64 &&
"Unsupported floating point scalar size");
7424 for (
unsigned I = 0, E = Bits.size();
I != E; ++
I)
7426 : getConstantScalar(Bits[
I]));
7435 auto getConstantScalar = [&](
const APInt &Val) ->
Constant * {
7437 if (ScalarSize == 16)
7439 if (ScalarSize == 32)
7441 assert(ScalarSize == 64 &&
"Unsupported floating point scalar size");
7447 if (ScalarSize == SplatBitSize)
7448 return getConstantScalar(SplatValue);
7450 unsigned NumElm = SplatBitSize / ScalarSize;
7452 for (
unsigned I = 0;
I != NumElm; ++
I) {
7454 ConstantVec.
push_back(getConstantScalar(Val));
7460 for (
auto *U :
N->users()) {
7461 unsigned Opc = U->getOpcode();
7471 if (
N->hasOneUse()) {
7503 "Unsupported vector type for broadcast.");
7510 assert((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.");
7511 if (Sequence.size() == 1)
7521 if (!Sequence.empty() && Subtarget.hasCDI()) {
7523 unsigned SeqLen = Sequence.size();
7524 bool UpperZeroOrUndef =
7529 if (UpperZeroOrUndef && ((Op0.getOpcode() ==
ISD::BITCAST) ||
7534 : Op0.getOperand(0).getOperand(0);
7537 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||
7538 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) {
7552 unsigned NumUndefElts = UndefElements.
count();
7553 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7554 APInt SplatValue, Undef;
7555 unsigned SplatBitSize;
7558 if (BVOp->
isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7568 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7569 (SplatBitSize < 32 && Subtarget.
hasAVX2())) {
7576 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7586 if (SplatBitSize > 64) {
7592 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7598 Ops, VVT, MPI, Alignment,
7608 if (!Ld || NumElts - NumUndefElts != 1)
7611 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7615 bool ConstSplatVal =
7643 if (ConstSplatVal && (Subtarget.
hasAVX2() || OptForSize)) {
7651 if (ScalarSize == 32 ||
7652 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7653 (CVT == MVT::f16 && Subtarget.
hasAVX2()) ||
7654 (OptForSize && (ScalarSize == 64 || Subtarget.
hasAVX2()))) {
7657 C = CI->getConstantIntValue();
7659 C = CF->getConstantFPValue();
7661 assert(
C &&
"Invalid constant type");
7665 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7678 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7689 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7690 (Subtarget.hasVLX() && ScalarSize == 64)) {
7691 auto *LN = cast<LoadSDNode>(Ld);
7693 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7696 LN->getMemoryVT(), LN->getMemOperand());
7704 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7705 auto *LN = cast<LoadSDNode>(Ld);
7707 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7710 LN->getMemoryVT(), LN->getMemOperand());
7715 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7730 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7751 ExtractedFromVec = ShuffleVec;
7759 MVT VT =
Op.getSimpleValueType();
7772 for (
unsigned i = 0; i != NumElems; ++i) {
7773 unsigned Opc =
Op.getOperand(i).getOpcode();
7780 if (InsertIndices.
size() > 1)
7787 SDValue ExtractedFromVec =
Op.getOperand(i).getOperand(0);
7788 SDValue ExtIdx =
Op.getOperand(i).getOperand(1);
7791 if (!isa<ConstantSDNode>(ExtIdx))
7800 VecIn1 = ExtractedFromVec;
7801 else if (VecIn1 != ExtractedFromVec) {
7803 VecIn2 = ExtractedFromVec;
7804 else if (VecIn2 != ExtractedFromVec)
7809 if (ExtractedFromVec == VecIn1)
7811 else if (ExtractedFromVec == VecIn2)
7812 Mask[i] =
Idx + NumElems;
7821 for (
unsigned Idx : InsertIndices)
7831 MVT VT =
Op.getSimpleValueType();
7847 MVT VT =
Op.getSimpleValueType();
7849 "Unexpected type in LowerBUILD_VECTORvXi1!");
7856 bool IsSplat =
true;
7857 bool HasConstElts =
false;
7863 if (
auto *InC = dyn_cast<ConstantSDNode>(In)) {
7864 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7865 HasConstElts =
true;
7871 else if (In !=
Op.getOperand(SplatIdx))
7882 assert(
Cond.getValueType() == MVT::i8 &&
"Unexpected VT!");
7888 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7909 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7926 for (
unsigned InsertIdx : NonConstIdx) {
7928 Op.getOperand(InsertIdx),
7969 unsigned BaseIdx,
unsigned LastIdx,
7971 EVT VT =
N->getValueType(0);
7973 assert(BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!");
7975 "Invalid Vector in input!");
7978 bool CanFold =
true;
7979 unsigned ExpectedVExtractIdx = BaseIdx;
7980 unsigned NumElts = LastIdx - BaseIdx;
7985 for (
unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7989 if (
Op->isUndef()) {
7991 if (i * 2 == NumElts)
7992 ExpectedVExtractIdx = BaseIdx;
7993 ExpectedVExtractIdx += 2;
7997 CanFold =
Op->getOpcode() == Opcode &&
Op->hasOneUse();
8018 if (i * 2 < NumElts) {
8030 if (i * 2 == NumElts)
8031 ExpectedVExtractIdx = BaseIdx;
8035 if (I0 == ExpectedVExtractIdx)
8037 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8044 ExpectedVExtractIdx += 2;
8083 unsigned X86Opcode,
bool Mode,
8084 bool isUndefLO,
bool isUndefHI) {
8087 "Invalid nodes in input!");
8101 if (!isUndefLO && !V0->
isUndef())
8102 LO = DAG.
getNode(X86Opcode,
DL, NewVT, V0_LO, V0_HI);
8103 if (!isUndefHI && !V1->
isUndef())
8104 HI = DAG.
getNode(X86Opcode,
DL, NewVT, V1_LO, V1_HI);
8108 LO = DAG.
getNode(X86Opcode,
DL, NewVT, V0_LO, V1_LO);
8111 HI = DAG.
getNode(X86Opcode,
DL, NewVT, V0_HI, V1_HI);
8125 unsigned &NumExtracts,
8142 unsigned Opc[2] = {0, 0};
8143 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
8147 unsigned Opcode =
Op.getOpcode();
8173 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8175 Opc[i % 2] = Opcode;
8212 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8252 unsigned ExpectedUses) {
8282 unsigned NumExtracts;
8294 return DAG.
getNode(Opc,
DL, VT, Opnd0, Opnd1, Opnd2);
8307 Mask.push_back(
I + E + 1);
8331 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8332 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8333 for (
unsigned i = 0; i != Num128BitChunks; ++i) {
8334 for (
unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8346 GenericOpcode =
Op.getOpcode();
8347 switch (GenericOpcode) {
8353 default:
return false;
8364 !isa<ConstantSDNode>(Op1.
getOperand(1)) || !
Op.hasOneUse())
8369 if (j < NumEltsIn64Bits) {
8377 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8384 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8385 (j % NumEltsIn64Bits) * 2;
8386 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8395 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8426 for (
unsigned i = 0; i != NumElts; ++i)
8431 unsigned HalfNumElts = NumElts / 2;
8440 return DAG.
getNode(HOpcode,
DL, VT, V0, V1);
8448 unsigned NumNonUndefs =
8450 if (NumNonUndefs < 2)
8457 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.
hasSSE3()) ||
8458 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.
hasSSSE3()) ||
8459 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.
hasAVX()) ||
8460 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.
hasAVX2())) {
8473 unsigned Half = NumElts / 2;
8474 unsigned NumUndefsLO = 0;
8475 unsigned NumUndefsHI = 0;
8476 for (
unsigned i = 0, e = Half; i != e; ++i)
8480 for (
unsigned i = Half, e = NumElts; i != e; ++i)
8485 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8488 bool CanFold =
true;
8509 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8518 bool isUndefLO = NumUndefsLO == Half;
8519 bool isUndefHI = NumUndefsHI == Half;
8525 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8526 VT == MVT::v16i16) {
8545 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8550 bool isUndefLO = NumUndefsLO == Half;
8551 bool isUndefHI = NumUndefsHI == Half;
8553 isUndefLO, isUndefHI);
8571 MVT VT =
Op->getSimpleValueType(0);
8577 unsigned Opcode =
Op->getOperand(0).getOpcode();
8578 for (
unsigned i = 1; i < NumElems; ++i)
8579 if (Opcode !=
Op->getOperand(i).getOpcode())
8583 bool IsShift =
false;
8597 if (
Op->getSplatValue())
8610 if (!isa<ConstantSDNode>(
RHS))
8627 if (IsShift &&
any_of(RHSElts, [&](
SDValue V) {
return RHSElts[0] != V; }))
8648 MVT VT =
Op.getSimpleValueType();
8658 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8680 "Illegal variable permute mask size");
8688 SDLoc(IndicesVec), SizeInBits);
8692 IndicesVT, IndicesVec);
8704 Subtarget, DAG,
SDLoc(IndicesVec));
8719 EVT SrcVT =
Idx.getValueType();
8729 for (
uint64_t i = 0; i != Scale; ++i) {
8730 IndexScale |= Scale << (i * NumDstBits);
8731 IndexOffset |= i << (i * NumDstBits);
8741 unsigned Opcode = 0;
8750 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8754 ShuffleVT = MVT::v16i8;
8759 if (Subtarget.
hasAVX()) {
8761 ShuffleVT = MVT::v4f32;
8764 ShuffleVT = MVT::v16i8;
8769 if (Subtarget.
hasAVX()) {
8773 ShuffleVT = MVT::v2f64;
8779 DAG.getVectorShuffle(VT,
DL, SrcVec, SrcVec, {0, 0}),
8785 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8787 else if (Subtarget.hasXOP()) {
8796 }
else if (Subtarget.hasAVX()) {
8807 EVT VT =
Idx.getValueType();
8813 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8819 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8821 else if (Subtarget.hasAVX()) {
8823 IndicesVec = ScaleIndices(IndicesVec, 2);
8826 MVT::v32i8, DAG.
getBitcast(MVT::v32i8, SrcVec),
8827 DAG.
getBitcast(MVT::v32i8, IndicesVec),
DL, DAG, Subtarget));
8832 if (Subtarget.hasAVX2())
8834 else if (Subtarget.hasAVX()) {
8837 {0, 1, 2, 3, 0, 1, 2, 3});
8839 {4, 5, 6, 7, 4, 5, 6, 7});
8840 if (Subtarget.hasXOP())
8856 if (Subtarget.hasAVX512()) {
8857 if (!Subtarget.hasVLX()) {
8859 SrcVec =
widenSubVector(WidenSrcVT, SrcVec,
false, Subtarget, DAG,
8861 IndicesVec =
widenSubVector(MVT::v8i64, IndicesVec,
false, Subtarget,
8862 DAG,
SDLoc(IndicesVec));
8868 }
else if (Subtarget.hasAVX()) {
8876 if (Subtarget.hasXOP())
8891 if (Subtarget.hasVBMI())
8895 if (Subtarget.hasBWI())
8902 if (Subtarget.hasAVX512())
8911 "Illegal variable permute shuffle type");
8915 IndicesVec = ScaleIndices(IndicesVec, Scale);
8918 IndicesVec = DAG.
getBitcast(ShuffleIdxVT, IndicesVec);
8922 ? DAG.
getNode(Opcode,
DL, ShuffleVT, IndicesVec, SrcVec)
8923 : DAG.
getNode(Opcode,
DL, ShuffleVT, SrcVec, IndicesVec);
8946 for (
unsigned Idx = 0, E = V.getNumOperands();
Idx != E; ++
Idx) {
8955 SrcVec =
Op.getOperand(0);
8956 else if (SrcVec !=
Op.getOperand(0))
8958 SDValue ExtractedIndex =
Op->getOperand(1);
8962 ExtractedIndex = ExtractedIndex.
getOperand(0);
8971 else if (IndicesVec != ExtractedIndex.
getOperand(0))
8974 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.
getOperand(1));
8975 if (!PermIdx || PermIdx->getAPIntValue() !=
Idx)
8979 MVT VT = V.getSimpleValueType();
8987 MVT VT =
Op.getSimpleValueType();
8989 MVT OpEltVT =
Op.getOperand(0).getSimpleValueType();
8997 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9008 bool IsAllConstants =
true;
9009 bool OneUseFrozenUndefs =
true;
9011 unsigned NumConstants = NumElems;
9012 for (
unsigned i = 0; i < NumElems; ++i) {
9019 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->
hasOneUse();
9020 FrozenUndefMask.
setBit(i);
9025 IsAllConstants =
false;
9040 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9044 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9052 if (
unsigned NumFrozenUndefElts = FrozenUndefMask.
popcount();
9053 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9056 for (
unsigned i = 0; i < NumElems; ++i) {
9062 if (!FrozenUndefMask[i])
9063 Elts[i] =
Op.getOperand(i);
9065 BlendMask[i] += NumElems;
9080 unsigned UpperElems = NumElems / 2;
9081 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9082 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.
countl_one();
9083 if (NumUpperUndefsOrZeros >= UpperElems) {
9085 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9086 UpperElems = NumElems - (NumElems / 4);
9088 bool UndefUpper = UndefMask.
countl_one() >= UpperElems;
9092 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9099 return HorizontalOp;
9105 unsigned NumZero = ZeroMask.
popcount();
9106 unsigned NumNonZero = NonZeroMask.
popcount();
9114 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9115 FrozenUndefMask.
isZero() &&
9122 Type *EltType =
Op.getValueType().getScalarType().getTypeForEVT(Context);
9126 for (
unsigned i = 0; i != NumElems; ++i) {
9128 if (
auto *
C = dyn_cast<ConstantSDNode>(Elt))
9129 ConstVecOps[i] = ConstantInt::get(Context,
C->getAPIntValue());
9130 else if (
auto *
C = dyn_cast<ConstantFPSDNode>(Elt))
9131 ConstVecOps[i] = ConstantFP::get(Context,
C->getValueAPF());
9134 "Expected one variable element in this vector");
9148 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9154 if (InsertC < NumEltsInLow128Bits)
9160 assert(Subtarget.
hasAVX() &&
"Must have AVX with >16-byte vector");
9163 for (
unsigned i = 0; i != NumElts; ++i)
9164 ShuffleMask.
push_back(i == InsertC ? NumElts : i);
9170 if (NumNonZero == 1) {
9182 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9183 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9184 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9187 "Expected an SSE value type!");
9196 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9206 if (NumElems == 2 &&
Idx == 1 &&
9212 VT,
Op.getOperand(1)),
9213 NumBits/2, DAG, *
this, dl);
9224 if (EVTBits == 32) {
9231 if (Values.
size() == 1) {
9232 if (EVTBits == 32) {
9239 if (
Op.getNode()->isOnlyUserOf(Item.
getNode()))
9264 if (Subtarget.
hasAVX2() && EVTBits == 32 && Values.
size() == 2) {
9265 SDValue Ops[4] = {
Op.getOperand(0),
Op.getOperand(1),
9269 for (
unsigned i = 2; i != NumElems; ++i)
9270 if (Ops[i % 2] !=
Op.getOperand(i))
9274 if (CanSplat(
Op, NumElems, Ops)) {
9296 HVT, dl,
Op->ops().slice(NumElems / 2, NumElems /2));
9303 if (EVTBits == 64) {
9304 if (NumNonZero == 1) {
9308 Op.getOperand(
Idx));
9315 if (EVTBits == 8 && NumElems == 16)
9317 NumZero, DAG, Subtarget))
9320 if (EltVT == MVT::i16 && NumElems == 8)
9322 NumZero, DAG, Subtarget))
9326 if (EVTBits == 32 && NumElems == 4)
9331 if (NumElems == 4 && NumZero > 0) {
9333 for (
unsigned i = 0; i < 4; ++i) {
9334 bool isZero = !NonZeroMask[i];
9341 for (
unsigned i = 0; i < 2; ++i) {
9348 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9351 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9354 Ops[i] =
getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9364 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9365 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9370 assert(Values.
size() > 1 &&
"Expected non-undef and non-splat vector");
9377 if (Subtarget.
hasSSE41() && EltVT != MVT::f16) {
9379 if (!
Op.getOperand(0).isUndef())
9384 for (
unsigned i = 1; i < NumElems; ++i) {
9385 if (
Op.getOperand(i).isUndef())
continue;
9396 for (
unsigned i = 0; i < NumElems; ++i) {
9397 if (!
Op.getOperand(i).isUndef())
9407 for (
unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9410 for(
unsigned i = 0; i != Scale; ++i)
9412 for (
unsigned i = 0; i != Scale; ++i)
9413 Mask.push_back(NumElems+i);
9416 for (
unsigned i = 0, e = NumElems / (2 * Scale); i !=
e; ++i)
9428 MVT ResVT =
Op.getSimpleValueType();
9431 ResVT.
is512BitVector()) &&
"Value type must be 256-/512-bit wide");
9434 unsigned NumFreezeUndef = 0;
9435 unsigned NumZero = 0;
9436 unsigned NumNonZero = 0;
9437 unsigned NonZeros = 0;
9438 for (
unsigned i = 0; i != NumOperands; ++i) {
9452 assert(i <
sizeof(NonZeros) * CHAR_BIT);
9459 if (NumNonZero > 2) {
9463 Ops.
slice(0, NumOperands/2));
9465 Ops.
slice(NumOperands/2));
9474 MVT SubVT =
Op.getOperand(0).getSimpleValueType();
9476 for (
unsigned i = 0; i != NumOperands; ++i) {
9477 if ((NonZeros & (1 << i)) == 0)
9496 MVT ResVT =
Op.getSimpleValueType();
9500 "Unexpected number of operands in CONCAT_VECTORS");
9504 for (
unsigned i = 0; i != NumOperands; ++i) {
9508 assert(i <
sizeof(NonZeros) * CHAR_BIT);
9520 if (
isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9521 Log2_64(NonZeros) != NumOperands - 1) {
9545 if (NumOperands > 2) {
9549 Ops.
slice(0, NumOperands/2));
9551 Ops.
slice(NumOperands/2));
9570 MVT VT =
Op.getSimpleValueType();
9604 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
9605 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
9606 if (Mask[i] >= 0 && Mask[i] != i)
9618 unsigned ScalarSizeInBits,
9620 assert(LaneSizeInBits && ScalarSizeInBits &&
9621 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9622 "Illegal shuffle lane size");
9623 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9624 int Size = Mask.size();
9625 for (
int i = 0; i <
Size; ++i)
9626 if (Mask[i] >= 0 && (Mask[i] %
Size) / LaneSize != i / LaneSize)
9641 unsigned ScalarSizeInBits,
9643 assert(LaneSizeInBits && ScalarSizeInBits &&
9644 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9645 "Illegal shuffle lane size");
9646 int NumElts = Mask.size();
9647 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9648 int NumLanes = NumElts / NumEltsPerLane;
9650 for (
int i = 0; i != NumLanes; ++i) {
9652 for (
int j = 0; j != NumEltsPerLane; ++j) {
9653 int M = Mask[(i * NumEltsPerLane) + j];
9656 int Lane = (M % NumElts) / NumEltsPerLane;
9657 if (SrcLane >= 0 && SrcLane != Lane)
9681 RepeatedMask.
assign(LaneSize, -1);
9682 int Size = Mask.size();
9683 for (
int i = 0; i <
Size; ++i) {
9687 if ((Mask[i] %
Size) / LaneSize != i / LaneSize)
9693 int LocalM = Mask[i] <
Size ? Mask[i] % LaneSize
9694 : Mask[i] % LaneSize + LaneSize;
9695 if (RepeatedMask[i % LaneSize] < 0)
9697 RepeatedMask[i % LaneSize] = LocalM;
9698 else if (RepeatedMask[i % LaneSize] != LocalM)
9728 unsigned EltSizeInBits,
9731 int LaneSize = LaneSizeInBits / EltSizeInBits;
9733 int Size = Mask.size();
9734 for (
int i = 0; i <
Size; ++i) {
9744 if ((Mask[i] %
Size) / LaneSize != i / LaneSize)
9750 int LaneM = Mask[i] /
Size;
9751 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9754 RepeatedMask[i % LaneSize] = LocalM;
9755 else if (RepeatedMask[i % LaneSize] != LocalM)
9768 Mask, RepeatedMask);
9774 int Idx,
int ExpectedIdx) {
9775 assert(0 <=
Idx &&
Idx < MaskSize && 0 <= ExpectedIdx &&
9776 ExpectedIdx < MaskSize &&
"Out of range element index");
9777 if (!
Op || !ExpectedOp ||
Op.getOpcode() != ExpectedOp.
getOpcode())
9780 switch (
Op.getOpcode()) {
9792 return (
Op == ExpectedOp &&
9793 (
int)
Op.getValueType().getVectorNumElements() == MaskSize);
9803 if (
Op == ExpectedOp &&
Op.getOperand(0) ==
Op.getOperand(1)) {
9804 MVT VT =
Op.getSimpleValueType();
9806 if (MaskSize == NumElts) {
9808 int NumEltsPerLane = NumElts / NumLanes;
9809 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9811 (
Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9813 (
Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9814 return SameLane && SameElt;
9836 int Size = Mask.size();
9837 if (
Size != (
int)ExpectedMask.
size())
9840 for (
int i = 0; i <
Size; ++i) {
9841 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
9842 int MaskIdx = Mask[i];
9843 int ExpectedIdx = ExpectedMask[i];
9844 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9847 MaskIdx = MaskIdx <
Size ? MaskIdx : (MaskIdx -
Size);
9848 ExpectedIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
9870 int Size = Mask.size();
9871 if (
Size != (
int)ExpectedMask.
size())
9875 "Illegal target shuffle mask");
9883 !V1.getValueType().isVector()))
9886 !V2.getValueType().isVector()))
9892 for (
int i = 0; i <
Size; ++i) {
9893 int MaskIdx = Mask[i];
9894 int ExpectedIdx = ExpectedMask[i];
9904 int BitIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
9905 APInt &ZeroMask = ExpectedIdx <
Size ? ZeroV1 : ZeroV2;
9913 MaskIdx = MaskIdx <
Size ? MaskIdx : (MaskIdx -
Size);
9914 ExpectedIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
9928 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9939 return IsUnpackwdMask;
9953 for (
unsigned i = 0; i != 4; ++i) {
9968 assert(Mask.size() % 2 == 0 &&
"Expecting even number of elements in mask");
9969 unsigned HalfSize = Mask.size() / 2;
9970 for (
unsigned i = 0; i != HalfSize; ++i) {
9971 if (Mask[i] != Mask[i + HalfSize])
9986 assert(Mask.size() == 4 &&
"Only 4-lane shuffle masks");
9987 assert(Mask[0] >= -1 && Mask[0] < 4 &&
"Out of bound mask element!");
9988 assert(Mask[1] >= -1 && Mask[1] < 4 &&
"Out of bound mask element!");
9989 assert(Mask[2] >= -1 && Mask[2] < 4 &&
"Out of bound mask element!");
9990 assert(Mask[3] >= -1 && Mask[3] < 4 &&
"Out of bound mask element!");
9994 int FirstIndex =
find_if(Mask, [](
int M) {
return M >= 0; }) - Mask.begin();
9995 assert(0 <= FirstIndex && FirstIndex < 4 &&
"All undef shuffle mask");
9997 int FirstElt = Mask[FirstIndex];
9998 if (
all_of(Mask, [FirstElt](
int M) {
return M < 0 || M == FirstElt; }))
9999 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10002 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10003 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10004 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10005 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10017 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10018 "Unexpected SHUFPD mask size");
10019 assert(
all_of(Mask, [](
int M) {
return -1 <= M && M <= 1; }) &&
10020 "Unexpected SHUFPD mask elements");
10024 int FirstIndex =
find_if(Mask, [](
int M) {
return M >= 0; }) - Mask.begin();
10025 assert(0 <= FirstIndex && FirstIndex < (
int)Mask.size() &&
10026 "All undef shuffle mask");
10028 int FirstElt = Mask[FirstIndex];
10029 if (
all_of(Mask, [FirstElt](
int M) {
return M < 0 || M == FirstElt; }) &&
10030 count_if(Mask, [FirstElt](
int M) {
return M == FirstElt; }) > 1) {
10032 for (
unsigned I = 0, E = Mask.size();
I != E; ++
I)
10033 Imm |= FirstElt <<
I;
10040 for (
unsigned I = 0, E = Mask.size();
I != E; ++
I)
10041 Imm |= (Mask[
I] < 0 ? (
I & 1) : Mask[
I]) <<
I;
10060 bool &IsZeroSideLeft) {
10061 int NextElement = -1;
10063 for (
int i = 0, e = Mask.size(); i < e; i++) {
10065 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
10071 if (NextElement < 0) {
10072 NextElement = Mask[i] != 0 ?
VectorType.getVectorNumElements() : 0;
10073 IsZeroSideLeft = NextElement != 0;
10076 if (NextElement != Mask[i])
10089 int Size = Mask.size();
10103 for (
int i = 0; i < NumBytes; ++i) {
10104 int M = Mask[i / NumEltBytes];
10106 PSHUFBMask[i] = DAG.
getUNDEF(MVT::i8);
10109 if (Zeroable[i / NumEltBytes]) {
10110 PSHUFBMask[i] = ZeroMask;
10116 if (V && V != SrcV)
10122 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10126 M = M * NumEltBytes + (i % NumEltBytes);
10129 assert(V &&
"Failed to find a source input");
10144 const APInt &Zeroable,
10147 bool IsLeftZeroSide =
true;
10151 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10156 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10157 "Unexpected number of vector elements");
10159 Subtarget, DAG,
DL);
10161 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10166 unsigned &UnpackOpcode,
bool IsUnary,
10172 bool Undef1 =
true, Undef2 =
true, Zero1 =
true, Zero2 =
true;
10173 for (
int i = 0; i != NumElts; i += 2) {
10174 int M1 = TargetMask[i + 0];
10175 int M2 = TargetMask[i + 1];
10181 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10182 "Zeroable shuffle detected");
10188 (IsUnary ? V1 : V2))) {
10190 V2 = (Undef2 ? DAG.
getUNDEF(VT) : (IsUnary ? V1 : V2));
10191 V1 = (Undef1 ? DAG.
getUNDEF(VT) : V1);
10197 (IsUnary ? V1 : V2))) {
10199 V2 = (Undef2 ? DAG.
getUNDEF(VT) : (IsUnary ? V1 : V2));
10200 V1 = (Undef1 ? DAG.
getUNDEF(VT) : V1);
10205 if (IsUnary && (Zero1 || Zero2)) {
10207 if ((Subtarget.
hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10211 bool MatchLo =
true, MatchHi =
true;
10212 for (
int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10213 int M = TargetMask[i];
10216 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10220 MatchLo &= (M == Unpckl[i]);
10221 MatchHi &= (M == Unpckh[i]);
10224 if (MatchLo || MatchHi) {
10288 unsigned UnpackOpcode;
10300 DAG.
getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10302 return DAG.
getNode(UnpackOpcode,
DL, VT, V1, V1);
10313 unsigned NumElts = Mask.size();
10315 unsigned MaxScale = 64 / EltSizeInBits;
10317 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10318 unsigned SrcEltBits = EltSizeInBits * Scale;
10319 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10321 unsigned NumSrcElts = NumElts / Scale;
10324 unsigned UpperElts = NumElts - NumSrcElts;
10330 if ((NumSrcElts * EltSizeInBits) >= 128) {
10348 MVT SrcVT = Src.getSimpleValueType();
10358 if (NumSrcElts == NumDstElts)
10361 if (NumSrcElts > NumDstElts) {
10367 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10384 if (DstVT != TruncVT)
10408 const APInt &Zeroable,
10411 assert((VT == MVT::v16i8 || VT == MVT::v8i16) &&
"Unexpected VTRUNC type");
10417 unsigned MaxScale = 64 / EltSizeInBits;
10418 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10419 unsigned SrcEltBits = EltSizeInBits * Scale;
10420 unsigned NumSrcElts = NumElts / Scale;
10421 unsigned UpperElts = NumElts - NumSrcElts;
10430 Src.getScalarValueSizeInBits() == SrcEltBits) {
10431 Src = Src.getOperand(0);
10432 }
else if (Subtarget.hasVLX()) {
10445 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10458 const APInt &Zeroable,
10462 "Unexpected VTRUNC type");
10468 unsigned MaxScale = 64 / EltSizeInBits;
10469 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10471 unsigned SrcEltBits = EltSizeInBits * Scale;
10472 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10477 unsigned NumHalfSrcElts = NumElts / Scale;
10478 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10485 unsigned UpperElts = NumElts - NumSrcElts;
10486 if (UpperElts > 0 &&
10497 return Lo.getOperand(0) ==
Hi.getOperand(0);
10500 auto *LDLo = cast<LoadSDNode>(
Lo);
10501 auto *LDHi = cast<LoadSDNode>(
Hi);
10503 LDHi, LDLo,
Lo.getValueType().getStoreSize(), 1);
10561 bool IsSingleInput) {
10564 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10566 "We should only be called with masks with a power-of-2 size!");
10569 int Offset = MatchEven ? 0 : 1;
10574 bool ViableForN[3] = {
true,
true,
true};
10576 for (
int i = 0, e = Mask.size(); i < e; ++i) {
10582 bool IsAnyViable =
false;
10583 for (
unsigned j = 0; j != std::size(ViableForN); ++j)
10584 if (ViableForN[j]) {
10589 IsAnyViable =
true;
10591 ViableForN[j] =
false;
10598 for (
unsigned j = 0; j != std::size(ViableForN); ++j)
10614 unsigned MaxStages = 1) {
10617 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10618 "Illegal maximum compaction");
10621 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10622 unsigned NumPackedBits = NumSrcBits - BitSize;
10626 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10629 if ((!N1.
isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10630 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10632 if (Subtarget.
hasSSE41() || BitSize == 8) {
10645 if ((N1.
isUndef() || IsZero1 || IsAllOnes1 ||
10647 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10659 for (
unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10667 if (MatchPACK(V1, V2, PackVT))
10674 if (MatchPACK(V1, V1, PackVT))
10686 unsigned PackOpcode;
10689 unsigned MaxStages =
Log2_32(64 / EltBits);
10691 Subtarget, MaxStages))
10695 unsigned NumStages =
Log2_32(CurrentEltBits / EltBits);
10698 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10703 unsigned MaxPackBits = 16;
10704 if (CurrentEltBits > 16 &&
10710 for (
unsigned i = 0; i != NumStages; ++i) {
10711 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10712 unsigned NumSrcElts = SizeBits / SrcEltBits;
10720 CurrentEltBits /= 2;
10723 "Failed to lower compaction shuffle");
10733 const APInt &Zeroable,
10740 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10746 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10759 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
10762 if (Mask[i] %
Size != i)
10765 V = Mask[i] <
Size ? V1 : V2;
10766 else if (V != (Mask[i] <
Size ? V1 : V2))
10794 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
10795 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i +
Size)
10811 const APInt &Zeroable,
bool &ForceV1Zero,
10812 bool &ForceV2Zero,
uint64_t &BlendMask) {
10813 bool V1IsZeroOrUndef =
10815 bool V2IsZeroOrUndef =
10819 ForceV1Zero =
false, ForceV2Zero =
false;
10820 assert(Mask.size() <= 64 &&
"Shuffle mask too big for blend mask");
10822 int NumElts = Mask.size();
10824 int NumEltsPerLane = NumElts / NumLanes;
10825 assert((NumLanes * NumEltsPerLane) == NumElts &&
"Value type mismatch");
10829 bool ForceWholeLaneMasks =
10834 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
10836 bool LaneV1InUse =
false;
10837 bool LaneV2InUse =
false;
10839 for (
int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10840 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10844 if (M == Elt || (0 <= M && M < NumElts &&
10847 LaneV1InUse =
true;
10850 if (M == (Elt + NumElts) ||
10853 LaneBlendMask |= 1ull << LaneElt;
10854 Mask[Elt] = Elt + NumElts;
10855 LaneV2InUse =
true;
10858 if (Zeroable[Elt]) {
10859 if (V1IsZeroOrUndef) {
10860 ForceV1Zero =
true;
10862 LaneV1InUse =
true;
10865 if (V2IsZeroOrUndef) {
10866 ForceV2Zero =
true;
10867 LaneBlendMask |= 1ull << LaneElt;
10868 Mask[Elt] = Elt + NumElts;
10869 LaneV2InUse =
true;
10879 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10880 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10882 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10895 const APInt &Zeroable,
10899 bool ForceV1Zero =
false, ForceV2Zero =
false;
10916 assert(Subtarget.
hasAVX2() &&
"256-bit integer blends require AVX2!");
10920 assert(Subtarget.
hasAVX() &&
"256-bit float blends require AVX!");
10927 assert(Subtarget.
hasSSE41() &&
"128-bit blends require SSE41!");
10930 case MVT::v16i16: {
10931 assert(Subtarget.
hasAVX2() &&
"v16i16 blends require AVX2!");
10935 assert(RepeatedMask.
size() == 8 &&
"Repeated mask size doesn't match!");
10937 for (
int i = 0; i < 8; ++i)
10938 if (RepeatedMask[i] >= 8)
10939 BlendMask |= 1ull << i;
10946 uint64_t LoMask = BlendMask & 0xFF;
10947 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10948 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10954 MVT::v16i16,
DL,
Lo,
Hi,
10955 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10960 assert(Subtarget.
hasAVX2() &&
"256-bit byte-blends require AVX2!");
10963 assert(Subtarget.
hasSSE41() &&
"128-bit byte-blends require SSE41!");
10970 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10977 if (Subtarget.hasVLX())
11010 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
11011 for (
int j = 0; j < Scale; ++j)
11058 bool ImmBlends =
false) {
11064 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
11068 assert(Mask[i] <
Size * 2 &&
"Shuffle input is out of bounds.");
11070 if (BlendMask[Mask[i] %
Size] < 0)
11071 BlendMask[Mask[i] %
Size] = Mask[i];
11072 else if (BlendMask[Mask[i] %
Size] != Mask[i])
11075 PermuteMask[i] = Mask[i] %
Size;
11097 int NumElts = Mask.size();
11099 int NumLaneElts = NumElts / NumLanes;
11100 int NumHalfLaneElts = NumLaneElts / 2;
11102 bool MatchLo =
true, MatchHi =
true;
11106 for (
int Elt = 0; Elt != NumElts; ++Elt) {
11114 if (M < NumElts && (
Op.isUndef() ||
Op == V1))
11116 else if (NumElts <= M && (
Op.isUndef() ||
Op == V2)) {
11122 bool MatchLoAnyLane =
false, MatchHiAnyLane =
false;
11123 for (
int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11124 int Lo = Lane, Mid = Lane + NumHalfLaneElts,
Hi = Lane + NumLaneElts;
11127 if (MatchLoAnyLane || MatchHiAnyLane) {
11128 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11129 "Failed to match UNPCKLO/UNPCKHI");
11133 MatchLo &= MatchLoAnyLane;
11134 MatchHi &= MatchHiAnyLane;
11135 if (!MatchLo && !MatchHi)
11138 assert((MatchLo ^ MatchHi) &&
"Failed to match UNPCKLO/UNPCKHI");
11144 for (
int Elt = 0; Elt != NumElts; ++Elt) {
11151 bool IsFirstOp = M < NumElts;
11153 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11154 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11155 PermuteMask[Elt] = BaseMaskElt;
11156 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11157 PermuteMask[Elt] = BaseMaskElt + 1;
11158 assert(PermuteMask[Elt] != -1 &&
11159 "Input mask element is defined but failed to assign permute mask");
11181 int Size = Mask.size();
11182 assert(Mask.size() >= 2 &&
"Single element masks are invalid.");
11193 bool UnpackLo = NumLoInputs >= NumHiInputs;
11195 auto TryUnpack = [&](
int ScalarSize,
int Scale) {
11199 for (
int i = 0; i <
Size; ++i) {
11204 int UnpackIdx = i / Scale;
11208 if ((UnpackIdx % 2 == 0) != (Mask[i] <
Size))
11214 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 :
Size / 2)] =
11237 UnpackVT, V1, V2));
11243 for (
int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11244 if (
SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11255 if (NumLoInputs == 0 || NumHiInputs == 0) {
11256 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11257 "We have to have *some* inputs!");
11258 int HalfOffset = NumLoInputs == 0 ?
Size / 2 : 0;
11266 for (
int i = 0; i <
Size; ++i) {
11270 assert(Mask[i] %
Size >= HalfOffset &&
"Found input from wrong half!");
11273 2 * ((Mask[i] %
Size) - HalfOffset) + (Mask[i] <
Size ? 0 : 1);
11302 int NumEltsPerLane = NumElts / NumLanes;
11305 bool Blend1 =
true;
11306 bool Blend2 =
true;
11307 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11308 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11309 for (
int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11310 for (
int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11311 int M = Mask[Lane + Elt];
11315 Blend1 &= (M == (Lane + Elt));
11316 assert(Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask");
11317 M = M % NumEltsPerLane;
11318 Range1.first = std::min(Range1.first, M);
11319 Range1.second = std::max(Range1.second, M);
11322 Blend2 &= (M == (Lane + Elt));
11323 assert(Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask");
11324 M = M % NumEltsPerLane;
11325 Range2.first = std::min(Range2.first, M);
11326 Range2.second = std::max(Range2.second, M);
11334 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11335 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11349 for (
int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11350 for (
int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11351 int M = Mask[Lane + Elt];
11355 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11357 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11364 if (Range2.second < Range1.first)
11365 return RotateAndPermute(V1, V2, Range1.first, 0);
11366 if (Range1.second < Range2.first)
11367 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11381 size_t NumUndefs = 0;
11382 std::optional<int> UniqueElt;
11383 for (
int Elt : Mask) {
11388 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11394 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11407 int NumElts = Mask.size();
11409 int NumEltsPerLane = NumElts / NumLanes;
11413 bool IsAlternating =
true;
11414 bool V1Zero =
true, V2Zero =
true;
11418 for (
int i = 0; i < NumElts; ++i) {
11420 if (M >= 0 && M < NumElts) {
11423 V1Zero &= Zeroable[i];
11424 IsAlternating &= (i & 1) == 0;
11425 }
else if (M >= NumElts) {
11426 V2Mask[i] = M - NumElts;
11427 FinalMask[i] = i + NumElts;
11428 V2Zero &= Zeroable[i];
11429 IsAlternating &= (i & 1) == 1;
11436 auto canonicalizeBroadcastableInput = [
DL, VT, &Subtarget,
11439 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11440 if (!Subtarget.
hasAVX2() && (!Subtarget.
hasAVX() || EltSizeInBits < 32 ||
11446 "Expected to demand only the 0'th element.");
11449 int &InputMaskElt =
I.value();
11450 if (InputMaskElt >= 0)
11451 InputMaskElt =
I.index();
11461 canonicalizeBroadcastableInput(V1, V1Mask);
11462 canonicalizeBroadcastableInput(V2, V2Mask);
11487 DL, VT, V1, V2, Mask, Subtarget, DAG))
11495 DL, VT, V1, V2, Mask, Subtarget, DAG))
11504 V1Mask.
assign(NumElts, -1);
11505 V2Mask.
assign(NumElts, -1);
11506 FinalMask.
assign(NumElts, -1);
11507 for (
int i = 0; i != NumElts; i += NumEltsPerLane)
11508 for (
int j = 0; j != NumEltsPerLane; ++j) {
11509 int M = Mask[i + j];
11510 if (M >= 0 && M < NumElts) {
11511 V1Mask[i + (j / 2)] = M;
11512 FinalMask[i + j] = i + (j / 2);
11513 }
else if (M >= NumElts) {
11514 V2Mask[i + (j / 2)] = M - NumElts;
11515 FinalMask[i + j] = i + (j / 2) + NumElts;
11529 assert(EltSizeInBits < 64 &&
"Can't rotate 64-bit integers");
11532 int MinSubElts = Subtarget.
hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11533 int MaxSubElts = 64 / EltSizeInBits;
11534 unsigned RotateAmt, NumSubElts;
11536 MaxSubElts, NumSubElts, RotateAmt))
11538 unsigned NumElts = Mask.size();
11553 if (!IsLegal && Subtarget.
hasSSE3())
11566 if ((RotateAmt % 16) == 0)
11569 unsigned ShlAmt = RotateAmt;
11591 int NumElts = Mask.size();
11602 for (
int i = 0; i < NumElts; ++i) {
11605 "Unexpected mask index.");
11610 int StartIdx = i - (M % NumElts);
11618 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11621 Rotation = CandidateRotation;
11622 else if (Rotation != CandidateRotation)
11627 SDValue MaskV = M < NumElts ? V1 : V2;
11638 else if (TargetV != MaskV)
11645 assert(Rotation != 0 &&
"Failed to locate a viable rotation!");
11646 assert((
Lo ||
Hi) &&
"Failed to find a rotated input vector!");
11691 int NumElts = RepeatedMask.
size();
11692 int Scale = 16 / NumElts;
11693 return Rotation * Scale;
11704 if (ByteRotation <= 0)
11716 "512-bit PALIGNR requires BWI instructions");
11723 "Rotate-based lowering only supports 128-bit lowering!");
11724 assert(Mask.size() <= 16 &&
11725 "Can shuffle at most 16 bytes in a 128-bit vector!");
11726 assert(ByteVT == MVT::v16i8 &&
11727 "SSE2 rotate lowering only needed for v16i8!");
11730 int LoByteShift = 16 - ByteRotation;
11731 int HiByteShift = ByteRotation;
11755 const APInt &Zeroable,
11759 "Only 32-bit and 64-bit elements are supported!");
11763 &&
"VLX required for 128/256-bit vectors");
11775 unsigned NumElts = Mask.size();
11778 assert((ZeroLo + ZeroHi) < NumElts &&
"Zeroable shuffle detected");
11779 if (!ZeroLo && !ZeroHi)
11783 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11784 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11792 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11793 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11806 const APInt &Zeroable,
11816 if (!ZeroLo && !ZeroHi)
11819 unsigned NumElts = Mask.size();
11820 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11830 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11839 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11844 }
else if (ZeroHi == 0) {
11845 unsigned Shift = Mask[ZeroLo] % NumElts;
11850 }
else if (!Subtarget.
hasSSSE3()) {
11854 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11857 Shift += Mask[ZeroLo] % NumElts;
11893 int MaskOffset,
const APInt &Zeroable,
11895 int Size = Mask.size();
11896 unsigned SizeInBits =
Size * ScalarSizeInBits;
11898 auto CheckZeros = [&](
int Shift,
int Scale,
bool Left) {
11899 for (
int i = 0; i <
Size; i += Scale)
11900 for (
int j = 0; j < Shift; ++j)
11901 if (!Zeroable[i + j + (
Left ? 0 : (Scale - Shift))])
11907 auto MatchShift = [&](
int Shift,
int Scale,
bool Left) {
11908 for (
int i = 0; i !=
Size; i += Scale) {
11909 unsigned Pos =
Left ? i + Shift : i;
11910 unsigned Low =
Left ? i : i + Shift;
11911 unsigned Len = Scale - Shift;
11916 int ShiftEltBits = ScalarSizeInBits * Scale;
11917 bool ByteShift = ShiftEltBits > 64;
11920 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11924 Scale = ByteShift ? Scale / 2 : Scale;
11930 return (
int)ShiftAmt;
11939 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11940 for (
int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11941 for (
int Shift = 1; Shift != Scale; ++Shift)
11942 for (
bool Left : {
true,
false})
11943 if (CheckZeros(Shift, Scale,
Left)) {
11944 int ShiftAmt = MatchShift(Shift, Scale,
Left);
11955 const APInt &Zeroable,
11958 int Size = Mask.size();
11967 Mask, 0, Zeroable, Subtarget);
11970 if (ShiftAmt < 0) {
11972 Mask,
Size, Zeroable, Subtarget);
11983 "Illegal integer vector type");
11985 V = DAG.
getNode(Opcode,
DL, ShiftVT, V,
11995 int Size = Mask.size();
11996 int HalfSize =
Size / 2;
12006 int Len = HalfSize;
12007 for (; Len > 0; --Len)
12008 if (!Zeroable[Len - 1])
12010 assert(Len > 0 &&
"Zeroable shuffle mask");
12015 for (
int i = 0; i != Len; ++i) {
12024 if (i > M || M >= HalfSize)
12027 if (
Idx < 0 || (Src == V &&
Idx == (M - i))) {
12035 if (!Src ||
Idx < 0)
12038 assert((
Idx + Len) <= HalfSize &&
"Illegal extraction mask");
12051 int Size = Mask.size();
12052 int HalfSize =
Size / 2;
12059 for (
int Idx = 0;
Idx != HalfSize; ++
Idx) {
12075 for (
int Hi =
Idx + 1;
Hi <= HalfSize; ++
Hi) {
12077 int Len =
Hi -
Idx;
12091 }
else if ((!
Base || (
Base == V1)) &&
12094 }
else if ((!
Base || (
Base == V2)) &&
12144 assert(Scale > 1 &&
"Need a scale to extend.");
12147 int NumEltsPerLane = 128 / EltBits;
12148 int OffsetLane =
Offset / NumEltsPerLane;
12149 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12150 "Only 8, 16, and 32 bit elements can be extended.");
12151 assert(Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.");
12152 assert(0 <=
Offset &&
"Extension offset must be positive.");
12154 "Extension offset must be in the first lane or start an upper lane.");
12157 auto SafeOffset = [&](
int Idx) {
12158 return OffsetLane == (
Idx / NumEltsPerLane);
12162 auto ShuffleOffset = [&](
SDValue V) {
12167 for (
int i = 0; i * Scale < NumElements; ++i) {
12168 int SrcIdx = i +
Offset;
12169 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12182 NumElements / Scale);
12184 InputV = ShuffleOffset(InputV);
12186 DL, ExtVT, InputV, DAG);
12195 if (AnyExt && EltBits == 32) {
12203 if (AnyExt && EltBits == 16 && Scale > 2) {
12204 int PSHUFDMask[4] = {
Offset / 2, -1,
12209 int PSHUFWMask[4] = {1, -1, -1, -1};
12212 VT, DAG.
getNode(OddEvenOp,
DL, MVT::v8i16,
12219 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12220 assert(NumElements == (
int)Mask.size() &&
"Unexpected shuffle mask size!");
12223 int LoIdx =
Offset * EltBits;
12232 int HiIdx = (
Offset + 1) * EltBits;
12244 if (Scale > 4 && EltBits == 8 && Subtarget.
hasSSSE3()) {
12245 assert(NumElements == 16 &&
"Unexpected byte vector width!");
12247 for (
int i = 0; i < 16; ++i) {
12249 if ((i % Scale == 0 && SafeOffset(
Idx))) {
12256 InputV = DAG.
getBitcast(MVT::v16i8, InputV);
12264 int AlignToUnpack =
Offset % (NumElements / Scale);
12265 if (AlignToUnpack) {
12267 for (
int i = AlignToUnpack; i < NumElements; ++i)
12268 ShMask[i - AlignToUnpack] = i;
12270 Offset -= AlignToUnpack;
12276 if (
Offset >= (NumElements / 2)) {
12278 Offset -= (NumElements / 2);
12285 InputV = DAG.
getNode(UnpackLoHi,
DL, InputVT, InputV, Ext);
12289 }
while (Scale > 1);
12310 int NumLanes = Bits / 128;
12312 int NumEltsPerLane = NumElements / NumLanes;
12314 "Exceeds 32-bit integer zero extension limit");
12315 assert((
int)Mask.size() == NumElements &&
"Unexpected shuffle mask size");
12321 bool AnyExt =
true;
12324 for (
int i = 0; i < NumElements; ++i) {
12328 if (i % Scale != 0) {
12340 SDValue V = M < NumElements ? V1 : V2;
12341 M = M % NumElements;
12344 Offset = M - (i / Scale);
12345 }
else if (InputV != V)
12352 (
Offset % NumEltsPerLane) == 0))
12357 if (
Offset && (
Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12360 if ((M % NumElements) != (
Offset + (i / Scale)))
12373 if (
Offset != 0 && Matches < 2)
12377 InputV, Mask, Subtarget, DAG);
12381 assert(Bits % 64 == 0 &&
12382 "The number of bits in a vector must be divisible by 64 on x86!");
12383 int NumExtElements = Bits / 64;
12387 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12388 assert(NumElements % NumExtElements == 0 &&
12389 "The input vector size must be divisible by the extended size.");
12400 auto CanZExtLowHalf = [&]() {
12401 for (
int i = NumElements / 2; i != NumElements; ++i)
12411 if (
SDValue V = CanZExtLowHalf()) {
12426 MVT VT = V.getSimpleValueType();
12432 MVT NewVT = V.getSimpleValueType();
12453 return V->hasOneUse() &&
12457template<
typename T>
12459 T EltVT = VT.getScalarType();
12460 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12461 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12481 find_if(Mask, [&Mask](
int M) {
return M >= (int)Mask.size(); }) -
12484 bool IsV1Zeroable =
true;
12485 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
12486 if (i != V2Index && !Zeroable[i]) {
12487 IsV1Zeroable =
false;
12492 if (!IsV1Zeroable) {
12494 V1Mask[V2Index] = -1;
12509 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12513 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12522 if (!IsV1Zeroable) {
12533 }
else if (Mask[V2Index] != (
int)Mask.size() || EltVT == MVT::i8 ||
12534 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12540 if (!IsV1Zeroable) {
12543 assert(VT == ExtVT &&
"Cannot change extended type when non-zeroable!");
12550 unsigned MovOpc = 0;
12551 if (EltVT == MVT::f16)
12553 else if (EltVT == MVT::f32)
12555 else if (EltVT == MVT::f64)
12559 return DAG.
getNode(MovOpc,
DL, ExtVT, V1, V2);
12570 if (V2Index != 0) {
12577 V2Shuffle[V2Index] = 0;
12599 "We can only lower integer broadcasts with AVX2!");
12605 assert(V0VT.
isVector() &&
"Unexpected non-vector vector-sized value!");
12615 if (V0EltSize <= EltSize)
12618 assert(((V0EltSize % EltSize) == 0) &&
12619 "Scalar type sizes must all be powers of 2 on x86!");
12622 const unsigned Scale = V0EltSize / EltSize;
12623 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12635 if (
const int OffsetIdx = BroadcastIdx % Scale)
12649 assert(Mask.size() == 4 &&
"Unsupported mask size!");
12650 assert(Mask[0] >= -1 && Mask[0] < 8 &&
"Out of bound mask element!");
12651 assert(Mask[1] >= -1 && Mask[1] < 8 &&
"Out of bound mask element!");
12652 assert(Mask[2] >= -1 && Mask[2] < 8 &&
"Out of bound mask element!");
12653 assert(Mask[3] >= -1 && Mask[3] < 8 &&
"Out of bound mask element!");
12657 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12659 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12671 assert((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.");
12672 int Size = Mask.size();
12673 for (
int i = 0; i <
Size; ++i)
12674 if (Mask[i] >= 0 && Mask[i] /
Size == Input && Mask[i] %
Size != i)
12689 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12709 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12711 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12717 if (NumElts == 4 &&
12722 NewMask.
append(NumElts, -1);
12742 if (!((Subtarget.
hasSSE3() && VT == MVT::v2f64) ||
12743 (Subtarget.
hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12750 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.
hasAVX2())
12757 if (BroadcastIdx < 0)
12759 assert(BroadcastIdx < (
int)Mask.size() &&
"We only expect to be called with "
12760 "a sorted mask where the broadcast "
12762 int NumActiveElts =
count_if(Mask, [](
int M) {
return M >= 0; });
12768 int BitOffset = BroadcastIdx * NumEltBits;
12771 switch (V.getOpcode()) {
12773 V = V.getOperand(0);
12777 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12778 int OpIdx = BitOffset / OpBitWidth;
12779 V = V.getOperand(OpIdx);
12780 BitOffset %= OpBitWidth;
12785 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12786 unsigned Idx = V.getConstantOperandVal(1);
12787 unsigned BeginOffset =
Idx * EltBitWidth;
12788 BitOffset += BeginOffset;
12789 V = V.getOperand(0);
12793 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12795 int Idx = (int)V.getConstantOperandVal(2);
12796 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12797 int BeginOffset =
Idx * EltBitWidth;
12798 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12799 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12800 BitOffset -= BeginOffset;
12810 assert((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset");
12811 BroadcastIdx = BitOffset / NumEltBits;
12814 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12823 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12824 return TruncBroadcast;
12830 V = V.getOperand(BroadcastIdx);
12836 cast<LoadSDNode>(V)->isSimple()) {
12846 assert((
int)(
Offset * 8) == BitOffset &&
"Unexpected bit-offset");
12863 assert(SVT == MVT::f64 &&
"Unexpected VT!");
12868 }
else if (!BroadcastFromReg) {
12871 }
else if (BitOffset != 0) {
12879 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12884 if (BitOffset < 128 && NumActiveElts > 1 &&
12885 V.getScalarValueSizeInBits() == NumEltBits) {
12886 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12887 "Unexpected bit-offset");
12889 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
12894 if ((BitOffset % 128) != 0)
12897 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12898 "Unexpected bit-offset");
12899 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12900 "Unexpected vector size");
12901 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12909 if (Subtarget.
hasAVX()) {
12917 if (!V.getValueType().isVector()) {
12918 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12919 "Unexpected scalar size");
12928 if (V.getValueSizeInBits() > 128)
12933 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12945 unsigned &InsertPSMask,
12946 const APInt &Zeroable,
12949 assert(V2.getSimpleValueType().is128BitVector() &&
"Bad operand type!");
12950 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
12957 unsigned ZMask = 0;
12958 int VADstIndex = -1;
12959 int VBDstIndex = -1;
12960 bool VAUsedInPlace =
false;
12962 for (
int i = 0; i < 4; ++i) {
12970 if (i == CandidateMask[i]) {
12971 VAUsedInPlace =
true;
12976 if (VADstIndex >= 0 || VBDstIndex >= 0)
12979 if (CandidateMask[i] < 4) {
12989 if (VADstIndex < 0 && VBDstIndex < 0)
12994 unsigned VBSrcIndex = 0;
12995 if (VADstIndex >= 0) {
12998 VBSrcIndex = CandidateMask[VADstIndex];
12999 VBDstIndex = VADstIndex;
13002 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13007 if (!VAUsedInPlace)
13015 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13016 assert((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!");
13020 if (matchAsInsertPS(V1, V2, Mask))
13026 if (matchAsInsertPS(V2, V1, CommutedMask))
13036 assert(V2.getSimpleValueType() == MVT::v4f32 &&
"Bad operand type!");
13039 unsigned InsertPSMask = 0;
13060 assert(V2.getSimpleValueType() == MVT::v2f64 &&
"Bad operand type!");
13061 assert(Mask.size() == 2 &&
"Unexpected mask size for v2 shuffle!");
13063 if (V2.isUndef()) {
13066 Mask, Subtarget, DAG))
13071 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13073 if (Subtarget.
hasAVX()) {
13086 assert(Mask[0] >= 0 &&
"No undef lanes in multi-input v2 shuffles!");
13087 assert(Mask[1] >= 0 &&
"No undef lanes in multi-input v2 shuffles!");
13088 assert(Mask[0] < 2 &&
"We sort V1 to be the first input.");
13089 assert(Mask[1] >= 2 &&
"We sort V2 to be the second input.");
13098 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13102 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13103 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13105 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13121 Zeroable, Subtarget, DAG))
13128 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13144 assert(V2.getSimpleValueType() == MVT::v2i64 &&
"Bad operand type!");
13145 assert(Mask.size() == 2 &&
"Unexpected mask size for v2 shuffle!");
13147 if (V2.isUndef()) {
13150 Mask, Subtarget, DAG))
13157 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13158 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13159 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13160 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13166 assert(Mask[0] != -1 &&
"No undef lanes in multi-input v2 shuffles!");
13167 assert(Mask[1] != -1 &&
"No undef lanes in multi-input v2 shuffles!");
13168 assert(Mask[0] < 2 &&
"We sort V1 to be the first input.");
13169 assert(Mask[1] >= 2 &&
"We sort V2 to be the second input.");
13184 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13188 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13190 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13195 bool IsBlendSupported = Subtarget.
hasSSE41();
13196 if (IsBlendSupported)
13198 Zeroable, Subtarget, DAG))
13208 if (Subtarget.hasVLX())
13210 Zeroable, Subtarget, DAG))
13220 if (IsBlendSupported)
13222 Zeroable, Subtarget, DAG);
13242 SDValue LowV = V1, HighV = V2;
13244 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
13246 if (NumV2Elements == 1) {
13247 int V2Index =
find_if(Mask, [](
int M) {
return M >= 4; }) - Mask.begin();
13251 int V2AdjIndex = V2Index ^ 1;
13253 if (Mask[V2AdjIndex] < 0) {
13259 NewMask[V2Index] -= 4;
13263 int V1Index = V2AdjIndex;
13264 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13276 NewMask[V1Index] = 2;
13277 NewMask[V2Index] = 0;
13279 }
else if (NumV2Elements == 2) {
13280 if (Mask[0] < 4 && Mask[1] < 4) {
13285 }
else if (Mask[2] < 4 && Mask[3] < 4) {
13300 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13301 Mask[2] < 4 ? Mask[2] : Mask[3],
13302 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13303 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13310 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13311 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13312 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13313 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13315 }
else if (NumV2Elements == 3) {
13336 assert(V2.getSimpleValueType() == MVT::v4f32 &&
"Bad operand type!");
13337 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
13341 Zeroable, Subtarget, DAG))
13344 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
13346 if (NumV2Elements == 0) {
13349 Mask, Subtarget, DAG))
13360 if (Subtarget.
hasAVX()) {
13384 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13398 if (NumV2Elements == 1 && Mask[0] >= 4)
13400 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13440 assert(V2.getSimpleValueType() == MVT::v4i32 &&
"Bad operand type!");
13441 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
13447 Zeroable, Subtarget, DAG))
13450 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
13453 if (Subtarget.preferLowerShuffleAsShift()) {
13456 Subtarget, DAG,
true))
13458 if (NumV2Elements == 0)
13464 if (NumV2Elements == 0) {
13466 if (
count_if(Mask, [](
int M) {
return M >= 0 && M < 4; }) > 1) {
13468 Mask, Subtarget, DAG))
13477 const int UnpackLoMask[] = {0, 0, 1, 1};
13478 const int UnpackHiMask[] = {2, 2, 3, 3};
13480 Mask = UnpackLoMask;
13482 Mask = UnpackHiMask;
13499 if (NumV2Elements == 1)
13501 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13506 bool IsBlendSupported = Subtarget.
hasSSE41();
13507 if (IsBlendSupported)
13509 Zeroable, Subtarget, DAG))
13513 Zeroable, Subtarget, DAG))
13523 if (Subtarget.hasVLX())
13524 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13525 Zeroable, Subtarget, DAG))
13528 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13539 if (IsBlendSupported)
13541 Zeroable, Subtarget, DAG);
13545 Mask, Subtarget, DAG))
13582 assert(Mask.size() == 8 &&
"Shuffle mask length doesn't match!");
13594 for (
int i = 0; i != 4; ++i)
13595 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13601 copy_if(LoMask, std::back_inserter(LoInputs), [](
int M) {
return M >= 0; });
13605 copy_if(HiMask, std::back_inserter(HiInputs), [](
int M) {
return M >= 0; });
13609 int NumHToL = LoInputs.
size() - NumLToL;
13611 int NumHToH = HiInputs.
size() - NumLToH;
13630 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13631 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13633 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13636 for (
int DWord = 0; DWord != 4; ++DWord) {
13637 int M0 = Mask[2 * DWord + 0];
13638 int M1 = Mask[2 * DWord + 1];
13641 if (
M0 < 0 &&
M1 < 0)
13644 bool Match =
false;
13645 for (
int j = 0, e = DWordPairs.
size(); j < e; ++j) {
13646 auto &DWordPair = DWordPairs[j];
13649 DWordPair.first = (
M0 >= 0 ?
M0 : DWordPair.first);
13650 DWordPair.second = (
M1 >= 0 ?
M1 : DWordPair.second);
13651 PSHUFDMask[DWord] = DOffset + j;
13657 PSHUFDMask[DWord] = DOffset + DWordPairs.
size();
13662 if (DWordPairs.
size() <= 2) {
13663 DWordPairs.
resize(2, std::make_pair(-1, -1));
13664 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13665 DWordPairs[1].first, DWordPairs[1].second};
13666 if ((NumHToL + NumHToH) == 0)
13667 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask,
X86ISD::PSHUFLW);
13668 if ((NumLToL + NumLToH) == 0)
13669 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask,
X86ISD::PSHUFHW);
13705 int AOffset,
int BOffset) {
13707 "Must call this with A having 3 or 1 inputs from the A half.");
13709 "Must call this with B having 1 or 3 inputs from the B half.");
13711 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13713 bool ThreeAInputs = AToAInputs.
size() == 3;
13719 int ADWord = 0, BDWord = 0;
13720 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13721 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13722 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13723 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13724 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13725 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13726 int TripleNonInputIdx =
13727 TripleInputSum - std::accumulate(TripleInputs.
begin(), TripleInputs.
end(), 0);
13728 TripleDWord = TripleNonInputIdx / 2;
13732 OneInputDWord = (OneInput / 2) ^ 1;
13739 if (BToBInputs.
size() == 2 && AToBInputs.
size() == 2) {
13744 int NumFlippedAToBInputs =
llvm::count(AToBInputs, 2 * ADWord) +
13746 int NumFlippedBToBInputs =
llvm::count(BToBInputs, 2 * BDWord) +
13748 if ((NumFlippedAToBInputs == 1 &&
13749 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13750 (NumFlippedBToBInputs == 1 &&
13751 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13756 auto FixFlippedInputs = [&V, &
DL, &Mask, &DAG](
int PinnedIdx,
int DWord,
13758 int FixIdx = PinnedIdx ^ 1;
13759 bool IsFixIdxInput =
is_contained(Inputs, PinnedIdx ^ 1);
13763 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13764 bool IsFixFreeIdxInput =
is_contained(Inputs, FixFreeIdx);
13765 if (IsFixIdxInput == IsFixFreeIdxInput)
13768 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13769 "We need to be changing the number of flipped inputs!");
13770 int PSHUFHalfMask[] = {0, 1, 2, 3};
13771 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13777 for (
int &M : Mask)
13778 if (M >= 0 && M == FixIdx)
13780 else if (M >= 0 && M == FixFreeIdx)
13783 if (NumFlippedBToBInputs != 0) {
13785 BToAInputs.
size() == 3 ? TripleNonInputIdx : OneInput;
13786 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13788 assert(NumFlippedAToBInputs != 0 &&
"Impossible given predicates!");
13789 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13790 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13795 int PSHUFDMask[] = {0, 1, 2, 3};
13796 PSHUFDMask[ADWord] = BDWord;
13797 PSHUFDMask[BDWord] = ADWord;
13804 for (
int &M : Mask)
13805 if (M >= 0 && M/2 == ADWord)
13806 M = 2 * BDWord + M % 2;
13807 else if (M >= 0 && M/2 == BDWord)
13808 M = 2 * ADWord + M % 2;
13814 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13815 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13816 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13817 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13824 int PSHUFLMask[4] = {-1, -1, -1, -1};
13825 int PSHUFHMask[4] = {-1, -1, -1, -1};
13826 int PSHUFDMask[4] = {-1, -1, -1, -1};
13831 auto fixInPlaceInputs =
13835 if (InPlaceInputs.
empty())
13837 if (InPlaceInputs.
size() == 1) {
13838 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13839 InPlaceInputs[0] - HalfOffset;
13840 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13843 if (IncomingInputs.
empty()) {
13845 for (
int Input : InPlaceInputs) {
13846 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13847 PSHUFDMask[Input / 2] = Input / 2;
13852 assert(InPlaceInputs.
size() == 2 &&
"Cannot handle 3 or 4 inputs!");
13853 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13854 InPlaceInputs[0] - HalfOffset;
13857 int AdjIndex = InPlaceInputs[0] ^ 1;
13858 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13859 std::replace(HalfMask.
begin(), HalfMask.
end(), InPlaceInputs[1], AdjIndex);
13860 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13862 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13863 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13869 auto moveInputsToRightHalf = [&PSHUFDMask](
13874 auto isWordClobbered = [](
ArrayRef<int> SourceHalfMask,
int Word) {
13875 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13877 auto isDWordClobbered = [&isWordClobbered](
ArrayRef<int> SourceHalfMask,
13879 int LowWord = Word & ~1;
13880 int HighWord = Word | 1;
13881 return isWordClobbered(SourceHalfMask, LowWord) ||
13882 isWordClobbered(SourceHalfMask, HighWord);
13885 if (IncomingInputs.
empty())
13888 if (ExistingInputs.
empty()) {
13890 for (
int Input : IncomingInputs) {
13893 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13894 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13895 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13896 Input - SourceOffset;
13898 for (
int &M : HalfMask)
13899 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13901 else if (M == Input)
13902 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13904 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13905 Input - SourceOffset &&
13906 "Previous placement doesn't match!");
13911 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13915 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13916 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13918 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13920 "Previous placement doesn't match!");
13926 for (
int &M : HalfMask)
13927 if (M >= SourceOffset && M < SourceOffset + 4) {
13928 M = M - SourceOffset + DestOffset;
13929 assert(M >= 0 &&
"This should never wrap below zero!");
13937 if (IncomingInputs.
size() == 1) {
13938 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13939 int InputFixed =
find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13941 SourceHalfMask[InputFixed - SourceOffset] =
13942 IncomingInputs[0] - SourceOffset;
13943 std::replace(HalfMask.
begin(), HalfMask.
end(), IncomingInputs[0],
13945 IncomingInputs[0] = InputFixed;
13947 }
else if (IncomingInputs.
size() == 2) {
13948 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13949 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13953 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13954 IncomingInputs[1] - SourceOffset};
13959 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13960 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13961 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13962 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13963 InputsFixed[1] = InputsFixed[0] ^ 1;
13964 }
else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13965 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13966 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13967 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13968 InputsFixed[0] = InputsFixed[1] ^ 1;
13969 }
else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13970 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13974 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13975 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13976 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13977 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13983 for (
int i = 0; i < 4; ++i)
13984 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13985 "We can't handle any clobbers here!");
13986 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13987 "Cannot have adjacent inputs here!");
13989 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13990 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13994 for (
int &M : FinalSourceHalfMask)
13995 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13996 M = InputsFixed[1] + SourceOffset;
13997 else if (M == InputsFixed[1] + SourceOffset)
13998 M = (InputsFixed[0] ^ 1) + SourceOffset;
14000 InputsFixed[1] = InputsFixed[0] ^ 1;
14004 for (
int &M : HalfMask)
14005 if (M == IncomingInputs[0])
14006 M = InputsFixed[0] + SourceOffset;
14007 else if (M == IncomingInputs[1])
14008 M = InputsFixed[1] + SourceOffset;
14010 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14011 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14018 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14019 assert(PSHUFDMask[FreeDWord] < 0 &&
"DWord not free");
14020 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14021 for (
int &M : HalfMask)
14022 for (
int Input : IncomingInputs)
14024 M = FreeDWord * 2 + Input % 2;
14026 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14028 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14048 "Failed to lift all the high half inputs to the low mask!");
14049 assert(
count_if(HiMask, [](
int M) {
return M >= 0 && M < 4; }) == 0 &&
14050 "Failed to lift all the low half inputs to the high mask!");
14058 for (
int &M : HiMask)
14074 "Lane crossing shuffle masks not supported");
14077 int Size = Mask.size();
14078 int Scale = NumBytes /
Size;
14085 for (
int i = 0; i < NumBytes; ++i) {
14086 int M = Mask[i / Scale];
14090 const int ZeroMask = 0x80;
14091 int V1Idx = M <
Size ? M * Scale + i % Scale : ZeroMask;
14092 int V2Idx = M <
Size ? ZeroMask : (M -
Size) * Scale + i % Scale;
14093 if (Zeroable[i / Scale])
14094 V1Idx = V2Idx = ZeroMask;
14098 V1InUse |= (ZeroMask != V1Idx);
14099 V2InUse |= (ZeroMask != V2Idx);
14112 if (V1InUse && V2InUse)
14115 V = V1InUse ? V1 : V2;
14138 assert(V2.getSimpleValueType() == MVT::v8i16 &&
"Bad operand type!");
14139 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
14144 Zeroable, Subtarget, DAG))
14152 int NumV2Inputs =
count_if(Mask, [](
int M) {
return M >= 8; });
14154 if (NumV2Inputs == 0) {
14158 Subtarget, DAG,
false))
14163 Mask, Subtarget, DAG))
14192 "All single-input shuffles should be canonicalized to be V1-input "
14202 if (Subtarget.hasSSE4A())
14208 if (NumV2Inputs == 1)
14210 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14215 bool IsBlendSupported = Subtarget.
hasSSE41();
14216 if (IsBlendSupported)
14218 Zeroable, Subtarget, DAG))
14222 Zeroable, Subtarget, DAG))
14250 Zeroable, Subtarget, DAG))
14255 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.
hasSSE41())) &&
14256 !Subtarget.hasVLX()) {
14258 unsigned PackOpc = 0;
14259 if (NumEvenDrops == 2 && Subtarget.
hasAVX2() &&
14270 }
else if (Subtarget.
hasSSE41()) {
14273 for (
unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14282 }
else if (!Subtarget.
hasSSSE3()) {
14295 if (NumEvenDrops == 2) {
14296 Result = DAG.
getBitcast(MVT::v4i32, Result);
14297 Result = DAG.
getNode(PackOpc,
DL, MVT::v8i16, Result, Result);
14305 if (NumOddDrops == 1) {
14306 bool HasSSE41 = Subtarget.
hasSSE41();
14314 MVT::v8i16, V1, V2);
14319 Mask, Subtarget, DAG))
14324 if (!IsBlendSupported && Subtarget.
hasSSSE3()) {
14325 bool V1InUse, V2InUse;
14327 Zeroable, DAG, V1InUse, V2InUse);
14333 Zeroable, Subtarget, DAG);
14342 assert(V2.getSimpleValueType() == MVT::v8f16 &&
"Bad operand type!");
14343 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
14344 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
14346 if (Subtarget.hasFP16()) {
14347 if (NumV2Elements == 0) {
14350 Mask, Subtarget, DAG))
14353 if (NumV2Elements == 1 && Mask[0] >= 8)
14355 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14382 MVT ShuffleVT = VT;
14392 for (
int &M : AdjustedMask)
14394 M += (Scale - 1) * NumElts;
14407 if (VT != ShuffleVT)
14425 assert(V2.getSimpleValueType() == MVT::v16i8 &&
"Bad operand type!");
14426 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
14446 Zeroable, Subtarget, DAG))
14459 if (Subtarget.hasSSE4A())
14464 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
14467 if (NumV2Elements == 0) {
14470 Mask, Subtarget, DAG))
14490 for (
int i = 0; i < 16; i += 2)
14491 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14496 auto tryToWidenViaDuplication = [&]() ->
SDValue {
14497 if (!canWidenViaDuplication(Mask))
14500 copy_if(Mask, std::back_inserter(LoInputs),
14501 [](
int M) {
return M >= 0 && M < 8; });
14505 copy_if(Mask, std::back_inserter(HiInputs), [](
int M) {
return M >= 8; });
14509 bool TargetLo = LoInputs.
size() >= HiInputs.
size();
14510 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14511 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14513 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14515 for (
int I : InPlaceInputs) {
14516 PreDupI16Shuffle[
I/2] =
I/2;
14519 int j = TargetLo ? 0 : 4, je = j + 4;
14520 for (
int i = 0, ie = MovingInputs.
size(); i < ie; ++i) {
14523 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14526 while (j < je && PreDupI16Shuffle[j] >= 0)
14534 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14538 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14543 DAG.
getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14546 bool EvenInUse =
false, OddInUse =
false;
14547 for (
int i = 0; i < 16; i += 2) {
14548 EvenInUse |= (Mask[i + 0] >= 0);
14549 OddInUse |= (Mask[i + 1] >= 0);
14550 if (EvenInUse && OddInUse)
14554 MVT::v16i8, EvenInUse ? V1 : DAG.
getUNDEF(MVT::v16i8),
14555 OddInUse ? V1 : DAG.
getUNDEF(MVT::v16i8));
14557 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14558 for (
int i = 0; i < 16; ++i)
14559 if (Mask[i] >= 0) {
14560 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14561 assert(MappedMask < 8 &&
"Invalid v8 shuffle mask!");
14562 if (PostDupI16Shuffle[i / 2] < 0)
14563 PostDupI16Shuffle[i / 2] = MappedMask;
14565 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14566 "Conflicting entries in the original shuffle!");
14571 DAG.
getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14573 if (
SDValue V = tryToWidenViaDuplication())
14578 Zeroable, Subtarget, DAG))
14587 Zeroable, Subtarget, DAG))
14591 bool IsSingleInput = V2.isUndef();
14610 if (Subtarget.
hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14611 bool V1InUse =
false;
14612 bool V2InUse =
false;
14615 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14620 if (V1InUse && V2InUse) {
14623 Zeroable, Subtarget, DAG))
14635 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14639 if (Subtarget.hasVBMI())
14644 if (Subtarget.hasXOP()) {
14652 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14660 if (NumV2Elements == 1)
14662 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14675 if (NumEvenDrops) {
14681 assert(NumEvenDrops <= 3 &&
14682 "No support for dropping even elements more than 3 times.");
14684 for (
unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14689 if (!IsSingleInput)
14695 IsSingleInput ? V1 : V2);
14696 for (
int i = 1; i < NumEvenDrops; ++i) {
14697 Result = DAG.
getBitcast(MVT::v8i16, Result);
14704 if (NumOddDrops == 1) {
14708 if (!IsSingleInput)
14713 IsSingleInput ? V1 : V2);
14717 if (NumV2Elements > 0)
14719 Zeroable, Subtarget, DAG);
14726 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14727 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14728 for (
int i = 0; i < 16; ++i)
14730 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14736 if (
none_of(LoBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; }) &&
14737 none_of(HiBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; })) {
14744 VHiHalf = DAG.
getUNDEF(MVT::v8i16);
14747 for (
int &M : LoBlendMask)
14750 for (
int &M : HiBlendMask)
14776 const APInt &Zeroable,
14779 if (VT == MVT::v8bf16) {
14816 "Only for 256-bit or wider vector shuffles!");
14818 assert(V2.getSimpleValueType() == VT &&
"Bad operand type!");
14824 int SplitNumElements = NumElements / 2;
14830 auto SplitVector = [&](
SDValue V) {
14833 return std::make_pair(DAG.
getBitcast(SplitVT, LoV),
14837 SDValue LoV1, HiV1, LoV2, HiV2;
14838 std::tie(LoV1, HiV1) = SplitVector(V1);
14839 std::tie(LoV2, HiV2) = SplitVector(V2);
14842 auto GetHalfBlendPiecesReq = [&](
const ArrayRef<int> &HalfMask,
bool &UseLoV1,
14843 bool &UseHiV1,
bool &UseLoV2,
14845 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 =
false;
14846 for (
int i = 0; i < SplitNumElements; ++i) {
14847 int M = HalfMask[i];
14848 if (M >= NumElements) {
14849 if (M >= NumElements + SplitNumElements)
14853 }
else if (M >= 0) {
14854 if (M >= SplitNumElements)
14862 auto CheckHalfBlendUsable = [&](
const ArrayRef<int> &HalfMask) ->
bool {
14866 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14867 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14869 return !(UseHiV1 || UseHiV2);
14876 for (
int i = 0; i < SplitNumElements; ++i) {
14877 int M = HalfMask[i];
14878 if (M >= NumElements) {
14879 V2BlendMask[i] = M - NumElements;
14880 BlendMask[i] = SplitNumElements + i;
14881 }
else if (M >= 0) {
14882 V1BlendMask[i] = M;
14887 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14888 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14893 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) &&
"Shuffle isn't simple");
14896 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14898 if (!UseLoV2 && !UseHiV2)
14900 if (!UseLoV1 && !UseHiV1)
14904 if (UseLoV1 && UseHiV1) {
14908 V1Blend = UseLoV1 ? LoV1 : HiV1;
14909 for (
int i = 0; i < SplitNumElements; ++i)
14910 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14911 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14913 if (UseLoV2 && UseHiV2) {
14917 V2Blend = UseLoV2 ? LoV2 : HiV2;
14918 for (
int i = 0; i < SplitNumElements; ++i)
14919 if (BlendMask[i] >= SplitNumElements)
14920 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14925 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14943 const APInt &Zeroable,
14946 assert(!V2.isUndef() &&
"This routine must not be used to lower single-input "
14947 "shuffles as it could then recurse on itself.");
14948 int Size = Mask.size();
14953 auto DoBothBroadcast = [&] {
14954 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14957 if (V2BroadcastIdx < 0)
14958 V2BroadcastIdx = M -
Size;
14959 else if (M -
Size != V2BroadcastIdx)
14961 }
else if (M >= 0) {
14962 if (V1BroadcastIdx < 0)
14963 V1BroadcastIdx = M;
14964 else if (M != V1BroadcastIdx)
14969 if (DoBothBroadcast())
14977 int LaneSize =
Size / LaneCount;
14979 LaneInputs[0].
resize(LaneCount,
false);
14980 LaneInputs[1].
resize(LaneCount,
false);
14981 for (
int i = 0; i <
Size; ++i)
14983 LaneInputs[Mask[i] /
Size][(Mask[i] %
Size) / LaneSize] =
true;
14984 if (LaneInputs[0].
count() <= 1 && LaneInputs[1].
count() <= 1)
15000 assert(VT == MVT::v4f64 &&
"Only for v4f64 shuffles");
15002 int LHSMask[4] = {-1, -1, -1, -1};
15003 int RHSMask[4] = {-1, -1, -1, -1};
15004 int SHUFPDMask[4] = {-1, -1, -1, -1};
15008 for (
int i = 0; i != 4; ++i) {
15012 int LaneBase = i & ~1;
15013 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15014 LaneMask[LaneBase + (M & 1)] = M;
15015 SHUFPDMask[i] = M & 1;
15037 int NumEltsPerLane = NumElts / NumLanes;
15038 bool CanUseSublanes = Subtarget.
hasAVX2() && V2.isUndef();
15045 auto getSublanePermute = [&](
int NumSublanes) ->
SDValue {
15046 int NumSublanesPerLane = NumSublanes / NumLanes;
15047 int NumEltsPerSublane = NumElts / NumSublanes;
15055 for (
int i = 0; i != NumElts; ++i) {
15060 int SrcSublane = M / NumEltsPerSublane;
15061 int DstLane = i / NumEltsPerLane;
15065 bool Found =
false;
15066 int DstSubStart = DstLane * NumSublanesPerLane;
15067 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15068 for (
int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15069 if (!
isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15073 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15074 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15075 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15076 DemandedCrossLane.
setBit(InLaneMask[i]);
15086 if (!CanUseSublanes) {
15091 int NumIdentityLanes = 0;
15092 bool OnlyShuffleLowestLane =
true;
15093 for (
int i = 0; i != NumLanes; ++i) {
15094 int LaneOffset = i * NumEltsPerLane;
15096 i * NumEltsPerLane))
15097 NumIdentityLanes++;
15098 else if (CrossLaneMask[LaneOffset] != 0)
15099 OnlyShuffleLowestLane =
false;
15101 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15108 if (CrossLaneMask == Mask || InLaneMask == Mask)
15113 for (
int i = 0; i != NumElts; ++i)
15114 if (!DemandedCrossLane[i])
15123 if (
SDValue V = getSublanePermute(NumLanes))
15127 if (!CanUseSublanes)
15131 if (
SDValue V = getSublanePermute(NumLanes * 2))
15136 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15139 return getSublanePermute(NumLanes * 4);
15145 int Size = Mask.size();
15146 InLaneMask.
assign(Mask.begin(), Mask.end());
15147 for (
int i = 0; i <
Size; ++i) {
15148 int &M = InLaneMask[i];
15151 if (((M %
Size) / LaneSize) != (i / LaneSize))
15152 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) +
Size;
15168 int Size = Mask.size();
15169 int LaneSize =
Size / 2;
15174 if (VT == MVT::v4f64 &&
15175 !
all_of(Mask, [LaneSize](
int M) {
return M < LaneSize; }))
15183 bool LaneCrossing[2] = {
false,
false};
15184 for (
int i = 0; i <
Size; ++i)
15185 if (Mask[i] >= 0 && ((Mask[i] %
Size) / LaneSize) != (i / LaneSize))
15186 LaneCrossing[(Mask[i] %
Size) / LaneSize] =
true;
15187 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15189 bool LaneUsed[2] = {
false,
false};
15190 for (
int i = 0; i <
Size; ++i)
15192 LaneUsed[(Mask[i] %
Size) / LaneSize] =
true;
15193 AllLanes = LaneUsed[0] && LaneUsed[1];
15198 "This last part of this routine only works on single input shuffles");
15204 "In-lane shuffle mask expected");
15224 const APInt &Zeroable,
15227 if (V2.isUndef()) {
15237 VT, MemVT, Ld, Ofs, DAG))
15252 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15253 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15256 if (WidenedMask[0] == 0 && IsHighZero) {
15276 if (!IsLowZero && !IsHighZero) {
15287 OnlyUsesV1 ? V1 : V2,
15295 if (Subtarget.hasVLX()) {
15296 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15297 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15298 ((WidenedMask[1] % 2) << 1);
15318 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15319 (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?");
15321 unsigned PermMask = 0;
15322 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15323 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15326 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15328 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15345 assert(!V2.isUndef() &&
"This is only useful with multiple inputs.");
15350 int NumElts = Mask.size();
15358 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
15359 int Srcs[2] = {-1, -1};
15361 for (
int i = 0; i != NumLaneElts; ++i) {
15362 int M = Mask[(Lane * NumLaneElts) + i];
15369 int LaneSrc = M / NumLaneElts;
15371 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15373 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15378 Srcs[Src] = LaneSrc;
15379 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15386 LaneSrcs[Lane][0] = Srcs[0];
15387 LaneSrcs[Lane][1] = Srcs[1];
15390 assert(
M1.size() == M2.size() &&
"Unexpected mask size");
15391 for (
int i = 0, e =
M1.size(); i != e; ++i)
15392 if (
M1[i] >= 0 && M2[i] >= 0 &&
M1[i] != M2[i])
15398 assert(Mask.size() == MergedMask.size() &&
"Unexpected mask size");
15399 for (
int i = 0, e = MergedMask.size(); i != e; ++i) {
15403 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15404 "Unexpected mask element");
15409 if (MatchMasks(InLaneMask, RepeatMask)) {
15411 MergeMasks(InLaneMask, RepeatMask);
15416 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15419 if (MatchMasks(InLaneMask, RepeatMask)) {
15421 MergeMasks(InLaneMask, RepeatMask);
15430 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
15432 if (LaneSrcs[Lane][0] >= 0)
15435 for (
int i = 0; i != NumLaneElts; ++i) {
15436 int M = Mask[(Lane * NumLaneElts) + i];
15441 if (RepeatMask[i] < 0)
15442 RepeatMask[i] = M % NumLaneElts;
15444 if (RepeatMask[i] < NumElts) {
15445 if (RepeatMask[i] != M % NumLaneElts)
15447 LaneSrcs[Lane][0] = M / NumLaneElts;
15449 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15451 LaneSrcs[Lane][1] = M / NumLaneElts;
15455 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15460 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
15461 int Src = LaneSrcs[Lane][0];
15462 for (
int i = 0; i != NumLaneElts; ++i) {
15465 M = Src * NumLaneElts + i;
15466 NewMask[Lane * NumLaneElts + i] = M;
15473 if (isa<ShuffleVectorSDNode>(NewV1) &&
15474 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15477 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
15478 int Src = LaneSrcs[Lane][1];
15479 for (
int i = 0; i != NumLaneElts; ++i) {
15482 M = Src * NumLaneElts + i;
15483 NewMask[Lane * NumLaneElts + i] = M;
15490 if (isa<ShuffleVectorSDNode>(NewV2) &&
15491 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15494 for (
int i = 0; i != NumElts; ++i) {
15499 NewMask[i] = RepeatMask[i % NumLaneElts];
15500 if (NewMask[i] < 0)
15503 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15516 int &HalfIdx1,
int &HalfIdx2) {
15517 assert((Mask.size() == HalfMask.
size() * 2) &&
15518 "Expected input mask to be twice as long as output");
15523 if (UndefLower == UndefUpper)
15526 unsigned HalfNumElts = HalfMask.
size();
15527 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15530 for (
unsigned i = 0; i != HalfNumElts; ++i) {
15531 int M = Mask[i + MaskIndexOffset];
15539 int HalfIdx = M / HalfNumElts;
15542 int HalfElt = M % HalfNumElts;
15546 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15547 HalfMask[i] = HalfElt;
15548 HalfIdx1 = HalfIdx;
15551 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15552 HalfMask[i] = HalfElt + HalfNumElts;
15553 HalfIdx2 = HalfIdx;
15568 int HalfIdx2,
bool UndefLower,
15577 auto getHalfVector = [&](
int HalfIdx) {
15580 SDValue V = (HalfIdx < 2 ? V1 : V2);
15581 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15587 SDValue Half1 = getHalfVector(HalfIdx1);
15588 SDValue Half2 = getHalfVector(HalfIdx2);
15598 unsigned Offset = UndefLower ? HalfNumElts : 0;
15611 "Expected 256-bit or 512-bit vector");
15618 "Completely undef shuffle mask should have been simplified already");
15642 int HalfIdx1, HalfIdx2;
15647 assert(HalfMask.
size() == HalfNumElts &&
"Unexpected shuffle mask length");
15650 unsigned NumLowerHalves =
15651 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15652 unsigned NumUpperHalves =
15653 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15654 assert(NumLowerHalves + NumUpperHalves <= 2 &&
"Only 1 or 2 halves allowed");
15662 if (NumUpperHalves == 0)
15666 if (NumUpperHalves == 1) {
15670 if (EltWidth == 32 && NumLowerHalves && HalfVT.
is128BitVector() &&
15673 Subtarget.hasFastVariableCrossLaneShuffle()))
15679 if (EltWidth == 64 && V2.isUndef())
15691 assert(NumUpperHalves == 2 &&
"Half vector count went wrong");
15696 if (NumUpperHalves == 0) {
15699 if (Subtarget.
hasAVX2() && EltWidth == 64)
15722 int NumLaneElts = NumElts / NumLanes;
15727 for (
unsigned BroadcastSize : {16, 32, 64}) {
15736 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
15737 for (
int j = 0; j != NumBroadcastElts; ++j) {
15738 int M = Mask[i + j];
15741 int &R = RepeatMask[j];
15742 if (0 != ((M % NumElts) / NumLaneElts))
15744 if (0 <= R && R != M)
15752 if (!FindRepeatingBroadcastMask(RepeatMask))
15760 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
15761 for (
int j = 0; j != NumBroadcastElts; ++j)
15762 BroadcastMask[i + j] = j;
15766 if (BroadcastMask == Mask)
15784 auto ShuffleSubLanes = [&](
int SubLaneScale) {
15785 int NumSubLanes = NumLanes * SubLaneScale;
15786 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15791 int TopSrcSubLane = -1;
15797 for (
int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15802 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15803 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15806 int Lane = (M % NumElts) / NumLaneElts;
15807 if ((0 <= SrcLane) && (SrcLane != Lane))
15810 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15811 SubLaneMask[Elt] = LocalM;
15819 for (
int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15821 for (
int i = 0; i != NumSubLaneElts; ++i) {
15822 if (
M1[i] < 0 || M2[i] < 0)
15824 if (
M1[i] != M2[i])
15830 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15831 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15835 for (
int i = 0; i != NumSubLaneElts; ++i) {
15836 int M = SubLaneMask[i];
15839 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15840 "Unexpected mask element");
15841 RepeatedSubLaneMask[i] = M;
15846 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15847 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15848 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15853 if (Dst2SrcSubLanes[DstSubLane] < 0)
15856 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15857 "Unexpected source lane");
15861 for (
int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15862 int Lane = SubLane / SubLaneScale;
15863 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15864 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15865 int M = RepeatedSubLaneMask[Elt];
15868 int Idx = (SubLane * NumSubLaneElts) + Elt;
15869 RepeatedMask[
Idx] = M + (Lane * NumLaneElts);
15875 for (
int i = 0; i != NumElts; i += NumSubLaneElts) {
15876 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15877 if (SrcSubLane < 0)
15879 for (
int j = 0; j != NumSubLaneElts; ++j)
15880 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15885 if (RepeatedMask == Mask || SubLaneMask == Mask)
15899 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15902 MinSubLaneScale = 2;
15904 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15906 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15907 MinSubLaneScale = MaxSubLaneScale = 4;
15909 for (
int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15910 if (
SDValue Shuffle = ShuffleSubLanes(Scale))
15917 bool &ForceV1Zero,
bool &ForceV2Zero,
15919 const APInt &Zeroable) {
15922 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15923 "Unexpected data type for VSHUFPD");
15925 "Illegal shuffle mask");
15927 bool ZeroLane[2] = {
true,
true };
15928 for (
int i = 0; i < NumElts; ++i)
15929 ZeroLane[i & 1] &= Zeroable[i];
15933 bool IsSHUFPD =
true;
15934 bool IsCommutable =
true;
15936 for (
int i = 0; i < NumElts; ++i) {
15941 int Val = (i & 6) + NumElts * (i & 1);
15942 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15943 if (Mask[i] < Val || Mask[i] > Val + 1)
15945 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15946 IsCommutable =
false;
15947 SHUFPDMask[i] = Mask[i] % 2;
15950 if (!IsSHUFPD && !IsCommutable)
15953 if (!IsSHUFPD && IsCommutable)
15956 ForceV1Zero = ZeroLane[0];
15957 ForceV2Zero = ZeroLane[1];
15964 const APInt &Zeroable,
15967 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15968 "Unexpected data type for VSHUFPD");
15970 unsigned Immediate = 0;
15971 bool ForceV1Zero =
false, ForceV2Zero =
false;
15992 const APInt &Zeroable,
15994 assert(VT == MVT::v32i8 &&
"Unexpected type!");
16001 if (Zeroable.
countl_one() < (Mask.size() - 8))
16013 { 0, 1, 2, 3, 16, 17, 18, 19,
16014 4, 5, 6, 7, 20, 21, 22, 23 });
16041 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16045 auto IsInterleavingPattern = [&](
ArrayRef<int> Mask,
unsigned Begin0,
16047 size_t Size = Mask.size();
16048 assert(
Size % 2 == 0 &&
"Expected even mask size");
16049 for (
unsigned I = 0;
I <
Size;
I += 2) {
16050 if (Mask[
I] != (
int)(Begin0 +
I / 2) ||
16051 Mask[
I + 1] != (
int)(Begin1 +
I / 2))
16058 size_t FirstQtr = NumElts / 2;
16059 size_t ThirdQtr = NumElts + NumElts / 2;
16060 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16061 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16062 if (!IsFirstHalf && !IsSecondHalf)
16072 if (Shuffles.
size() != 2)
16075 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16076 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16079 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16080 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16081 FirstHalf = Shuffles[0];
16082 SecondHalf = Shuffles[1];
16083 }
else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16084 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16085 FirstHalf = Shuffles[1];
16086 SecondHalf = Shuffles[0];
16115 assert(V2.getSimpleValueType() == MVT::v4f64 &&
"Bad operand type!");
16116 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
16122 if (V2.isUndef()) {
16125 Mask, Subtarget, DAG))
16135 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16136 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16149 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16154 Mask, DAG, Subtarget))
16167 Zeroable, Subtarget, DAG))
16172 Zeroable, Subtarget, DAG))
16183 !
all_of(Mask, [](
int M) {
return M < 2 || (4 <= M && M < 6); }) &&
16190 if (V1IsInPlace || V2IsInPlace)
16192 Zeroable, Subtarget, DAG);
16197 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16204 if (!(Subtarget.
hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16206 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16210 if (Subtarget.hasVLX())
16212 Zeroable, Subtarget, DAG))
16219 Zeroable, Subtarget, DAG);
16235 assert(V2.getSimpleValueType() == MVT::v4i64 &&
"Bad operand type!");
16236 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
16237 assert(Subtarget.
hasAVX2() &&
"We can only lower v4i64 with AVX2!");
16244 Zeroable, Subtarget, DAG))
16253 if (Subtarget.preferLowerShuffleAsShift())
16256 Subtarget, DAG,
true))
16259 if (V2.isUndef()) {
16286 if (Subtarget.hasVLX()) {
16288 Zeroable, Subtarget, DAG))
16292 Zeroable, Subtarget, DAG))
16310 if (V1IsInPlace || V2IsInPlace)
16312 Zeroable, Subtarget, DAG);
16317 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16329 if (!V1IsInPlace && !V2IsInPlace)
16331 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16336 Zeroable, Subtarget, DAG);
16348 assert(V2.getSimpleValueType() == MVT::v8f32 &&
"Bad operand type!");
16349 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
16352 Zeroable, Subtarget, DAG))
16370 Zeroable, Subtarget, DAG))
16378 "Repeated masks must be half the mask width!");
16402 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16407 if (V2.isUndef()) {
16424 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16428 if (Subtarget.hasVLX())
16430 Zeroable, Subtarget, DAG))
16454 Zeroable, Subtarget, DAG);
16470 assert(V2.getSimpleValueType() == MVT::v8i32 &&
"Bad operand type!");
16471 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
16472 assert(Subtarget.
hasAVX2() &&
"We can only lower v8i32 with AVX2!");
16474 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
16480 Zeroable, Subtarget, DAG))
16499 Zeroable, Subtarget, DAG))
16508 if (Subtarget.preferLowerShuffleAsShift()) {
16511 Subtarget, DAG,
true))
16513 if (NumV2Elements == 0)
16523 bool Is128BitLaneRepeatedShuffle =
16525 if (Is128BitLaneRepeatedShuffle) {
16526 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
16542 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16548 if (Subtarget.hasVLX()) {
16550 Zeroable, Subtarget, DAG))
16554 Zeroable, Subtarget, DAG))
16566 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16569 if (V2.isUndef()) {
16588 CastV1, CastV2, DAG);
16595 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16600 Zeroable, Subtarget, DAG);
16612 assert(V2.getSimpleValueType() == MVT::v16i16 &&
"Bad operand type!");
16613 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
16614 assert(Subtarget.
hasAVX2() &&
"We can only lower v16i16 with AVX2!");
16620 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16629 Zeroable, Subtarget, DAG))
16649 Subtarget, DAG,
false))
16660 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16663 if (V2.isUndef()) {
16678 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16691 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16696 Zeroable, Subtarget, DAG))
16700 if (Subtarget.hasBWI())
16706 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16711 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16735 assert(V2.getSimpleValueType() == MVT::v32i8 &&
"Bad operand type!");
16736 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
16737 assert(Subtarget.
hasAVX2() &&
"We can only lower v32i8 with AVX2!");
16743 Zeroable, Subtarget, DAG))
16752 Zeroable, Subtarget, DAG))
16789 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16801 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16809 Zeroable, Subtarget, DAG))
16813 if (Subtarget.hasVBMI())
16819 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16824 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16830 if (Subtarget.hasVLX())
16832 Mask, Zeroable, DAG))
16859 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
16861 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16863 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16879 if (ElementBits < 32) {
16897 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16929 "Unexpected element type size for 128bit shuffle.");
16939 assert(Widened128Mask.
size() == 4 &&
"Shuffle widening mismatch");
16942 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16943 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16944 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16955 bool OnlyUsesV1 =
isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16957 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16967 bool IsInsert =
true;
16969 for (
int i = 0; i < 4; ++i) {
16970 assert(Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value");
16971 if (Widened128Mask[i] < 0)
16975 if (Widened128Mask[i] < 4) {
16976 if (Widened128Mask[i] != i) {
16982 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16989 if (IsInsert && V2Index >= 0) {
17002 Widened128Mask.
clear();
17008 int PermMask[4] = {-1, -1, -1, -1};
17010 for (
int i = 0; i < 4; ++i) {
17011 assert(Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value");
17012 if (Widened128Mask[i] < 0)
17015 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17022 PermMask[i] = Widened128Mask[i] % 4;
17035 assert(V2.getSimpleValueType() == MVT::v8f64 &&
"Bad operand type!");
17036 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17038 if (V2.isUndef()) {
17040 if (
isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17046 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17047 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17048 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17049 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17061 V2, Subtarget, DAG))
17069 Zeroable, Subtarget, DAG))
17077 Zeroable, Subtarget, DAG))
17089 assert(V2.getSimpleValueType() == MVT::v16f32 &&
"Bad operand type!");
17090 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
17096 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
17113 Zeroable, Subtarget, DAG))
17121 Zeroable, Subtarget, DAG))
17125 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17131 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17136 if (V2.isUndef() &&
17144 Zeroable, Subtarget, DAG))
17156 assert(V2.getSimpleValueType() == MVT::v8i64 &&
"Bad operand type!");
17157 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17160 if (Subtarget.preferLowerShuffleAsShift())
17163 Subtarget, DAG,
true))
17166 if (V2.isUndef()) {
17188 V2, Subtarget, DAG))
17199 Zeroable, Subtarget, DAG))
17203 if (Subtarget.hasBWI())
17217 Zeroable, Subtarget, DAG))
17229 assert(V2.getSimpleValueType() == MVT::v16i32 &&
"Bad operand type!");
17230 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
17232 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
17238 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17242 if (Subtarget.preferLowerShuffleAsShift()) {
17245 Subtarget, DAG,
true))
17247 if (NumV2Elements == 0)
17257 bool Is128BitLaneRepeatedShuffle =
17259 if (Is128BitLaneRepeatedShuffle) {
17260 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
17273 Subtarget, DAG,
false))
17276 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17283 Zeroable, Subtarget, DAG))
17287 if (Subtarget.hasBWI())
17298 CastV1, CastV2, DAG);
17305 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17310 Zeroable, Subtarget, DAG))
17314 Zeroable, Subtarget, DAG))
17326 assert(V2.getSimpleValueType() == MVT::v32i16 &&
"Bad operand type!");
17327 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
17328 assert(Subtarget.hasBWI() &&
"We can only lower v32i16 with AVX-512-BWI!");
17334 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17349 Subtarget, DAG,
false))
17357 if (V2.isUndef()) {
17369 RepeatedMask, Subtarget, DAG);
17374 Zeroable, Subtarget, DAG))
17378 Zeroable, Subtarget, DAG))
17385 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17397 assert(V2.getSimpleValueType() == MVT::v64i8 &&
"Bad operand type!");
17398 assert(Mask.size() == 64 &&
"Unexpected mask size for v64 shuffle!");
17399 assert(Subtarget.hasBWI() &&
"We can only lower v64i8 with AVX-512-BWI!");
17405 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17436 Zeroable, Subtarget, DAG))
17440 Zeroable, Subtarget, DAG))
17446 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17450 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17454 Zeroable, Subtarget, DAG))
17461 Mask, Subtarget, DAG))
17466 bool V1InUse, V2InUse;
17468 DAG, V1InUse, V2InUse);
17475 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17479 if (Subtarget.hasVBMI())
17492 const APInt &Zeroable,
17496 "Cannot lower 512-bit vectors w/ basic ISA!");
17500 int NumElts = Mask.size();
17501 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
17503 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17505 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17518 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17530 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17531 if (!Subtarget.hasBWI())
17573 int NumElts = Mask.size();
17574 for (
int i = 0; i != NumElts; ++i) {
17577 "Unexpected mask index.");
17582 if (ShiftAmt < 0) {
17589 if (ShiftAmt != M - i)
17592 assert(ShiftAmt >= 0 &&
"All undef?");
17606 int MaskOffset,
const APInt &Zeroable) {
17607 int Size = Mask.size();
17609 auto CheckZeros = [&](
int Shift,
bool Left) {
17610 for (
int j = 0; j < Shift; ++j)
17611 if (!Zeroable[j + (
Left ? 0 : (
Size - Shift))])
17617 auto MatchShift = [&](
int Shift,
bool Left) {
17618 unsigned Pos =
Left ? Shift : 0;
17619 unsigned Low =
Left ? 0 : Shift;
17620 unsigned Len =
Size - Shift;
17624 for (
int Shift = 1; Shift !=
Size; ++Shift)
17625 for (
bool Left : {
true,
false})
17626 if (CheckZeros(Shift,
Left) && MatchShift(Shift,
Left)) {
17641 const APInt &Zeroable,
17645 "Cannot lower 512-bit vectors w/o basic ISA!");
17647 int NumElts = Mask.size();
17648 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
17651 int SubvecElts = 0;
17653 for (
int i = 0; i != NumElts; ++i) {
17654 if (Mask[i] >= 0) {
17658 Src = Mask[i] / NumElts;
17659 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17665 assert(SubvecElts != NumElts &&
"Identity shuffle?");
17668 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17672 if ((
int)Zeroable.
countl_one() >= (NumElts - SubvecElts)) {
17673 assert(Src >= 0 &&
"Expected a source!");
17676 Src == 0 ? V1 : V2,
17690 for (
SDValue V : { V1, V2 }) {
17693 if (ShiftAmt >= 0) {
17703 ShiftAmt += WideElts - NumElts;
17706 Res = DAG.
getNode(Opcode,
DL, WideVT, Res,
17733 ExtVT = MVT::v2i64;
17736 ExtVT = MVT::v4i32;
17741 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17751 assert(Subtarget.hasBWI() &&
"Expected AVX512BW support");
17759 ExtVT = MVT::v64i8;
17769 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17770 (Subtarget.hasDQI() && (NumElems < 32)))
17780 int NumElements = Mask.size();
17782 int NumV1Elements = 0, NumV2Elements = 0;
17786 else if (M < NumElements)
17794 if (NumV2Elements > NumV1Elements)
17797 assert(NumV1Elements > 0 &&
"No V1 indices");
17799 if (NumV2Elements == 0)
17807 if (NumV1Elements == NumV2Elements) {
17808 int LowV1Elements = 0, LowV2Elements = 0;
17809 for (
int M : Mask.slice(0, NumElements / 2))
17810 if (M >= NumElements)
17814 if (LowV2Elements > LowV1Elements)
17816 if (LowV2Elements == LowV1Elements) {
17817 int SumV1Indices = 0, SumV2Indices = 0;
17818 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
17819 if (Mask[i] >= NumElements)
17821 else if (Mask[i] >= 0)
17823 if (SumV2Indices < SumV1Indices)
17825 if (SumV2Indices == SumV1Indices) {
17826 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17827 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
17828 if (Mask[i] >= NumElements)
17829 NumV2OddIndices += i % 2;
17830 else if (Mask[i] >= 0)
17831 NumV1OddIndices += i % 2;
17832 if (NumV2OddIndices < NumV1OddIndices)
17846 if (!V.getValueType().isSimple())
17849 MVT VT = V.getSimpleValueType().getScalarType();
17850 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17855 if ((VT == MVT::i16 || VT == MVT::i8) &&
17856 V.getSimpleValueType().getSizeInBits() < 512)
17859 auto HasMaskOperation = [&](
SDValue V) {
17862 switch (V->getOpcode()) {
17881 if (!V->hasOneUse())
17887 if (HasMaskOperation(V))
17912 MVT VT =
Op.getSimpleValueType();
17918 "Can't lower MMX shuffles");
17920 bool V1IsUndef = V1.
isUndef();
17921 bool V2IsUndef = V2.isUndef();
17922 if (V1IsUndef && V2IsUndef)
17935 any_of(OrigMask, [NumElements](
int M) {
return M >= NumElements; })) {
17937 for (
int &M : NewMask)
17938 if (M >= NumElements)
17944 int MaskUpperLimit = OrigMask.
size() * (V2IsUndef ? 1 : 2);
17945 (void)MaskUpperLimit;
17947 [&](
int M) {
return -1 <= M && M < MaskUpperLimit; }) &&
17948 "Out of bounds shuffle index");
17953 APInt KnownUndef, KnownZero;
17956 APInt Zeroable = KnownUndef | KnownZero;
17982 int NewNumElts = NumElements / 2;
17990 bool UsedZeroVector =
false;
17992 "V2's non-undef elements are used?!");
17993 for (
int i = 0; i != NewNumElts; ++i)
17995 WidenedMask[i] = i + NewNumElts;
17996 UsedZeroVector =
true;
18000 if (UsedZeroVector)
18021 assert(NumElements == (
int)Mask.size() &&
18022 "canonicalizeShuffleMaskWithHorizOp "
18023 "shouldn't alter the shuffle mask size");
18052 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18067 if (NumVecBits != 128 && NumVecBits != 256)
18070 if (NumElementBits == 32 || NumElementBits == 64) {
18071 unsigned NumLargeElements = 512 / NumElementBits;
18079 Subtarget, DAG,
DL);
18083 Subtarget, DAG,
DL);
18091 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18092 VecVT == MVT::v16i16) {
18097 Passthru = Passthru.
isUndef()
18116 MVT VT =
Op.getSimpleValueType();
18135 MVT VT =
Op.getSimpleValueType();
18157 MVT CondVT =
Cond.getSimpleValueType();
18158 unsigned CondEltSize =
Cond.getScalarValueSizeInBits();
18159 if (CondEltSize == 1)
18170 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18183 return DAG.
getSelect(dl, VT, Mask, LHS, RHS);
18187 if (CondEltSize != EltSize) {
18204 !Subtarget.hasXOP()) {
18210 if (FreeCond && (FreeLHS || FreeRHS))
18230 case MVT::v16i16: {
18243 MVT VT =
Op.getSimpleValueType();
18246 assert(isa<ConstantSDNode>(
Idx) &&
"Constant index expected");
18261 unsigned IdxVal =
Idx->getAsZExtVal();
18267 if (VT == MVT::f32) {
18273 if (!
Op.hasOneUse())
18278 User->getValueType(0) != MVT::i32))
18285 if (VT == MVT::i32 || VT == MVT::i64)
18299 auto* IdxC = dyn_cast<ConstantSDNode>(
Idx);
18300 MVT EltVT =
Op.getSimpleValueType();
18303 "Unexpected vector type in ExtractBitFromMaskVector");
18311 if (NumElts == 1) {
18323 unsigned IdxVal = IdxC->getZExtValue();
18340 MVT VT =
N->getSimpleValueType(0);
18344 switch (
User->getOpcode()) {
18350 return DemandedElts;
18352 DemandedElts.
setBit(
User->getConstantOperandVal(1));
18355 if (!
User->getValueType(0).isSimple() ||
18356 !
User->getValueType(0).isVector()) {
18358 return DemandedElts;
18366 return DemandedElts;
18369 return DemandedElts;
18373X86TargetLowering::LowerEXTRACT_VECTOR_ELT(
SDValue Op,
18379 auto* IdxC = dyn_cast<ConstantSDNode>(
Idx);
18418 unsigned IdxVal = IdxC->getZExtValue();
18432 IdxVal &= ElemsPerChunk - 1;
18439 MVT VT =
Op.getSimpleValueType();
18441 if (VT == MVT::i16) {
18446 if (Subtarget.hasFP16())
18466 if (VT == MVT::i8) {
18471 int DWordIdx = IdxVal / 4;
18472 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18476 int ShiftVal = (IdxVal % 4) * 8;
18483 int WordIdx = IdxVal / 2;
18484 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18488 int ShiftVal = (IdxVal % 2) * 8;
18502 Mask[0] =
static_cast<int>(IdxVal);
18518 int Mask[2] = { 1, -1 };
18537 if (!isa<ConstantSDNode>(
Idx)) {
18556 MVT VT =
Op.getSimpleValueType();
18561 if (EltVT == MVT::i1)
18568 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18570 if (EltVT == MVT::bf16) {
18582 if (!(Subtarget.hasBWI() ||
18583 (Subtarget.
hasAVX512() && EltSizeInBits >= 32) ||
18584 (Subtarget.
hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18597 for (
unsigned I = 0;
I != NumElts; ++
I)
18602 return DAG.
getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18606 if (N2C->getAPIntValue().uge(NumElts))
18608 uint64_t IdxVal = N2C->getZExtValue();
18613 if (IsZeroElt || IsAllOnesElt) {
18616 if (IsAllOnesElt &&
18617 ((VT == MVT::v16i8 && !Subtarget.
hasSSE41()) ||
18618 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.
hasInt256()))) {
18622 CstVectorElts[IdxVal] = OnesCst;
18631 for (
unsigned i = 0; i != NumElts; ++i)
18632 BlendMask.
push_back(i == IdxVal ? i + NumElts : i);
18648 if ((Subtarget.
hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18649 (Subtarget.
hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18656 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18658 "Vectors will always have power-of-two number of elements.");
18663 if (IdxVal >= NumEltsIn128 &&
18664 ((Subtarget.
hasAVX2() && EltSizeInBits != 8) ||
18665 (Subtarget.
hasAVX() && (EltSizeInBits >= 32) &&
18669 for (
unsigned i = 0; i != NumElts; ++i)
18670 BlendMask.
push_back(i == IdxVal ? i + NumElts : i);
18679 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18691 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18692 EltVT == MVT::f16 || EltVT == MVT::i64) {
18699 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18710 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.
hasSSE41())) {
18712 if (VT == MVT::v8i16) {
18716 assert(VT == MVT::v16i8 &&
"PINSRB requires v16i8 vector");
18721 assert(N1.getValueType() != MVT::i32 &&
"Unexpected VT");
18724 return DAG.
getNode(Opc, dl, VT, N0, N1, N2);
18728 if (EltVT == MVT::f32) {
18758 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18768 MVT OpVT =
Op.getSimpleValueType();
18789 "Expected an SSE type!");
18793 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18806 assert(
Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18813 assert(
Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18814 "Only vXi1 extract_subvectors need custom lowering");
18818 uint64_t IdxVal =
Op.getConstantOperandVal(1);
18835unsigned X86TargetLowering::getGlobalWrapperKind(
18836 const GlobalValue *GV,
const unsigned char OpFlags)
const {
18870 CP->getConstVal(), PtrVT,
CP->getAlign(),
CP->getOffset(), OpFlag);
18873 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlag),
DL, PtrVT, Result);
18895 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlag),
DL, PtrVT, Result);
18908 return LowerGlobalOrExternal(
Op, DAG,
false);
18914 unsigned char OpFlags =
18916 const BlockAddress *BA = cast<BlockAddressSDNode>(
Op)->getBlockAddress();
18917 int64_t
Offset = cast<BlockAddressSDNode>(
Op)->getOffset();
18922 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlags), dl, PtrVT, Result);
18936 bool ForCall)
const {
18941 const char *ExternalSym =
nullptr;
18942 if (
const auto *
G = dyn_cast<GlobalAddressSDNode>(
Op)) {
18943 GV =
G->getGlobal();
18946 const auto *ES = cast<ExternalSymbolSDNode>(
Op);
18947 ExternalSym = ES->getSymbol();
18952 unsigned char OpFlags;
18970 int64_t GlobalOffset = 0;
18983 if (ForCall && !NeedsLoad && !HasPICReg &&
Offset == 0)
18986 Result = DAG.
getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19011 return LowerGlobalOrExternal(
Op, DAG,
false);
19015 const EVT PtrVT,
unsigned ReturnReg,
19016 unsigned char OperandFlags,
19017 bool LoadGlobalBaseReg =
false,
19018 bool LocalDynamic =
false) {
19026 if (LocalDynamic && UseTLSDESC) {
19033 "Unexpected TLSDESC DAG");
19037 "Unexpected TLSDESC DAG");
19039 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19041 "Unexpected TLSDESC DAG");
19042 Ret =
SDValue(CopyFromRegOp, 0);
19055 if (LoadGlobalBaseReg) {
19061 Chain = DAG.
getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19063 Chain = DAG.
getNode(CallType, dl, NodeTys, {Chain, TGA});
19111 bool Is64Bit,
bool Is64BitLP64) {
19121 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19160 unsigned char OperandFlags = 0;
19217 if (Subtarget.is64Bit()) {
19229 PositionIndependent);
19236 unsigned char OpFlag = 0;
19237 unsigned WrapperKind = 0;
19241 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19276 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19302 SDValue TlsArray = Subtarget.is64Bit()
19317 if (Subtarget.is64Bit())
19348 if (Subtarget.is64Bit() && Subtarget.
isTargetELF()) {
19389 "Unexpected opcode!");
19390 bool IsStrict =
Op->isStrictFPOpcode();
19391 unsigned OpNo = IsStrict ? 1 : 0;
19393 MVT SrcVT = Src.getSimpleValueType();
19394 MVT VT =
Op.getSimpleValueType();
19396 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19397 (VT != MVT::f32 && VT != MVT::f64))
19403 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19410 {Op.getOperand(0), InVec});
19430 "Unexpected opcode!");
19431 bool IsStrict =
Op->isStrictFPOpcode();
19432 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
19433 MVT SrcVT = Src.getSimpleValueType();
19434 MVT VT =
Op.getSimpleValueType();
19436 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19441 assert(Subtarget.hasFP16() &&
"Expected FP16");
19445 SDValue CvtVec = DAG.
getNode(
Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19446 {Op.getOperand(0), InVec});
19464 if (!Subtarget.
hasSSE2() || FromVT != MVT::v4i32)
19467 return ToVT == MVT::v4f32 || (Subtarget.
hasAVX() && ToVT == MVT::v4f64);
19471 if (!Subtarget.
hasAVX512() || FromVT != MVT::v4i32)
19474 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19492 !isa<ConstantSDNode>(Extract.
getOperand(1)))
19513 if (FromVT != Vec128VT)
19537 MVT SrcVT =
X.getSimpleValueType();
19538 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19543 if (!Subtarget.
hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19555 unsigned ToIntOpcode =
19557 unsigned ToFPOpcode =
19576 bool IsStrict =
Op->isStrictFPOpcode();
19577 MVT VT =
Op->getSimpleValueType(0);
19578 SDValue Src =
Op->getOperand(IsStrict ? 1 : 0);
19580 if (Subtarget.hasDQI()) {
19581 assert(!Subtarget.hasVLX() &&
"Unexpected features");
19583 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19584 Src.getSimpleValueType() == MVT::v4i64) &&
19585 "Unsupported custom type");
19588 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19590 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19600 Res = DAG.
getNode(
Op.getOpcode(),
DL, {WideVT, MVT::Other},
19601 {Op->getOperand(0), Src});
19604 Res = DAG.
getNode(
Op.getOpcode(),
DL, WideVT, Src);
19617 if (VT != MVT::v4f32 || IsSigned)
19629 for (
int i = 0; i != 4; ++i) {
19635 {
Op.getOperand(0), Elt});
19636 Chains[i] = SignCvts[i].getValue(1);
19647 {Chain, SignCvt, SignCvt});
19664 bool IsStrict =
Op->isStrictFPOpcode();
19665 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
19667 MVT VT =
Op.getSimpleValueType();
19675 DAG.
getNode(
Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19678 DAG.
getNode(
Op.getOpcode(), dl, NVT, Src), Rnd);
19683 if (VT == MVT::v4i32 && Subtarget.
hasSSE2() && IsSigned)
19685 if (VT == MVT::v8i32 && Subtarget.
hasAVX() && IsSigned)
19687 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19690 if (VT == MVT::v16i32)
19692 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19695 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19696 (VT == MVT::v2i64 || VT == MVT::v4i64))
19703 bool IsStrict =
Op->isStrictFPOpcode();
19704 unsigned OpNo = IsStrict ? 1 : 0;
19707 MVT SrcVT = Src.getSimpleValueType();
19708 MVT VT =
Op.getSimpleValueType();
19717 return LowerWin64_INT128_TO_FP(
Op, DAG);
19726 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19738 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19744 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19745 "Unknown SINT_TO_FP to lower!");
19751 if (SrcVT == MVT::i32 && UseSSEReg)
19753 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19762 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19771 if (VT == MVT::f128 || !Subtarget.hasX87())
19775 if (SrcVT == MVT::i64 && Subtarget.
hasSSE2() && !Subtarget.is64Bit())
19779 ValueToStore = DAG.
getBitcast(MVT::f64, ValueToStore);
19789 Chain = DAG.
getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19790 std::pair<SDValue, SDValue> Tmp =
19791 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19806 Tys = DAG.
getVTList(MVT::f80, MVT::Other);
19808 Tys = DAG.
getVTList(DstVT, MVT::Other);
19810 SDValue FILDOps[] = {Chain, Pointer};
19814 Chain = Result.getValue(1);
19824 SDValue FSTOps[] = {Chain, Result, StackSlot};
19832 DstVT,
DL, Chain, StackSlot,
19834 Chain = Result.getValue(1);
19837 return { Result, Chain };
19846 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19847 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19857 assert(!
Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!");
19874 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19882 APInt(64, 0x4330000000000000ULL))));
19885 APInt(64, 0x4530000000000000ULL))));
19899 MVT::v2f64, dl, CLod0.
getValue(1), CPIdx1,
19922 unsigned OpNo =
Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19925 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19944 if (
Op.getNode()->isStrictFPOpcode()) {
19949 {Chain,
Or, Bias});
19956 Sub, Sub.
getValue(1), dl,
Op.getSimpleValueType());
19958 return DAG.
getMergeValues({ResultPair.first, ResultPair.second}, dl);
19972 if (
Op.getSimpleValueType() != MVT::v2f64)
19975 bool IsStrict =
Op->isStrictFPOpcode();
19977 SDValue N0 =
Op.getOperand(IsStrict ? 1 : 0);
19981 if (!Subtarget.hasVLX()) {
19989 {Op.getOperand(0), N0});
20001 {
Op.getOperand(0), N0});
20011 llvm::bit_cast<double>(0x4330000000000000ULL),
DL, MVT::v2f64);
20018 {
Op.getOperand(0),
Or, VBias});
20025 bool IsStrict =
Op->isStrictFPOpcode();
20026 SDValue V =
Op->getOperand(IsStrict ? 1 : 0);
20027 MVT VecIntVT = V.getSimpleValueType();
20028 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20029 "Unsupported custom type");
20033 assert(!Subtarget.hasVLX() &&
"Unexpected features");
20034 MVT VT =
Op->getSimpleValueType(0);
20037 if (VT == MVT::v8f64)
20040 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20042 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20043 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20053 {
Op->getOperand(0), V});
20067 if (Subtarget.
hasAVX() && VecIntVT == MVT::v4i32 &&
20068 Op->getSimpleValueType(0) == MVT::v4f64) {
20088 {
Op.getOperand(0),
Or, VBias});
20104 bool Is128 = VecIntVT == MVT::v4i32;
20105 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20108 if (VecFloatVT !=
Op->getSimpleValueType(0))
20129 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20170 {
Op.getOperand(0), HighBitcast, VecCstFSub});
20172 {FHigh.
getValue(1), LowBitcast, FHigh});
20182 unsigned OpNo =
Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20202 bool IsStrict =
Op->isStrictFPOpcode();
20203 unsigned OpNo = IsStrict ? 1 : 0;
20207 MVT SrcVT = Src.getSimpleValueType();
20208 MVT DstVT =
Op->getSimpleValueType(0);
20212 if (DstVT == MVT::f128)
20224 return LowerWin64_INT128_TO_FP(
Op, DAG);
20230 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20237 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20252 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.
hasSSE2() &&
20257 if (SrcVT == MVT::i32 && Subtarget.
hasSSE2() && DstVT != MVT::f80 &&
20260 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20261 (DstVT == MVT::f32 || DstVT == MVT::f64))
20266 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20267 Align SlotAlign(8);
20270 if (SrcVT == MVT::i32) {
20273 SDValue Store1 = DAG.
getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20276 std::pair<SDValue, SDValue> Tmp =
20277 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20284 assert(SrcVT == MVT::i64 &&
"Unexpected type in UINT_TO_FP");
20290 ValueToStore = DAG.
getBitcast(MVT::f64, ValueToStore);
20293 DAG.
getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20310 APInt FF(64, 0x5F80000000000000ULL);
20313 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20332 if (Subtarget.
isOSWindows() && DstVT == MVT::f32)
20336 DAG.
getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20338 if (DstVT == MVT::f80)
20346 if (Subtarget.
isOSWindows() && DstVT == MVT::f32)
20363 bool IsStrict =
Op->isStrictFPOpcode();
20366 EVT DstTy =
Op.getValueType();
20371 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20380 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20384 if (!IsSigned && DstTy != MVT::i64) {
20387 assert(DstTy == MVT::i32 &&
"Unexpected FP_TO_UINT");
20391 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20392 DstTy.getSimpleVT() >= MVT::i16 &&
20393 "Unknown FP_TO_INT to lower!");
20398 unsigned MemSize = DstTy.getStoreSize();
20407 if (UnsignedFixup) {
20427 bool LosesInfo =
false;
20428 if (TheVT == MVT::f64)
20432 else if (TheVT == MVT::f80)
20437 "FP conversion should have been exact");
20447 Chain =
Cmp.getValue(1);
20472 { Chain,
Value, FltOfs });
20473 Chain =
Value.getValue(1);
20483 assert(DstTy == MVT::i64 &&
"Invalid FP_TO_SINT to lower!");
20486 SDValue Ops[] = { Chain, StackSlot };
20489 assert(FLDSize <= MemSize &&
"Stack slot not big enough");
20493 Chain =
Value.getValue(1);
20516 MVT VT =
Op.getSimpleValueType();
20518 MVT InVT = In.getSimpleValueType();
20519 unsigned Opc =
Op.getOpcode();
20523 "Unexpected extension opcode");
20525 "Expected same number of elements");
20529 "Unexpected element type");
20533 "Unexpected element type");
20537 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20538 assert(InVT == MVT::v32i8 &&
"Unexpected VT!");
20562 if (
auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20578 assert((VT == MVT::v16i8 || VT == MVT::v16i16) &&
"Unexpected VT.");
20592 MVT VT =
Op->getSimpleValueType(0);
20594 MVT InVT = In.getSimpleValueType();
20608 if (!Subtarget.hasBWI()) {
20617 MVT WideVT = ExtVT;
20643 return SelectedVal;
20649 MVT SVT = In.getSimpleValueType();
20668 "Unexpected PACK opcode");
20675 EVT SrcVT = In.getValueType();
20678 if (SrcVT == DstVT)
20688 assert(SrcSizeInBits > DstSizeInBits &&
"Illegal truncation");
20696 EVT InVT = MVT::i16, OutVT = MVT::i8;
20705 if (SrcSizeInBits <= 128) {
20722 if (
Hi.isUndef()) {
20729 unsigned SubSizeInBits = SrcSizeInBits / 2;
20731 OutVT =
EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20752 int Scale = 64 / OutVT.getScalarSizeInBits();
20765 assert(SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater");
20797 EVT SrcVT = In.getValueType();
20814 EVT SrcVT = In.getValueType();
20821 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20822 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20825 assert(NumSrcEltBits > NumDstEltBits &&
"Bad truncation");
20826 unsigned NumStages =
Log2_32(NumSrcEltBits / NumDstEltBits);
20831 if ((DstSVT == MVT::i32 && SrcVT.
getSizeInBits() <= 128) ||
20832 (DstSVT == MVT::i16 && SrcVT.
getSizeInBits() <= (64 * NumStages)) ||
20833 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.
hasSSSE3()))
20838 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20844 if (Subtarget.
hasAVX512() && NumStages > 1)
20847 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20848 unsigned NumPackedZeroBits = Subtarget.
hasSSE41() ? NumPackedSignBits : 8;
20869 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20873 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20874 if (MinSignBits < NumSignBits) {
20882 if (In.getOpcode() ==
ISD::SRL && In->hasOneUse())
20884 if (*ShAmt == MinSignBits) {
20900 MVT SrcVT = In.getSimpleValueType();
20903 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20904 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20920 unsigned PackOpcode;
20933 MVT SrcVT = In.getSimpleValueType();
20937 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20938 (DstSVT == MVT::i8 || DstSVT == MVT::i16) &&
isPowerOf2_32(NumElems) &&
20943 if (Subtarget.
hasSSSE3() && NumElems == 8) {
20944 if (SrcSVT == MVT::i16)
20946 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.
hasSSE41()))
20965 if (Subtarget.
hasSSE41() || DstSVT == MVT::i8)
20968 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20972 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20984 MVT VT =
Op.getSimpleValueType();
20986 MVT InVT = In.getSimpleValueType();
20992 if (Subtarget.hasBWI()) {
21008 "Unexpected vector type.");
21010 assert((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements");
21022 if (InVT == MVT::v16i8) {
21026 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21029 assert(InVT == MVT::v16i16 &&
"Unexpected VT!");
21055 if (Subtarget.hasDQI())
21062 MVT VT =
Op.getSimpleValueType();
21064 MVT InVT =
In.getSimpleValueType();
21066 "Invalid TRUNCATE operation");
21071 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21073 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21074 "Unexpected subtarget!");
21116 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21117 assert(VT == MVT::v32i8 &&
"Unexpected VT!");
21125 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21133 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21136 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21147 static const int ShufMask[] = {0, 2, 4, 6};
21149 DAG.
getBitcast(MVT::v4i32, OpHi), ShufMask);
21152 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21156 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21157 -1, -1, -1, -1, -1, -1, -1, -1,
21158 16, 17, 20, 21, 24, 25, 28, 29,
21159 -1, -1, -1, -1, -1, -1, -1, -1 };
21164 static const int ShufMask2[] = {0, 2, -1, -1};
21176 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21187 MVT SrcVT = Src.getSimpleValueType();
21189 assert(DstBits == 32 &&
"expandFP_TO_UINT_SSE - only vXi32 supported");
21208 if (VT == MVT::v8i32 && !Subtarget.
hasAVX2()) {
21221 bool IsStrict =
Op->isStrictFPOpcode();
21224 MVT VT =
Op->getSimpleValueType(0);
21225 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
21227 MVT SrcVT = Src.getSimpleValueType();
21234 return DAG.
getNode(
Op.getOpcode(), dl, {VT, MVT::Other},
21235 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21236 {NVT, MVT::Other}, {Chain, Src})});
21237 return DAG.
getNode(
Op.getOpcode(), dl, VT,
21243 if (VT.isVector()) {
21244 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21245 MVT ResVT = MVT::v4i32;
21246 MVT TruncVT = MVT::v4i1;
21253 if (!IsSigned && !Subtarget.hasVLX()) {
21256 ResVT = MVT::v8i32;
21257 TruncVT = MVT::v8i1;
21258 Opc =
Op.getOpcode();
21268 Res = DAG.
getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21271 Res = DAG.
getNode(Opc, dl, ResVT, Src);
21283 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
21288 if (EleVT != MVT::i64)
21289 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21291 if (SrcVT != MVT::v8f16) {
21302 dl, {ResVT, MVT::Other}, {Chain, Src});
21325 if (VT.getVectorElementType() == MVT::i16) {
21328 "Expected f32/f64 vector!");
21333 dl, {NVT, MVT::Other}, {Chain, Src});
21349 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21350 assert(!IsSigned &&
"Expected unsigned conversion!");
21356 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21357 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21359 assert(!IsSigned &&
"Expected unsigned conversion!");
21360 assert(!Subtarget.hasVLX() &&
"Unexpected features!");
21361 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21362 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21388 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21389 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21391 assert(!Subtarget.hasVLX() &&
"Unexpected features!");
21392 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21402 Res = DAG.
getNode(
Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21406 Res = DAG.
getNode(
Op.getOpcode(), dl, MVT::v8i64, Src);
21417 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21418 if (!Subtarget.hasVLX()) {
21427 Tmp = DAG.
getNode(
Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21435 assert(Subtarget.hasDQI() && Subtarget.hasVLX() &&
"Requires AVX512DQVL");
21441 return DAG.
getNode(Opc, dl, {VT, MVT::Other}, {
Op->getOperand(0), Tmp});
21444 return DAG.
getNode(Opc, dl, VT, Tmp);
21449 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21450 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21451 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21452 assert(!IsSigned &&
"Expected unsigned conversion!");
21461 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21463 if (!IsSigned && UseSSEReg) {
21470 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21471 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21472 unsigned DstBits = VT.getScalarSizeInBits();
21502 if (VT == MVT::i64)
21505 assert(VT == MVT::i32 &&
"Unexpected VT!");
21510 if (Subtarget.is64Bit()) {
21533 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21534 assert(IsSigned &&
"Expected i16 FP_TO_UINT to have been promoted!");
21549 if (UseSSEReg && IsSigned)
21553 if (SrcVT == MVT::f128) {
21560 MakeLibCallOptions CallOptions;
21561 std::pair<SDValue, SDValue> Tmp =
21562 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21571 if (
SDValue V = FP_TO_INTHelper(
Op, DAG, IsSigned, Chain)) {
21577 llvm_unreachable(
"Expected FP_TO_INTHelper to handle all remaining cases.");
21583 EVT DstVT =
Op.getSimpleValueType();
21584 MVT SrcVT = Src.getSimpleValueType();
21589 if (SrcVT == MVT::f16)
21596 return LRINT_LLRINTHelper(
Op.getNode(), DAG);
21601 EVT DstVT =
N->getValueType(0);
21603 EVT SrcVT = Src.getValueType();
21605 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21618 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21620 int SPFI = cast<FrameIndexSDNode>(
StackPtr.getNode())->getIndex();
21625 assert(DstVT == MVT::i64 &&
"Invalid LRINT/LLRINT to lower!");
21626 Chain = DAG.
getStore(Chain,
DL, Src, StackPtr, MPI);
21633 Chain = Src.getValue(1);
21638 StoreOps, DstVT, MPI, std::nullopt,
21641 return DAG.
getLoad(DstVT,
DL, Chain, StackPtr, MPI);