135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
163 if (Subtarget.hasSlowDivide32())
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
184 if (Subtarget.is64Bit())
201 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
207 if (Subtarget.canUseCMOV()) {
210 if (Subtarget.is64Bit())
219 if (Subtarget.is64Bit())
227 if (Subtarget.is64Bit())
238 if (Subtarget.is64Bit())
242 if (!Subtarget.useSoftFloat()) {
306 if (!Subtarget.is64Bit()) {
312 if (Subtarget.hasSSE2()) {
315 for (
MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
321 if (Subtarget.is64Bit()) {
326 if (Subtarget.hasAVX10_2()) {
331 for (
MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
336 if (Subtarget.is64Bit()) {
347 if (!Subtarget.hasSSE2()) {
352 if (Subtarget.is64Bit()) {
357 }
else if (!Subtarget.is64Bit())
370 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
381 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
386 if (Subtarget.is64Bit())
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
415 if (!Subtarget.hasBMI()) {
418 if (Subtarget.is64Bit()) {
424 if (Subtarget.hasLZCNT()) {
430 for (
auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ?
Custom :
Expand);
452 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
457 for (
MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
486 if (!Subtarget.hasMOVBE())
490 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
496 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
515 for (
auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
527 for (
auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
535 if (Subtarget.hasSSEPrefetch())
541 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
551 if (!Subtarget.is64Bit())
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
561 if (Subtarget.canUseCMPXCHG16B())
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
579 if (Subtarget.isTargetPS())
587 bool Is64Bit = Subtarget.is64Bit();
642 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
646 : &X86::FR16RegClass);
648 : &X86::FR32RegClass);
650 : &X86::FR64RegClass);
658 for (
auto VT : { MVT::f32, MVT::f64 }) {
679 setF16Action(MVT::f16,
Promote);
736 }
else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
737 (UseX87 || Is64Bit)) {
775 for (
auto VT : { MVT::f32, MVT::f64 }) {
788 if (UseX87 && (
getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
789 addLegalFPImmediate(
APFloat(+0.0f));
790 addLegalFPImmediate(
APFloat(+1.0f));
791 addLegalFPImmediate(
APFloat(-0.0f));
792 addLegalFPImmediate(
APFloat(-1.0f));
794 addLegalFPImmediate(
APFloat(+0.0f));
799 addLegalFPImmediate(
APFloat(+0.0));
800 addLegalFPImmediate(
APFloat(+1.0));
801 addLegalFPImmediate(
APFloat(-0.0));
802 addLegalFPImmediate(
APFloat(-1.0));
804 addLegalFPImmediate(
APFloat(+0.0));
835 addLegalFPImmediate(TmpFlt);
837 addLegalFPImmediate(TmpFlt);
843 addLegalFPImmediate(TmpFlt2);
845 addLegalFPImmediate(TmpFlt2);
894 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
896 : &X86::VR128RegClass);
973 for (
auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
974 MVT::v4f32, MVT::v8f32, MVT::v16f32,
975 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
1058 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1063 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1065 : &X86::VR128RegClass);
1093 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1095 : &X86::VR128RegClass);
1100 : &X86::VR128RegClass);
1102 : &X86::VR128RegClass);
1104 : &X86::VR128RegClass);
1106 : &X86::VR128RegClass);
1108 : &X86::VR128RegClass);
1110 for (
auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1117 for (
auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1118 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1158 if (Subtarget.hasPCLMUL()) {
1159 for (
auto VT : {MVT::i64, MVT::v4i32, MVT::v2i64}) {
1168 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1191 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1211 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1219 for (
auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1224 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1230 setF16Action(MVT::v8f16,
Expand);
1255 for (
auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1301 if (!Subtarget.hasAVX512())
1329 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1333 if (VT == MVT::v2i64)
continue;
1347 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1353 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1358 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1363 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1375 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1376 for (
MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1416 for (
auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1431 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1443 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1447 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1448 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1449 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1455 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1459 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1460 bool HasInt256 = Subtarget.hasInt256();
1463 : &X86::VR256RegClass);
1465 : &X86::VR256RegClass);
1467 : &X86::VR256RegClass);
1469 : &X86::VR256RegClass);
1471 : &X86::VR256RegClass);
1473 : &X86::VR256RegClass);
1475 : &X86::VR256RegClass);
1477 for (
auto VT : { MVT::v8f32, MVT::v4f64 }) {
1540 if (!Subtarget.hasAVX512())
1545 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1551 if (VT == MVT::v4i64)
continue;
1572 for (
auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1583 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1602 if (Subtarget.hasAnyFMA()) {
1603 for (
auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1604 MVT::v2f64, MVT::v4f64 }) {
1610 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1651 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1659 for (
auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1681 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1682 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1689 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1690 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1695 for (
MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1696 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1707 setF16Action(MVT::v16f16,
Expand);
1723 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1724 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1729 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1730 Subtarget.hasF16C()) {
1731 for (
MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1735 for (
MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1750 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1778 if (!Subtarget.hasDQI()) {
1791 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1797 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1800 for (
auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1813 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1816 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1817 for (
MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1826 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1827 bool HasBWI = Subtarget.hasBWI();
1847 for (
MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1863 if (Subtarget.hasDQI())
1870 for (
MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1877 for (
MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1914 if (!Subtarget.hasVLX()) {
1915 for (
auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1916 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1942 for (
auto VT : { MVT::v16f32, MVT::v8f64 }) {
1959 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1986 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
2010 for (
auto VT : { MVT::v16i32, MVT::v8i64 }) {
2019 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2040 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2046 if (Subtarget.hasDQI())
2049 if (Subtarget.hasCDI()) {
2051 for (
auto VT : { MVT::v16i32, MVT::v8i64} ) {
2056 if (Subtarget.hasVPOPCNTDQ()) {
2057 for (
auto VT : { MVT::v16i32, MVT::v8i64 })
2064 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2065 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2068 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2069 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2080 setF16Action(MVT::v32f16,
Expand);
2089 for (
auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2096 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2105 if (Subtarget.hasVBMI2()) {
2106 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2120 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2121 for (
auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2131 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2132 for (
MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
2133 MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
2134 MVT::v16f32, MVT::v8f64})
2143 if (Subtarget.hasDQI()) {
2148 "Unexpected operation action!");
2156 for (
auto VT : { MVT::v2i64, MVT::v4i64 }) {
2164 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2173 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2174 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2177 if (Subtarget.hasDQI()) {
2188 if (Subtarget.hasCDI()) {
2189 for (
auto VT : {MVT::i256, MVT::i512}) {
2190 if (VT == MVT::i512 && !Subtarget.useAVX512Regs())
2197 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2202 if (Subtarget.hasVPOPCNTDQ()) {
2203 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2210 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2211 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2212 MVT::v16i16, MVT::v8i8})
2217 for (
MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2221 if (Subtarget.hasVLX())
2222 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2223 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2227 if (Subtarget.hasVBMI2())
2228 for (
MVT VT : {MVT::v32i16, MVT::v64i8})
2232 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2233 for (
MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2239 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2243 for (
auto VT : { MVT::v32i1, MVT::v64i1 }) {
2256 for (
auto VT : { MVT::v16i1, MVT::v32i1 })
2264 for (
auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2265 MVT::v16f16, MVT::v8f16}) {
2274 if (Subtarget.hasBITALG()) {
2275 for (
auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2280 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2281 auto setGroup = [&] (
MVT VT) {
2351 if (Subtarget.useAVX512Regs()) {
2352 setGroup(MVT::v32f16);
2403 if (Subtarget.hasVLX()) {
2404 setGroup(MVT::v8f16);
2405 setGroup(MVT::v16f16);
2456 if (!Subtarget.useSoftFloat() &&
2457 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2459 : &X86::VR128RegClass);
2460 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2461 : &X86::VR256RegClass);
2467 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2468 setF16Action(VT,
Expand);
2469 if (!Subtarget.hasBF16())
2486 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2487 Subtarget.useAVX512Regs()) {
2489 setF16Action(MVT::v32bf16,
Expand);
2500 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2512 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2525 for (
auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2531 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2544 if (Subtarget.hasBWI()) {
2549 if (Subtarget.hasFP16()) {
2581 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2589 if (!Subtarget.is64Bit()) {
2599 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2600 if (VT == MVT::i64 && !Subtarget.is64Bit())
2622 if (Subtarget.isTargetWin64()) {
2641 if (Subtarget.is32Bit() &&
2642 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2670 if (Subtarget.isOSWindows()) {
4643 unsigned IdxVal =
Op.getConstantOperandVal(2);
4649 if (IdxVal == 0 && Vec.
isUndef())
4652 MVT OpVT =
Op.getSimpleValueType();
4671 assert(IdxVal + SubVecNumElems <= NumElems &&
4673 "Unexpected index value in INSERT_SUBVECTOR");
4693 Undef, SubVec, ZeroIdx);
4696 assert(IdxVal != 0 &&
"Unexpected index");
4703 assert(IdxVal != 0 &&
"Unexpected index");
4706 [](
SDValue V) { return V.isUndef(); })) {
4711 unsigned ShiftLeft = NumElems - SubVecNumElems;
4712 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4715 if (ShiftRight != 0)
4723 if (IdxVal + SubVecNumElems == NumElems) {
4726 if (SubVecNumElems * 2 == NumElems) {
4736 Undef, Vec, ZeroIdx);
4753 unsigned ShiftLeft = NumElems - SubVecNumElems;
4754 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4757 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4780 unsigned LowShift = NumElems - IdxVal;
4787 unsigned HighShift = IdxVal + SubVecNumElems;
5071 bool AllowWholeUndefs =
true,
5072 bool AllowPartialUndefs =
false) {
5073 assert(EltBits.
empty() &&
"Expected an empty EltBits vector");
5077 EVT VT =
Op.getValueType();
5079 unsigned NumElts = SizeInBits / EltSizeInBits;
5082 if ((SizeInBits % EltSizeInBits) != 0)
5088 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5089 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5090 "Constant bit sizes don't match");
5093 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5098 if (NumSrcElts == NumElts) {
5099 UndefElts = UndefSrcElts;
5100 EltBits.
assign(SrcEltBits.begin(), SrcEltBits.end());
5105 APInt UndefBits(SizeInBits, 0);
5106 APInt MaskBits(SizeInBits, 0);
5108 for (
unsigned i = 0; i != NumSrcElts; ++i) {
5109 unsigned BitOffset = i * SrcEltSizeInBits;
5110 if (UndefSrcElts[i])
5111 UndefBits.
setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5112 MaskBits.
insertBits(SrcEltBits[i], BitOffset);
5116 UndefElts =
APInt(NumElts, 0);
5119 for (
unsigned i = 0; i != NumElts; ++i) {
5120 unsigned BitOffset = i * EltSizeInBits;
5125 if (!AllowWholeUndefs)
5133 if (UndefEltBits.
getBoolValue() && !AllowPartialUndefs)
5136 EltBits[i] = MaskBits.
extractBits(EltSizeInBits, BitOffset);
5143 unsigned UndefBitIndex) {
5147 Undefs.setBit(UndefBitIndex);
5151 Mask = CInt->getValue();
5155 Mask = CFP->getValueAPF().bitcastToAPInt();
5159 Type *Ty = CDS->getType();
5161 Type *EltTy = CDS->getElementType();
5165 if (!IsInteger && !IsFP)
5168 for (
unsigned I = 0,
E = CDS->getNumElements();
I !=
E; ++
I)
5170 Mask.insertBits(CDS->getElementAsAPInt(
I),
I * EltBits);
5172 Mask.insertBits(CDS->getElementAsAPFloat(
I).bitcastToAPInt(),
5183 return CastBitData(UndefSrcElts, SrcEltBits);
5190 return CastBitData(UndefSrcElts, SrcEltBits);
5194 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5196 return CastBitData(UndefSrcElts, SrcEltBits);
5204 if (BV->getConstantRawBits(
true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5206 for (
unsigned I = 0,
E = SrcEltBits.
size();
I !=
E; ++
I)
5209 return CastBitData(UndefSrcElts, SrcEltBits);
5217 if (!CstTy->
isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5221 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5222 if ((SizeInBits % SrcEltSizeInBits) != 0)
5225 APInt UndefSrcElts(NumSrcElts, 0);
5227 for (
unsigned i = 0; i != NumSrcElts; ++i)
5232 return CastBitData(UndefSrcElts, SrcEltBits);
5242 SDValue Ptr = MemIntr->getBasePtr();
5245 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5247 APInt UndefSrcElts(NumSrcElts, 0);
5249 if (CollectConstantBits(
C, SrcEltBits[0], UndefSrcElts, 0)) {
5250 if (UndefSrcElts[0])
5251 UndefSrcElts.
setBits(0, NumSrcElts);
5252 if (SrcEltBits[0].
getBitWidth() != SrcEltSizeInBits)
5253 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5254 SrcEltBits.
append(NumSrcElts - 1, SrcEltBits[0]);
5255 return CastBitData(UndefSrcElts, SrcEltBits);
5263 SDValue Ptr = MemIntr->getBasePtr();
5269 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5270 if (!CstTy->
isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5271 (SizeInBits % SubVecSizeInBits) != 0)
5274 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5275 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5276 APInt UndefSubElts(NumSubElts, 0);
5278 APInt(CstEltSizeInBits, 0));
5279 for (
unsigned i = 0; i != NumSubElts; ++i) {
5283 for (
unsigned j = 1; j != NumSubVecs; ++j)
5284 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5288 return CastBitData(UndefSubElts, SubEltBits);
5297 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5299 APInt UndefSrcElts(NumSrcElts, 0);
5301 const APInt &
C =
Op.getOperand(0).getConstantOperandAPInt(0);
5302 SrcEltBits.
push_back(
C.zextOrTrunc(SrcEltSizeInBits));
5303 SrcEltBits.
append(NumSrcElts - 1,
APInt(SrcEltSizeInBits, 0));
5304 return CastBitData(UndefSrcElts, SrcEltBits);
5312 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5314 APInt UndefSrcElts, UndefSubElts;
5317 UndefSubElts, EltSubBits,
5318 AllowWholeUndefs && AllowUndefs,
5319 AllowPartialUndefs && AllowUndefs) &&
5321 UndefSrcElts, EltSrcBits,
5322 AllowWholeUndefs && AllowUndefs,
5323 AllowPartialUndefs && AllowUndefs)) {
5324 unsigned BaseIdx =
Op.getConstantOperandVal(2);
5325 UndefSrcElts.
insertBits(UndefSubElts, BaseIdx);
5326 for (
unsigned i = 0, e = EltSubBits.
size(); i != e; ++i)
5327 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5328 return CastBitData(UndefSrcElts, EltSrcBits);
5335 EltBits, AllowWholeUndefs,
5336 AllowPartialUndefs)) {
5337 EVT SrcVT =
Op.getOperand(0).getValueType();
5338 unsigned NumSrcElts = SrcVT.
getSizeInBits() / EltSizeInBits;
5341 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5344 (BaseOfs % EltSizeInBits) == 0 &&
"Bad subvector index");
5346 UndefElts = UndefElts.
extractBits(NumSubElts, BaseIdx);
5347 if ((BaseIdx + NumSubElts) != NumSrcElts)
5348 EltBits.
erase(EltBits.
begin() + BaseIdx + NumSubElts, EltBits.
end());
5361 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5365 APInt UndefElts0, UndefElts1;
5369 UndefElts0, EltBits0, AllowWholeUndefs,
5370 AllowPartialUndefs))
5374 UndefElts1, EltBits1, AllowWholeUndefs,
5375 AllowPartialUndefs))
5379 for (
int i = 0; i != (int)NumElts; ++i) {
5384 }
else if (M < (
int)NumElts) {
5389 if (UndefElts1[M - NumElts])
5391 EltBits.
push_back(EltBits1[M - NumElts]);
5619 MVT VT =
N.getSimpleValueType();
5626 assert(Mask.empty() &&
"getTargetShuffleMask expects an empty Mask vector");
5627 assert(
Ops.empty() &&
"getTargetShuffleMask expects an empty Ops vector");
5630 bool IsFakeUnary =
false;
5631 switch (
N.getOpcode()) {
5633 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5634 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5635 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5637 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5640 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5641 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5642 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5644 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5647 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5648 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5649 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5651 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5654 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5657 int BitLen =
N.getConstantOperandVal(1);
5658 int BitIdx =
N.getConstantOperandVal(2);
5664 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5665 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5668 int BitLen =
N.getConstantOperandVal(2);
5669 int BitIdx =
N.getConstantOperandVal(3);
5671 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5675 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5676 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5678 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5681 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5682 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5684 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5687 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5688 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5690 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5693 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5694 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5696 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5700 "Only 32-bit and 64-bit elements are supported!");
5701 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5702 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5703 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5705 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5706 Ops.push_back(
N.getOperand(1));
5707 Ops.push_back(
N.getOperand(0));
5711 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5712 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5713 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5715 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5716 Ops.push_back(
N.getOperand(1));
5717 Ops.push_back(
N.getOperand(0));
5721 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5722 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5728 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5729 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5735 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5736 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5741 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5742 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5747 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5748 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5753 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5761 if (
N.getOperand(0).getValueType() == VT) {
5768 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5770 SDValue MaskNode =
N.getOperand(1);
5780 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5781 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5783 SDValue MaskNode =
N.getOperand(1);
5791 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5792 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5799 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5800 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5804 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5805 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5806 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5808 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5811 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5812 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5813 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5815 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5818 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5823 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5828 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5833 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5834 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5835 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5836 SDValue MaskNode =
N.getOperand(2);
5837 SDValue CtrlNode =
N.getOperand(3);
5839 unsigned CtrlImm = CtrlOp->getZExtValue();
5850 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5851 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5852 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5853 SDValue MaskNode =
N.getOperand(2);
5861 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5864 Ops.push_back(
N.getOperand(1));
5865 SDValue MaskNode =
N.getOperand(0);
5874 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5875 assert(
N.getOperand(2).getValueType() == VT &&
"Unexpected value type");
5876 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(2);
5878 Ops.push_back(
N.getOperand(0));
5879 Ops.push_back(
N.getOperand(2));
5880 SDValue MaskNode =
N.getOperand(1);
5890 SDValue PassThru =
N.getOperand(1);
5897 "Illegal compression mask");
5898 for (
unsigned I = 0;
I != NumElems; ++
I) {
5902 while (Mask.size() != NumElems) {
5903 Mask.push_back(NumElems + Mask.size());
5905 Ops.push_back(CmpVec);
5906 Ops.push_back(PassThru);
5911 SDValue PassThru =
N.getOperand(1);
5918 "Illegal expansion mask");
5919 unsigned ExpIndex = 0;
5920 for (
unsigned I = 0;
I != NumElems; ++
I) {
5922 Mask.push_back(
I + NumElems);
5924 Mask.push_back(ExpIndex++);
5926 Ops.push_back(ExpVec);
5927 Ops.push_back(PassThru);
5939 if (!AllowSentinelZero &&
isAnyZero(Mask))
5947 if (M >= (
int)Mask.size())
5953 Ops.push_back(
N.getOperand(0));
5954 if (!IsUnary || IsFakeUnary)
5955 Ops.push_back(
N.getOperand(1));
6238 bool ResolveKnownElts) {
6242 MVT VT =
N.getSimpleValueType();
6246 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6249 unsigned NumSizeInBytes = NumSizeInBits / 8;
6250 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6252 unsigned Opcode =
N.getOpcode();
6258 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6259 Ops.push_back(
N.getOperand(0));
6260 Ops.push_back(
N.getOperand(1));
6273 uint64_t ZeroMask = IsAndN ? 255 : 0;
6280 assert(UndefElts.
isZero() &&
"Unexpected UNDEF element in AND/ANDNP mask");
6281 for (
int i = 0, e = (
int)EltBits.
size(); i != e; ++i) {
6282 const APInt &ByteBits = EltBits[i];
6283 if (ByteBits != 0 && ByteBits != 255)
6287 Ops.push_back(IsAndN ? N1 : N0);
6308 size_t MaskSize = std::max(SrcMask0.
size(), SrcMask1.
size());
6312 for (
int i = 0; i != (int)MaskSize; ++i) {
6322 Mask.push_back(i + MaskSize);
6326 Ops.push_back(
N.getOperand(0));
6327 Ops.push_back(
N.getOperand(1));
6332 unsigned NumSubElts =
N.getOperand(0).getValueType().getVectorNumElements();
6333 if (NumBitsPerElt == 64) {
6334 for (
unsigned I = 0,
E =
N.getNumOperands();
I !=
E; ++
I) {
6335 for (
unsigned M = 0; M != NumSubElts; ++M)
6336 Mask.push_back((
I * NumElts) + M);
6337 Ops.push_back(
N.getOperand(
I));
6346 EVT SubVT =
Sub.getValueType();
6348 uint64_t InsertIdx =
N.getConstantOperandVal(2);
6350 if (DemandedElts.
extractBits(NumSubElts, InsertIdx) == 0) {
6351 Mask.resize(NumElts);
6352 std::iota(Mask.begin(), Mask.end(), 0);
6358 if (
Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6360 Src.getOperand(0).isUndef() &&
6361 Src.getOperand(1).getValueType() == SubVT &&
6362 Src.getConstantOperandVal(2) == 0 &&
6363 (NumBitsPerElt == 64 || Src.getOperand(1) ==
Sub) &&
6365 Mask.resize(NumElts);
6366 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6367 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6368 Ops.push_back(Src.getOperand(1));
6372 if (!
N->isOnlyUserOf(
Sub.getNode()))
6387 unsigned NumSubSrcSrcElts =
6389 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6390 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6391 "Subvector valuetype mismatch");
6392 InsertIdx *= (MaxElts / NumElts);
6393 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6394 NumSubElts *= (MaxElts / NumElts);
6395 bool SrcIsUndef = Src.isUndef();
6396 for (
int i = 0; i != (int)MaxElts; ++i)
6398 for (
int i = 0; i != (int)NumSubElts; ++i)
6399 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6402 Ops.push_back(SubSrcSrc);
6409 Depth + 1, ResolveKnownElts))
6419 if (SubMask.
size() != NumSubElts) {
6420 assert(((SubMask.
size() % NumSubElts) == 0 ||
6421 (NumSubElts % SubMask.
size()) == 0) &&
6422 "Illegal submask scale");
6423 if ((NumSubElts % SubMask.
size()) == 0) {
6424 int Scale = NumSubElts / SubMask.
size();
6427 SubMask = ScaledSubMask;
6429 int Scale = SubMask.
size() / NumSubElts;
6430 NumSubElts = SubMask.
size();
6440 for (
int i = 0; i != (int)NumElts; ++i)
6442 for (
int i = 0; i != (int)NumSubElts; ++i) {
6445 int InputIdx = M / NumSubElts;
6446 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6448 Mask[i + InsertIdx] = M;
6460 unsigned DstIdx = 0;
6464 N.getConstantOperandAPInt(2).uge(NumElts))
6466 DstIdx =
N.getConstantOperandVal(2);
6470 Ops.push_back(
N.getOperand(0));
6471 for (
unsigned i = 0; i != NumElts; ++i)
6491 if ((MinBitsPerElt % 8) != 0)
6511 unsigned DstByte = DstIdx * NumBytesPerElt;
6517 Ops.push_back(SrcVec);
6520 Ops.push_back(SrcVec);
6521 Ops.push_back(
N.getOperand(0));
6522 for (
int i = 0; i != (int)NumSizeInBytes; ++i)
6523 Mask.push_back(NumSizeInBytes + i);
6526 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6527 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6528 for (
unsigned i = 0; i != MinBytesPerElts; ++i)
6529 Mask[DstByte + i] = SrcByte + i;
6530 for (
unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6540 "Unexpected input value type");
6542 APInt EltsLHS, EltsRHS;
6547 bool Offset0 =
false, Offset1 =
false;
6576 bool IsUnary = (N0 == N1);
6584 if (Offset0 || Offset1) {
6586 if ((Offset0 &&
isInRange(M, 0, NumElts)) ||
6587 (Offset1 &&
isInRange(M, NumElts, 2 * NumElts)))
6596 Ops.push_back(
N.getOperand(1));
6597 Ops.push_back(
N.getOperand(2));
6604 EVT SrcVT = Src.getValueType();
6609 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6610 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 &&
"Illegal truncation");
6611 for (
unsigned i = 0; i != NumSrcElts; ++i)
6612 Mask.push_back(i * Scale);
6628 for (
unsigned I = 0;
I != NumElts; ++
I)
6629 if (DemandedElts[
I] && !UndefElts[
I] &&
6630 (EltBits[
I].urem(8) != 0 || EltBits[
I].uge(NumBitsPerElt)))
6634 Ops.push_back(
N.getOperand(0));
6636 for (
unsigned I = 0;
I != NumElts; ++
I) {
6637 if (!DemandedElts[
I] || UndefElts[
I])
6639 unsigned ByteShift = EltBits[
I].getZExtValue() / 8;
6640 unsigned Lo =
I * NumBytesPerElt;
6641 unsigned Hi =
Lo + NumBytesPerElt;
6645 std::iota(Mask.begin() +
Lo + ByteShift, Mask.begin() +
Hi,
Lo);
6647 std::iota(Mask.begin() +
Lo, Mask.begin() +
Hi - ByteShift,
6654 uint64_t ShiftVal =
N.getConstantOperandVal(1);
6656 if (NumBitsPerElt <= ShiftVal) {
6662 if ((ShiftVal % 8) != 0)
6666 Ops.push_back(
N.getOperand(0));
6672 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6673 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6674 Mask[i + j] = i + j - ByteShift;
6676 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6677 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6678 Mask[i + j - ByteShift] = i + j;
6685 uint64_t RotateVal =
N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6686 if ((RotateVal % 8) != 0)
6688 Ops.push_back(
N.getOperand(0));
6689 int Offset = RotateVal / 8;
6691 for (
int i = 0; i != (int)NumElts; ++i) {
6692 int BaseIdx = i * NumBytesPerElt;
6693 for (
int j = 0; j != (int)NumBytesPerElt; ++j) {
6694 Mask.push_back(BaseIdx + ((
Offset + j) % NumBytesPerElt));
6701 if (!Src.getSimpleValueType().isVector()) {
6704 Src.getOperand(0).getValueType().getScalarType() !=
6707 Src = Src.getOperand(0);
6710 Mask.append(NumElts, 0);
6715 EVT SrcVT = Src.getValueType();
6720 (NumBitsPerSrcElt % 8) != 0)
6724 APInt DemandedSrcElts =
6729 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 &&
"Unexpected extension");
6730 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6731 for (
unsigned I = 0;
I != NumElts; ++
I)
6732 Mask.append(Scale,
I);
6741 EVT SrcVT = Src.getValueType();
7384 bool IsAfterLegalize,
7385 unsigned Depth = 0) {
7391 unsigned NumElems = Elts.
size();
7393 int LastLoadedElt = -1;
7403 for (
unsigned i = 0; i < NumElems; ++i) {
7422 if (!
findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7424 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7425 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7433 "Incomplete element masks");
7436 if (UndefMask.
popcount() == NumElems)
7447 "Register/Memory size mismatch");
7449 assert(LDBase &&
"Did not find base load for merging consecutive loads");
7451 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7452 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7453 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7454 assert((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected");
7457 if (ByteOffsets[FirstLoadedElt] != 0)
7464 int64_t ByteOffset = ByteOffsets[EltIdx];
7465 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7466 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7467 return (0 <= BaseIdx && BaseIdx < (
int)NumElems && LoadMask[BaseIdx] &&
7468 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7470 int Stride = EltIdx - FirstLoadedElt;
7476 unsigned BaseMemSizeInBits =
Base->getMemoryVT().getSizeInBits();
7477 if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
7478 (BaseMemSizeInBits % BaseSizeInBits) == 0) {
7479 unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
7489 bool IsConsecutiveLoad =
true;
7490 bool IsConsecutiveLoadWithZeros =
true;
7491 for (
int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7493 if (!CheckConsecutiveLoad(LDBase, i)) {
7494 IsConsecutiveLoad =
false;
7495 IsConsecutiveLoadWithZeros =
false;
7498 }
else if (ZeroMask[i]) {
7499 IsConsecutiveLoad =
false;
7506 "Cannot merge volatile or atomic loads.");
7510 for (
auto *LD : Loads)
7525 if (FirstLoadedElt == 0 &&
7526 (NumLoadedElts == (
int)NumElems || IsDereferenceable) &&
7527 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7538 return DAG.
getBitcast(VT, Elts[FirstLoadedElt]);
7541 return CreateLoad(VT, LDBase);
7545 if (!IsAfterLegalize && VT.
isVector()) {
7547 if ((NumMaskElts % NumElems) == 0) {
7548 unsigned Scale = NumMaskElts / NumElems;
7550 for (
unsigned i = 0; i < NumElems; ++i) {
7553 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7554 for (
unsigned j = 0; j != Scale; ++j)
7555 ClearMask[(i * Scale) + j] = (i * Scale) + j +
Offset;
7557 SDValue V = CreateLoad(VT, LDBase);
7567 unsigned HalfNumElems = NumElems / 2;
7573 DAG, Subtarget, IsAfterLegalize,
Depth + 1);
7581 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7582 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7583 LoadSizeInBits == 64) &&
7590 if (!Subtarget.
hasSSE2() && VT == MVT::v4f32)
7598 for (
auto *LD : Loads)
7609 for (
unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7610 unsigned RepeatSize = SubElems * BaseSizeInBits;
7611 unsigned ScalarSize = std::min(RepeatSize, 64u);
7612 if (!Subtarget.
hasAVX2() && ScalarSize < 32)
7617 if (RepeatSize > ScalarSize && SubElems == 1)
7622 for (
unsigned i = 0; i != NumElems && Match; ++i) {
7626 if (RepeatedLoads[i % SubElems].
isUndef())
7627 RepeatedLoads[i % SubElems] = Elt;
7629 Match &= (RepeatedLoads[i % SubElems] == Elt);
7633 Match &= !RepeatedLoads.
front().isUndef();
7634 Match &= !RepeatedLoads.
back().isUndef();
7642 if (RepeatSize > ScalarSize)
7644 RepeatSize / ScalarSize);
7650 RepeatVT, RepeatedLoads,
DL, DAG, Subtarget, IsAfterLegalize,
7652 SDValue Broadcast = RepeatLoad;
7653 if (RepeatSize > ScalarSize) {
7679 VT, ReverseElts,
DL, DAG, Subtarget, IsAfterLegalize,
Depth + 1)) {
7681 std::iota(ReverseMask.
rbegin(), ReverseMask.
rend(), 0);
9109 "Illegal variable permute mask size");
9117 SDLoc(IndicesVec), SizeInBits);
9121 IndicesVT, IndicesVec);
9133 Subtarget, DAG,
SDLoc(IndicesVec));
9158 for (
uint64_t i = 0; i != Scale; ++i) {
9159 IndexScale |= Scale << (i * NumDstBits);
9160 IndexOffset |= i << (i * NumDstBits);
9170 unsigned Opcode = 0;
9179 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9183 ShuffleVT = MVT::v16i8;
9188 if (Subtarget.
hasAVX()) {
9190 ShuffleVT = MVT::v4f32;
9193 ShuffleVT = MVT::v16i8;
9198 if (Subtarget.
hasAVX()) {
9202 ShuffleVT = MVT::v2f64;
9214 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9216 else if (Subtarget.hasXOP()) {
9225 }
else if (Subtarget.
hasAVX()) {
9248 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9250 else if (Subtarget.
hasAVX()) {
9252 IndicesVec = ScaleIndices(IndicesVec, 2);
9255 MVT::v32i8, DAG.
getBitcast(MVT::v32i8, SrcVec),
9256 DAG.
getBitcast(MVT::v32i8, IndicesVec),
DL, DAG, Subtarget));
9263 else if (Subtarget.
hasAVX()) {
9266 {0, 1, 2, 3, 0, 1, 2, 3});
9268 {4, 5, 6, 7, 4, 5, 6, 7});
9269 if (Subtarget.hasXOP())
9286 if (!Subtarget.hasVLX()) {
9288 SrcVec =
widenSubVector(WidenSrcVT, SrcVec,
false, Subtarget, DAG,
9290 IndicesVec =
widenSubVector(MVT::v8i64, IndicesVec,
false, Subtarget,
9291 DAG,
SDLoc(IndicesVec));
9297 }
else if (Subtarget.
hasAVX()) {
9305 if (Subtarget.hasXOP())
9320 if (Subtarget.hasVBMI())
9324 if (Subtarget.hasBWI())
9340 "Illegal variable permute shuffle type");
9344 IndicesVec = ScaleIndices(IndicesVec, Scale);
9347 IndicesVec = DAG.
getBitcast(ShuffleIdxVT, IndicesVec);
9351 ? DAG.
getNode(Opcode,
DL, ShuffleVT, IndicesVec, SrcVec)
9352 : DAG.
getNode(Opcode,
DL, ShuffleVT, SrcVec, IndicesVec);
13969 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
13975 Zeroable, Subtarget, DAG))
13978 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
13981 if (Subtarget.preferLowerShuffleAsShift()) {
13984 Subtarget, DAG,
true))
13986 if (NumV2Elements == 0)
13992 if (NumV2Elements == 0) {
13994 if (
count_if(Mask, [](
int M) {
return M >= 0 && M < 4; }) > 1) {
13996 Mask, Subtarget, DAG))
14005 const int UnpackLoMask[] = {0, 0, 1, 1};
14006 const int UnpackHiMask[] = {2, 2, 3, 3};
14009 Mask = UnpackLoMask;
14011 Mask = UnpackHiMask;
14029 if (NumV2Elements == 1)
14031 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14036 bool IsBlendSupported = Subtarget.
hasSSE41();
14037 if (IsBlendSupported)
14039 Zeroable, Subtarget, DAG))
14043 Zeroable, Subtarget, DAG))
14053 if (Subtarget.hasVLX())
14054 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14055 Zeroable, Subtarget, DAG))
14058 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14069 if (IsBlendSupported)
14071 Zeroable, Subtarget, DAG);
14075 Mask, Subtarget, DAG))
14112 assert(Mask.size() == 8 &&
"Shuffle mask length doesn't match!");
14124 for (
int i = 0; i != 4; ++i)
14125 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14131 copy_if(LoMask, std::back_inserter(LoInputs), [](
int M) {
return M >= 0; });
14135 copy_if(HiMask, std::back_inserter(HiInputs), [](
int M) {
return M >= 0; });
14139 int NumHToL = LoInputs.
size() - NumLToL;
14141 int NumHToH = HiInputs.
size() - NumLToH;
14160 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14161 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14163 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14166 for (
int DWord = 0; DWord != 4; ++DWord) {
14167 int M0 = Mask[2 * DWord + 0];
14168 int M1 = Mask[2 * DWord + 1];
14171 if (
M0 < 0 &&
M1 < 0)
14174 bool Match =
false;
14175 for (
int j = 0, e = DWordPairs.
size(); j < e; ++j) {
14176 auto &DWordPair = DWordPairs[j];
14179 DWordPair.first = (
M0 >= 0 ?
M0 : DWordPair.first);
14180 DWordPair.second = (
M1 >= 0 ?
M1 : DWordPair.second);
14181 PSHUFDMask[DWord] = DOffset + j;
14187 PSHUFDMask[DWord] = DOffset + DWordPairs.
size();
14192 if (DWordPairs.
size() <= 2) {
14193 DWordPairs.
resize(2, std::make_pair(-1, -1));
14194 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14195 DWordPairs[1].first, DWordPairs[1].second};
14200 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
14201 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
14202 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
14204 if ((NumHToL + NumHToH) == 0)
14205 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask,
X86ISD::PSHUFLW);
14206 if ((NumLToL + NumLToH) == 0)
14207 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask,
X86ISD::PSHUFHW);
14243 int AOffset,
int BOffset) {
14245 "Must call this with A having 3 or 1 inputs from the A half.");
14247 "Must call this with B having 1 or 3 inputs from the B half.");
14249 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14251 bool ThreeAInputs = AToAInputs.
size() == 3;
14257 int ADWord = 0, BDWord = 0;
14258 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14259 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14260 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14261 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14262 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14263 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14264 int TripleNonInputIdx =
14265 TripleInputSum - std::accumulate(TripleInputs.
begin(), TripleInputs.
end(), 0);
14266 TripleDWord = TripleNonInputIdx / 2;
14270 OneInputDWord = (OneInput / 2) ^ 1;
14277 if (BToBInputs.
size() == 2 && AToBInputs.
size() == 2) {
14282 int NumFlippedAToBInputs =
llvm::count(AToBInputs, 2 * ADWord) +
14284 int NumFlippedBToBInputs =
llvm::count(BToBInputs, 2 * BDWord) +
14286 if ((NumFlippedAToBInputs == 1 &&
14287 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14288 (NumFlippedBToBInputs == 1 &&
14289 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14294 auto FixFlippedInputs = [&V, &
DL, &Mask, &DAG](
int PinnedIdx,
int DWord,
14296 int FixIdx = PinnedIdx ^ 1;
14297 bool IsFixIdxInput =
is_contained(Inputs, PinnedIdx ^ 1);
14301 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14302 bool IsFixFreeIdxInput =
is_contained(Inputs, FixFreeIdx);
14303 if (IsFixIdxInput == IsFixFreeIdxInput)
14306 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14307 "We need to be changing the number of flipped inputs!");
14308 int PSHUFHalfMask[] = {0, 1, 2, 3};
14309 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14315 for (
int &M : Mask)
14316 if (M >= 0 && M == FixIdx)
14318 else if (M >= 0 && M == FixFreeIdx)
14321 if (NumFlippedBToBInputs != 0) {
14323 BToAInputs.
size() == 3 ? TripleNonInputIdx : OneInput;
14324 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14326 assert(NumFlippedAToBInputs != 0 &&
"Impossible given predicates!");
14327 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14328 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14333 int PSHUFDMask[] = {0, 1, 2, 3};
14334 PSHUFDMask[ADWord] = BDWord;
14335 PSHUFDMask[BDWord] = ADWord;
14342 for (
int &M : Mask)
14343 if (M >= 0 && M/2 == ADWord)
14344 M = 2 * BDWord + M % 2;
14345 else if (M >= 0 && M/2 == BDWord)
14346 M = 2 * ADWord + M % 2;
14352 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14353 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14354 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14355 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14362 int PSHUFLMask[4] = {-1, -1, -1, -1};
14363 int PSHUFHMask[4] = {-1, -1, -1, -1};
14364 int PSHUFDMask[4] = {-1, -1, -1, -1};
14369 auto fixInPlaceInputs =
14373 if (InPlaceInputs.
empty())
14375 if (InPlaceInputs.
size() == 1) {
14376 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14377 InPlaceInputs[0] - HalfOffset;
14378 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14381 if (IncomingInputs.
empty()) {
14383 for (
int Input : InPlaceInputs) {
14384 SourceHalfMask[
Input - HalfOffset] =
Input - HalfOffset;
14390 assert(InPlaceInputs.
size() == 2 &&
"Cannot handle 3 or 4 inputs!");
14391 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14392 InPlaceInputs[0] - HalfOffset;
14395 int AdjIndex = InPlaceInputs[0] ^ 1;
14396 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14398 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14400 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14401 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14407 auto moveInputsToRightHalf = [&PSHUFDMask](
14412 auto isWordClobbered = [](
ArrayRef<int> SourceHalfMask,
int Word) {
14413 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14415 auto isDWordClobbered = [&isWordClobbered](
ArrayRef<int> SourceHalfMask,
14417 int LowWord = Word & ~1;
14418 int HighWord = Word | 1;
14419 return isWordClobbered(SourceHalfMask, LowWord) ||
14420 isWordClobbered(SourceHalfMask, HighWord);
14423 if (IncomingInputs.
empty())
14426 if (ExistingInputs.
empty()) {
14428 for (
int Input : IncomingInputs) {
14431 if (isWordClobbered(SourceHalfMask,
Input - SourceOffset)) {
14432 if (SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] < 0) {
14433 SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] =
14434 Input - SourceOffset;
14436 for (
int &M : HalfMask)
14437 if (M == SourceHalfMask[
Input - SourceOffset] + SourceOffset)
14439 else if (M ==
Input)
14440 M = SourceHalfMask[
Input - SourceOffset] + SourceOffset;
14442 assert(SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] ==
14443 Input - SourceOffset &&
14444 "Previous placement doesn't match!");
14449 Input = SourceHalfMask[
Input - SourceOffset] + SourceOffset;
14453 if (PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] < 0)
14454 PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] =
Input / 2;
14456 assert(PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] ==
14458 "Previous placement doesn't match!");
14464 for (
int &M : HalfMask)
14465 if (M >= SourceOffset && M < SourceOffset + 4) {
14466 M = M - SourceOffset + DestOffset;
14467 assert(M >= 0 &&
"This should never wrap below zero!");
14475 if (IncomingInputs.
size() == 1) {
14476 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14477 int InputFixed =
find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14479 SourceHalfMask[InputFixed - SourceOffset] =
14480 IncomingInputs[0] - SourceOffset;
14482 IncomingInputs[0] = InputFixed;
14484 }
else if (IncomingInputs.
size() == 2) {
14485 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14486 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14490 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14491 IncomingInputs[1] - SourceOffset};
14496 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14497 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14498 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14499 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14500 InputsFixed[1] = InputsFixed[0] ^ 1;
14501 }
else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14502 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14503 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14504 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14505 InputsFixed[0] = InputsFixed[1] ^ 1;
14506 }
else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14507 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14511 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14512 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14513 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14514 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14520 for (
int i = 0; i < 4; ++i)
14521 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14522 "We can't handle any clobbers here!");
14523 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14524 "Cannot have adjacent inputs here!");
14526 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14527 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14531 for (
int &M : FinalSourceHalfMask)
14532 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14533 M = InputsFixed[1] + SourceOffset;
14534 else if (M == InputsFixed[1] + SourceOffset)
14535 M = (InputsFixed[0] ^ 1) + SourceOffset;
14537 InputsFixed[1] = InputsFixed[0] ^ 1;
14541 for (
int &M : HalfMask)
14542 if (M == IncomingInputs[0])
14543 M = InputsFixed[0] + SourceOffset;
14544 else if (M == IncomingInputs[1])
14545 M = InputsFixed[1] + SourceOffset;
14547 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14548 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14555 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14556 assert(PSHUFDMask[FreeDWord] < 0 &&
"DWord not free");
14557 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14558 for (
int &M : HalfMask)
14559 for (
int Input : IncomingInputs)
14561 M = FreeDWord * 2 +
Input % 2;
14563 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14565 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14585 "Failed to lift all the high half inputs to the low mask!");
14586 assert(
none_of(HiMask, [](
int M) {
return M >= 0 && M < 4; }) &&
14587 "Failed to lift all the low half inputs to the high mask!");
14595 for (
int &M : HiMask)
14676 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
14681 Zeroable, Subtarget, DAG))
14689 int NumV2Inputs =
count_if(Mask, [](
int M) {
return M >= 8; });
14691 if (NumV2Inputs == 0) {
14695 Subtarget, DAG,
false))
14700 Mask, Subtarget, DAG))
14729 "All single-input shuffles should be canonicalized to be V1-input "
14739 if (Subtarget.hasSSE4A())
14745 if (NumV2Inputs == 1)
14747 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14752 bool IsBlendSupported = Subtarget.
hasSSE41();
14753 if (IsBlendSupported)
14755 Zeroable, Subtarget, DAG))
14759 Zeroable, Subtarget, DAG))
14787 Zeroable, Subtarget, DAG))
14792 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.
hasSSE41())) &&
14793 !Subtarget.hasVLX()) {
14795 unsigned PackOpc = 0;
14796 if (NumEvenDrops == 2 && Subtarget.
hasAVX2() &&
14807 }
else if (Subtarget.
hasSSE41()) {
14810 for (
unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14819 }
else if (!Subtarget.
hasSSSE3()) {
14832 if (NumEvenDrops == 2) {
14833 Result = DAG.
getBitcast(MVT::v4i32, Result);
14834 Result = DAG.
getNode(PackOpc,
DL, MVT::v8i16, Result, Result);
14842 if (NumOddDrops == 1) {
14843 bool HasSSE41 = Subtarget.
hasSSE41();
14851 MVT::v8i16, V1, V2);
14856 Mask, Subtarget, DAG))
14861 if (!IsBlendSupported && Subtarget.
hasSSSE3()) {
14862 bool V1InUse, V2InUse;
14864 Zeroable, DAG, V1InUse, V2InUse);
14870 Zeroable, Subtarget, DAG);
14963 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
14983 Zeroable, Subtarget, DAG))
14996 if (Subtarget.hasSSE4A())
15001 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
15004 if (NumV2Elements == 0) {
15007 Mask, Subtarget, DAG))
15027 for (
int i = 0; i < 16; i += 2)
15028 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15033 auto tryToWidenViaDuplication = [&]() ->
SDValue {
15034 if (!canWidenViaDuplication(Mask))
15037 copy_if(Mask, std::back_inserter(LoInputs),
15038 [](
int M) {
return M >= 0 && M < 8; });
15042 copy_if(Mask, std::back_inserter(HiInputs), [](
int M) {
return M >= 8; });
15046 bool TargetLo = LoInputs.
size() >= HiInputs.
size();
15047 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15048 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15050 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15052 for (
int I : InPlaceInputs) {
15053 PreDupI16Shuffle[
I/2] =
I/2;
15056 int j = TargetLo ? 0 : 4, je = j + 4;
15057 for (
int i = 0, ie = MovingInputs.
size(); i < ie; ++i) {
15060 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15063 while (j < je && PreDupI16Shuffle[j] >= 0)
15071 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15075 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15080 DAG.
getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15083 bool EvenInUse =
false, OddInUse =
false;
15084 for (
int i = 0; i < 16; i += 2) {
15085 EvenInUse |= (Mask[i + 0] >= 0);
15086 OddInUse |= (Mask[i + 1] >= 0);
15087 if (EvenInUse && OddInUse)
15091 MVT::v16i8, EvenInUse ? V1 : DAG.
getUNDEF(MVT::v16i8),
15092 OddInUse ? V1 : DAG.
getUNDEF(MVT::v16i8));
15094 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15095 for (
int i = 0; i < 16; ++i)
15096 if (Mask[i] >= 0) {
15097 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15098 assert(MappedMask < 8 &&
"Invalid v8 shuffle mask!");
15099 if (PostDupI16Shuffle[i / 2] < 0)
15100 PostDupI16Shuffle[i / 2] = MappedMask;
15102 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15103 "Conflicting entries in the original shuffle!");
15108 DAG.
getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15110 if (
SDValue V = tryToWidenViaDuplication())
15115 Zeroable, Subtarget, DAG))
15124 Zeroable, Subtarget, DAG))
15128 bool IsSingleInput = V2.
isUndef();
15147 if (Subtarget.
hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15148 bool V1InUse =
false;
15149 bool V2InUse =
false;
15152 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15157 if (V1InUse && V2InUse) {
15160 Zeroable, Subtarget, DAG))
15172 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15176 if (Subtarget.hasVBMI())
15181 if (Subtarget.hasXOP()) {
15189 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15197 if (NumV2Elements == 1)
15199 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15212 if (NumEvenDrops) {
15218 assert(NumEvenDrops <= 3 &&
15219 "No support for dropping even elements more than 3 times.");
15221 for (
unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15226 if (!IsSingleInput)
15232 IsSingleInput ? V1 : V2);
15233 for (
int i = 1; i < NumEvenDrops; ++i) {
15234 Result = DAG.
getBitcast(MVT::v8i16, Result);
15241 if (NumOddDrops == 1) {
15245 if (!IsSingleInput)
15250 IsSingleInput ? V1 : V2);
15254 if (NumV2Elements > 0)
15256 Zeroable, Subtarget, DAG);
15263 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15264 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15265 for (
int i = 0; i < 16; ++i)
15267 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15273 if (
none_of(LoBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; }) &&
15274 none_of(HiBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; })) {
15281 VHiHalf = DAG.
getUNDEF(MVT::v8i16);
15284 for (
int &M : LoBlendMask)
15287 for (
int &M : HiBlendMask)
16290 int NumLaneElts = NumElts / NumLanes;
16295 for (
unsigned BroadcastSize : {16, 32, 64}) {
16304 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
16305 for (
int j = 0; j != NumBroadcastElts; ++j) {
16306 int M = Mask[i + j];
16309 int &R = RepeatMask[j];
16310 if (0 != ((M % NumElts) / NumLaneElts))
16312 if (0 <= R && R != M)
16320 if (!FindRepeatingBroadcastMask(RepeatMask))
16328 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
16329 for (
int j = 0; j != NumBroadcastElts; ++j)
16330 BroadcastMask[i + j] = j;
16334 if (BroadcastMask == Mask)
16352 auto ShuffleSubLanes = [&](
int SubLaneScale) {
16353 int NumSubLanes = NumLanes * SubLaneScale;
16354 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16359 int TopSrcSubLane = -1;
16365 for (
int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16370 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16371 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16374 int Lane = (M % NumElts) / NumLaneElts;
16375 if ((0 <= SrcLane) && (SrcLane != Lane))
16378 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16379 SubLaneMask[Elt] = LocalM;
16387 for (
int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16389 for (
int i = 0; i != NumSubLaneElts; ++i) {
16390 if (
M1[i] < 0 || M2[i] < 0)
16392 if (
M1[i] != M2[i])
16398 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16399 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16403 for (
int i = 0; i != NumSubLaneElts; ++i) {
16404 int M = SubLaneMask[i];
16407 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16408 "Unexpected mask element");
16409 RepeatedSubLaneMask[i] = M;
16414 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16415 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16416 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16421 if (Dst2SrcSubLanes[DstSubLane] < 0)
16424 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16425 "Unexpected source lane");
16429 for (
int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16430 int Lane = SubLane / SubLaneScale;
16431 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16432 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16433 int M = RepeatedSubLaneMask[Elt];
16436 int Idx = (SubLane * NumSubLaneElts) + Elt;
16437 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16443 for (
int i = 0; i != NumElts; i += NumSubLaneElts) {
16444 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16445 if (SrcSubLane < 0)
16447 for (
int j = 0; j != NumSubLaneElts; ++j)
16448 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16453 if (RepeatedMask == Mask || SubLaneMask == Mask)
16467 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16470 MinSubLaneScale = 2;
16472 (!OnlyLowestElts && V2.
isUndef() && VT == MVT::v32i8) ? 4 : 2;
16474 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16475 MinSubLaneScale = MaxSubLaneScale = 4;
16477 for (
int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16478 if (
SDValue Shuffle = ShuffleSubLanes(Scale))
17043 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17044 assert(Subtarget.
hasAVX2() &&
"We can only lower v8i32 with AVX2!");
17046 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
17052 Zeroable, Subtarget, DAG))
17071 Zeroable, Subtarget, DAG))
17080 if (Subtarget.preferLowerShuffleAsShift()) {
17083 Subtarget, DAG,
true))
17085 if (NumV2Elements == 0)
17095 bool Is128BitLaneRepeatedShuffle =
17097 if (Is128BitLaneRepeatedShuffle) {
17098 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
17114 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
17120 if (Subtarget.hasVLX()) {
17122 Zeroable, Subtarget, DAG))
17126 Zeroable, Subtarget, DAG))
17138 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17160 CastV1, CastV2, DAG);
17167 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17172 Zeroable, Subtarget, DAG);
17185 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
17186 assert(Subtarget.
hasAVX2() &&
"We can only lower v16i16 with AVX2!");
17192 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17201 Zeroable, Subtarget, DAG))
17221 Subtarget, DAG,
false))
17232 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17250 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17263 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17268 Zeroable, Subtarget, DAG))
17272 if (Subtarget.hasBWI())
17278 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17283 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17308 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
17309 assert(Subtarget.
hasAVX2() &&
"We can only lower v32i8 with AVX2!");
17315 Zeroable, Subtarget, DAG))
17324 Zeroable, Subtarget, DAG))
17361 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17373 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17381 Zeroable, Subtarget, DAG))
17385 if (Subtarget.hasVBMI())
17391 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17396 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17402 if (Subtarget.hasVLX())
17404 Mask, Zeroable, DAG))