135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
163 if (Subtarget.hasSlowDivide32())
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
184 if (Subtarget.is64Bit())
201 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
207 if (Subtarget.canUseCMOV()) {
210 if (Subtarget.is64Bit())
219 if (Subtarget.is64Bit())
227 if (Subtarget.is64Bit())
238 if (Subtarget.is64Bit())
242 if (!Subtarget.useSoftFloat()) {
306 if (!Subtarget.is64Bit()) {
312 if (Subtarget.hasSSE2()) {
315 for (
MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
321 if (Subtarget.is64Bit()) {
326 if (Subtarget.hasAVX10_2()) {
331 for (
MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
336 if (Subtarget.is64Bit()) {
347 if (!Subtarget.hasSSE2()) {
352 if (Subtarget.is64Bit()) {
357 }
else if (!Subtarget.is64Bit())
370 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
381 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
386 if (Subtarget.is64Bit())
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
415 if (!Subtarget.hasBMI()) {
418 if (Subtarget.is64Bit()) {
424 if (Subtarget.hasLZCNT()) {
430 for (
auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ?
Custom :
Expand);
452 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
457 for (
MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
486 if (!Subtarget.hasMOVBE())
490 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
496 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
515 for (
auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
527 for (
auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
535 if (Subtarget.hasSSEPrefetch())
541 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
551 if (!Subtarget.is64Bit())
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
561 if (Subtarget.canUseCMPXCHG16B())
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
579 if (Subtarget.isTargetPS())
587 bool Is64Bit = Subtarget.is64Bit();
642 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
646 : &X86::FR16RegClass);
648 : &X86::FR32RegClass);
650 : &X86::FR64RegClass);
658 for (
auto VT : { MVT::f32, MVT::f64 }) {
679 setF16Action(MVT::f16,
Promote);
736 }
else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
737 (UseX87 || Is64Bit)) {
775 for (
auto VT : { MVT::f32, MVT::f64 }) {
788 if (UseX87 && (
getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
789 addLegalFPImmediate(
APFloat(+0.0f));
790 addLegalFPImmediate(
APFloat(+1.0f));
791 addLegalFPImmediate(
APFloat(-0.0f));
792 addLegalFPImmediate(
APFloat(-1.0f));
794 addLegalFPImmediate(
APFloat(+0.0f));
799 addLegalFPImmediate(
APFloat(+0.0));
800 addLegalFPImmediate(
APFloat(+1.0));
801 addLegalFPImmediate(
APFloat(-0.0));
802 addLegalFPImmediate(
APFloat(-1.0));
804 addLegalFPImmediate(
APFloat(+0.0));
835 addLegalFPImmediate(TmpFlt);
837 addLegalFPImmediate(TmpFlt);
843 addLegalFPImmediate(TmpFlt2);
845 addLegalFPImmediate(TmpFlt2);
894 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
896 : &X86::VR128RegClass);
973 for (
auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
974 MVT::v4f32, MVT::v8f32, MVT::v16f32,
975 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
1058 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1063 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1065 : &X86::VR128RegClass);
1093 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1095 : &X86::VR128RegClass);
1100 : &X86::VR128RegClass);
1102 : &X86::VR128RegClass);
1104 : &X86::VR128RegClass);
1106 : &X86::VR128RegClass);
1108 : &X86::VR128RegClass);
1110 for (
auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1117 for (
auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1118 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1158 if (Subtarget.hasPCLMUL()) {
1159 for (
auto VT : {MVT::i64, MVT::v4i32, MVT::v2i64}) {
1168 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1191 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1211 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1219 for (
auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1224 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1230 setF16Action(MVT::v8f16,
Expand);
1255 for (
auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1301 if (!Subtarget.hasAVX512())
1329 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1333 if (VT == MVT::v2i64)
continue;
1347 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1353 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1358 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1363 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1375 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1376 for (
MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1416 for (
auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1431 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1443 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1447 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1448 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1449 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1455 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1459 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1460 bool HasInt256 = Subtarget.hasInt256();
1463 : &X86::VR256RegClass);
1465 : &X86::VR256RegClass);
1467 : &X86::VR256RegClass);
1469 : &X86::VR256RegClass);
1471 : &X86::VR256RegClass);
1473 : &X86::VR256RegClass);
1475 : &X86::VR256RegClass);
1477 for (
auto VT : { MVT::v8f32, MVT::v4f64 }) {
1540 if (!Subtarget.hasAVX512())
1545 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1551 if (VT == MVT::v4i64)
continue;
1572 for (
auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1583 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1602 if (Subtarget.hasAnyFMA()) {
1603 for (
auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1604 MVT::v2f64, MVT::v4f64 }) {
1610 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1651 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1659 for (
auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1681 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1682 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1689 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1690 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1695 for (
MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1696 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1707 setF16Action(MVT::v16f16,
Expand);
1717 if (Subtarget.hasPCLMUL()) {
1718 for (
auto VT : {MVT::v8i32, MVT::v4i64}) {
1731 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1732 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1737 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1738 Subtarget.hasF16C()) {
1739 for (
MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1743 for (
MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1758 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1786 if (!Subtarget.hasDQI()) {
1799 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1805 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1808 for (
auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1821 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1824 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1825 for (
MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1834 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1835 bool HasBWI = Subtarget.hasBWI();
1855 for (
MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1871 if (Subtarget.hasDQI())
1878 for (
MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1885 for (
MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1922 if (!Subtarget.hasVLX()) {
1923 for (
auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1924 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1950 for (
auto VT : { MVT::v16f32, MVT::v8f64 }) {
1967 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1994 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
2018 for (
auto VT : { MVT::v16i32, MVT::v8i64 }) {
2027 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2048 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2054 if (Subtarget.hasDQI())
2057 if (Subtarget.hasCDI()) {
2059 for (
auto VT : { MVT::v16i32, MVT::v8i64} ) {
2064 if (Subtarget.hasVPOPCNTDQ()) {
2065 for (
auto VT : { MVT::v16i32, MVT::v8i64 })
2072 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2073 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2076 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2077 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2088 setF16Action(MVT::v32f16,
Expand);
2097 for (
auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2104 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2113 if (Subtarget.hasVBMI2()) {
2114 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2124 if (Subtarget.hasPCLMUL()) {
2125 for (
auto VT : {MVT::v16i32, MVT::v8i64}) {
2136 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2137 for (
auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2147 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2148 for (
MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
2149 MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
2150 MVT::v16f32, MVT::v8f64})
2159 if (Subtarget.hasDQI()) {
2164 "Unexpected operation action!");
2172 for (
auto VT : { MVT::v2i64, MVT::v4i64 }) {
2180 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2189 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2190 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2193 if (Subtarget.hasDQI()) {
2204 if (Subtarget.hasCDI()) {
2205 for (
auto VT : {MVT::i256, MVT::i512}) {
2206 if (VT == MVT::i512 && !Subtarget.useAVX512Regs())
2213 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2218 if (Subtarget.hasVPOPCNTDQ()) {
2219 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2226 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2227 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2228 MVT::v16i16, MVT::v8i8})
2233 for (
MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2237 if (Subtarget.hasVLX())
2238 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2239 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2243 if (Subtarget.hasVBMI2())
2244 for (
MVT VT : {MVT::v32i16, MVT::v64i8})
2248 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2249 for (
MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2255 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2259 for (
auto VT : { MVT::v32i1, MVT::v64i1 }) {
2272 for (
auto VT : { MVT::v16i1, MVT::v32i1 })
2280 for (
auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2281 MVT::v16f16, MVT::v8f16}) {
2290 if (Subtarget.hasBITALG()) {
2291 for (
auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2296 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2297 auto setGroup = [&] (
MVT VT) {
2367 if (Subtarget.useAVX512Regs()) {
2368 setGroup(MVT::v32f16);
2419 if (Subtarget.hasVLX()) {
2420 setGroup(MVT::v8f16);
2421 setGroup(MVT::v16f16);
2472 if (!Subtarget.useSoftFloat() &&
2473 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2475 : &X86::VR128RegClass);
2476 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2477 : &X86::VR256RegClass);
2483 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2484 setF16Action(VT,
Expand);
2485 if (!Subtarget.hasBF16())
2502 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2503 Subtarget.useAVX512Regs()) {
2505 setF16Action(MVT::v32bf16,
Expand);
2516 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2528 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2541 for (
auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2547 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2560 if (Subtarget.hasBWI()) {
2565 if (Subtarget.hasFP16()) {
2597 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2605 if (!Subtarget.is64Bit()) {
2615 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2616 if (VT == MVT::i64 && !Subtarget.is64Bit())
2638 if (Subtarget.isTargetWin64()) {
2657 if (Subtarget.is32Bit() &&
2658 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2686 if (Subtarget.isOSWindows()) {
4659 unsigned IdxVal =
Op.getConstantOperandVal(2);
4665 if (IdxVal == 0 && Vec.
isUndef())
4668 MVT OpVT =
Op.getSimpleValueType();
4687 assert(IdxVal + SubVecNumElems <= NumElems &&
4689 "Unexpected index value in INSERT_SUBVECTOR");
4709 Undef, SubVec, ZeroIdx);
4712 assert(IdxVal != 0 &&
"Unexpected index");
4719 assert(IdxVal != 0 &&
"Unexpected index");
4722 [](
SDValue V) { return V.isUndef(); })) {
4727 unsigned ShiftLeft = NumElems - SubVecNumElems;
4728 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4731 if (ShiftRight != 0)
4739 if (IdxVal + SubVecNumElems == NumElems) {
4742 if (SubVecNumElems * 2 == NumElems) {
4752 Undef, Vec, ZeroIdx);
4769 unsigned ShiftLeft = NumElems - SubVecNumElems;
4770 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4773 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4796 unsigned LowShift = NumElems - IdxVal;
4803 unsigned HighShift = IdxVal + SubVecNumElems;
5087 bool AllowWholeUndefs =
true,
5088 bool AllowPartialUndefs =
false) {
5089 assert(EltBits.
empty() &&
"Expected an empty EltBits vector");
5093 EVT VT =
Op.getValueType();
5095 unsigned NumElts = SizeInBits / EltSizeInBits;
5098 if ((SizeInBits % EltSizeInBits) != 0)
5104 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5105 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5106 "Constant bit sizes don't match");
5109 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5114 if (NumSrcElts == NumElts) {
5115 UndefElts = UndefSrcElts;
5116 EltBits.
assign(SrcEltBits.begin(), SrcEltBits.end());
5121 APInt UndefBits(SizeInBits, 0);
5122 APInt MaskBits(SizeInBits, 0);
5124 for (
unsigned i = 0; i != NumSrcElts; ++i) {
5125 unsigned BitOffset = i * SrcEltSizeInBits;
5126 if (UndefSrcElts[i])
5127 UndefBits.
setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5128 MaskBits.
insertBits(SrcEltBits[i], BitOffset);
5132 UndefElts =
APInt(NumElts, 0);
5135 for (
unsigned i = 0; i != NumElts; ++i) {
5136 unsigned BitOffset = i * EltSizeInBits;
5141 if (!AllowWholeUndefs)
5149 if (UndefEltBits.
getBoolValue() && !AllowPartialUndefs)
5152 EltBits[i] = MaskBits.
extractBits(EltSizeInBits, BitOffset);
5159 unsigned UndefBitIndex) {
5163 Undefs.setBit(UndefBitIndex);
5167 Mask = CInt->getValue();
5171 Mask = CFP->getValueAPF().bitcastToAPInt();
5175 Type *Ty = CDS->getType();
5177 Type *EltTy = CDS->getElementType();
5181 if (!IsInteger && !IsFP)
5184 for (
unsigned I = 0,
E = CDS->getNumElements();
I !=
E; ++
I)
5186 Mask.insertBits(CDS->getElementAsAPInt(
I),
I * EltBits);
5188 Mask.insertBits(CDS->getElementAsAPFloat(
I).bitcastToAPInt(),
5199 return CastBitData(UndefSrcElts, SrcEltBits);
5206 return CastBitData(UndefSrcElts, SrcEltBits);
5210 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5212 return CastBitData(UndefSrcElts, SrcEltBits);
5220 if (BV->getConstantRawBits(
true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5222 for (
unsigned I = 0,
E = SrcEltBits.
size();
I !=
E; ++
I)
5225 return CastBitData(UndefSrcElts, SrcEltBits);
5233 if (!CstTy->
isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5237 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5238 if ((SizeInBits % SrcEltSizeInBits) != 0)
5241 APInt UndefSrcElts(NumSrcElts, 0);
5243 for (
unsigned i = 0; i != NumSrcElts; ++i)
5248 return CastBitData(UndefSrcElts, SrcEltBits);
5258 SDValue Ptr = MemIntr->getBasePtr();
5261 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5263 APInt UndefSrcElts(NumSrcElts, 0);
5265 if (CollectConstantBits(
C, SrcEltBits[0], UndefSrcElts, 0)) {
5266 if (UndefSrcElts[0])
5267 UndefSrcElts.
setBits(0, NumSrcElts);
5268 if (SrcEltBits[0].
getBitWidth() != SrcEltSizeInBits)
5269 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5270 SrcEltBits.
append(NumSrcElts - 1, SrcEltBits[0]);
5271 return CastBitData(UndefSrcElts, SrcEltBits);
5279 SDValue Ptr = MemIntr->getBasePtr();
5285 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5286 if (!CstTy->
isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5287 (SizeInBits % SubVecSizeInBits) != 0)
5290 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5291 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5292 APInt UndefSubElts(NumSubElts, 0);
5294 APInt(CstEltSizeInBits, 0));
5295 for (
unsigned i = 0; i != NumSubElts; ++i) {
5299 for (
unsigned j = 1; j != NumSubVecs; ++j)
5300 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5304 return CastBitData(UndefSubElts, SubEltBits);
5313 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5315 APInt UndefSrcElts(NumSrcElts, 0);
5317 const APInt &
C =
Op.getOperand(0).getConstantOperandAPInt(0);
5318 SrcEltBits.
push_back(
C.zextOrTrunc(SrcEltSizeInBits));
5319 SrcEltBits.
append(NumSrcElts - 1,
APInt(SrcEltSizeInBits, 0));
5320 return CastBitData(UndefSrcElts, SrcEltBits);
5328 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5330 APInt UndefSrcElts, UndefSubElts;
5333 UndefSubElts, EltSubBits,
5334 AllowWholeUndefs && AllowUndefs,
5335 AllowPartialUndefs && AllowUndefs) &&
5337 UndefSrcElts, EltSrcBits,
5338 AllowWholeUndefs && AllowUndefs,
5339 AllowPartialUndefs && AllowUndefs)) {
5340 unsigned BaseIdx =
Op.getConstantOperandVal(2);
5341 UndefSrcElts.
insertBits(UndefSubElts, BaseIdx);
5342 for (
unsigned i = 0, e = EltSubBits.
size(); i != e; ++i)
5343 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5344 return CastBitData(UndefSrcElts, EltSrcBits);
5351 EltBits, AllowWholeUndefs,
5352 AllowPartialUndefs)) {
5353 EVT SrcVT =
Op.getOperand(0).getValueType();
5354 unsigned NumSrcElts = SrcVT.
getSizeInBits() / EltSizeInBits;
5357 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5360 (BaseOfs % EltSizeInBits) == 0 &&
"Bad subvector index");
5362 UndefElts = UndefElts.
extractBits(NumSubElts, BaseIdx);
5363 if ((BaseIdx + NumSubElts) != NumSrcElts)
5364 EltBits.
erase(EltBits.
begin() + BaseIdx + NumSubElts, EltBits.
end());
5377 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5381 APInt UndefElts0, UndefElts1;
5385 UndefElts0, EltBits0, AllowWholeUndefs,
5386 AllowPartialUndefs))
5390 UndefElts1, EltBits1, AllowWholeUndefs,
5391 AllowPartialUndefs))
5395 for (
int i = 0; i != (int)NumElts; ++i) {
5400 }
else if (M < (
int)NumElts) {
5405 if (UndefElts1[M - NumElts])
5407 EltBits.
push_back(EltBits1[M - NumElts]);
5635 MVT VT =
N.getSimpleValueType();
5642 assert(Mask.empty() &&
"getTargetShuffleMask expects an empty Mask vector");
5643 assert(
Ops.empty() &&
"getTargetShuffleMask expects an empty Ops vector");
5646 bool IsFakeUnary =
false;
5647 switch (
N.getOpcode()) {
5649 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5650 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5651 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5653 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5656 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5657 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5658 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5660 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5663 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5664 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5665 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5667 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5670 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5673 int BitLen =
N.getConstantOperandVal(1);
5674 int BitIdx =
N.getConstantOperandVal(2);
5680 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5681 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5684 int BitLen =
N.getConstantOperandVal(2);
5685 int BitIdx =
N.getConstantOperandVal(3);
5687 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5691 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5692 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5694 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5697 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5698 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5700 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5703 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5704 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5706 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5709 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5710 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5712 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5716 "Only 32-bit and 64-bit elements are supported!");
5717 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5718 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5719 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5721 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5722 Ops.push_back(
N.getOperand(1));
5723 Ops.push_back(
N.getOperand(0));
5727 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5728 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5729 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5731 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5732 Ops.push_back(
N.getOperand(1));
5733 Ops.push_back(
N.getOperand(0));
5737 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5738 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5744 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5745 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5751 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5752 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5757 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5758 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5763 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5764 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5769 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5777 if (
N.getOperand(0).getValueType() == VT) {
5784 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5786 SDValue MaskNode =
N.getOperand(1);
5796 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5797 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5799 SDValue MaskNode =
N.getOperand(1);
5807 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5808 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5815 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5816 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5820 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5821 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5822 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5824 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5827 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5828 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5829 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5831 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5834 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5839 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5844 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5849 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5850 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5851 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5852 SDValue MaskNode =
N.getOperand(2);
5853 SDValue CtrlNode =
N.getOperand(3);
5855 unsigned CtrlImm = CtrlOp->getZExtValue();
5866 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5867 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5868 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5869 SDValue MaskNode =
N.getOperand(2);
5877 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5880 Ops.push_back(
N.getOperand(1));
5881 SDValue MaskNode =
N.getOperand(0);
5890 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5891 assert(
N.getOperand(2).getValueType() == VT &&
"Unexpected value type");
5892 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(2);
5894 Ops.push_back(
N.getOperand(0));
5895 Ops.push_back(
N.getOperand(2));
5896 SDValue MaskNode =
N.getOperand(1);
5906 SDValue PassThru =
N.getOperand(1);
5913 "Illegal compression mask");
5914 for (
unsigned I = 0;
I != NumElems; ++
I) {
5918 while (Mask.size() != NumElems) {
5919 Mask.push_back(NumElems + Mask.size());
5921 Ops.push_back(CmpVec);
5922 Ops.push_back(PassThru);
5927 SDValue PassThru =
N.getOperand(1);
5934 "Illegal expansion mask");
5935 unsigned ExpIndex = 0;
5936 for (
unsigned I = 0;
I != NumElems; ++
I) {
5938 Mask.push_back(
I + NumElems);
5940 Mask.push_back(ExpIndex++);
5942 Ops.push_back(ExpVec);
5943 Ops.push_back(PassThru);
5955 if (!AllowSentinelZero &&
isAnyZero(Mask))
5963 if (M >= (
int)Mask.size())
5969 Ops.push_back(
N.getOperand(0));
5970 if (!IsUnary || IsFakeUnary)
5971 Ops.push_back(
N.getOperand(1));
6254 bool ResolveKnownElts) {
6258 MVT VT =
N.getSimpleValueType();
6262 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6265 unsigned NumSizeInBytes = NumSizeInBits / 8;
6266 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6268 unsigned Opcode =
N.getOpcode();
6274 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6275 Ops.push_back(
N.getOperand(0));
6276 Ops.push_back(
N.getOperand(1));
6289 uint64_t ZeroMask = IsAndN ? 255 : 0;
6296 assert(UndefElts.
isZero() &&
"Unexpected UNDEF element in AND/ANDNP mask");
6297 for (
int i = 0, e = (
int)EltBits.
size(); i != e; ++i) {
6298 const APInt &ByteBits = EltBits[i];
6299 if (ByteBits != 0 && ByteBits != 255)
6303 Ops.push_back(IsAndN ? N1 : N0);
6324 size_t MaskSize = std::max(SrcMask0.
size(), SrcMask1.
size());
6328 for (
int i = 0; i != (int)MaskSize; ++i) {
6338 Mask.push_back(i + MaskSize);
6342 Ops.push_back(
N.getOperand(0));
6343 Ops.push_back(
N.getOperand(1));
6348 unsigned NumSubElts =
N.getOperand(0).getValueType().getVectorNumElements();
6349 if (NumBitsPerElt == 64) {
6350 for (
unsigned I = 0,
E =
N.getNumOperands();
I !=
E; ++
I) {
6351 for (
unsigned M = 0; M != NumSubElts; ++M)
6352 Mask.push_back((
I * NumElts) + M);
6353 Ops.push_back(
N.getOperand(
I));
6362 EVT SubVT =
Sub.getValueType();
6364 uint64_t InsertIdx =
N.getConstantOperandVal(2);
6366 if (DemandedElts.
extractBits(NumSubElts, InsertIdx) == 0) {
6367 Mask.resize(NumElts);
6368 std::iota(Mask.begin(), Mask.end(), 0);
6374 if (
Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6376 Src.getOperand(0).isUndef() &&
6377 Src.getOperand(1).getValueType() == SubVT &&
6378 Src.getConstantOperandVal(2) == 0 &&
6379 (NumBitsPerElt == 64 || Src.getOperand(1) ==
Sub) &&
6381 Mask.resize(NumElts);
6382 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6383 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6384 Ops.push_back(Src.getOperand(1));
6388 if (!
N->isOnlyUserOf(
Sub.getNode()))
6403 unsigned NumSubSrcSrcElts =
6405 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6406 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6407 "Subvector valuetype mismatch");
6408 InsertIdx *= (MaxElts / NumElts);
6409 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6410 NumSubElts *= (MaxElts / NumElts);
6411 bool SrcIsUndef = Src.isUndef();
6412 for (
int i = 0; i != (int)MaxElts; ++i)
6414 for (
int i = 0; i != (int)NumSubElts; ++i)
6415 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6418 Ops.push_back(SubSrcSrc);
6425 Depth + 1, ResolveKnownElts))
6435 if (SubMask.
size() != NumSubElts) {
6436 assert(((SubMask.
size() % NumSubElts) == 0 ||
6437 (NumSubElts % SubMask.
size()) == 0) &&
6438 "Illegal submask scale");
6439 if ((NumSubElts % SubMask.
size()) == 0) {
6440 int Scale = NumSubElts / SubMask.
size();
6443 SubMask = ScaledSubMask;
6445 int Scale = SubMask.
size() / NumSubElts;
6446 NumSubElts = SubMask.
size();
6456 for (
int i = 0; i != (int)NumElts; ++i)
6458 for (
int i = 0; i != (int)NumSubElts; ++i) {
6461 int InputIdx = M / NumSubElts;
6462 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6464 Mask[i + InsertIdx] = M;
6476 unsigned DstIdx = 0;
6480 N.getConstantOperandAPInt(2).uge(NumElts))
6482 DstIdx =
N.getConstantOperandVal(2);
6486 Ops.push_back(
N.getOperand(0));
6487 for (
unsigned i = 0; i != NumElts; ++i)
6507 if ((MinBitsPerElt % 8) != 0)
6527 unsigned DstByte = DstIdx * NumBytesPerElt;
6533 Ops.push_back(SrcVec);
6536 Ops.push_back(SrcVec);
6537 Ops.push_back(
N.getOperand(0));
6538 for (
int i = 0; i != (int)NumSizeInBytes; ++i)
6539 Mask.push_back(NumSizeInBytes + i);
6542 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6543 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6544 for (
unsigned i = 0; i != MinBytesPerElts; ++i)
6545 Mask[DstByte + i] = SrcByte + i;
6546 for (
unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6556 "Unexpected input value type");
6558 APInt EltsLHS, EltsRHS;
6563 bool Offset0 =
false, Offset1 =
false;
6592 bool IsUnary = (N0 == N1);
6600 if (Offset0 || Offset1) {
6602 if ((Offset0 &&
isInRange(M, 0, NumElts)) ||
6603 (Offset1 &&
isInRange(M, NumElts, 2 * NumElts)))
6612 Ops.push_back(
N.getOperand(1));
6613 Ops.push_back(
N.getOperand(2));
6620 EVT SrcVT = Src.getValueType();
6625 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6626 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 &&
"Illegal truncation");
6627 for (
unsigned i = 0; i != NumSrcElts; ++i)
6628 Mask.push_back(i * Scale);
6644 for (
unsigned I = 0;
I != NumElts; ++
I)
6645 if (DemandedElts[
I] && !UndefElts[
I] &&
6646 (EltBits[
I].urem(8) != 0 || EltBits[
I].uge(NumBitsPerElt)))
6650 Ops.push_back(
N.getOperand(0));
6652 for (
unsigned I = 0;
I != NumElts; ++
I) {
6653 if (!DemandedElts[
I] || UndefElts[
I])
6655 unsigned ByteShift = EltBits[
I].getZExtValue() / 8;
6656 unsigned Lo =
I * NumBytesPerElt;
6657 unsigned Hi =
Lo + NumBytesPerElt;
6661 std::iota(Mask.begin() +
Lo + ByteShift, Mask.begin() +
Hi,
Lo);
6663 std::iota(Mask.begin() +
Lo, Mask.begin() +
Hi - ByteShift,
6670 uint64_t ShiftVal =
N.getConstantOperandVal(1);
6672 if (NumBitsPerElt <= ShiftVal) {
6678 if ((ShiftVal % 8) != 0)
6682 Ops.push_back(
N.getOperand(0));
6688 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6689 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6690 Mask[i + j] = i + j - ByteShift;
6692 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6693 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6694 Mask[i + j - ByteShift] = i + j;
6701 uint64_t RotateVal =
N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6702 if ((RotateVal % 8) != 0)
6704 Ops.push_back(
N.getOperand(0));
6705 int Offset = RotateVal / 8;
6707 for (
int i = 0; i != (int)NumElts; ++i) {
6708 int BaseIdx = i * NumBytesPerElt;
6709 for (
int j = 0; j != (int)NumBytesPerElt; ++j) {
6710 Mask.push_back(BaseIdx + ((
Offset + j) % NumBytesPerElt));
6717 if (!Src.getSimpleValueType().isVector()) {
6720 Src.getOperand(0).getValueType().getScalarType() !=
6723 Src = Src.getOperand(0);
6726 Mask.append(NumElts, 0);
6731 EVT SrcVT = Src.getValueType();
6736 (NumBitsPerSrcElt % 8) != 0)
6740 APInt DemandedSrcElts =
6745 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 &&
"Unexpected extension");
6746 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6747 for (
unsigned I = 0;
I != NumElts; ++
I)
6748 Mask.append(Scale,
I);
6757 EVT SrcVT = Src.getValueType();
7400 bool IsAfterLegalize,
7401 unsigned Depth = 0) {
7407 unsigned NumElems = Elts.
size();
7409 int LastLoadedElt = -1;
7419 for (
unsigned i = 0; i < NumElems; ++i) {
7438 if (!
findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7440 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7441 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7449 "Incomplete element masks");
7452 if (UndefMask.
popcount() == NumElems)
7463 "Register/Memory size mismatch");
7465 assert(LDBase &&
"Did not find base load for merging consecutive loads");
7467 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7468 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7469 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7470 assert((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected");
7473 if (ByteOffsets[FirstLoadedElt] != 0)
7480 int64_t ByteOffset = ByteOffsets[EltIdx];
7481 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7482 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7483 return (0 <= BaseIdx && BaseIdx < (
int)NumElems && LoadMask[BaseIdx] &&
7484 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7486 int Stride = EltIdx - FirstLoadedElt;
7492 unsigned BaseMemSizeInBits =
Base->getMemoryVT().getSizeInBits();
7493 if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
7494 (BaseMemSizeInBits % BaseSizeInBits) == 0) {
7495 unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
7505 bool IsConsecutiveLoad =
true;
7506 bool IsConsecutiveLoadWithZeros =
true;
7507 for (
int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7509 if (!CheckConsecutiveLoad(LDBase, i)) {
7510 IsConsecutiveLoad =
false;
7511 IsConsecutiveLoadWithZeros =
false;
7514 }
else if (ZeroMask[i]) {
7515 IsConsecutiveLoad =
false;
7522 "Cannot merge volatile or atomic loads.");
7526 for (
auto *LD : Loads)
7541 if (FirstLoadedElt == 0 &&
7542 (NumLoadedElts == (
int)NumElems || IsDereferenceable) &&
7543 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7554 return DAG.
getBitcast(VT, Elts[FirstLoadedElt]);
7557 return CreateLoad(VT, LDBase);
7561 if (!IsAfterLegalize && VT.
isVector()) {
7563 if ((NumMaskElts % NumElems) == 0) {
7564 unsigned Scale = NumMaskElts / NumElems;
7566 for (
unsigned i = 0; i < NumElems; ++i) {
7569 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7570 for (
unsigned j = 0; j != Scale; ++j)
7571 ClearMask[(i * Scale) + j] = (i * Scale) + j +
Offset;
7573 SDValue V = CreateLoad(VT, LDBase);
7583 unsigned HalfNumElems = NumElems / 2;
7589 DAG, Subtarget, IsAfterLegalize,
Depth + 1);
7597 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7598 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7599 LoadSizeInBits == 64) &&
7606 if (!Subtarget.
hasSSE2() && VT == MVT::v4f32)
7614 for (
auto *LD : Loads)
7625 for (
unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7626 unsigned RepeatSize = SubElems * BaseSizeInBits;
7627 unsigned ScalarSize = std::min(RepeatSize, 64u);
7628 if (!Subtarget.
hasAVX2() && ScalarSize < 32)
7633 if (RepeatSize > ScalarSize && SubElems == 1)
7638 for (
unsigned i = 0; i != NumElems && Match; ++i) {
7642 if (RepeatedLoads[i % SubElems].
isUndef())
7643 RepeatedLoads[i % SubElems] = Elt;
7645 Match &= (RepeatedLoads[i % SubElems] == Elt);
7649 Match &= !RepeatedLoads.
front().isUndef();
7650 Match &= !RepeatedLoads.
back().isUndef();
7658 if (RepeatSize > ScalarSize)
7660 RepeatSize / ScalarSize);
7666 RepeatVT, RepeatedLoads,
DL, DAG, Subtarget, IsAfterLegalize,
7668 SDValue Broadcast = RepeatLoad;
7669 if (RepeatSize > ScalarSize) {
7695 VT, ReverseElts,
DL, DAG, Subtarget, IsAfterLegalize,
Depth + 1)) {
7697 std::iota(ReverseMask.
rbegin(), ReverseMask.
rend(), 0);
9124 "Illegal variable permute mask size");
9132 SDLoc(IndicesVec), SizeInBits);
9136 IndicesVT, IndicesVec);
9148 Subtarget, DAG,
SDLoc(IndicesVec));
9173 for (
uint64_t i = 0; i != Scale; ++i) {
9174 IndexScale |= Scale << (i * NumDstBits);
9175 IndexOffset |= i << (i * NumDstBits);
9185 unsigned Opcode = 0;
9194 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9198 ShuffleVT = MVT::v16i8;
9203 if (Subtarget.
hasAVX()) {
9205 ShuffleVT = MVT::v4f32;
9208 ShuffleVT = MVT::v16i8;
9213 if (Subtarget.
hasAVX()) {
9217 ShuffleVT = MVT::v2f64;
9229 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9231 else if (Subtarget.hasXOP()) {
9240 }
else if (Subtarget.
hasAVX()) {
9263 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9265 else if (Subtarget.
hasAVX()) {
9267 IndicesVec = ScaleIndices(IndicesVec, 2);
9270 MVT::v32i8, DAG.
getBitcast(MVT::v32i8, SrcVec),
9271 DAG.
getBitcast(MVT::v32i8, IndicesVec),
DL, DAG, Subtarget));
9278 else if (Subtarget.
hasAVX()) {
9281 {0, 1, 2, 3, 0, 1, 2, 3});
9283 {4, 5, 6, 7, 4, 5, 6, 7});
9284 if (Subtarget.hasXOP())
9301 if (!Subtarget.hasVLX()) {
9303 SrcVec =
widenSubVector(WidenSrcVT, SrcVec,
false, Subtarget, DAG,
9305 IndicesVec =
widenSubVector(MVT::v8i64, IndicesVec,
false, Subtarget,
9306 DAG,
SDLoc(IndicesVec));
9312 }
else if (Subtarget.
hasAVX()) {
9320 if (Subtarget.hasXOP())
9335 if (Subtarget.hasVBMI())
9339 if (Subtarget.hasBWI())
9355 "Illegal variable permute shuffle type");
9359 IndicesVec = ScaleIndices(IndicesVec, Scale);
9362 IndicesVec = DAG.
getBitcast(ShuffleIdxVT, IndicesVec);
9366 ? DAG.
getNode(Opcode,
DL, ShuffleVT, IndicesVec, SrcVec)
9367 : DAG.
getNode(Opcode,
DL, ShuffleVT, SrcVec, IndicesVec);
14127 assert(Mask.size() == 8 &&
"Shuffle mask length doesn't match!");
14139 for (
int i = 0; i != 4; ++i)
14140 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14146 copy_if(LoMask, std::back_inserter(LoInputs), [](
int M) {
return M >= 0; });
14150 copy_if(HiMask, std::back_inserter(HiInputs), [](
int M) {
return M >= 0; });
14154 int NumHToL = LoInputs.
size() - NumLToL;
14156 int NumHToH = HiInputs.
size() - NumLToH;
14175 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14176 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14178 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14181 for (
int DWord = 0; DWord != 4; ++DWord) {
14182 int M0 = Mask[2 * DWord + 0];
14183 int M1 = Mask[2 * DWord + 1];
14186 if (
M0 < 0 &&
M1 < 0)
14189 bool Match =
false;
14190 for (
int j = 0, e = DWordPairs.
size(); j < e; ++j) {
14191 auto &DWordPair = DWordPairs[j];
14194 DWordPair.first = (
M0 >= 0 ?
M0 : DWordPair.first);
14195 DWordPair.second = (
M1 >= 0 ?
M1 : DWordPair.second);
14196 PSHUFDMask[DWord] = DOffset + j;
14202 PSHUFDMask[DWord] = DOffset + DWordPairs.
size();
14207 if (DWordPairs.
size() <= 2) {
14208 DWordPairs.
resize(2, std::make_pair(-1, -1));
14209 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14210 DWordPairs[1].first, DWordPairs[1].second};
14215 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
14216 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
14217 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
14219 if ((NumHToL + NumHToH) == 0)
14220 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask,
X86ISD::PSHUFLW);
14221 if ((NumLToL + NumLToH) == 0)
14222 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask,
X86ISD::PSHUFHW);
14258 int AOffset,
int BOffset) {
14260 "Must call this with A having 3 or 1 inputs from the A half.");
14262 "Must call this with B having 1 or 3 inputs from the B half.");
14264 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14266 bool ThreeAInputs = AToAInputs.
size() == 3;
14272 int ADWord = 0, BDWord = 0;
14273 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14274 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14275 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14276 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14277 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14278 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14279 int TripleNonInputIdx =
14280 TripleInputSum - std::accumulate(TripleInputs.
begin(), TripleInputs.
end(), 0);
14281 TripleDWord = TripleNonInputIdx / 2;
14285 OneInputDWord = (OneInput / 2) ^ 1;
14292 if (BToBInputs.
size() == 2 && AToBInputs.
size() == 2) {
14297 int NumFlippedAToBInputs =
llvm::count(AToBInputs, 2 * ADWord) +
14299 int NumFlippedBToBInputs =
llvm::count(BToBInputs, 2 * BDWord) +
14301 if ((NumFlippedAToBInputs == 1 &&
14302 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14303 (NumFlippedBToBInputs == 1 &&
14304 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14309 auto FixFlippedInputs = [&V, &
DL, &Mask, &DAG](
int PinnedIdx,
int DWord,
14311 int FixIdx = PinnedIdx ^ 1;
14312 bool IsFixIdxInput =
is_contained(Inputs, PinnedIdx ^ 1);
14316 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14317 bool IsFixFreeIdxInput =
is_contained(Inputs, FixFreeIdx);
14318 if (IsFixIdxInput == IsFixFreeIdxInput)
14321 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14322 "We need to be changing the number of flipped inputs!");
14323 int PSHUFHalfMask[] = {0, 1, 2, 3};
14324 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14330 for (
int &M : Mask)
14331 if (M >= 0 && M == FixIdx)
14333 else if (M >= 0 && M == FixFreeIdx)
14336 if (NumFlippedBToBInputs != 0) {
14338 BToAInputs.
size() == 3 ? TripleNonInputIdx : OneInput;
14339 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14341 assert(NumFlippedAToBInputs != 0 &&
"Impossible given predicates!");
14342 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14343 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14348 int PSHUFDMask[] = {0, 1, 2, 3};
14349 PSHUFDMask[ADWord] = BDWord;
14350 PSHUFDMask[BDWord] = ADWord;
14357 for (
int &M : Mask)
14358 if (M >= 0 && M/2 == ADWord)
14359 M = 2 * BDWord + M % 2;
14360 else if (M >= 0 && M/2 == BDWord)
14361 M = 2 * ADWord + M % 2;
14367 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14368 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14369 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14370 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14377 int PSHUFLMask[4] = {-1, -1, -1, -1};
14378 int PSHUFHMask[4] = {-1, -1, -1, -1};
14379 int PSHUFDMask[4] = {-1, -1, -1, -1};
14384 auto fixInPlaceInputs =
14388 if (InPlaceInputs.
empty())
14390 if (InPlaceInputs.
size() == 1) {
14391 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14392 InPlaceInputs[0] - HalfOffset;
14393 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14396 if (IncomingInputs.
empty()) {
14398 for (
int Input : InPlaceInputs) {
14399 SourceHalfMask[
Input - HalfOffset] =
Input - HalfOffset;
14405 assert(InPlaceInputs.
size() == 2 &&
"Cannot handle 3 or 4 inputs!");
14406 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14407 InPlaceInputs[0] - HalfOffset;
14410 int AdjIndex = InPlaceInputs[0] ^ 1;
14411 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14413 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14415 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14416 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14422 auto moveInputsToRightHalf = [&PSHUFDMask](
14427 auto isWordClobbered = [](
ArrayRef<int> SourceHalfMask,
int Word) {
14428 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14430 auto isDWordClobbered = [&isWordClobbered](
ArrayRef<int> SourceHalfMask,
14432 int LowWord = Word & ~1;
14433 int HighWord = Word | 1;
14434 return isWordClobbered(SourceHalfMask, LowWord) ||
14435 isWordClobbered(SourceHalfMask, HighWord);
14438 if (IncomingInputs.
empty())
14441 if (ExistingInputs.
empty()) {
14443 for (
int Input : IncomingInputs) {
14446 if (isWordClobbered(SourceHalfMask,
Input - SourceOffset)) {
14447 if (SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] < 0) {
14448 SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] =
14449 Input - SourceOffset;
14451 for (
int &M : HalfMask)
14452 if (M == SourceHalfMask[
Input - SourceOffset] + SourceOffset)
14454 else if (M ==
Input)
14455 M = SourceHalfMask[
Input - SourceOffset] + SourceOffset;
14457 assert(SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] ==
14458 Input - SourceOffset &&
14459 "Previous placement doesn't match!");
14464 Input = SourceHalfMask[
Input - SourceOffset] + SourceOffset;
14468 if (PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] < 0)
14469 PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] =
Input / 2;
14471 assert(PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] ==
14473 "Previous placement doesn't match!");
14479 for (
int &M : HalfMask)
14480 if (M >= SourceOffset && M < SourceOffset + 4) {
14481 M = M - SourceOffset + DestOffset;
14482 assert(M >= 0 &&
"This should never wrap below zero!");
14490 if (IncomingInputs.
size() == 1) {
14491 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14492 int InputFixed =
find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14494 SourceHalfMask[InputFixed - SourceOffset] =
14495 IncomingInputs[0] - SourceOffset;
14497 IncomingInputs[0] = InputFixed;
14499 }
else if (IncomingInputs.
size() == 2) {
14500 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14501 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14505 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14506 IncomingInputs[1] - SourceOffset};
14511 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14512 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14513 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14514 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14515 InputsFixed[1] = InputsFixed[0] ^ 1;
14516 }
else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14517 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14518 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14519 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14520 InputsFixed[0] = InputsFixed[1] ^ 1;
14521 }
else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14522 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14526 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14527 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14528 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14529 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14535 for (
int i = 0; i < 4; ++i)
14536 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14537 "We can't handle any clobbers here!");
14538 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14539 "Cannot have adjacent inputs here!");
14541 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14542 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14546 for (
int &M : FinalSourceHalfMask)
14547 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14548 M = InputsFixed[1] + SourceOffset;
14549 else if (M == InputsFixed[1] + SourceOffset)
14550 M = (InputsFixed[0] ^ 1) + SourceOffset;
14552 InputsFixed[1] = InputsFixed[0] ^ 1;
14556 for (
int &M : HalfMask)
14557 if (M == IncomingInputs[0])
14558 M = InputsFixed[0] + SourceOffset;
14559 else if (M == IncomingInputs[1])
14560 M = InputsFixed[1] + SourceOffset;
14562 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14563 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14570 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14571 assert(PSHUFDMask[FreeDWord] < 0 &&
"DWord not free");
14572 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14573 for (
int &M : HalfMask)
14574 for (
int Input : IncomingInputs)
14576 M = FreeDWord * 2 +
Input % 2;
14578 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14580 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14600 "Failed to lift all the high half inputs to the low mask!");
14601 assert(
none_of(HiMask, [](
int M) {
return M >= 0 && M < 4; }) &&
14602 "Failed to lift all the low half inputs to the high mask!");
14610 for (
int &M : HiMask)
14691 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
14696 Zeroable, Subtarget, DAG))
14704 int NumV2Inputs =
count_if(Mask, [](
int M) {
return M >= 8; });
14706 if (NumV2Inputs == 0) {
14710 Subtarget, DAG,
false))
14715 Mask, Subtarget, DAG))
14744 "All single-input shuffles should be canonicalized to be V1-input "
14754 if (Subtarget.hasSSE4A())
14760 if (NumV2Inputs == 1)
14762 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14767 bool IsBlendSupported = Subtarget.
hasSSE41();
14768 if (IsBlendSupported)
14770 Zeroable, Subtarget, DAG))
14774 Zeroable, Subtarget, DAG))
14802 Zeroable, Subtarget, DAG))
14807 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.
hasSSE41())) &&
14808 !Subtarget.hasVLX()) {
14810 unsigned PackOpc = 0;
14811 if (NumEvenDrops == 2 && Subtarget.
hasAVX2() &&
14822 }
else if (Subtarget.
hasSSE41()) {
14825 for (
unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14834 }
else if (!Subtarget.
hasSSSE3()) {
14847 if (NumEvenDrops == 2) {
14848 Result = DAG.
getBitcast(MVT::v4i32, Result);
14849 Result = DAG.
getNode(PackOpc,
DL, MVT::v8i16, Result, Result);
14857 if (NumOddDrops == 1) {
14858 bool HasSSE41 = Subtarget.
hasSSE41();
14866 MVT::v8i16, V1, V2);
14871 Mask, Subtarget, DAG))
14876 if (!IsBlendSupported && Subtarget.
hasSSSE3()) {
14877 bool V1InUse, V2InUse;
14879 Zeroable, DAG, V1InUse, V2InUse);
14885 Zeroable, Subtarget, DAG);
14978 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
14998 Zeroable, Subtarget, DAG))
15011 if (Subtarget.hasSSE4A())
15016 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
15019 if (NumV2Elements == 0) {
15022 Mask, Subtarget, DAG))
15042 for (
int i = 0; i < 16; i += 2)
15043 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15048 auto tryToWidenViaDuplication = [&]() ->
SDValue {
15049 if (!canWidenViaDuplication(Mask))
15052 copy_if(Mask, std::back_inserter(LoInputs),
15053 [](
int M) {
return M >= 0 && M < 8; });
15057 copy_if(Mask, std::back_inserter(HiInputs), [](
int M) {
return M >= 8; });
15061 bool TargetLo = LoInputs.
size() >= HiInputs.
size();
15062 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15063 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15065 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15067 for (
int I : InPlaceInputs) {
15068 PreDupI16Shuffle[
I/2] =
I/2;
15071 int j = TargetLo ? 0 : 4, je = j + 4;
15072 for (
int i = 0, ie = MovingInputs.
size(); i < ie; ++i) {
15075 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15078 while (j < je && PreDupI16Shuffle[j] >= 0)
15086 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15090 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15095 DAG.
getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15098 bool EvenInUse =
false, OddInUse =
false;
15099 for (
int i = 0; i < 16; i += 2) {
15100 EvenInUse |= (Mask[i + 0] >= 0);
15101 OddInUse |= (Mask[i + 1] >= 0);
15102 if (EvenInUse && OddInUse)
15106 MVT::v16i8, EvenInUse ? V1 : DAG.
getUNDEF(MVT::v16i8),
15107 OddInUse ? V1 : DAG.
getUNDEF(MVT::v16i8));
15109 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15110 for (
int i = 0; i < 16; ++i)
15111 if (Mask[i] >= 0) {
15112 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15113 assert(MappedMask < 8 &&
"Invalid v8 shuffle mask!");
15114 if (PostDupI16Shuffle[i / 2] < 0)
15115 PostDupI16Shuffle[i / 2] = MappedMask;
15117 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15118 "Conflicting entries in the original shuffle!");
15123 DAG.
getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15125 if (
SDValue V = tryToWidenViaDuplication())
15130 Zeroable, Subtarget, DAG))
15139 Zeroable, Subtarget, DAG))
15143 bool IsSingleInput = V2.
isUndef();
15162 if (Subtarget.
hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15163 bool V1InUse =
false;
15164 bool V2InUse =
false;
15167 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15172 if (V1InUse && V2InUse) {
15175 Zeroable, Subtarget, DAG))
15187 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15191 if (Subtarget.hasVBMI())
15196 if (Subtarget.hasXOP()) {
15204 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15212 if (NumV2Elements == 1)
15214 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15227 if (NumEvenDrops) {
15233 assert(NumEvenDrops <= 3 &&
15234 "No support for dropping even elements more than 3 times.");
15236 for (
unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15241 if (!IsSingleInput)
15247 IsSingleInput ? V1 : V2);
15248 for (
int i = 1; i < NumEvenDrops; ++i) {
15249 Result = DAG.
getBitcast(MVT::v8i16, Result);
15256 if (NumOddDrops == 1) {
15260 if (!IsSingleInput)
15265 IsSingleInput ? V1 : V2);
15269 if (NumV2Elements > 0)
15271 Zeroable, Subtarget, DAG);
15278 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15279 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15280 for (
int i = 0; i < 16; ++i)
15282 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15288 if (
none_of(LoBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; }) &&
15289 none_of(HiBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; })) {
15296 VHiHalf = DAG.
getUNDEF(MVT::v8i16);
15299 for (
int &M : LoBlendMask)
15302 for (
int &M : HiBlendMask)
16305 int NumLaneElts = NumElts / NumLanes;
16310 for (
unsigned BroadcastSize : {16, 32, 64}) {
16319 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
16320 for (
int j = 0; j != NumBroadcastElts; ++j) {
16321 int M = Mask[i + j];
16324 int &R = RepeatMask[j];
16325 if (0 != ((M % NumElts) / NumLaneElts))
16327 if (0 <= R && R != M)
16335 if (!FindRepeatingBroadcastMask(RepeatMask))
16343 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
16344 for (
int j = 0; j != NumBroadcastElts; ++j)
16345 BroadcastMask[i + j] = j;
16349 if (BroadcastMask == Mask)
16367 auto ShuffleSubLanes = [&](
int SubLaneScale) {
16368 int NumSubLanes = NumLanes * SubLaneScale;
16369 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16374 int TopSrcSubLane = -1;
16380 for (
int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16385 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16386 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16389 int Lane = (M % NumElts) / NumLaneElts;
16390 if ((0 <= SrcLane) && (SrcLane != Lane))
16393 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16394 SubLaneMask[Elt] = LocalM;
16402 for (
int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16404 for (
int i = 0; i != NumSubLaneElts; ++i) {
16405 if (
M1[i] < 0 || M2[i] < 0)
16407 if (
M1[i] != M2[i])
16413 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16414 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16418 for (
int i = 0; i != NumSubLaneElts; ++i) {
16419 int M = SubLaneMask[i];
16422 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16423 "Unexpected mask element");
16424 RepeatedSubLaneMask[i] = M;
16429 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16430 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16431 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16436 if (Dst2SrcSubLanes[DstSubLane] < 0)
16439 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16440 "Unexpected source lane");
16444 for (
int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16445 int Lane = SubLane / SubLaneScale;
16446 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16447 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16448 int M = RepeatedSubLaneMask[Elt];
16451 int Idx = (SubLane * NumSubLaneElts) + Elt;
16452 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16458 for (
int i = 0; i != NumElts; i += NumSubLaneElts) {
16459 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16460 if (SrcSubLane < 0)
16462 for (
int j = 0; j != NumSubLaneElts; ++j)
16463 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16468 if (RepeatedMask == Mask || SubLaneMask == Mask)
16482 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16485 MinSubLaneScale = 2;
16487 (!OnlyLowestElts && V2.
isUndef() && VT == MVT::v32i8) ? 4 : 2;
16489 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16490 MinSubLaneScale = MaxSubLaneScale = 4;
16492 for (
int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16493 if (
SDValue Shuffle = ShuffleSubLanes(Scale))
17058 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17059 assert(Subtarget.
hasAVX2() &&
"We can only lower v8i32 with AVX2!");
17061 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
17067 Zeroable, Subtarget, DAG))
17086 Zeroable, Subtarget, DAG))
17095 if (Subtarget.preferLowerShuffleAsShift()) {
17098 Subtarget, DAG,
true))
17100 if (NumV2Elements == 0)
17110 bool Is128BitLaneRepeatedShuffle =
17112 if (Is128BitLaneRepeatedShuffle) {
17113 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
17129 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
17135 if (Subtarget.hasVLX()) {
17137 Zeroable, Subtarget, DAG))
17141 Zeroable, Subtarget, DAG))
17153 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17175 CastV1, CastV2, DAG);
17182 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17187 Zeroable, Subtarget, DAG);
17200 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
17201 assert(Subtarget.
hasAVX2() &&
"We can only lower v16i16 with AVX2!");
17207 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17216 Zeroable, Subtarget, DAG))
17236 Subtarget, DAG,
false))
17247 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17265 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17278 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17283 Zeroable, Subtarget, DAG))
17287 if (Subtarget.hasBWI())
17293 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17298 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17323 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
17324 assert(Subtarget.
hasAVX2() &&
"We can only lower v32i8 with AVX2!");
17330 Zeroable, Subtarget, DAG))
17339 Zeroable, Subtarget, DAG))
17376 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17388 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17396 Zeroable, Subtarget, DAG))
17400 if (Subtarget.hasVBMI())
17406 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17411 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17417 if (Subtarget.hasVLX())
17419 Mask, Zeroable, DAG))