LLVM 19.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Intrinsics.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/Debug.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE "x86-isel"
71
73 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
79
81 "x86-br-merging-base-cost", cl::init(2),
83 "Sets the cost threshold for when multiple conditionals will be merged "
84 "into one branch versus be split in multiple branches. Merging "
85 "conditionals saves branches at the cost of additional instructions. "
86 "This value sets the instruction cost limit, below which conditionals "
87 "will be merged, and above which conditionals will be split. Set to -1 "
88 "to never merge branches."),
90
92 "x86-br-merging-ccmp-bias", cl::init(6),
93 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
94 "supports conditional compare instructions."),
96
98 "x86-br-merging-likely-bias", cl::init(0),
99 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
100 "that all conditionals will be executed. For example for merging "
101 "the conditionals (a == b && c > d), if its known that a == b is "
102 "likely, then it is likely that if the conditionals are split "
103 "both sides will be executed, so it may be desirable to increase "
104 "the instruction cost threshold. Set to -1 to never merge likely "
105 "branches."),
106 cl::Hidden);
107
109 "x86-br-merging-unlikely-bias", cl::init(-1),
110 cl::desc(
111 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
112 "that all conditionals will be executed. For example for merging "
113 "the conditionals (a == b && c > d), if its known that a == b is "
114 "unlikely, then it is unlikely that if the conditionals are split "
115 "both sides will be executed, so it may be desirable to decrease "
116 "the instruction cost threshold. Set to -1 to never merge unlikely "
117 "branches."),
118 cl::Hidden);
119
121 "mul-constant-optimization", cl::init(true),
122 cl::desc("Replace 'mul x, Const' with more effective instructions like "
123 "SHIFT, LEA, etc."),
124 cl::Hidden);
125
127 const X86Subtarget &STI)
128 : TargetLowering(TM), Subtarget(STI) {
129 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
130 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
131
132 // Set up the TargetLowering object.
133
134 // X86 is weird. It always uses i8 for shift amounts and setcc results.
136 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
138
139 // X86 instruction cache is coherent with its data cache so we can use the
140 // default expansion to a no-op.
142
143 // For 64-bit, since we have so many registers, use the ILP scheduler.
144 // For 32-bit, use the register pressure specific scheduling.
145 // For Atom, always use ILP scheduling.
146 if (Subtarget.isAtom())
148 else if (Subtarget.is64Bit())
150 else
152 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
154
155 // Bypass expensive divides and use cheaper ones.
156 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
157 if (Subtarget.hasSlowDivide32())
158 addBypassSlowDiv(32, 8);
159 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
160 addBypassSlowDiv(64, 32);
161 }
162
163 // Setup Windows compiler runtime calls.
164 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
165 static const struct {
166 const RTLIB::Libcall Op;
167 const char * const Name;
168 const CallingConv::ID CC;
169 } LibraryCalls[] = {
170 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
171 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
172 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
173 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
174 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
175 };
176
177 for (const auto &LC : LibraryCalls) {
178 setLibcallName(LC.Op, LC.Name);
179 setLibcallCallingConv(LC.Op, LC.CC);
180 }
181 }
182
183 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
184 // MSVCRT doesn't have powi; fall back to pow
185 setLibcallName(RTLIB::POWI_F32, nullptr);
186 setLibcallName(RTLIB::POWI_F64, nullptr);
187 }
188
189 if (Subtarget.canUseCMPXCHG16B())
191 else if (Subtarget.canUseCMPXCHG8B())
193 else
195
196 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
197
199
200 // Set up the register classes.
201 addRegisterClass(MVT::i8, &X86::GR8RegClass);
202 addRegisterClass(MVT::i16, &X86::GR16RegClass);
203 addRegisterClass(MVT::i32, &X86::GR32RegClass);
204 if (Subtarget.is64Bit())
205 addRegisterClass(MVT::i64, &X86::GR64RegClass);
206
207 for (MVT VT : MVT::integer_valuetypes())
209
210 // We don't accept any truncstore of integer registers.
211 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
212 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
213 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
214 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
215 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
216 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
217
218 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
219
220 // SETOEQ and SETUNE require checking two conditions.
221 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
224 }
225
226 // Integer absolute.
227 if (Subtarget.canUseCMOV()) {
228 setOperationAction(ISD::ABS , MVT::i16 , Custom);
229 setOperationAction(ISD::ABS , MVT::i32 , Custom);
230 if (Subtarget.is64Bit())
231 setOperationAction(ISD::ABS , MVT::i64 , Custom);
232 }
233
234 // Absolute difference.
235 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
236 setOperationAction(Op , MVT::i8 , Custom);
237 setOperationAction(Op , MVT::i16 , Custom);
238 setOperationAction(Op , MVT::i32 , Custom);
239 if (Subtarget.is64Bit())
240 setOperationAction(Op , MVT::i64 , Custom);
241 }
242
243 // Signed saturation subtraction.
247 if (Subtarget.is64Bit())
249
250 // Funnel shifts.
251 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
252 // For slow shld targets we only lower for code size.
253 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
254
255 setOperationAction(ShiftOp , MVT::i8 , Custom);
256 setOperationAction(ShiftOp , MVT::i16 , Custom);
257 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
258 if (Subtarget.is64Bit())
259 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
260 }
261
262 if (!Subtarget.useSoftFloat()) {
263 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
264 // operation.
269 // We have an algorithm for SSE2, and we turn this into a 64-bit
270 // FILD or VCVTUSI2SS/SD for other targets.
273 // We have an algorithm for SSE2->double, and we turn this into a
274 // 64-bit FILD followed by conditional FADD for other targets.
277
278 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
279 // this operation.
282 // SSE has no i16 to fp conversion, only i32. We promote in the handler
283 // to allow f80 to use i16 and f64 to use i16 with sse1 only
286 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
289 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
290 // are Legal, f80 is custom lowered.
293
294 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
295 // this operation.
297 // FIXME: This doesn't generate invalid exception when it should. PR44019.
303 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
304 // are Legal, f80 is custom lowered.
307
308 // Handle FP_TO_UINT by promoting the destination to a larger signed
309 // conversion.
311 // FIXME: This doesn't generate invalid exception when it should. PR44019.
314 // FIXME: This doesn't generate invalid exception when it should. PR44019.
320
325
326 if (!Subtarget.is64Bit()) {
329 }
330 }
331
332 if (Subtarget.hasSSE2()) {
333 // Custom lowering for saturating float to int conversions.
334 // We handle promotion to larger result types manually.
335 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
338 }
339 if (Subtarget.is64Bit()) {
342 }
343 }
344
345 // Handle address space casts between mixed sized pointers.
348
349 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
350 if (!Subtarget.hasSSE2()) {
353 if (Subtarget.is64Bit()) {
355 // Without SSE, i64->f64 goes through memory.
357 }
358 } else if (!Subtarget.is64Bit())
360
361 // Scalar integer divide and remainder are lowered to use operations that
362 // produce two results, to match the available instructions. This exposes
363 // the two-result form to trivial CSE, which is able to combine x/y and x%y
364 // into a single instruction.
365 //
366 // Scalar integer multiply-high is also lowered to use two-result
367 // operations, to match the available instructions. However, plain multiply
368 // (low) operations are left as Legal, as there are single-result
369 // instructions for this in x86. Using the two-result multiply instructions
370 // when both high and low results are needed must be arranged by dagcombine.
371 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
378 }
379
380 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
382 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
383 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
386 }
387 if (Subtarget.is64Bit())
392
393 setOperationAction(ISD::FREM , MVT::f32 , Expand);
394 setOperationAction(ISD::FREM , MVT::f64 , Expand);
395 setOperationAction(ISD::FREM , MVT::f80 , Expand);
396 setOperationAction(ISD::FREM , MVT::f128 , Expand);
397
398 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
404 }
405
406 // Promote the i8 variants and force them on up to i32 which has a shorter
407 // encoding.
408 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
410 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
411 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
412 // promote that too.
413 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
415
416 if (!Subtarget.hasBMI()) {
417 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
419 if (Subtarget.is64Bit()) {
420 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
422 }
423 }
424
425 if (Subtarget.hasLZCNT()) {
426 // When promoting the i8 variants, force them to i32 for a shorter
427 // encoding.
428 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
430 } else {
431 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
432 if (VT == MVT::i64 && !Subtarget.is64Bit())
433 continue;
436 }
437 }
438
441 // Special handling for half-precision floating point conversions.
442 // If we don't have F16C support, then lower half float conversions
443 // into library calls.
445 Op, MVT::f32,
446 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
447 // There's never any support for operations beyond MVT::f32.
448 setOperationAction(Op, MVT::f64, Expand);
449 setOperationAction(Op, MVT::f80, Expand);
450 setOperationAction(Op, MVT::f128, Expand);
451 }
452
453 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
456 }
457
458 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
460 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
461 setTruncStoreAction(VT, MVT::f16, Expand);
462 setTruncStoreAction(VT, MVT::bf16, Expand);
463
466 }
467
471 if (Subtarget.is64Bit())
473 if (Subtarget.hasPOPCNT()) {
474 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
475 // popcntw is longer to encode than popcntl and also has a false dependency
476 // on the dest that popcntl hasn't had since Cannon Lake.
477 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
478 } else {
483 }
484
486
487 if (!Subtarget.hasMOVBE())
489
490 // X86 wants to expand cmov itself.
491 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
496 }
497 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
498 if (VT == MVT::i64 && !Subtarget.is64Bit())
499 continue;
502 }
503
504 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
507
509 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
510 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
514 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
515 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
516
517 // Darwin ABI issue.
518 for (auto VT : { MVT::i32, MVT::i64 }) {
519 if (VT == MVT::i64 && !Subtarget.is64Bit())
520 continue;
527 }
528
529 // 64-bit shl, sra, srl (iff 32-bit x86)
530 for (auto VT : { MVT::i32, MVT::i64 }) {
531 if (VT == MVT::i64 && !Subtarget.is64Bit())
532 continue;
536 }
537
538 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
540
542
543 // Expand certain atomics
544 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
552 }
553
554 if (!Subtarget.is64Bit())
556
557 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
558 // All CPUs supporting AVX will atomically load/store aligned 128-bit
559 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
562 }
563
564 if (Subtarget.canUseCMPXCHG16B())
566
567 // FIXME - use subtarget debug flags
568 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
569 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
570 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
572 }
573
576
579
580 setOperationAction(ISD::TRAP, MVT::Other, Legal);
582 if (Subtarget.isTargetPS())
584 else
586
587 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
589 setOperationAction(ISD::VAEND , MVT::Other, Expand);
590 bool Is64Bit = Subtarget.is64Bit();
591 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
592 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
593
596
598
599 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
602
604
605 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
606 setOperationAction(ISD::FABS, VT, Action);
607 setOperationAction(ISD::FNEG, VT, Action);
609 setOperationAction(ISD::FREM, VT, Action);
610 setOperationAction(ISD::FMA, VT, Action);
611 setOperationAction(ISD::FMINNUM, VT, Action);
612 setOperationAction(ISD::FMAXNUM, VT, Action);
615 setOperationAction(ISD::FSIN, VT, Action);
616 setOperationAction(ISD::FCOS, VT, Action);
617 setOperationAction(ISD::FSINCOS, VT, Action);
618 setOperationAction(ISD::FTAN, VT, Action);
619 setOperationAction(ISD::FSQRT, VT, Action);
620 setOperationAction(ISD::FPOW, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
639 };
640
641 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
642 // f16, f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
645 : &X86::FR16RegClass);
646 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
647 : &X86::FR32RegClass);
648 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
649 : &X86::FR64RegClass);
650
651 // Disable f32->f64 extload as we can only generate this in one instruction
652 // under optsize. So its easier to pattern match (fpext (load)) for that
653 // case instead of needing to emit 2 instructions for extload in the
654 // non-optsize case.
655 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
656
657 for (auto VT : { MVT::f32, MVT::f64 }) {
658 // Use ANDPD to simulate FABS.
660
661 // Use XORP to simulate FNEG.
663
664 // Use ANDPD and ORPD to simulate FCOPYSIGN.
666
667 // These might be better off as horizontal vector ops.
670
671 // We don't support sin/cos/fmod
675 }
676
677 // Half type will be promoted by default.
678 setF16Action(MVT::f16, Promote);
686
716
717 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
718 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
719
720 // Lower this to MOVMSK plus an AND.
723
724 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
725 (UseX87 || Is64Bit)) {
726 // Use SSE for f32, x87 for f64.
727 // Set up the FP register classes.
728 addRegisterClass(MVT::f32, &X86::FR32RegClass);
729 if (UseX87)
730 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
731
732 // Use ANDPS to simulate FABS.
734
735 // Use XORP to simulate FNEG.
737
738 if (UseX87)
740
741 // Use ANDPS and ORPS to simulate FCOPYSIGN.
742 if (UseX87)
745
746 // We don't support sin/cos/fmod
750
751 if (UseX87) {
752 // Always expand sin/cos functions even though x87 has an instruction.
756 }
757 } else if (UseX87) {
758 // f32 and f64 in x87.
759 // Set up the FP register classes.
760 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
761 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
762
763 for (auto VT : { MVT::f32, MVT::f64 }) {
766
767 // Always expand sin/cos functions even though x87 has an instruction.
771 }
772 }
773
774 // Expand FP32 immediates into loads from the stack, save special cases.
775 if (isTypeLegal(MVT::f32)) {
776 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
777 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
778 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
779 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
780 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
781 } else // SSE immediates.
782 addLegalFPImmediate(APFloat(+0.0f)); // xorps
783 }
784 // Expand FP64 immediates into loads from the stack, save special cases.
785 if (isTypeLegal(MVT::f64)) {
786 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
787 addLegalFPImmediate(APFloat(+0.0)); // FLD0
788 addLegalFPImmediate(APFloat(+1.0)); // FLD1
789 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
790 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
791 } else // SSE immediates.
792 addLegalFPImmediate(APFloat(+0.0)); // xorpd
793 }
794 // Support fp16 0 immediate.
795 if (isTypeLegal(MVT::f16))
796 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
797
798 // Handle constrained floating-point operations of scalar.
811
812 // We don't support FMA.
815
816 // f80 always uses X87.
817 if (UseX87) {
818 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
821 {
823 addLegalFPImmediate(TmpFlt); // FLD0
824 TmpFlt.changeSign();
825 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
826
827 bool ignored;
828 APFloat TmpFlt2(+1.0);
830 &ignored);
831 addLegalFPImmediate(TmpFlt2); // FLD1
832 TmpFlt2.changeSign();
833 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
834 }
835
836 // Always expand sin/cos functions even though x87 has an instruction.
837 // clang-format off
842 // clang-format on
843
855
856 // Handle constrained floating-point operations of scalar.
862 if (isTypeLegal(MVT::f16)) {
865 } else {
867 }
868 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
869 // as Custom.
871 }
872
873 // f128 uses xmm registers, but most operations require libcalls.
874 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
875 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
876 : &X86::VR128RegClass);
877
878 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
879
890
894
895 // clang-format off
903 // clang-format on
904 // No STRICT_FSINCOS
907
910 // We need to custom handle any FP_ROUND with an f128 input, but
911 // LegalizeDAG uses the result type to know when to run a custom handler.
912 // So we have to list all legal floating point result types here.
913 if (isTypeLegal(MVT::f32)) {
916 }
917 if (isTypeLegal(MVT::f64)) {
920 }
921 if (isTypeLegal(MVT::f80)) {
924 }
925
927
928 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
929 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
930 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
931 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
932 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
933 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
934 }
935
936 // Always use a library call for pow.
937 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
938 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
939 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
940 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
941
950
951 // Some FP actions are always expanded for vector types.
952 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
953 MVT::v4f32, MVT::v8f32, MVT::v16f32,
954 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
955 // clang-format off
969 // clang-format on
970 }
971
972 // First set operation action for all vector types to either promote
973 // (for widening) or expand (for scalarization). Then we will selectively
974 // turn on ones that can be effectively codegen'd.
1014 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1015 setTruncStoreAction(InnerVT, VT, Expand);
1016
1017 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1018 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1019
1020 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1021 // types, we have to deal with them whether we ask for Expansion or not.
1022 // Setting Expand causes its own optimisation problems though, so leave
1023 // them legal.
1024 if (VT.getVectorElementType() == MVT::i1)
1025 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1026
1027 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1028 // split/scalarized right now.
1029 if (VT.getVectorElementType() == MVT::f16 ||
1030 VT.getVectorElementType() == MVT::bf16)
1031 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1032 }
1033 }
1034
1035 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1036 // with -msoft-float, disable use of MMX as well.
1037 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1038 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1039 // No operations on x86mmx supported, everything uses intrinsics.
1040 }
1041
1042 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1043 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1044 : &X86::VR128RegClass);
1045
1048
1049 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1050 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1057
1058 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1059 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1060
1066 }
1067
1068 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1069 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1070 : &X86::VR128RegClass);
1071
1072 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1073 // registers cannot be used even for integer operations.
1074 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1075 : &X86::VR128RegClass);
1076 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1077 : &X86::VR128RegClass);
1078 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1079 : &X86::VR128RegClass);
1080 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1081 : &X86::VR128RegClass);
1082 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1083 : &X86::VR128RegClass);
1084
1085 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1088 }
1089
1090 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1091 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1096 }
1097
1098 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1099 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1100 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1101
1102 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1103 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1104 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1105 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1106 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1107 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1108 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1109 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1110 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1111 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1114
1115 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1116 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1117 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1118
1119 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1120 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1122
1123 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1124
1125 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1126 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1127 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1128 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1129 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1130 }
1131
1142
1147
1148 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1154
1155 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1156 // setcc all the way to isel and prefer SETGT in some isel patterns.
1159 }
1160
1161 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1162 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1174 }
1175
1176 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1180
1181 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1182 continue;
1183
1186 }
1187 setF16Action(MVT::v8f16, Expand);
1188 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1189 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1190 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1191 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1192 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1193 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1195
1196 // Custom lower v2i64 and v2f64 selects.
1203
1210
1211 // Custom legalize these to avoid over promotion or custom promotion.
1212 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1217 }
1218
1223
1226
1229
1230 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1235
1240
1241 // We want to legalize this to an f64 load rather than an i64 load on
1242 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1243 // store.
1244 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1245 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1246 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1247 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1248 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1250
1251 // Add 32-bit vector stores to help vectorization opportunities.
1252 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1254
1258 if (!Subtarget.hasAVX512())
1260
1264
1266
1283
1284 // In the customized shift lowering, the legal v4i32/v2i64 cases
1285 // in AVX2 will be recognized.
1286 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1290 if (VT == MVT::v2i64) continue;
1295 }
1296
1302 }
1303
1304 if (Subtarget.hasGFNI()) {
1309 }
1310
1311 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1312 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1313 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1314 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1315
1316 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1319 }
1320
1321 // These might be better off as horizontal vector ops.
1326 }
1327
1328 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1329 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1332 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1336 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1342
1344 }
1345
1346 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1347 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1348 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1349 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1350 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1351 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1352 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1353 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1354
1358
1359 // FIXME: Do we need to handle scalar-to-vector here?
1360 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1361 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1362
1363 // We directly match byte blends in the backend as they match the VSELECT
1364 // condition form.
1366
1367 // SSE41 brings specific instructions for doing vector sign extend even in
1368 // cases where we don't have SRA.
1369 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1372 }
1373
1374 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1375 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1376 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1377 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1378 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1379 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1380 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1381 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1382 }
1383
1384 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1385 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1386 // do the pre and post work in the vector domain.
1389 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1390 // so that DAG combine doesn't try to turn it into uint_to_fp.
1393 }
1394 }
1395
1396 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1398 }
1399
1400 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1401 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1402 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1405 }
1406
1407 // XOP can efficiently perform BITREVERSE with VPPERM.
1408 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1410 }
1411
1412 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1413 bool HasInt256 = Subtarget.hasInt256();
1414
1415 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1416 : &X86::VR256RegClass);
1417 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1418 : &X86::VR256RegClass);
1419 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1420 : &X86::VR256RegClass);
1421 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1422 : &X86::VR256RegClass);
1423 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1424 : &X86::VR256RegClass);
1425 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1426 : &X86::VR256RegClass);
1427 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1428 : &X86::VR256RegClass);
1429
1430 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1443
1445
1449
1452 }
1453
1454 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1455 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1456
1457 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1458 // even though v8i16 is a legal type.
1459 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1460 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1461 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1462 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1466
1473
1485
1486 if (!Subtarget.hasAVX512())
1488
1489 // In the customized shift lowering, the legal v8i32/v4i64 cases
1490 // in AVX2 will be recognized.
1491 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1497 if (VT == MVT::v4i64) continue;
1502 }
1503
1504 // These types need custom splitting if their input is a 128-bit vector.
1509
1513 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1514 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1517
1518 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1522 }
1523
1528
1529 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1534
1535 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1536 // setcc all the way to isel and prefer SETGT in some isel patterns.
1539 }
1540
1541 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1542 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1547
1548 if (Subtarget.hasAnyFMA()) {
1549 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1550 MVT::v2f64, MVT::v4f64 }) {
1553 }
1554 }
1555
1556 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1557 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1558 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1559 }
1560
1561 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1562 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1563 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1564 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1565
1566 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1567 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1568 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1569 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1570 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1571 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1572 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1573 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1574
1575 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1576 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1577
1578 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1579 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1580 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1581 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1582 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1583
1584 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1585 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1586 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1587 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1588 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1589 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1596
1597 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1598 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1599 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1601 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1602 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1603 }
1604
1605 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1608 }
1609
1610 if (HasInt256) {
1611 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1612 // when we have a 256bit-wide blend with immediate.
1615
1616 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1617 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1618 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1619 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1620 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1621 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1622 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1623 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1624 }
1625 }
1626
1627 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1628 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1629 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1631 }
1632
1633 // Extract subvector is special because the value type
1634 // (result) is 128-bit but the source is 256-bit wide.
1635 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1636 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1638 }
1639
1640 // Custom lower several nodes for 256-bit types.
1641 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1642 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1652 }
1653 setF16Action(MVT::v16f16, Expand);
1654 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1655 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1657 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1658 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1659 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1660 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1661
1662 if (HasInt256) {
1664
1665 // Custom legalize 2x32 to get a little better code.
1668
1669 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1670 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1672 }
1673 }
1674
1675 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1676 Subtarget.hasF16C()) {
1677 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1680 }
1681 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1684 }
1685 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1686 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1687 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1688 }
1689 }
1690
1691 // This block controls legalization of the mask vector sizes that are
1692 // available with AVX512. 512-bit vectors are in a separate block controlled
1693 // by useAVX512Regs.
1694 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1695 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1696 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1697 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1698 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1699 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1700
1704
1705 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1706 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1707 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1708 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1709 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1710 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1711 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1712 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1717
1718 // There is no byte sized k-register load or store without AVX512DQ.
1719 if (!Subtarget.hasDQI()) {
1720 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1721 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1722 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1723 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1724
1729 }
1730
1731 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1732 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1736 }
1737
1738 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1740
1741 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1745
1752 }
1753
1754 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1756 }
1757 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1758 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1761 }
1762 }
1763
1764 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1765 // elements. 512-bits can be disabled based on prefer-vector-width and
1766 // required-vector-width function attributes.
1767 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1768 bool HasBWI = Subtarget.hasBWI();
1769
1770 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1771 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1772 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1773 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1774 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1775 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1776 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1777
1778 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1779 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1780 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1781 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1782 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1783 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1784 if (HasBWI)
1785 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1786 }
1787
1788 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1796 }
1797 setOperationAction(ISD::LRINT, MVT::v16f32,
1798 Subtarget.hasDQI() ? Legal : Custom);
1799 setOperationAction(ISD::LRINT, MVT::v8f64,
1800 Subtarget.hasDQI() ? Legal : Custom);
1801 if (Subtarget.hasDQI())
1802 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1803
1804 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1809 }
1810
1811 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1816 }
1817
1824
1836
1837 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1838 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1839 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1840 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1841 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1842 if (HasBWI)
1843 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1844
1845 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1846 // to 512-bit rather than use the AVX2 instructions so that we can use
1847 // k-masks.
1848 if (!Subtarget.hasVLX()) {
1849 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1850 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1853 }
1854 }
1855
1857 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1858 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1868
1869 if (HasBWI) {
1870 // Extends from v64i1 masks to 512-bit vectors.
1874 }
1875
1876 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1889
1891 }
1892
1893 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1896 }
1897
1898 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1899 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1900 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1901 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1902
1903 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1904 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1905 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1906 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1907
1908 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1909 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1910 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1911 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1912 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1913 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1914 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1915 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1916
1917 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1918 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1919
1920 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1930
1931 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1932 // setcc all the way to isel and prefer SETGT in some isel patterns.
1935 }
1936
1937 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1938 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1943
1944 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1951 }
1952
1953 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1954 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1955 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1957 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1958 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1959 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1960 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1965 }
1966
1967 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1968 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1969 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1970 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1971 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1972 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1973
1974 if (Subtarget.hasDQI()) {
1978 setOperationAction(Opc, MVT::v8i64, Custom);
1979 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1980 }
1981
1982 if (Subtarget.hasCDI()) {
1983 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1984 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1986 }
1987 } // Subtarget.hasCDI()
1988
1989 if (Subtarget.hasVPOPCNTDQ()) {
1990 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1992 }
1993
1994 // Extract subvector is special because the value type
1995 // (result) is 256-bit but the source is 512-bit wide.
1996 // 128-bit was made Legal under AVX1.
1997 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1998 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2000
2001 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2002 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2012 }
2013 setF16Action(MVT::v32f16, Expand);
2018 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2019 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2020
2021 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2026 }
2027 if (HasBWI) {
2028 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2031 }
2032 } else {
2033 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2034 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2035 }
2036
2037 if (Subtarget.hasVBMI2()) {
2038 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2041 }
2042
2043 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2044 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2045 }
2046
2047 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2048 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2050 }// useAVX512Regs
2051
2052 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2053 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2054 MVT::v4i64}) {
2057 }
2058 }
2059
2060 // This block controls legalization for operations that don't have
2061 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2062 // narrower widths.
2063 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2064 // These operations are handled on non-VLX by artificially widening in
2065 // isel patterns.
2066
2070
2071 if (Subtarget.hasDQI()) {
2072 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2073 // v2f32 UINT_TO_FP is already custom under SSE2.
2076 "Unexpected operation action!");
2077 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2082 }
2083
2084 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2090 }
2091
2092 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2095 }
2096
2097 // Custom legalize 2x32 to get a little better code.
2100
2101 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2102 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2104
2105 if (Subtarget.hasDQI()) {
2109 setOperationAction(Opc, MVT::v2i64, Custom);
2110 setOperationAction(Opc, MVT::v4i64, Custom);
2111 }
2112 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2113 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2114 }
2115
2116 if (Subtarget.hasCDI()) {
2117 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2119 }
2120 } // Subtarget.hasCDI()
2121
2122 if (Subtarget.hasVPOPCNTDQ()) {
2123 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2125 }
2126 }
2127
2128 // This block control legalization of v32i1/v64i1 which are available with
2129 // AVX512BW..
2130 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2131 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2132 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2133
2134 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2145 }
2146
2147 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2149
2150 // Extends from v32i1 masks to 256-bit vectors.
2154
2155 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2156 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2157 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2158 }
2159
2160 // These operations are handled on non-VLX by artificially widening in
2161 // isel patterns.
2162 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2163
2164 if (Subtarget.hasBITALG()) {
2165 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2167 }
2168 }
2169
2170 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2171 auto setGroup = [&] (MVT VT) {
2182
2195
2197
2200
2206
2212
2216 };
2217
2218 // AVX512_FP16 scalar operations
2219 setGroup(MVT::f16);
2233
2236
2237 if (Subtarget.useAVX512Regs()) {
2238 setGroup(MVT::v32f16);
2244 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2251
2256 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2258 MVT::v32i16);
2259 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2261 MVT::v32i16);
2262 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2264 MVT::v32i16);
2265 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2267 MVT::v32i16);
2268
2272
2273 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2274 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2275 }
2276
2277 if (Subtarget.hasVLX()) {
2278 setGroup(MVT::v8f16);
2279 setGroup(MVT::v16f16);
2280
2291
2302
2303 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2306
2310
2311 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2312 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2313 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2314 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2315
2316 // Need to custom widen these to prevent scalarization.
2317 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2318 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2319 }
2320 }
2321
2322 if (!Subtarget.useSoftFloat() &&
2323 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2324 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2325 : &X86::VR128RegClass);
2326 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2327 : &X86::VR256RegClass);
2328 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2329 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2330 // Set the operation action Custom to do the customization later.
2333 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2334 setF16Action(VT, Expand);
2339 }
2340 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2341 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2342 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2343 }
2345 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2346 }
2347
2348 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2349 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2350 setF16Action(MVT::v32bf16, Expand);
2351 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2352 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2354 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2358 }
2359
2360 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2361 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2362 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2363 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2364 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2365 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2366
2367 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2368 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2369 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2370 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2371 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2372
2373 if (Subtarget.hasBWI()) {
2374 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2375 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2376 }
2377
2378 if (Subtarget.hasFP16()) {
2379 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2388 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2397 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2402 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2407 }
2408 }
2409
2410 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2411 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2412 }
2413
2414 // We want to custom lower some of our intrinsics.
2418 if (!Subtarget.is64Bit()) {
2420 }
2421
2422 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2423 // handle type legalization for these operations here.
2424 //
2425 // FIXME: We really should do custom legalization for addition and
2426 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2427 // than generic legalization for 64-bit multiplication-with-overflow, though.
2428 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2429 if (VT == MVT::i64 && !Subtarget.is64Bit())
2430 continue;
2431 // Add/Sub/Mul with overflow operations are custom lowered.
2438
2439 // Support carry in as value rather than glue.
2445 }
2446
2447 if (!Subtarget.is64Bit()) {
2448 // These libcalls are not available in 32-bit.
2449 setLibcallName(RTLIB::SHL_I128, nullptr);
2450 setLibcallName(RTLIB::SRL_I128, nullptr);
2451 setLibcallName(RTLIB::SRA_I128, nullptr);
2452 setLibcallName(RTLIB::MUL_I128, nullptr);
2453 // The MULO libcall is not part of libgcc, only compiler-rt.
2454 setLibcallName(RTLIB::MULO_I64, nullptr);
2455 }
2456 // The MULO libcall is not part of libgcc, only compiler-rt.
2457 setLibcallName(RTLIB::MULO_I128, nullptr);
2458
2459 // Combine sin / cos into _sincos_stret if it is available.
2460 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2461 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2464 }
2465
2466 if (Subtarget.isTargetWin64()) {
2467 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2468 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2469 setOperationAction(ISD::SREM, MVT::i128, Custom);
2470 setOperationAction(ISD::UREM, MVT::i128, Custom);
2479 }
2480
2481 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2482 // is. We should promote the value to 64-bits to solve this.
2483 // This is what the CRT headers do - `fmodf` is an inline header
2484 // function casting to f64 and calling `fmod`.
2485 if (Subtarget.is32Bit() &&
2486 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2487 // clang-format off
2488 for (ISD::NodeType Op :
2499 if (isOperationExpand(Op, MVT::f32))
2500 setOperationAction(Op, MVT::f32, Promote);
2501 // clang-format on
2502
2503 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2504 // it, but it's just a wrapper around ldexp.
2505 if (Subtarget.isOSWindows()) {
2507 if (isOperationExpand(Op, MVT::f32))
2508 setOperationAction(Op, MVT::f32, Promote);
2509 }
2510
2511 // We have target-specific dag combine patterns for the following nodes:
2522 ISD::SHL,
2523 ISD::SRA,
2524 ISD::SRL,
2525 ISD::OR,
2526 ISD::AND,
2532 ISD::ADD,
2533 ISD::FADD,
2534 ISD::FSUB,
2535 ISD::FNEG,
2536 ISD::FMA,
2540 ISD::SUB,
2541 ISD::LOAD,
2542 ISD::LRINT,
2544 ISD::MLOAD,
2545 ISD::STORE,
2559 ISD::SETCC,
2560 ISD::MUL,
2561 ISD::XOR,
2569
2571
2572 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2574 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2576 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2578
2579 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2580 // that needs to benchmarked and balanced with the potential use of vector
2581 // load/store types (PR33329, PR33914).
2584
2585 // Default loop alignment, which can be overridden by -align-loops.
2587
2588 // An out-of-order CPU can speculatively execute past a predictable branch,
2589 // but a conditional move could be stalled by an expensive earlier operation.
2590 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2591 EnableExtLdPromotion = true;
2593
2595
2596 // Default to having -disable-strictnode-mutation on
2597 IsStrictFPEnabled = true;
2598}
2599
2600// This has so far only been implemented for 64-bit MachO.
2602 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2603}
2604
2606 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2607 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2608}
2609
2611 const SDLoc &DL) const {
2612 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2613 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2614 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2615 return SDValue(Node, 0);
2616}
2617
2620 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2621 !Subtarget.hasBWI())
2622 return TypeSplitVector;
2623
2624 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2625 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2626 return TypeSplitVector;
2627
2628 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2629 VT.getVectorElementType() != MVT::i1)
2630 return TypeWidenVector;
2631
2633}
2634
2635FastISel *
2637 const TargetLibraryInfo *libInfo) const {
2638 return X86::createFastISel(funcInfo, libInfo);
2639}
2640
2641//===----------------------------------------------------------------------===//
2642// Other Lowering Hooks
2643//===----------------------------------------------------------------------===//
2644
2646 bool AssumeSingleUse) {
2647 if (!AssumeSingleUse && !Op.hasOneUse())
2648 return false;
2649 if (!ISD::isNormalLoad(Op.getNode()))
2650 return false;
2651
2652 // If this is an unaligned vector, make sure the target supports folding it.
2653 auto *Ld = cast<LoadSDNode>(Op.getNode());
2654 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2655 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2656 return false;
2657
2658 // TODO: If this is a non-temporal load and the target has an instruction
2659 // for it, it should not be folded. See "useNonTemporalLoad()".
2660
2661 return true;
2662}
2663
2665 const X86Subtarget &Subtarget,
2666 bool AssumeSingleUse) {
2667 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2668 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2669 return false;
2670
2671 // We can not replace a wide volatile load with a broadcast-from-memory,
2672 // because that would narrow the load, which isn't legal for volatiles.
2673 auto *Ld = cast<LoadSDNode>(Op.getNode());
2674 return !Ld->isVolatile() ||
2675 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2676}
2677
2679 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2680}
2681
2683 if (Op.hasOneUse()) {
2684 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
2685 return (ISD::ZERO_EXTEND == Opcode);
2686 }
2687 return false;
2688}
2689
2690static bool isLogicOp(unsigned Opcode) {
2691 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2692 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2693}
2694
2695static bool isTargetShuffle(unsigned Opcode) {
2696 switch(Opcode) {
2697 default: return false;
2698 case X86ISD::BLENDI:
2699 case X86ISD::PSHUFB:
2700 case X86ISD::PSHUFD:
2701 case X86ISD::PSHUFHW:
2702 case X86ISD::PSHUFLW:
2703 case X86ISD::SHUFP:
2704 case X86ISD::INSERTPS:
2705 case X86ISD::EXTRQI:
2706 case X86ISD::INSERTQI:
2707 case X86ISD::VALIGN:
2708 case X86ISD::PALIGNR:
2709 case X86ISD::VSHLDQ:
2710 case X86ISD::VSRLDQ:
2711 case X86ISD::MOVLHPS:
2712 case X86ISD::MOVHLPS:
2713 case X86ISD::MOVSHDUP:
2714 case X86ISD::MOVSLDUP:
2715 case X86ISD::MOVDDUP:
2716 case X86ISD::MOVSS:
2717 case X86ISD::MOVSD:
2718 case X86ISD::MOVSH:
2719 case X86ISD::UNPCKL:
2720 case X86ISD::UNPCKH:
2721 case X86ISD::VBROADCAST:
2722 case X86ISD::VPERMILPI:
2723 case X86ISD::VPERMILPV:
2724 case X86ISD::VPERM2X128:
2725 case X86ISD::SHUF128:
2726 case X86ISD::VPERMIL2:
2727 case X86ISD::VPERMI:
2728 case X86ISD::VPPERM:
2729 case X86ISD::VPERMV:
2730 case X86ISD::VPERMV3:
2731 case X86ISD::VZEXT_MOVL:
2732 return true;
2733 }
2734}
2735
2736static bool isTargetShuffleVariableMask(unsigned Opcode) {
2737 switch (Opcode) {
2738 default: return false;
2739 // Target Shuffles.
2740 case X86ISD::PSHUFB:
2741 case X86ISD::VPERMILPV:
2742 case X86ISD::VPERMIL2:
2743 case X86ISD::VPPERM:
2744 case X86ISD::VPERMV:
2745 case X86ISD::VPERMV3:
2746 return true;
2747 // 'Faux' Target Shuffles.
2748 case ISD::OR:
2749 case ISD::AND:
2750 case X86ISD::ANDNP:
2751 return true;
2752 }
2753}
2754
2757 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2759 int ReturnAddrIndex = FuncInfo->getRAIndex();
2760
2761 if (ReturnAddrIndex == 0) {
2762 // Set up a frame object for the return address.
2763 unsigned SlotSize = RegInfo->getSlotSize();
2764 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2765 -(int64_t)SlotSize,
2766 false);
2767 FuncInfo->setRAIndex(ReturnAddrIndex);
2768 }
2769
2770 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2771}
2772
2774 bool HasSymbolicDisplacement) {
2775 // Offset should fit into 32 bit immediate field.
2776 if (!isInt<32>(Offset))
2777 return false;
2778
2779 // If we don't have a symbolic displacement - we don't have any extra
2780 // restrictions.
2781 if (!HasSymbolicDisplacement)
2782 return true;
2783
2784 // We can fold large offsets in the large code model because we always use
2785 // 64-bit offsets.
2786 if (CM == CodeModel::Large)
2787 return true;
2788
2789 // For kernel code model we know that all object resist in the negative half
2790 // of 32bits address space. We may not accept negative offsets, since they may
2791 // be just off and we may accept pretty large positive ones.
2792 if (CM == CodeModel::Kernel)
2793 return Offset >= 0;
2794
2795 // For other non-large code models we assume that latest small object is 16MB
2796 // before end of 31 bits boundary. We may also accept pretty large negative
2797 // constants knowing that all objects are in the positive half of address
2798 // space.
2799 return Offset < 16 * 1024 * 1024;
2800}
2801
2802/// Return true if the condition is an signed comparison operation.
2803static bool isX86CCSigned(unsigned X86CC) {
2804 switch (X86CC) {
2805 default:
2806 llvm_unreachable("Invalid integer condition!");
2807 case X86::COND_E:
2808 case X86::COND_NE:
2809 case X86::COND_B:
2810 case X86::COND_A:
2811 case X86::COND_BE:
2812 case X86::COND_AE:
2813 return false;
2814 case X86::COND_G:
2815 case X86::COND_GE:
2816 case X86::COND_L:
2817 case X86::COND_LE:
2818 return true;
2819 }
2820}
2821
2823 switch (SetCCOpcode) {
2824 // clang-format off
2825 default: llvm_unreachable("Invalid integer condition!");
2826 case ISD::SETEQ: return X86::COND_E;
2827 case ISD::SETGT: return X86::COND_G;
2828 case ISD::SETGE: return X86::COND_GE;
2829 case ISD::SETLT: return X86::COND_L;
2830 case ISD::SETLE: return X86::COND_LE;
2831 case ISD::SETNE: return X86::COND_NE;
2832 case ISD::SETULT: return X86::COND_B;
2833 case ISD::SETUGT: return X86::COND_A;
2834 case ISD::SETULE: return X86::COND_BE;
2835 case ISD::SETUGE: return X86::COND_AE;
2836 // clang-format on
2837 }
2838}
2839
2840/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2841/// condition code, returning the condition code and the LHS/RHS of the
2842/// comparison to make.
2844 bool isFP, SDValue &LHS, SDValue &RHS,
2845 SelectionDAG &DAG) {
2846 if (!isFP) {
2847 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2848 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2849 // X > -1 -> X == 0, jump !sign.
2850 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2851 return X86::COND_NS;
2852 }
2853 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2854 // X < 0 -> X == 0, jump on sign.
2855 return X86::COND_S;
2856 }
2857 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2858 // X >= 0 -> X == 0, jump on !sign.
2859 return X86::COND_NS;
2860 }
2861 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2862 // X < 1 -> X <= 0
2863 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2864 return X86::COND_LE;
2865 }
2866 }
2867
2868 return TranslateIntegerX86CC(SetCCOpcode);
2869 }
2870
2871 // First determine if it is required or is profitable to flip the operands.
2872
2873 // If LHS is a foldable load, but RHS is not, flip the condition.
2874 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2875 !ISD::isNON_EXTLoad(RHS.getNode())) {
2876 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2877 std::swap(LHS, RHS);
2878 }
2879
2880 switch (SetCCOpcode) {
2881 default: break;
2882 case ISD::SETOLT:
2883 case ISD::SETOLE:
2884 case ISD::SETUGT:
2885 case ISD::SETUGE:
2886 std::swap(LHS, RHS);
2887 break;
2888 }
2889
2890 // On a floating point condition, the flags are set as follows:
2891 // ZF PF CF op
2892 // 0 | 0 | 0 | X > Y
2893 // 0 | 0 | 1 | X < Y
2894 // 1 | 0 | 0 | X == Y
2895 // 1 | 1 | 1 | unordered
2896 switch (SetCCOpcode) {
2897 // clang-format off
2898 default: llvm_unreachable("Condcode should be pre-legalized away");
2899 case ISD::SETUEQ:
2900 case ISD::SETEQ: return X86::COND_E;
2901 case ISD::SETOLT: // flipped
2902 case ISD::SETOGT:
2903 case ISD::SETGT: return X86::COND_A;
2904 case ISD::SETOLE: // flipped
2905 case ISD::SETOGE:
2906 case ISD::SETGE: return X86::COND_AE;
2907 case ISD::SETUGT: // flipped
2908 case ISD::SETULT:
2909 case ISD::SETLT: return X86::COND_B;
2910 case ISD::SETUGE: // flipped
2911 case ISD::SETULE:
2912 case ISD::SETLE: return X86::COND_BE;
2913 case ISD::SETONE:
2914 case ISD::SETNE: return X86::COND_NE;
2915 case ISD::SETUO: return X86::COND_P;
2916 case ISD::SETO: return X86::COND_NP;
2917 case ISD::SETOEQ:
2918 case ISD::SETUNE: return X86::COND_INVALID;
2919 // clang-format on
2920 }
2921}
2922
2923/// Is there a floating point cmov for the specific X86 condition code?
2924/// Current x86 isa includes the following FP cmov instructions:
2925/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2926static bool hasFPCMov(unsigned X86CC) {
2927 switch (X86CC) {
2928 default:
2929 return false;
2930 case X86::COND_B:
2931 case X86::COND_BE:
2932 case X86::COND_E:
2933 case X86::COND_P:
2934 case X86::COND_A:
2935 case X86::COND_AE:
2936 case X86::COND_NE:
2937 case X86::COND_NP:
2938 return true;
2939 }
2940}
2941
2942static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
2943 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
2944 VT.is512BitVector();
2945}
2946
2948 const CallInst &I,
2949 MachineFunction &MF,
2950 unsigned Intrinsic) const {
2952 Info.offset = 0;
2953
2954 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
2955 if (!IntrData) {
2956 switch (Intrinsic) {
2957 case Intrinsic::x86_aesenc128kl:
2958 case Intrinsic::x86_aesdec128kl:
2960 Info.ptrVal = I.getArgOperand(1);
2961 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2962 Info.align = Align(1);
2964 return true;
2965 case Intrinsic::x86_aesenc256kl:
2966 case Intrinsic::x86_aesdec256kl:
2968 Info.ptrVal = I.getArgOperand(1);
2969 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2970 Info.align = Align(1);
2972 return true;
2973 case Intrinsic::x86_aesencwide128kl:
2974 case Intrinsic::x86_aesdecwide128kl:
2976 Info.ptrVal = I.getArgOperand(0);
2977 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2978 Info.align = Align(1);
2980 return true;
2981 case Intrinsic::x86_aesencwide256kl:
2982 case Intrinsic::x86_aesdecwide256kl:
2984 Info.ptrVal = I.getArgOperand(0);
2985 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2986 Info.align = Align(1);
2988 return true;
2989 case Intrinsic::x86_cmpccxadd32:
2990 case Intrinsic::x86_cmpccxadd64:
2991 case Intrinsic::x86_atomic_bts:
2992 case Intrinsic::x86_atomic_btc:
2993 case Intrinsic::x86_atomic_btr: {
2995 Info.ptrVal = I.getArgOperand(0);
2996 unsigned Size = I.getType()->getScalarSizeInBits();
2997 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2998 Info.align = Align(Size);
3001 return true;
3002 }
3003 case Intrinsic::x86_atomic_bts_rm:
3004 case Intrinsic::x86_atomic_btc_rm:
3005 case Intrinsic::x86_atomic_btr_rm: {
3007 Info.ptrVal = I.getArgOperand(0);
3008 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3009 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3010 Info.align = Align(Size);
3013 return true;
3014 }
3015 case Intrinsic::x86_aadd32:
3016 case Intrinsic::x86_aadd64:
3017 case Intrinsic::x86_aand32:
3018 case Intrinsic::x86_aand64:
3019 case Intrinsic::x86_aor32:
3020 case Intrinsic::x86_aor64:
3021 case Intrinsic::x86_axor32:
3022 case Intrinsic::x86_axor64:
3023 case Intrinsic::x86_atomic_add_cc:
3024 case Intrinsic::x86_atomic_sub_cc:
3025 case Intrinsic::x86_atomic_or_cc:
3026 case Intrinsic::x86_atomic_and_cc:
3027 case Intrinsic::x86_atomic_xor_cc: {
3029 Info.ptrVal = I.getArgOperand(0);
3030 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3031 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3032 Info.align = Align(Size);
3035 return true;
3036 }
3037 }
3038 return false;
3039 }
3040
3041 switch (IntrData->Type) {
3044 case TRUNCATE_TO_MEM_VI32: {
3046 Info.ptrVal = I.getArgOperand(0);
3047 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3049 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3050 ScalarVT = MVT::i8;
3051 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3052 ScalarVT = MVT::i16;
3053 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3054 ScalarVT = MVT::i32;
3055
3056 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3057 Info.align = Align(1);
3059 break;
3060 }
3061 case GATHER:
3062 case GATHER_AVX2: {
3064 Info.ptrVal = nullptr;
3065 MVT DataVT = MVT::getVT(I.getType());
3066 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3067 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3068 IndexVT.getVectorNumElements());
3069 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3070 Info.align = Align(1);
3072 break;
3073 }
3074 case SCATTER: {
3076 Info.ptrVal = nullptr;
3077 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3078 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3079 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3080 IndexVT.getVectorNumElements());
3081 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3082 Info.align = Align(1);
3084 break;
3085 }
3086 default:
3087 return false;
3088 }
3089
3090 return true;
3091}
3092
3093/// Returns true if the target can instruction select the
3094/// specified FP immediate natively. If false, the legalizer will
3095/// materialize the FP immediate as a load from a constant pool.
3097 bool ForCodeSize) const {
3098 for (const APFloat &FPImm : LegalFPImmediates)
3099 if (Imm.bitwiseIsEqual(FPImm))
3100 return true;
3101 return false;
3102}
3103
3105 ISD::LoadExtType ExtTy,
3106 EVT NewVT) const {
3107 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3108
3109 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3110 // relocation target a movq or addq instruction: don't let the load shrink.
3111 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3112 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3113 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3114 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3115
3116 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3117 // those uses are extracted directly into a store, then the extract + store
3118 // can be store-folded. Therefore, it's probably not worth splitting the load.
3119 EVT VT = Load->getValueType(0);
3120 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3121 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
3122 // Skip uses of the chain value. Result 0 of the node is the load value.
3123 if (UI.getUse().getResNo() != 0)
3124 continue;
3125
3126 // If this use is not an extract + store, it's probably worth splitting.
3127 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
3128 UI->use_begin()->getOpcode() != ISD::STORE)
3129 return true;
3130 }
3131 // All non-chain uses are extract + store.
3132 return false;
3133 }
3134
3135 return true;
3136}
3137
3138/// Returns true if it is beneficial to convert a load of a constant
3139/// to just the constant itself.
3141 Type *Ty) const {
3142 assert(Ty->isIntegerTy());
3143
3144 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3145 if (BitSize == 0 || BitSize > 64)
3146 return false;
3147 return true;
3148}
3149
3151 // If we are using XMM registers in the ABI and the condition of the select is
3152 // a floating-point compare and we have blendv or conditional move, then it is
3153 // cheaper to select instead of doing a cross-register move and creating a
3154 // load that depends on the compare result.
3155 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3156 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3157}
3158
3160 // TODO: It might be a win to ease or lift this restriction, but the generic
3161 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3162 if (VT.isVector() && Subtarget.hasAVX512())
3163 return false;
3164
3165 return true;
3166}
3167
3169 SDValue C) const {
3170 // TODO: We handle scalars using custom code, but generic combining could make
3171 // that unnecessary.
3172 APInt MulC;
3173 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3174 return false;
3175
3176 // Find the type this will be legalized too. Otherwise we might prematurely
3177 // convert this to shl+add/sub and then still have to type legalize those ops.
3178 // Another choice would be to defer the decision for illegal types until
3179 // after type legalization. But constant splat vectors of i64 can't make it
3180 // through type legalization on 32-bit targets so we would need to special
3181 // case vXi64.
3182 while (getTypeAction(Context, VT) != TypeLegal)
3183 VT = getTypeToTransformTo(Context, VT);
3184
3185 // If vector multiply is legal, assume that's faster than shl + add/sub.
3186 // Multiply is a complex op with higher latency and lower throughput in
3187 // most implementations, sub-vXi32 vector multiplies are always fast,
3188 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3189 // is always going to be slow.
3190 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3191 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3192 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3193 return false;
3194
3195 // shl+add, shl+sub, shl+add+neg
3196 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3197 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3198}
3199
3201 unsigned Index) const {
3203 return false;
3204
3205 // Mask vectors support all subregister combinations and operations that
3206 // extract half of vector.
3207 if (ResVT.getVectorElementType() == MVT::i1)
3208 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3209 (Index == ResVT.getVectorNumElements()));
3210
3211 return (Index % ResVT.getVectorNumElements()) == 0;
3212}
3213
3215 unsigned Opc = VecOp.getOpcode();
3216
3217 // Assume target opcodes can't be scalarized.
3218 // TODO - do we have any exceptions?
3219 if (Opc >= ISD::BUILTIN_OP_END)
3220 return false;
3221
3222 // If the vector op is not supported, try to convert to scalar.
3223 EVT VecVT = VecOp.getValueType();
3224 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3225 return true;
3226
3227 // If the vector op is supported, but the scalar op is not, the transform may
3228 // not be worthwhile.
3229 EVT ScalarVT = VecVT.getScalarType();
3230 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3231}
3232
3234 bool) const {
3235 // TODO: Allow vectors?
3236 if (VT.isVector())
3237 return false;
3238 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3239}
3240
3242 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
3243 return Subtarget.hasBMI() ||
3244 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
3245}
3246
3248 // Speculate ctlz only if we can directly use LZCNT.
3249 return Subtarget.hasLZCNT();
3250}
3251
3253 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3254 // expensive than a straight movsd. On the other hand, it's important to
3255 // shrink long double fp constant since fldt is very slow.
3256 return !Subtarget.hasSSE2() || VT == MVT::f80;
3257}
3258
3260 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3261 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3262}
3263
3265 const SelectionDAG &DAG,
3266 const MachineMemOperand &MMO) const {
3267 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3268 BitcastVT.getVectorElementType() == MVT::i1)
3269 return false;
3270
3271 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3272 return false;
3273
3274 // If both types are legal vectors, it's always ok to convert them.
3275 if (LoadVT.isVector() && BitcastVT.isVector() &&
3276 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3277 return true;
3278
3279 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3280}
3281
3283 const MachineFunction &MF) const {
3284 // Do not merge to float value size (128 bytes) if no implicit
3285 // float attribute is set.
3286 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3287
3288 if (NoFloat) {
3289 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3290 return (MemVT.getSizeInBits() <= MaxIntSize);
3291 }
3292 // Make sure we don't merge greater than our preferred vector
3293 // width.
3294 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3295 return false;
3296
3297 return true;
3298}
3299
3301 return Subtarget.hasFastLZCNT();
3302}
3303
3305 const Instruction &AndI) const {
3306 return true;
3307}
3308
3310 EVT VT = Y.getValueType();
3311
3312 if (VT.isVector())
3313 return false;
3314
3315 if (!Subtarget.hasBMI())
3316 return false;
3317
3318 // There are only 32-bit and 64-bit forms for 'andn'.
3319 if (VT != MVT::i32 && VT != MVT::i64)
3320 return false;
3321
3322 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3323}
3324
3326 EVT VT = Y.getValueType();
3327
3328 if (!VT.isVector())
3329 return hasAndNotCompare(Y);
3330
3331 // Vector.
3332
3333 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3334 return false;
3335
3336 if (VT == MVT::v4i32)
3337 return true;
3338
3339 return Subtarget.hasSSE2();
3340}
3341
3343 return X.getValueType().isScalarInteger(); // 'bt'
3344}
3345
3349 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3350 SelectionDAG &DAG) const {
3351 // Does baseline recommend not to perform the fold by default?
3353 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3354 return false;
3355 // For scalars this transform is always beneficial.
3356 if (X.getValueType().isScalarInteger())
3357 return true;
3358 // If all the shift amounts are identical, then transform is beneficial even
3359 // with rudimentary SSE2 shifts.
3360 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3361 return true;
3362 // If we have AVX2 with it's powerful shift operations, then it's also good.
3363 if (Subtarget.hasAVX2())
3364 return true;
3365 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3366 return NewShiftOpcode == ISD::SHL;
3367}
3368
3370 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3371 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3372 if (!VT.isInteger())
3373 return ShiftOpc;
3374
3375 bool PreferRotate = false;
3376 if (VT.isVector()) {
3377 // For vectors, if we have rotate instruction support, then its definetly
3378 // best. Otherwise its not clear what the best so just don't make changed.
3379 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3380 VT.getScalarType() == MVT::i64);
3381 } else {
3382 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3383 // rotate unless we have a zext mask+shr.
3384 PreferRotate = Subtarget.hasBMI2();
3385 if (!PreferRotate) {
3386 unsigned MaskBits =
3387 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3388 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3389 }
3390 }
3391
3392 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3393 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3394
3395 if (PreferRotate && MayTransformRotate)
3396 return ISD::ROTL;
3397
3398 // If vector we don't really get much benefit swapping around constants.
3399 // Maybe we could check if the DAG has the flipped node already in the
3400 // future.
3401 if (VT.isVector())
3402 return ShiftOpc;
3403
3404 // See if the beneficial to swap shift type.
3405 if (ShiftOpc == ISD::SHL) {
3406 // If the current setup has imm64 mask, then inverse will have
3407 // at least imm32 mask (or be zext i32 -> i64).
3408 if (VT == MVT::i64)
3409 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3410 : ShiftOpc;
3411
3412 // We can only benefit if req at least 7-bit for the mask. We
3413 // don't want to replace shl of 1,2,3 as they can be implemented
3414 // with lea/add.
3415 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3416 }
3417
3418 if (VT == MVT::i64)
3419 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3420 // extremely efficient.
3421 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3422
3423 // Keep small shifts as shl so we can generate add/lea.
3424 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3425 }
3426
3427 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3428 // (PreferRotate will be set in the latter case).
3429 if (PreferRotate || VT.isVector())
3430 return ShiftOpc;
3431
3432 // Non-vector type and we have a zext mask with SRL.
3433 return ISD::SRL;
3434}
3435
3438 const Value *Lhs,
3439 const Value *Rhs) const {
3440 using namespace llvm::PatternMatch;
3441 int BaseCost = BrMergingBaseCostThresh.getValue();
3442 // With CCMP, branches can be merged in a more efficient way.
3443 if (BaseCost >= 0 && Subtarget.hasCCMP())
3444 BaseCost += BrMergingCcmpBias;
3445 // a == b && a == c is a fast pattern on x86.
3447 if (BaseCost >= 0 && Opc == Instruction::And &&
3448 match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3449 Pred == ICmpInst::ICMP_EQ &&
3450 match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3451 Pred == ICmpInst::ICMP_EQ)
3452 BaseCost += 1;
3453 return {BaseCost, BrMergingLikelyBias.getValue(),
3454 BrMergingUnlikelyBias.getValue()};
3455}
3456
3458 return N->getOpcode() != ISD::FP_EXTEND;
3459}
3460
3462 const SDNode *N, CombineLevel Level) const {
3463 assert(((N->getOpcode() == ISD::SHL &&
3464 N->getOperand(0).getOpcode() == ISD::SRL) ||
3465 (N->getOpcode() == ISD::SRL &&
3466 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3467 "Expected shift-shift mask");
3468 // TODO: Should we always create i64 masks? Or only folded immediates?
3469 EVT VT = N->getValueType(0);
3470 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3471 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3472 // Only fold if the shift values are equal - so it folds to AND.
3473 // TODO - we should fold if either is a non-uniform vector but we don't do
3474 // the fold for non-splats yet.
3475 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3476 }
3478}
3479
3481 EVT VT = Y.getValueType();
3482
3483 // For vectors, we don't have a preference, but we probably want a mask.
3484 if (VT.isVector())
3485 return false;
3486
3487 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3488 if (VT == MVT::i64 && !Subtarget.is64Bit())
3489 return false;
3490
3491 return true;
3492}
3493
3496 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3498 !Subtarget.isOSWindows())
3501 ExpansionFactor);
3502}
3503
3505 // Any legal vector type can be splatted more efficiently than
3506 // loading/spilling from memory.
3507 return isTypeLegal(VT);
3508}
3509
3511 MVT VT = MVT::getIntegerVT(NumBits);
3512 if (isTypeLegal(VT))
3513 return VT;
3514
3515 // PMOVMSKB can handle this.
3516 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3517 return MVT::v16i8;
3518
3519 // VPMOVMSKB can handle this.
3520 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3521 return MVT::v32i8;
3522
3523 // TODO: Allow 64-bit type for 32-bit target.
3524 // TODO: 512-bit types should be allowed, but make sure that those
3525 // cases are handled in combineVectorSizedSetCCEquality().
3526
3528}
3529
3530/// Val is the undef sentinel value or equal to the specified value.
3531static bool isUndefOrEqual(int Val, int CmpVal) {
3532 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3533}
3534
3535/// Return true if every element in Mask is the undef sentinel value or equal to
3536/// the specified value.
3537static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3538 return llvm::all_of(Mask, [CmpVal](int M) {
3539 return (M == SM_SentinelUndef) || (M == CmpVal);
3540 });
3541}
3542
3543/// Return true if every element in Mask, beginning from position Pos and ending
3544/// in Pos+Size is the undef sentinel value or equal to the specified value.
3545static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3546 unsigned Size) {
3547 return llvm::all_of(Mask.slice(Pos, Size),
3548 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3549}
3550
3551/// Val is either the undef or zero sentinel value.
3552static bool isUndefOrZero(int Val) {
3553 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3554}
3555
3556/// Return true if every element in Mask, beginning from position Pos and ending
3557/// in Pos+Size is the undef sentinel value.
3558static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3559 return llvm::all_of(Mask.slice(Pos, Size),
3560 [](int M) { return M == SM_SentinelUndef; });
3561}
3562
3563/// Return true if the mask creates a vector whose lower half is undefined.
3565 unsigned NumElts = Mask.size();
3566 return isUndefInRange(Mask, 0, NumElts / 2);
3567}
3568
3569/// Return true if the mask creates a vector whose upper half is undefined.
3571 unsigned NumElts = Mask.size();
3572 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3573}
3574
3575/// Return true if Val falls within the specified range (L, H].
3576static bool isInRange(int Val, int Low, int Hi) {
3577 return (Val >= Low && Val < Hi);
3578}
3579
3580/// Return true if the value of any element in Mask falls within the specified
3581/// range (L, H].
3582static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3583 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3584}
3585
3586/// Return true if the value of any element in Mask is the zero sentinel value.
3587static bool isAnyZero(ArrayRef<int> Mask) {
3588 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3589}
3590
3591/// Return true if Val is undef or if its value falls within the
3592/// specified range (L, H].
3593static bool isUndefOrInRange(int Val, int Low, int Hi) {
3594 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3595}
3596
3597/// Return true if every element in Mask is undef or if its value
3598/// falls within the specified range (L, H].
3599static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3600 return llvm::all_of(
3601 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3602}
3603
3604/// Return true if Val is undef, zero or if its value falls within the
3605/// specified range (L, H].
3606static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3607 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3608}
3609
3610/// Return true if every element in Mask is undef, zero or if its value
3611/// falls within the specified range (L, H].
3612static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3613 return llvm::all_of(
3614 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3615}
3616
3617/// Return true if every element in Mask, is an in-place blend/select mask or is
3618/// undef.
3620 unsigned NumElts = Mask.size();
3621 for (auto [I, M] : enumerate(Mask))
3622 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3623 return false;
3624 return true;
3625}
3626
3627/// Return true if every element in Mask, beginning
3628/// from position Pos and ending in Pos + Size, falls within the specified
3629/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3630static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3631 unsigned Size, int Low, int Step = 1) {
3632 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3633 if (!isUndefOrEqual(Mask[i], Low))
3634 return false;
3635 return true;
3636}
3637
3638/// Return true if every element in Mask, beginning
3639/// from position Pos and ending in Pos+Size, falls within the specified
3640/// sequential range (Low, Low+Size], or is undef or is zero.
3642 unsigned Size, int Low,
3643 int Step = 1) {
3644 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3645 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3646 return false;
3647 return true;
3648}
3649
3650/// Return true if every element in Mask, beginning
3651/// from position Pos and ending in Pos+Size is undef or is zero.
3652static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3653 unsigned Size) {
3654 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3655}
3656
3657/// Return true if every element of a single input is referenced by the shuffle
3658/// mask. i.e. it just permutes them all.
3660 unsigned NumElts = Mask.size();
3661 APInt DemandedElts = APInt::getZero(NumElts);
3662 for (int M : Mask)
3663 if (isInRange(M, 0, NumElts))
3664 DemandedElts.setBit(M);
3665 return DemandedElts.isAllOnes();
3666}
3667
3668/// Helper function to test whether a shuffle mask could be
3669/// simplified by widening the elements being shuffled.
3670///
3671/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3672/// leaves it in an unspecified state.
3673///
3674/// NOTE: This must handle normal vector shuffle masks and *target* vector
3675/// shuffle masks. The latter have the special property of a '-2' representing
3676/// a zero-ed lane of a vector.
3678 SmallVectorImpl<int> &WidenedMask) {
3679 WidenedMask.assign(Mask.size() / 2, 0);
3680 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3681 int M0 = Mask[i];
3682 int M1 = Mask[i + 1];
3683
3684 // If both elements are undef, its trivial.
3685 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3686 WidenedMask[i / 2] = SM_SentinelUndef;
3687 continue;
3688 }
3689
3690 // Check for an undef mask and a mask value properly aligned to fit with
3691 // a pair of values. If we find such a case, use the non-undef mask's value.
3692 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3693 WidenedMask[i / 2] = M1 / 2;
3694 continue;
3695 }
3696 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3697 WidenedMask[i / 2] = M0 / 2;
3698 continue;
3699 }
3700
3701 // When zeroing, we need to spread the zeroing across both lanes to widen.
3702 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3703 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3705 WidenedMask[i / 2] = SM_SentinelZero;
3706 continue;
3707 }
3708 return false;
3709 }
3710
3711 // Finally check if the two mask values are adjacent and aligned with
3712 // a pair.
3713 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3714 WidenedMask[i / 2] = M0 / 2;
3715 continue;
3716 }
3717
3718 // Otherwise we can't safely widen the elements used in this shuffle.
3719 return false;
3720 }
3721 assert(WidenedMask.size() == Mask.size() / 2 &&
3722 "Incorrect size of mask after widening the elements!");
3723
3724 return true;
3725}
3726
3728 const APInt &Zeroable,
3729 bool V2IsZero,
3730 SmallVectorImpl<int> &WidenedMask) {
3731 // Create an alternative mask with info about zeroable elements.
3732 // Here we do not set undef elements as zeroable.
3733 SmallVector<int, 64> ZeroableMask(Mask);
3734 if (V2IsZero) {
3735 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3736 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3737 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3738 ZeroableMask[i] = SM_SentinelZero;
3739 }
3740 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3741}
3742
3744 SmallVector<int, 32> WidenedMask;
3745 return canWidenShuffleElements(Mask, WidenedMask);
3746}
3747
3748// Attempt to narrow/widen shuffle mask until it matches the target number of
3749// elements.
3750static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3751 SmallVectorImpl<int> &ScaledMask) {
3752 unsigned NumSrcElts = Mask.size();
3753 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3754 "Illegal shuffle scale factor");
3755
3756 // Narrowing is guaranteed to work.
3757 if (NumDstElts >= NumSrcElts) {
3758 int Scale = NumDstElts / NumSrcElts;
3759 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3760 return true;
3761 }
3762
3763 // We have to repeat the widening until we reach the target size, but we can
3764 // split out the first widening as it sets up ScaledMask for us.
3765 if (canWidenShuffleElements(Mask, ScaledMask)) {
3766 while (ScaledMask.size() > NumDstElts) {
3767 SmallVector<int, 16> WidenedMask;
3768 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3769 return false;
3770 ScaledMask = std::move(WidenedMask);
3771 }
3772 return true;
3773 }
3774
3775 return false;
3776}
3777
3778static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3779 SmallVector<int, 32> ScaledMask;
3780 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3781}
3782
3783/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3785 return isNullConstant(Elt) || isNullFPConstant(Elt);
3786}
3787
3788// Build a vector of constants.
3789// Use an UNDEF node if MaskElt == -1.
3790// Split 64-bit constants in the 32-bit mode.
3792 const SDLoc &dl, bool IsMask = false) {
3793
3795 bool Split = false;
3796
3797 MVT ConstVecVT = VT;
3798 unsigned NumElts = VT.getVectorNumElements();
3799 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3800 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3801 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3802 Split = true;
3803 }
3804
3805 MVT EltVT = ConstVecVT.getVectorElementType();
3806 for (unsigned i = 0; i < NumElts; ++i) {
3807 bool IsUndef = Values[i] < 0 && IsMask;
3808 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3809 DAG.getConstant(Values[i], dl, EltVT);
3810 Ops.push_back(OpNode);
3811 if (Split)
3812 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3813 DAG.getConstant(0, dl, EltVT));
3814 }
3815 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3816 if (Split)
3817 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3818 return ConstsNode;
3819}
3820
3821static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3822 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3823 assert(Bits.size() == Undefs.getBitWidth() &&
3824 "Unequal constant and undef arrays");
3826 bool Split = false;
3827
3828 MVT ConstVecVT = VT;
3829 unsigned NumElts = VT.getVectorNumElements();
3830 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3831 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3832 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3833 Split = true;
3834 }
3835
3836 MVT EltVT = ConstVecVT.getVectorElementType();
3837 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3838 if (Undefs[i]) {
3839 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3840 continue;
3841 }
3842 const APInt &V = Bits[i];
3843 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3844 if (Split) {
3845 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3846 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3847 } else if (EltVT == MVT::f32) {
3849 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3850 } else if (EltVT == MVT::f64) {
3852 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3853 } else {
3854 Ops.push_back(DAG.getConstant(V, dl, EltVT));
3855 }
3856 }
3857
3858 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3859 return DAG.getBitcast(VT, ConstsNode);
3860}
3861
3863 SelectionDAG &DAG, const SDLoc &dl) {
3864 APInt Undefs = APInt::getZero(Bits.size());
3865 return getConstVector(Bits, Undefs, VT, DAG, dl);
3866}
3867
3868/// Returns a vector of specified type with all zero elements.
3869static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
3870 SelectionDAG &DAG, const SDLoc &dl) {
3871 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
3872 VT.getVectorElementType() == MVT::i1) &&
3873 "Unexpected vector type");
3874
3875 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
3876 // type. This ensures they get CSE'd. But if the integer type is not
3877 // available, use a floating-point +0.0 instead.
3878 SDValue Vec;
3879 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3880 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
3881 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
3882 } else if (VT.isFloatingPoint() &&
3884 Vec = DAG.getConstantFP(+0.0, dl, VT);
3885 } else if (VT.getVectorElementType() == MVT::i1) {
3886 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
3887 "Unexpected vector type");
3888 Vec = DAG.getConstant(0, dl, VT);
3889 } else {
3890 unsigned Num32BitElts = VT.getSizeInBits() / 32;
3891 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
3892 }
3893 return DAG.getBitcast(VT, Vec);
3894}
3895
3896// Helper to determine if the ops are all the extracted subvectors come from a
3897// single source. If we allow commute they don't have to be in order (Lo/Hi).
3898static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
3899 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3900 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3901 LHS.getValueType() != RHS.getValueType() ||
3902 LHS.getOperand(0) != RHS.getOperand(0))
3903 return SDValue();
3904
3905 SDValue Src = LHS.getOperand(0);
3906 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
3907 return SDValue();
3908
3909 unsigned NumElts = LHS.getValueType().getVectorNumElements();
3910 if ((LHS.getConstantOperandAPInt(1) == 0 &&
3911 RHS.getConstantOperandAPInt(1) == NumElts) ||
3912 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
3913 LHS.getConstantOperandAPInt(1) == NumElts))
3914 return Src;
3915
3916 return SDValue();
3917}
3918
3919static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
3920 const SDLoc &dl, unsigned vectorWidth) {
3921 EVT VT = Vec.getValueType();
3922 EVT ElVT = VT.getVectorElementType();
3923 unsigned Factor = VT.getSizeInBits() / vectorWidth;
3924 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
3925 VT.getVectorNumElements() / Factor);
3926
3927 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
3928 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
3929 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3930
3931 // This is the index of the first element of the vectorWidth-bit chunk
3932 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3933 IdxVal &= ~(ElemsPerChunk - 1);
3934
3935 // If the input is a buildvector just emit a smaller one.
3936 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
3937 return DAG.getBuildVector(ResultVT, dl,
3938 Vec->ops().slice(IdxVal, ElemsPerChunk));
3939
3940 // Check if we're extracting the upper undef of a widening pattern.
3941 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
3942 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
3943 isNullConstant(Vec.getOperand(2)))
3944 return DAG.getUNDEF(ResultVT);
3945
3946 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3947 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
3948}
3949
3950/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
3951/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
3952/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
3953/// instructions or a simple subregister reference. Idx is an index in the
3954/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
3955/// lowering EXTRACT_VECTOR_ELT operations easier.
3956static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
3957 SelectionDAG &DAG, const SDLoc &dl) {
3959 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
3960 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
3961}
3962
3963/// Generate a DAG to grab 256-bits from a 512-bit vector.
3964static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
3965 SelectionDAG &DAG, const SDLoc &dl) {
3966 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
3967 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
3968}
3969
3970static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3971 SelectionDAG &DAG, const SDLoc &dl,
3972 unsigned vectorWidth) {
3973 assert((vectorWidth == 128 || vectorWidth == 256) &&
3974 "Unsupported vector width");
3975 // Inserting UNDEF is Result
3976 if (Vec.isUndef())
3977 return Result;
3978 EVT VT = Vec.getValueType();
3979 EVT ElVT = VT.getVectorElementType();
3980 EVT ResultVT = Result.getValueType();
3981
3982 // Insert the relevant vectorWidth bits.
3983 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
3984 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3985
3986 // This is the index of the first element of the vectorWidth-bit chunk
3987 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3988 IdxVal &= ~(ElemsPerChunk - 1);
3989
3990 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3991 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
3992}
3993
3994/// Generate a DAG to put 128-bits into a vector > 128 bits. This
3995/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
3996/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
3997/// simple superregister reference. Idx is an index in the 128 bits
3998/// we want. It need not be aligned to a 128-bit boundary. That makes
3999/// lowering INSERT_VECTOR_ELT operations easier.
4000static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4001 SelectionDAG &DAG, const SDLoc &dl) {
4002 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4003 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4004}
4005
4006/// Widen a vector to a larger size with the same scalar type, with the new
4007/// elements either zero or undef.
4008static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4009 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4010 const SDLoc &dl) {
4012 Vec.getValueType().getScalarType() == VT.getScalarType() &&
4013 "Unsupported vector widening type");
4014 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4015 : DAG.getUNDEF(VT);
4016 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
4017 DAG.getIntPtrConstant(0, dl));
4018}
4019
4020/// Widen a vector to a larger size with the same scalar type, with the new
4021/// elements either zero or undef.
4022static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4023 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4024 const SDLoc &dl, unsigned WideSizeInBits) {
4025 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4026 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4027 "Unsupported vector widening type");
4028 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4029 MVT SVT = Vec.getSimpleValueType().getScalarType();
4030 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4031 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4032}
4033
4034/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4035/// and bitcast with integer types.
4036static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4037 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4038 unsigned NumElts = VT.getVectorNumElements();
4039 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4040 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4041 return VT;
4042}
4043
4044/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4045/// bitcast with integer types.
4046static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4047 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4048 const SDLoc &dl) {
4049 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4050 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4051}
4052
4053// Helper function to collect subvector ops that are concatenated together,
4054// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4055// The subvectors in Ops are guaranteed to be the same type.
4057 SelectionDAG &DAG) {
4058 assert(Ops.empty() && "Expected an empty ops vector");
4059
4060 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4061 Ops.append(N->op_begin(), N->op_end());
4062 return true;
4063 }
4064
4065 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4066 SDValue Src = N->getOperand(0);
4067 SDValue Sub = N->getOperand(1);
4068 const APInt &Idx = N->getConstantOperandAPInt(2);
4069 EVT VT = Src.getValueType();
4070 EVT SubVT = Sub.getValueType();
4071
4072 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4073 // insert_subvector(undef, x, lo)
4074 if (Idx == 0 && Src.isUndef()) {
4075 Ops.push_back(Sub);
4076 Ops.push_back(DAG.getUNDEF(SubVT));
4077 return true;
4078 }
4079 if (Idx == (VT.getVectorNumElements() / 2)) {
4080 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4081 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4082 Src.getOperand(1).getValueType() == SubVT &&
4083 isNullConstant(Src.getOperand(2))) {
4084 // Attempt to recurse into inner (matching) concats.
4085 SDValue Lo = Src.getOperand(1);
4086 SDValue Hi = Sub;
4087 SmallVector<SDValue, 2> LoOps, HiOps;
4088 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4089 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4090 LoOps.size() == HiOps.size()) {
4091 Ops.append(LoOps);
4092 Ops.append(HiOps);
4093 return true;
4094 }
4095 Ops.push_back(Lo);
4096 Ops.push_back(Hi);
4097 return true;
4098 }
4099 // insert_subvector(x, extract_subvector(x, lo), hi)
4100 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4101 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4102 Ops.append(2, Sub);
4103 return true;
4104 }
4105 // insert_subvector(undef, x, hi)
4106 if (Src.isUndef()) {
4107 Ops.push_back(DAG.getUNDEF(SubVT));
4108 Ops.push_back(Sub);
4109 return true;
4110 }
4111 }
4112 }
4113 }
4114
4115 return false;
4116}
4117
4118// Helper to check if \p V can be split into subvectors and the upper subvectors
4119// are all undef. In which case return the lower subvector.
4121 SelectionDAG &DAG) {
4122 SmallVector<SDValue> SubOps;
4123 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4124 return SDValue();
4125
4126 unsigned NumSubOps = SubOps.size();
4127 unsigned HalfNumSubOps = NumSubOps / 2;
4128 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4129
4130 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4131 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4132 return SDValue();
4133
4134 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4135 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4136 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4137}
4138
4139// Helper to check if we can access all the constituent subvectors without any
4140// extract ops.
4143 return collectConcatOps(N, Ops, DAG);
4144}
4145
4146static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4147 const SDLoc &dl) {
4148 EVT VT = Op.getValueType();
4149 unsigned NumElems = VT.getVectorNumElements();
4150 unsigned SizeInBits = VT.getSizeInBits();
4151 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4152 "Can't split odd sized vector");
4153
4154 // If this is a splat value (with no-undefs) then use the lower subvector,
4155 // which should be a free extraction.
4156 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4157 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4158 return std::make_pair(Lo, Lo);
4159
4160 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4161 return std::make_pair(Lo, Hi);
4162}
4163
4164/// Break an operation into 2 half sized ops and then concatenate the results.
4166 unsigned NumOps = Op.getNumOperands();
4167 EVT VT = Op.getValueType();
4168
4169 // Extract the LHS Lo/Hi vectors
4170 SmallVector<SDValue> LoOps(NumOps, SDValue());
4171 SmallVector<SDValue> HiOps(NumOps, SDValue());
4172 for (unsigned I = 0; I != NumOps; ++I) {
4173 SDValue SrcOp = Op.getOperand(I);
4174 if (!SrcOp.getValueType().isVector()) {
4175 LoOps[I] = HiOps[I] = SrcOp;
4176 continue;
4177 }
4178 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4179 }
4180
4181 EVT LoVT, HiVT;
4182 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4183 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4184 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4185 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4186}
4187
4188/// Break an unary integer operation into 2 half sized ops and then
4189/// concatenate the result back.
4191 const SDLoc &dl) {
4192 // Make sure we only try to split 256/512-bit types to avoid creating
4193 // narrow vectors.
4194 [[maybe_unused]] EVT VT = Op.getValueType();
4195 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4196 Op.getOperand(0).getValueType().is512BitVector()) &&
4197 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4198 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4199 VT.getVectorNumElements() &&
4200 "Unexpected VTs!");
4201 return splitVectorOp(Op, DAG, dl);
4202}
4203
4204/// Break a binary integer operation into 2 half sized ops and then
4205/// concatenate the result back.
4207 const SDLoc &dl) {
4208 // Assert that all the types match.
4209 [[maybe_unused]] EVT VT = Op.getValueType();
4210 assert(Op.getOperand(0).getValueType() == VT &&
4211 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4212 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4213 return splitVectorOp(Op, DAG, dl);
4214}
4215
4216// Helper for splitting operands of an operation to legal target size and
4217// apply a function on each part.
4218// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4219// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4220// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4221// The argument Builder is a function that will be applied on each split part:
4222// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4223template <typename F>
4225 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4226 F Builder, bool CheckBWI = true) {
4227 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4228 unsigned NumSubs = 1;
4229 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4230 (!CheckBWI && Subtarget.useAVX512Regs())) {
4231 if (VT.getSizeInBits() > 512) {
4232 NumSubs = VT.getSizeInBits() / 512;
4233 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4234 }
4235 } else if (Subtarget.hasAVX2()) {
4236 if (VT.getSizeInBits() > 256) {
4237 NumSubs = VT.getSizeInBits() / 256;
4238 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4239 }
4240 } else {
4241 if (VT.getSizeInBits() > 128) {
4242 NumSubs = VT.getSizeInBits() / 128;
4243 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4244 }
4245 }
4246
4247 if (NumSubs == 1)
4248 return Builder(DAG, DL, Ops);
4249
4251 for (unsigned i = 0; i != NumSubs; ++i) {
4253 for (SDValue Op : Ops) {
4254 EVT OpVT = Op.getValueType();
4255 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4256 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4257 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4258 }
4259 Subs.push_back(Builder(DAG, DL, SubOps));
4260 }
4261 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4262}
4263
4264// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4265// targets.
4266static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4268 const X86Subtarget &Subtarget) {
4269 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4270 MVT SVT = VT.getScalarType();
4271
4272 // If we have a 32/64 splatted constant, splat it to DstTy to
4273 // encourage a foldable broadcast'd operand.
4274 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4275 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4276 // AVX512 broadcasts 32/64-bit operands.
4277 // TODO: Support float once getAVX512Node is used by fp-ops.
4278 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4280 return SDValue();
4281 // If we're not widening, don't bother if we're not bitcasting.
4282 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4283 return SDValue();
4284 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4285 APInt SplatValue, SplatUndef;
4286 unsigned SplatBitSize;
4287 bool HasAnyUndefs;
4288 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4289 HasAnyUndefs, OpEltSizeInBits) &&
4290 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4291 return DAG.getConstant(SplatValue, DL, DstVT);
4292 }
4293 return SDValue();
4294 };
4295
4296 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4297
4298 MVT DstVT = VT;
4299 if (Widen)
4300 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4301
4302 // Canonicalize src operands.
4303 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
4304 for (SDValue &Op : SrcOps) {
4305 MVT OpVT = Op.getSimpleValueType();
4306 // Just pass through scalar operands.
4307 if (!OpVT.isVector())
4308 continue;
4309 assert(OpVT == VT && "Vector type mismatch");
4310
4311 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4312 Op = BroadcastOp;
4313 continue;
4314 }
4315
4316 // Just widen the subvector by inserting into an undef wide vector.
4317 if (Widen)
4318 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4319 }
4320
4321 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4322
4323 // Perform the 512-bit op then extract the bottom subvector.
4324 if (Widen)
4325 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4326 return Res;
4327}
4328
4329/// Insert i1-subvector to i1-vector.
4331 const X86Subtarget &Subtarget) {
4332
4333 SDLoc dl(Op);
4334 SDValue Vec = Op.getOperand(0);
4335 SDValue SubVec = Op.getOperand(1);
4336 SDValue Idx = Op.getOperand(2);
4337 unsigned IdxVal = Op.getConstantOperandVal(2);
4338
4339 // Inserting undef is a nop. We can just return the original vector.
4340 if (SubVec.isUndef())
4341 return Vec;
4342
4343 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4344 return Op;
4345
4346 MVT OpVT = Op.getSimpleValueType();
4347 unsigned NumElems = OpVT.getVectorNumElements();
4348 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4349
4350 // Extend to natively supported kshift.
4351 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4352
4353 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4354 // if necessary.
4355 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4356 // May need to promote to a legal type.
4357 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4358 DAG.getConstant(0, dl, WideOpVT),
4359 SubVec, Idx);
4360 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4361 }
4362
4363 MVT SubVecVT = SubVec.getSimpleValueType();
4364 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4365 assert(IdxVal + SubVecNumElems <= NumElems &&
4366 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4367 "Unexpected index value in INSERT_SUBVECTOR");
4368
4369 SDValue Undef = DAG.getUNDEF(WideOpVT);
4370
4371 if (IdxVal == 0) {
4372 // Zero lower bits of the Vec
4373 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4374 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4375 ZeroIdx);
4376 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4377 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4378 // Merge them together, SubVec should be zero extended.
4379 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4380 DAG.getConstant(0, dl, WideOpVT),
4381 SubVec, ZeroIdx);
4382 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4383 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4384 }
4385
4386 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4387 Undef, SubVec, ZeroIdx);
4388
4389 if (Vec.isUndef()) {
4390 assert(IdxVal != 0 && "Unexpected index");
4391 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4392 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4393 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4394 }
4395
4397 assert(IdxVal != 0 && "Unexpected index");
4398 // If upper elements of Vec are known undef, then just shift into place.
4399 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4400 [](SDValue V) { return V.isUndef(); })) {
4401 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4402 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4403 } else {
4404 NumElems = WideOpVT.getVectorNumElements();
4405 unsigned ShiftLeft = NumElems - SubVecNumElems;
4406 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4407 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4408 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4409 if (ShiftRight != 0)
4410 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4411 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4412 }
4413 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4414 }
4415
4416 // Simple case when we put subvector in the upper part
4417 if (IdxVal + SubVecNumElems == NumElems) {
4418 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4419 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4420 if (SubVecNumElems * 2 == NumElems) {
4421 // Special case, use legal zero extending insert_subvector. This allows
4422 // isel to optimize when bits are known zero.
4423 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4424 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4425 DAG.getConstant(0, dl, WideOpVT),
4426 Vec, ZeroIdx);
4427 } else {
4428 // Otherwise use explicit shifts to zero the bits.
4429 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4430 Undef, Vec, ZeroIdx);
4431 NumElems = WideOpVT.getVectorNumElements();
4432 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4433 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4434 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4435 }
4436 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4437 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4438 }
4439
4440 // Inserting into the middle is more complicated.
4441
4442 NumElems = WideOpVT.getVectorNumElements();
4443
4444 // Widen the vector if needed.
4445 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4446
4447 unsigned ShiftLeft = NumElems - SubVecNumElems;
4448 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4449
4450 // Do an optimization for the most frequently used types.
4451 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4452 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4453 Mask0.flipAllBits();
4454 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4455 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4456 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4457 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4458 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4459 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4460 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4461 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4462
4463 // Reduce to original width if needed.
4464 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4465 }
4466
4467 // Clear the upper bits of the subvector and move it to its insert position.
4468 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4469 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4470 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4471 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4472
4473 // Isolate the bits below the insertion point.
4474 unsigned LowShift = NumElems - IdxVal;
4475 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4476 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4477 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4478 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4479
4480 // Isolate the bits after the last inserted bit.
4481 unsigned HighShift = IdxVal + SubVecNumElems;
4482 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4483 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4484 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4485 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4486
4487 // Now OR all 3 pieces together.
4488 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4489 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4490
4491 // Reduce to original width if needed.
4492 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4493}
4494
4496 const SDLoc &dl) {
4497 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4498 EVT SubVT = V1.getValueType();
4499 EVT SubSVT = SubVT.getScalarType();
4500 unsigned SubNumElts = SubVT.getVectorNumElements();
4501 unsigned SubVectorWidth = SubVT.getSizeInBits();
4502 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4503 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4504 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4505}
4506
4507/// Returns a vector of specified type with all bits set.
4508/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4509/// Then bitcast to their original type, ensuring they get CSE'd.
4510static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4511 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4512 "Expected a 128/256/512-bit vector type");
4513 unsigned NumElts = VT.getSizeInBits() / 32;
4514 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4515 return DAG.getBitcast(VT, Vec);
4516}
4517
4518static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4519 SDValue In, SelectionDAG &DAG) {
4520 EVT InVT = In.getValueType();
4521 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4522 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4523 ISD::ZERO_EXTEND == Opcode) &&
4524 "Unknown extension opcode");
4525
4526 // For 256-bit vectors, we only need the lower (128-bit) input half.
4527 // For 512-bit vectors, we only need the lower input half or quarter.
4528 if (InVT.getSizeInBits() > 128) {
4529 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4530 "Expected VTs to be the same size!");
4531 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4532 In = extractSubVector(In, 0, DAG, DL,
4533 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4534 InVT = In.getValueType();
4535 }
4536
4537 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4538 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4539
4540 return DAG.getNode(Opcode, DL, VT, In);
4541}
4542
4543// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4544static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4545 SDValue Mask, SelectionDAG &DAG) {
4546 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4547 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4548 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4549}
4550
4552 bool Lo, bool Unary) {
4553 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4554 "Illegal vector type to unpack");
4555 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4556 int NumElts = VT.getVectorNumElements();
4557 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4558 for (int i = 0; i < NumElts; ++i) {
4559 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4560 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4561 Pos += (Unary ? 0 : NumElts * (i % 2));
4562 Pos += (Lo ? 0 : NumEltsInLane / 2);
4563 Mask.push_back(Pos);
4564 }
4565}
4566
4567/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4568/// imposed by AVX and specific to the unary pattern. Example:
4569/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4570/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4572 bool Lo) {
4573 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4574 int NumElts = VT.getVectorNumElements();
4575 for (int i = 0; i < NumElts; ++i) {
4576 int Pos = i / 2;
4577 Pos += (Lo ? 0 : NumElts / 2);
4578 Mask.push_back(Pos);
4579 }
4580}
4581
4582// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4583static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4584 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4586 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4587 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4588 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4589 int M = Mask[I];
4590 if (M < 0)
4591 continue;
4592 SDValue V = (M < NumElts) ? V1 : V2;
4593 if (V.isUndef())
4594 continue;
4595 Ops[I] = V.getOperand(M % NumElts);
4596 }
4597 return DAG.getBuildVector(VT, dl, Ops);
4598 }
4599
4600 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4601}
4602
4603/// Returns a vector_shuffle node for an unpackl operation.
4604static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4605 SDValue V1, SDValue V2) {
4607 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4608 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4609}
4610
4611/// Returns a vector_shuffle node for an unpackh operation.
4612static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4613 SDValue V1, SDValue V2) {
4615 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4616 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4617}
4618
4619/// Returns a node that packs the LHS + RHS nodes together at half width.
4620/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4621/// TODO: Add subvector splitting if/when we have a need for it.
4622static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4623 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4624 bool PackHiHalf = false) {
4625 MVT OpVT = LHS.getSimpleValueType();
4626 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4627 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4628 assert(OpVT == RHS.getSimpleValueType() &&
4629 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4630 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4631 "Unexpected PACK operand types");
4632 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4633 "Unexpected PACK result type");
4634
4635 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4636 if (EltSizeInBits == 32) {
4637 SmallVector<int> PackMask;
4638 int Offset = PackHiHalf ? 1 : 0;
4639 int NumElts = VT.getVectorNumElements();
4640 for (int I = 0; I != NumElts; I += 4) {
4641 PackMask.push_back(I + Offset);
4642 PackMask.push_back(I + Offset + 2);
4643 PackMask.push_back(I + Offset + NumElts);
4644 PackMask.push_back(I + Offset + NumElts + 2);
4645 }
4646 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4647 DAG.getBitcast(VT, RHS), PackMask);
4648 }
4649
4650 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4651 if (!PackHiHalf) {
4652 if (UsePackUS &&
4653 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4654 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4655 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4656
4657 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4658 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4659 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4660 }
4661
4662 // Fallback to sign/zero extending the requested half and pack.
4663 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4664 if (UsePackUS) {
4665 if (PackHiHalf) {
4666 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4667 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4668 } else {
4669 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4670 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4671 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4672 };
4673 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4674 };
4675
4676 if (!PackHiHalf) {
4677 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4678 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4679 }
4680 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4681 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4682 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4683}
4684
4685/// Return a vector_shuffle of the specified vector of zero or undef vector.
4686/// This produces a shuffle where the low element of V2 is swizzled into the
4687/// zero/undef vector, landing at element Idx.
4688/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4690 bool IsZero,
4691 const X86Subtarget &Subtarget,
4692 SelectionDAG &DAG) {
4693 MVT VT = V2.getSimpleValueType();
4694 SDValue V1 = IsZero
4695 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4696 int NumElems = VT.getVectorNumElements();
4697 SmallVector<int, 16> MaskVec(NumElems);
4698 for (int i = 0; i != NumElems; ++i)
4699 // If this is the insertion idx, put the low elt of V2 here.
4700 MaskVec[i] = (i == Idx) ? NumElems : i;
4701 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4702}
4703
4705 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4706 Ptr.getOpcode() == X86ISD::WrapperRIP)
4707 Ptr = Ptr.getOperand(0);
4708 return dyn_cast<ConstantPoolSDNode>(Ptr);
4709}
4710
4711// TODO: Add support for non-zero offsets.
4714 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4715 return nullptr;
4716 return CNode->getConstVal();
4717}
4718
4720 if (!Load || !ISD::isNormalLoad(Load))
4721 return nullptr;
4722 return getTargetConstantFromBasePtr(Load->getBasePtr());
4723}
4724
4727 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4728}
4729
4730const Constant *
4732 assert(LD && "Unexpected null LoadSDNode");
4733 return getTargetConstantFromNode(LD);
4734}
4735
4736// Extract raw constant bits from constant pools.
4737static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4738 APInt &UndefElts,
4739 SmallVectorImpl<APInt> &EltBits,
4740 bool AllowWholeUndefs = true,
4741 bool AllowPartialUndefs = false) {
4742 assert(EltBits.empty() && "Expected an empty EltBits vector");
4743
4745
4746 EVT VT = Op.getValueType();
4747 unsigned SizeInBits = VT.getSizeInBits();
4748 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4749 unsigned NumElts = SizeInBits / EltSizeInBits;
4750
4751 // Bitcast a source array of element bits to the target size.
4752 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4753 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4754 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4755 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4756 "Constant bit sizes don't match");
4757
4758 // Don't split if we don't allow undef bits.
4759 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4760 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4761 return false;
4762
4763 // If we're already the right size, don't bother bitcasting.
4764 if (NumSrcElts == NumElts) {
4765 UndefElts = UndefSrcElts;
4766 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4767 return true;
4768 }
4769
4770 // Extract all the undef/constant element data and pack into single bitsets.
4771 APInt UndefBits(SizeInBits, 0);
4772 APInt MaskBits(SizeInBits, 0);
4773
4774 for (unsigned i = 0; i != NumSrcElts; ++i) {
4775 unsigned BitOffset = i * SrcEltSizeInBits;
4776 if (UndefSrcElts[i])
4777 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4778 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4779 }
4780
4781 // Split the undef/constant single bitset data into the target elements.
4782 UndefElts = APInt(NumElts, 0);
4783 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4784
4785 for (unsigned i = 0; i != NumElts; ++i) {
4786 unsigned BitOffset = i * EltSizeInBits;
4787 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4788
4789 // Only treat an element as UNDEF if all bits are UNDEF.
4790 if (UndefEltBits.isAllOnes()) {
4791 if (!AllowWholeUndefs)
4792 return false;
4793 UndefElts.setBit(i);
4794 continue;
4795 }
4796
4797 // If only some bits are UNDEF then treat them as zero (or bail if not
4798 // supported).
4799 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4800 return false;
4801
4802 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4803 }
4804 return true;
4805 };
4806
4807 // Collect constant bits and insert into mask/undef bit masks.
4808 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4809 unsigned UndefBitIndex) {
4810 if (!Cst)
4811 return false;
4812 if (isa<UndefValue>(Cst)) {
4813 Undefs.setBit(UndefBitIndex);
4814 return true;
4815 }
4816 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4817 Mask = CInt->getValue();
4818 return true;
4819 }
4820 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4821 Mask = CFP->getValueAPF().bitcastToAPInt();
4822 return true;
4823 }
4824 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4825 Type *Ty = CDS->getType();
4827 Type *EltTy = CDS->getElementType();
4828 bool IsInteger = EltTy->isIntegerTy();
4829 bool IsFP =
4830 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4831 if (!IsInteger && !IsFP)
4832 return false;
4833 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4834 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4835 if (IsInteger)
4836 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4837 else
4838 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4839 I * EltBits);
4840 return true;
4841 }
4842 return false;
4843 };
4844
4845 // Handle UNDEFs.
4846 if (Op.isUndef()) {
4847 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
4848 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
4849 return CastBitData(UndefSrcElts, SrcEltBits);
4850 }
4851
4852 // Extract scalar constant bits.
4853 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
4854 APInt UndefSrcElts = APInt::getZero(1);
4855 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
4856 return CastBitData(UndefSrcElts, SrcEltBits);
4857 }
4858 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
4859 APInt UndefSrcElts = APInt::getZero(1);
4860 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
4861 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
4862 return CastBitData(UndefSrcElts, SrcEltBits);
4863 }
4864
4865 // Extract constant bits from build vector.
4866 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
4867 BitVector Undefs;
4868 SmallVector<APInt> SrcEltBits;
4869 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4870 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
4871 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
4872 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
4873 if (Undefs[I])
4874 UndefSrcElts.setBit(I);
4875 return CastBitData(UndefSrcElts, SrcEltBits);
4876 }
4877 }
4878
4879 // Extract constant bits from constant pool vector.
4880 if (auto *Cst = getTargetConstantFromNode(Op)) {
4881 Type *CstTy = Cst->getType();
4882 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4883 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
4884 return false;
4885
4886 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
4887 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4888 if ((SizeInBits % SrcEltSizeInBits) != 0)
4889 return false;
4890
4891 APInt UndefSrcElts(NumSrcElts, 0);
4892 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
4893 for (unsigned i = 0; i != NumSrcElts; ++i)
4894 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
4895 UndefSrcElts, i))
4896 return false;
4897
4898 return CastBitData(UndefSrcElts, SrcEltBits);
4899 }
4900
4901 // Extract constant bits from a broadcasted constant pool scalar.
4902 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
4903 EltSizeInBits <= VT.getScalarSizeInBits()) {
4904 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4905 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
4906 return false;
4907
4908 SDValue Ptr = MemIntr->getBasePtr();
4910 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4911 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4912
4913 APInt UndefSrcElts(NumSrcElts, 0);
4914 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
4915 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
4916 if (UndefSrcElts[0])
4917 UndefSrcElts.setBits(0, NumSrcElts);
4918 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
4919 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
4920 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
4921 return CastBitData(UndefSrcElts, SrcEltBits);
4922 }
4923 }
4924 }
4925
4926 // Extract constant bits from a subvector broadcast.
4927 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
4928 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4929 SDValue Ptr = MemIntr->getBasePtr();
4930 // The source constant may be larger than the subvector broadcast,
4931 // ensure we extract the correct subvector constants.
4932 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
4933 Type *CstTy = Cst->getType();
4934 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4935 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
4936 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
4937 (SizeInBits % SubVecSizeInBits) != 0)
4938 return false;
4939 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
4940 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
4941 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
4942 APInt UndefSubElts(NumSubElts, 0);
4943 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
4944 APInt(CstEltSizeInBits, 0));
4945 for (unsigned i = 0; i != NumSubElts; ++i) {
4946 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
4947 UndefSubElts, i))
4948 return false;
4949 for (unsigned j = 1; j != NumSubVecs; ++j)
4950 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
4951 }
4952 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
4953 UndefSubElts);
4954 return CastBitData(UndefSubElts, SubEltBits);
4955 }
4956 }
4957
4958 // Extract a rematerialized scalar constant insertion.
4959 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
4960 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
4961 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
4962 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4963 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4964
4965 APInt UndefSrcElts(NumSrcElts, 0);
4966 SmallVector<APInt, 64> SrcEltBits;
4967 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
4968 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
4969 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
4970 return CastBitData(UndefSrcElts, SrcEltBits);
4971 }
4972
4973 // Insert constant bits from a base and sub vector sources.
4974 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
4975 // If bitcasts to larger elements we might lose track of undefs - don't
4976 // allow any to be safe.
4977 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4978 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
4979
4980 APInt UndefSrcElts, UndefSubElts;
4981 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
4982 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
4983 UndefSubElts, EltSubBits,
4984 AllowWholeUndefs && AllowUndefs,
4985 AllowPartialUndefs && AllowUndefs) &&
4986 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
4987 UndefSrcElts, EltSrcBits,
4988 AllowWholeUndefs && AllowUndefs,
4989 AllowPartialUndefs && AllowUndefs)) {
4990 unsigned BaseIdx = Op.getConstantOperandVal(2);
4991 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
4992 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
4993 EltSrcBits[BaseIdx + i] = EltSubBits[i];
4994 return CastBitData(UndefSrcElts, EltSrcBits);
4995 }
4996 }
4997
4998 // Extract constant bits from a subvector's source.
4999 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
5000 // TODO - support extract_subvector through bitcasts.
5001 if (EltSizeInBits != VT.getScalarSizeInBits())
5002 return false;
5003
5004 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5005 UndefElts, EltBits, AllowWholeUndefs,
5006 AllowPartialUndefs)) {
5007 EVT SrcVT = Op.getOperand(0).getValueType();
5008 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5009 unsigned NumSubElts = VT.getVectorNumElements();
5010 unsigned BaseIdx = Op.getConstantOperandVal(1);
5011 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5012 if ((BaseIdx + NumSubElts) != NumSrcElts)
5013 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5014 if (BaseIdx != 0)
5015 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5016 return true;
5017 }
5018 }
5019
5020 // Extract constant bits from shuffle node sources.
5021 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5022 // TODO - support shuffle through bitcasts.
5023 if (EltSizeInBits != VT.getScalarSizeInBits())
5024 return false;
5025
5026 ArrayRef<int> Mask = SVN->getMask();
5027 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5028 llvm::any_of(Mask, [](int M) { return M < 0; }))
5029 return false;
5030
5031 APInt UndefElts0, UndefElts1;
5032 SmallVector<APInt, 32> EltBits0, EltBits1;
5033 if (isAnyInRange(Mask, 0, NumElts) &&
5034 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5035 UndefElts0, EltBits0, AllowWholeUndefs,
5036 AllowPartialUndefs))
5037 return false;
5038 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5039 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5040 UndefElts1, EltBits1, AllowWholeUndefs,
5041 AllowPartialUndefs))
5042 return false;
5043
5044 UndefElts = APInt::getZero(NumElts);
5045 for (int i = 0; i != (int)NumElts; ++i) {
5046 int M = Mask[i];
5047 if (M < 0) {
5048 UndefElts.setBit(i);
5049 EltBits.push_back(APInt::getZero(EltSizeInBits));
5050 } else if (M < (int)NumElts) {
5051 if (UndefElts0[M])
5052 UndefElts.setBit(i);
5053 EltBits.push_back(EltBits0[M]);
5054 } else {
5055 if (UndefElts1[M - NumElts])
5056 UndefElts.setBit(i);
5057 EltBits.push_back(EltBits1[M - NumElts]);
5058 }
5059 }
5060 return true;
5061 }
5062
5063 return false;
5064}
5065
5066namespace llvm {
5067namespace X86 {
5068bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5069 APInt UndefElts;
5070 SmallVector<APInt, 16> EltBits;
5072 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5073 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5074 int SplatIndex = -1;
5075 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5076 if (UndefElts[i])
5077 continue;
5078 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5079 SplatIndex = -1;
5080 break;
5081 }
5082 SplatIndex = i;
5083 }
5084 if (0 <= SplatIndex) {
5085 SplatVal = EltBits[SplatIndex];
5086 return true;
5087 }
5088 }
5089
5090 return false;
5091}
5092} // namespace X86
5093} // namespace llvm
5094
5096 unsigned MaskEltSizeInBits,
5098 APInt &UndefElts) {
5099 // Extract the raw target constant bits.
5100 SmallVector<APInt, 64> EltBits;
5101 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5102 EltBits, /* AllowWholeUndefs */ true,
5103 /* AllowPartialUndefs */ false))
5104 return false;
5105
5106 // Insert the extracted elements into the mask.
5107 for (const APInt &Elt : EltBits)
5108 RawMask.push_back(Elt.getZExtValue());
5109
5110 return true;
5111}
5112
5113// Match not(xor X, -1) -> X.
5114// Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5115// Match not(extract_subvector(xor X, -1)) -> extract_subvector(X).
5116// Match not(concat_vectors(xor X, -1, xor Y, -1)) -> concat_vectors(X, Y).
5118 V = peekThroughBitcasts(V);
5119 if (V.getOpcode() == ISD::XOR &&
5120 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5121 isAllOnesConstant(V.getOperand(1))))
5122 return V.getOperand(0);
5123 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5124 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5125 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5126 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5127 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
5128 Not, V.getOperand(1));
5129 }
5130 }
5131 if (V.getOpcode() == X86ISD::PCMPGT &&
5132 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5133 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5134 V.getOperand(0).hasOneUse()) {
5135 APInt UndefElts;
5136 SmallVector<APInt> EltBits;
5137 if (getTargetConstantBitsFromNode(V.getOperand(0),
5138 V.getScalarValueSizeInBits(), UndefElts,
5139 EltBits)) {
5140 // Don't fold min_signed_value -> (min_signed_value - 1)
5141 bool MinSigned = false;
5142 for (APInt &Elt : EltBits) {
5143 MinSigned |= Elt.isMinSignedValue();
5144 Elt -= 1;
5145 }
5146 if (!MinSigned) {
5147 SDLoc DL(V);
5148 MVT VT = V.getSimpleValueType();
5149 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5150 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5151 }
5152 }
5153 }
5155 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5156 for (SDValue &CatOp : CatOps) {
5157 SDValue NotCat = IsNOT(CatOp, DAG);
5158 if (!NotCat) return SDValue();
5159 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5160 }
5161 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
5162 }
5163 return SDValue();
5164}
5165
5166/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5167/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5168/// Note: This ignores saturation, so inputs must be checked first.
5170 bool Unary, unsigned NumStages = 1) {
5171 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5172 unsigned NumElts = VT.getVectorNumElements();
5173 unsigned NumLanes = VT.getSizeInBits() / 128;
5174 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5175 unsigned Offset = Unary ? 0 : NumElts;
5176 unsigned Repetitions = 1u << (NumStages - 1);
5177 unsigned Increment = 1u << NumStages;
5178 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5179
5180 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5181 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5182 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5183 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5184 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5185 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5186 }
5187 }
5188}
5189
5190// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5191static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5192 APInt &DemandedLHS, APInt &DemandedRHS) {
5193 int NumLanes = VT.getSizeInBits() / 128;
5194 int NumElts = DemandedElts.getBitWidth();
5195 int NumInnerElts = NumElts / 2;
5196 int NumEltsPerLane = NumElts / NumLanes;
5197 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5198
5199 DemandedLHS = APInt::getZero(NumInnerElts);
5200 DemandedRHS = APInt::getZero(NumInnerElts);
5201
5202 // Map DemandedElts to the packed operands.
5203 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5204 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5205 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5206 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5207 if (DemandedElts[OuterIdx])
5208 DemandedLHS.setBit(InnerIdx);
5209 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5210 DemandedRHS.setBit(InnerIdx);
5211 }
5212 }
5213}
5214
5215// Split the demanded elts of a HADD/HSUB node between its operands.
5216static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5217 APInt &DemandedLHS, APInt &DemandedRHS) {
5218 int NumLanes = VT.getSizeInBits() / 128;
5219 int NumElts = DemandedElts.getBitWidth();
5220 int NumEltsPerLane = NumElts / NumLanes;
5221 int HalfEltsPerLane = NumEltsPerLane / 2;
5222
5223 DemandedLHS = APInt::getZero(NumElts);
5224 DemandedRHS = APInt::getZero(NumElts);
5225
5226 // Map DemandedElts to the horizontal operands.
5227 for (int Idx = 0; Idx != NumElts; ++Idx) {
5228 if (!DemandedElts[Idx])
5229 continue;
5230 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
5231 int LocalIdx = Idx % NumEltsPerLane;
5232 if (LocalIdx < HalfEltsPerLane) {
5233 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5234 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5235 } else {
5236 LocalIdx -= HalfEltsPerLane;
5237 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5238 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5239 }
5240 }
5241}
5242
5243/// Calculates the shuffle mask corresponding to the target-specific opcode.
5244/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5245/// operands in \p Ops, and returns true.
5246/// Sets \p IsUnary to true if only one source is used. Note that this will set
5247/// IsUnary for shuffles which use a single input multiple times, and in those
5248/// cases it will adjust the mask to only have indices within that single input.
5249/// It is an error to call this with non-empty Mask/Ops vectors.
5250static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5252 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5253 if (!isTargetShuffle(N.getOpcode()))
5254 return false;
5255
5256 MVT VT = N.getSimpleValueType();
5257 unsigned NumElems = VT.getVectorNumElements();
5258 unsigned MaskEltSize = VT.getScalarSizeInBits();
5260 APInt RawUndefs;
5261 uint64_t ImmN;
5262
5263 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5264 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5265
5266 IsUnary = false;
5267 bool IsFakeUnary = false;
5268 switch (N.getOpcode()) {
5269 case X86ISD::BLENDI:
5270 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5271 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5272 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5273 DecodeBLENDMask(NumElems, ImmN, Mask);
5274 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5275 break;
5276 case X86ISD::SHUFP:
5277 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5278 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5279 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5280 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5281 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5282 break;
5283 case X86ISD::INSERTPS:
5284 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5285 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5286 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5287 DecodeINSERTPSMask(ImmN, Mask);
5288 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5289 break;
5290 case X86ISD::EXTRQI:
5291 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5292 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5293 isa<ConstantSDNode>(N.getOperand(2))) {
5294 int BitLen = N.getConstantOperandVal(1);
5295 int BitIdx = N.getConstantOperandVal(2);
5296 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5297 IsUnary = true;
5298 }
5299 break;
5300 case X86ISD::INSERTQI:
5301 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5302 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5303 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5304 isa<ConstantSDNode>(N.getOperand(3))) {
5305 int BitLen = N.getConstantOperandVal(2);
5306 int BitIdx = N.getConstantOperandVal(3);
5307 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5308 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5309 }
5310 break;
5311 case X86ISD::UNPCKH:
5312 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5313 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5314 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5315 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5316 break;
5317 case X86ISD::UNPCKL:
5318 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5319 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5320 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5321 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5322 break;
5323 case X86ISD::MOVHLPS:
5324 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5325 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5326 DecodeMOVHLPSMask(NumElems, Mask);
5327 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5328 break;
5329 case X86ISD::MOVLHPS:
5330 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5331 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5332 DecodeMOVLHPSMask(NumElems, Mask);
5333 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5334 break;
5335 case X86ISD::VALIGN:
5336 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5337 "Only 32-bit and 64-bit elements are supported!");
5338 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5339 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5340 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5341 DecodeVALIGNMask(NumElems, ImmN, Mask);
5342 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5343 Ops.push_back(N.getOperand(1));
5344 Ops.push_back(N.getOperand(0));
5345 break;
5346 case X86ISD::PALIGNR:
5347 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5348 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5349 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5350 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5351 DecodePALIGNRMask(NumElems, ImmN, Mask);
5352 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5353 Ops.push_back(N.getOperand(1));
5354 Ops.push_back(N.getOperand(0));
5355 break;
5356 case X86ISD::VSHLDQ:
5357 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5358 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5359 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5360 DecodePSLLDQMask(NumElems, ImmN, Mask);
5361 IsUnary = true;
5362 break;
5363 case X86ISD::VSRLDQ:
5364 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5365 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5366 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5367 DecodePSRLDQMask(NumElems, ImmN, Mask);
5368 IsUnary = true;
5369 break;
5370 case X86ISD::PSHUFD:
5371 case X86ISD::VPERMILPI:
5372 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5373 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5374 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5375 IsUnary = true;
5376 break;
5377 case X86ISD::PSHUFHW:
5378 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5379 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5380 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5381 IsUnary = true;
5382 break;
5383 case X86ISD::PSHUFLW:
5384 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5385 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5386 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5387 IsUnary = true;
5388 break;
5389 case X86ISD::VZEXT_MOVL:
5390 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5391 DecodeZeroMoveLowMask(NumElems, Mask);
5392 IsUnary = true;
5393 break;
5394 case X86ISD::VBROADCAST:
5395 // We only decode broadcasts of same-sized vectors, peeking through to
5396 // extracted subvectors is likely to cause hasOneUse issues with
5397 // SimplifyDemandedBits etc.
5398 if (N.getOperand(0).getValueType() == VT) {
5399 DecodeVectorBroadcast(NumElems, Mask);
5400 IsUnary = true;
5401 break;
5402 }
5403 return false;
5404 case X86ISD::VPERMILPV: {
5405 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5406 IsUnary = true;
5407 SDValue MaskNode = N.getOperand(1);
5408 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5409 RawUndefs)) {
5410 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5411 break;
5412 }
5413 return false;
5414 }
5415 case X86ISD::PSHUFB: {
5416 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5417 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5418 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5419 IsUnary = true;
5420 SDValue MaskNode = N.getOperand(1);
5421 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5422 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5423 break;
5424 }
5425 return false;
5426 }
5427 case X86ISD::VPERMI:
5428 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5429 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5430 DecodeVPERMMask(NumElems, ImmN, Mask);
5431 IsUnary = true;
5432 break;
5433 case X86ISD::MOVSS:
5434 case X86ISD::MOVSD:
5435 case X86ISD::MOVSH:
5436 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5437 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5438 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5439 break;
5440 case X86ISD::VPERM2X128:
5441 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5442 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5443 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5444 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5445 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5446 break;
5447 case X86ISD::SHUF128:
5448 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5449 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5450 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5451 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5452 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5453 break;
5454 case X86ISD::MOVSLDUP:
5455 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5456 DecodeMOVSLDUPMask(NumElems, Mask);
5457 IsUnary = true;
5458 break;
5459 case X86ISD::MOVSHDUP:
5460 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5461 DecodeMOVSHDUPMask(NumElems, Mask);
5462 IsUnary = true;
5463 break;
5464 case X86ISD::MOVDDUP:
5465 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5466 DecodeMOVDDUPMask(NumElems, Mask);
5467 IsUnary = true;
5468 break;
5469 case X86ISD::VPERMIL2: {
5470 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5471 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5472 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5473 SDValue MaskNode = N.getOperand(2);
5474 SDValue CtrlNode = N.getOperand(3);
5475 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5476 unsigned CtrlImm = CtrlOp->getZExtValue();
5477 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5478 RawUndefs)) {
5479 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5480 Mask);
5481 break;
5482 }
5483 }
5484 return false;
5485 }
5486 case X86ISD::VPPERM: {
5487 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5488 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5489 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5490 SDValue MaskNode = N.getOperand(2);
5491 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5492 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5493 break;
5494 }
5495 return false;
5496 }
5497 case X86ISD::VPERMV: {
5498 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5499 IsUnary = true;
5500 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5501 Ops.push_back(N.getOperand(1));
5502 SDValue MaskNode = N.getOperand(0);
5503 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5504 RawUndefs)) {
5505 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5506 break;
5507 }
5508 return false;
5509 }
5510 case X86ISD::VPERMV3: {
5511 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5512 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5513 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5514 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5515 Ops.push_back(N.getOperand(0));
5516 Ops.push_back(N.getOperand(2));
5517 SDValue MaskNode = N.getOperand(1);
5518 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5519 RawUndefs)) {
5520 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5521 break;
5522 }
5523 return false;
5524 }
5525 default:
5526 llvm_unreachable("unknown target shuffle node");
5527 }
5528
5529 // Empty mask indicates the decode failed.
5530 if (Mask.empty())
5531 return false;
5532
5533 // Check if we're getting a shuffle mask with zero'd elements.
5534 if (!AllowSentinelZero && isAnyZero(Mask))
5535 return false;
5536
5537 // If we have a fake unary shuffle, the shuffle mask is spread across two
5538 // inputs that are actually the same node. Re-map the mask to always point
5539 // into the first input.
5540 if (IsFakeUnary)
5541 for (int &M : Mask)
5542 if (M >= (int)Mask.size())
5543 M -= Mask.size();
5544
5545 // If we didn't already add operands in the opcode-specific code, default to
5546 // adding 1 or 2 operands starting at 0.
5547 if (Ops.empty()) {
5548 Ops.push_back(N.getOperand(0));
5549 if (!IsUnary || IsFakeUnary)
5550 Ops.push_back(N.getOperand(1));
5551 }
5552
5553 return true;
5554}
5555
5556// Wrapper for getTargetShuffleMask with InUnary;
5557static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5559 SmallVectorImpl<int> &Mask) {
5560 bool IsUnary;
5561 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5562}
5563
5564/// Compute whether each element of a shuffle is zeroable.
5565///
5566/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5567/// Either it is an undef element in the shuffle mask, the element of the input
5568/// referenced is undef, or the element of the input referenced is known to be
5569/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5570/// as many lanes with this technique as possible to simplify the remaining
5571/// shuffle.
5573 SDValue V1, SDValue V2,
5574 APInt &KnownUndef, APInt &KnownZero) {
5575 int Size = Mask.size();
5576 KnownUndef = KnownZero = APInt::getZero(Size);
5577
5578 V1 = peekThroughBitcasts(V1);
5579 V2 = peekThroughBitcasts(V2);
5580
5581 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5582 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5583
5584 int VectorSizeInBits = V1.getValueSizeInBits();
5585 int ScalarSizeInBits = VectorSizeInBits / Size;
5586 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5587
5588 for (int i = 0; i < Size; ++i) {
5589 int M = Mask[i];
5590 // Handle the easy cases.
5591 if (M < 0) {
5592 KnownUndef.setBit(i);
5593 continue;
5594 }
5595 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5596 KnownZero.setBit(i);
5597 continue;
5598 }
5599
5600 // Determine shuffle input and normalize the mask.
5601 SDValue V = M < Size ? V1 : V2;
5602 M %= Size;
5603
5604 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5605 if (V.getOpcode() != ISD::BUILD_VECTOR)
5606 continue;
5607
5608 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5609 // the (larger) source element must be UNDEF/ZERO.
5610 if ((Size % V.getNumOperands()) == 0) {
5611 int Scale = Size / V->getNumOperands();
5612 SDValue Op = V.getOperand(M / Scale);
5613 if (Op.isUndef())
5614 KnownUndef.setBit(i);
5615 if (X86::isZeroNode(Op))
5616 KnownZero.setBit(i);
5617 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5618 APInt Val = Cst->getAPIntValue();
5619 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5620 if (Val == 0)
5621 KnownZero.setBit(i);
5622 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5623 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5624 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5625 if (Val == 0)
5626 KnownZero.setBit(i);
5627 }
5628 continue;
5629 }
5630
5631 // If the BUILD_VECTOR has more elements then all the (smaller) source
5632 // elements must be UNDEF or ZERO.
5633 if ((V.getNumOperands() % Size) == 0) {
5634 int Scale = V->getNumOperands() / Size;
5635 bool AllUndef = true;
5636 bool AllZero = true;
5637 for (int j = 0; j < Scale; ++j) {
5638 SDValue Op = V.getOperand((M * Scale) + j);
5639 AllUndef &= Op.isUndef();
5640 AllZero &= X86::isZeroNode(Op);
5641 }
5642 if (AllUndef)
5643 KnownUndef.setBit(i);
5644 if (AllZero)
5645 KnownZero.setBit(i);
5646 continue;
5647 }
5648 }
5649}
5650
5651/// Decode a target shuffle mask and inputs and see if any values are
5652/// known to be undef or zero from their inputs.
5653/// Returns true if the target shuffle mask was decoded.
5654/// FIXME: Merge this with computeZeroableShuffleElements?
5657 APInt &KnownUndef, APInt &KnownZero) {
5658 bool IsUnary;
5659 if (!isTargetShuffle(N.getOpcode()))
5660 return false;
5661
5662 MVT VT = N.getSimpleValueType();
5663 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5664 return false;
5665
5666 int Size = Mask.size();
5667 SDValue V1 = Ops[0];
5668 SDValue V2 = IsUnary ? V1 : Ops[1];
5669 KnownUndef = KnownZero = APInt::getZero(Size);
5670
5671 V1 = peekThroughBitcasts(V1);
5672 V2 = peekThroughBitcasts(V2);
5673
5674 assert((VT.getSizeInBits() % Size) == 0 &&
5675 "Illegal split of shuffle value type");
5676 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5677
5678 // Extract known constant input data.
5679 APInt UndefSrcElts[2];
5680 SmallVector<APInt, 32> SrcEltBits[2];
5681 bool IsSrcConstant[2] = {
5682 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5683 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5684 /*AllowPartialUndefs*/ false),
5685 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5686 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5687 /*AllowPartialUndefs*/ false)};
5688
5689 for (int i = 0; i < Size; ++i) {
5690 int M = Mask[i];
5691
5692 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5693 if (M < 0) {
5694 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5695 if (SM_SentinelUndef == M)
5696 KnownUndef.setBit(i);
5697 if (SM_SentinelZero == M)
5698 KnownZero.setBit(i);
5699 continue;
5700 }
5701
5702 // Determine shuffle input and normalize the mask.
5703 unsigned SrcIdx = M / Size;
5704 SDValue V = M < Size ? V1 : V2;
5705 M %= Size;
5706
5707 // We are referencing an UNDEF input.
5708 if (V.isUndef()) {
5709 KnownUndef.setBit(i);
5710 continue;
5711 }
5712
5713 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5714 // TODO: We currently only set UNDEF for integer types - floats use the same
5715 // registers as vectors and many of the scalar folded loads rely on the
5716 // SCALAR_TO_VECTOR pattern.
5717 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5718 (Size % V.getValueType().getVectorNumElements()) == 0) {
5719 int Scale = Size / V.getValueType().getVectorNumElements();
5720 int Idx = M / Scale;
5721 if (Idx != 0 && !VT.isFloatingPoint())
5722 KnownUndef.setBit(i);
5723 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5724 KnownZero.setBit(i);
5725 continue;
5726 }
5727
5728 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5729 // base vectors.
5730 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5731 SDValue Vec = V.getOperand(0);
5732 int NumVecElts = Vec.getValueType().getVectorNumElements();
5733 if (Vec.isUndef() && Size == NumVecElts) {
5734 int Idx = V.getConstantOperandVal(2);
5735 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5736 if (M < Idx || (Idx + NumSubElts) <= M)
5737 KnownUndef.setBit(i);
5738 }
5739 continue;
5740 }
5741
5742 // Attempt to extract from the source's constant bits.
5743 if (IsSrcConstant[SrcIdx]) {
5744 if (UndefSrcElts[SrcIdx][M])
5745 KnownUndef.setBit(i);
5746 else if (SrcEltBits[SrcIdx][M] == 0)
5747 KnownZero.setBit(i);
5748 }
5749 }
5750
5751 assert(VT.getVectorNumElements() == (unsigned)Size &&
5752 "Different mask size from vector size!");
5753 return true;
5754}
5755
5756// Replace target shuffle mask elements with known undef/zero sentinels.
5758 const APInt &KnownUndef,
5759 const APInt &KnownZero,
5760 bool ResolveKnownZeros= true) {
5761 unsigned NumElts = Mask.size();
5762 assert(KnownUndef.getBitWidth() == NumElts &&
5763 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5764
5765 for (unsigned i = 0; i != NumElts; ++i) {
5766 if (KnownUndef[i])
5767 Mask[i] = SM_SentinelUndef;
5768 else if (ResolveKnownZeros && KnownZero[i])
5769 Mask[i] = SM_SentinelZero;
5770 }
5771}
5772
5773// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5775 APInt &KnownUndef,
5776 APInt &KnownZero) {
5777 unsigned NumElts = Mask.size();
5778 KnownUndef = KnownZero = APInt::getZero(NumElts);
5779
5780 for (unsigned i = 0; i != NumElts; ++i) {
5781 int M = Mask[i];
5782 if (SM_SentinelUndef == M)
5783 KnownUndef.setBit(i);
5784 if (SM_SentinelZero == M)
5785 KnownZero.setBit(i);
5786 }
5787}
5788
5789// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5791 SDValue Cond, bool IsBLENDV = false) {
5792 EVT CondVT = Cond.getValueType();
5793 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5794 unsigned NumElts = CondVT.getVectorNumElements();
5795
5796 APInt UndefElts;
5797 SmallVector<APInt, 32> EltBits;
5798 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5799 /*AllowWholeUndefs*/ true,
5800 /*AllowPartialUndefs*/ false))
5801 return false;
5802
5803 Mask.resize(NumElts, SM_SentinelUndef);
5804
5805 for (int i = 0; i != (int)NumElts; ++i) {
5806 Mask[i] = i;
5807 // Arbitrarily choose from the 2nd operand if the select condition element
5808 // is undef.
5809 // TODO: Can we do better by matching patterns such as even/odd?
5810 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5811 (IsBLENDV && EltBits[i].isNonNegative()))
5812 Mask[i] += NumElts;
5813 }
5814
5815 return true;
5816}
5817
5818// Forward declaration (for getFauxShuffleMask recursive check).
5819static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5822 const SelectionDAG &DAG, unsigned Depth,
5823 bool ResolveKnownElts);
5824
5825// Attempt to decode ops that could be represented as a shuffle mask.
5826// The decoded shuffle mask may contain a different number of elements to the
5827// destination value type.
5828// TODO: Merge into getTargetShuffleInputs()
5829static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
5832 const SelectionDAG &DAG, unsigned Depth,
5833 bool ResolveKnownElts) {
5834 Mask.clear();
5835 Ops.clear();
5836
5837 MVT VT = N.getSimpleValueType();
5838 unsigned NumElts = VT.getVectorNumElements();
5839 unsigned NumSizeInBits = VT.getSizeInBits();
5840 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5841 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
5842 return false;
5843 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
5844 unsigned NumSizeInBytes = NumSizeInBits / 8;
5845 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5846
5847 unsigned Opcode = N.getOpcode();
5848 switch (Opcode) {
5849 case ISD::VECTOR_SHUFFLE: {
5850 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
5851 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
5852 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
5853 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
5854 Ops.push_back(N.getOperand(0));
5855 Ops.push_back(N.getOperand(1));
5856 return true;
5857 }
5858 return false;
5859 }
5860 case ISD::AND:
5861 case X86ISD::ANDNP: {
5862 // Attempt to decode as a per-byte mask.
5863 APInt UndefElts;
5864 SmallVector<APInt, 32> EltBits;
5865 SDValue N0 = N.getOperand(0);
5866 SDValue N1 = N.getOperand(1);
5867 bool IsAndN = (X86ISD::ANDNP == Opcode);
5868 uint64_t ZeroMask = IsAndN ? 255 : 0;
5869 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
5870 /*AllowWholeUndefs*/ false,
5871 /*AllowPartialUndefs*/ false))
5872 return false;
5873 // We can't assume an undef src element gives an undef dst - the other src
5874 // might be zero.
5875 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
5876 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5877 const APInt &ByteBits = EltBits[i];
5878 if (ByteBits != 0 && ByteBits != 255)
5879 return false;
5880 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5881 }
5882 Ops.push_back(IsAndN ? N1 : N0);
5883 return true;
5884 }
5885 case ISD::OR: {
5886 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
5887 // is a valid shuffle index.
5888 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
5889 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
5890 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
5891 return false;
5892
5893 SmallVector<int, 64> SrcMask0, SrcMask1;
5894 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
5897 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
5898 Depth + 1, true) ||
5899 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
5900 Depth + 1, true))
5901 return false;
5902
5903 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
5904 SmallVector<int, 64> Mask0, Mask1;
5905 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
5906 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
5907 for (int i = 0; i != (int)MaskSize; ++i) {
5908 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
5909 // loops converting between OR and BLEND shuffles due to
5910 // canWidenShuffleElements merging away undef elements, meaning we
5911 // fail to recognise the OR as the undef element isn't known zero.
5912 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
5913 Mask.push_back(SM_SentinelZero);
5914 else if (Mask1[i] == SM_SentinelZero)
5915 Mask.push_back(i);
5916 else if (Mask0[i] == SM_SentinelZero)
5917 Mask.push_back(i + MaskSize);
5918 else
5919 return false;
5920 }
5921 Ops.push_back(N0);
5922 Ops.push_back(N1);
5923 return true;
5924 }
5925 case ISD::INSERT_SUBVECTOR: {
5926 SDValue Src = N.getOperand(0);
5927 SDValue Sub = N.getOperand(1);
5928 EVT SubVT = Sub.getValueType();
5929 unsigned NumSubElts = SubVT.getVectorNumElements();
5930 if (!N->isOnlyUserOf(Sub.getNode()))
5931 return false;
5932 SDValue SubBC = peekThroughBitcasts(Sub);
5933 uint64_t InsertIdx = N.getConstantOperandVal(2);
5934 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
5935 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5936 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
5937 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
5938 SDValue SubBCSrc = SubBC.getOperand(0);
5939 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
5940 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
5941 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
5942 "Subvector valuetype mismatch");
5943 InsertIdx *= (MaxElts / NumElts);
5944 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
5945 NumSubElts *= (MaxElts / NumElts);
5946 bool SrcIsUndef = Src.isUndef();
5947 for (int i = 0; i != (int)MaxElts; ++i)
5948 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
5949 for (int i = 0; i != (int)NumSubElts; ++i)
5950 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
5951 if (!SrcIsUndef)
5952 Ops.push_back(Src);
5953 Ops.push_back(SubBCSrc);
5954 return true;
5955 }
5956 // Handle CONCAT(SUB0, SUB1).
5957 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
5958 // cross lane shuffles.
5959 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
5960 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
5961 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5962 Src.getOperand(0).isUndef() &&
5963 Src.getOperand(1).getValueType() == SubVT &&
5964 Src.getConstantOperandVal(2) == 0) {
5965 for (int i = 0; i != (int)NumSubElts; ++i)
5966 Mask.push_back(i);
5967 for (int i = 0; i != (int)NumSubElts; ++i)
5968 Mask.push_back(i + NumElts);
5969 Ops.push_back(Src.getOperand(1));
5970 Ops.push_back(Sub);
5971 return true;
5972 }
5973 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
5974 SmallVector<int, 64> SubMask;
5975 SmallVector<SDValue, 2> SubInputs;
5976 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
5977 EVT SubSrcVT = SubSrc.getValueType();
5978 if (!SubSrcVT.isVector())
5979 return false;
5980
5981 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
5982 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
5983 Depth + 1, ResolveKnownElts))
5984 return false;
5985
5986 // Subvector shuffle inputs must not be larger than the subvector.
5987 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
5988 return SubVT.getFixedSizeInBits() <
5989 SubInput.getValueSizeInBits().getFixedValue();
5990 }))
5991 return false;
5992
5993 if (SubMask.size() != NumSubElts) {
5994 assert(((SubMask.size() % NumSubElts) == 0 ||
5995 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
5996 if ((NumSubElts % SubMask.size()) == 0) {
5997 int Scale = NumSubElts / SubMask.size();
5998 SmallVector<int,64> ScaledSubMask;
5999 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6000 SubMask = ScaledSubMask;
6001 } else {
6002 int Scale = SubMask.size() / NumSubElts;
6003 NumSubElts = SubMask.size();
6004 NumElts *= Scale;
6005 InsertIdx *= Scale;
6006 }
6007 }
6008 Ops.push_back(Src);
6009 Ops.append(SubInputs.begin(), SubInputs.end());
6010 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6011 Mask.append(NumElts, SM_SentinelZero);
6012 else
6013 for (int i = 0; i != (int)NumElts; ++i)
6014 Mask.push_back(i);
6015 for (int i = 0; i != (int)NumSubElts; ++i) {
6016 int M = SubMask[i];
6017 if (0 <= M) {
6018 int InputIdx = M / NumSubElts;
6019 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6020 }
6021 Mask[i + InsertIdx] = M;
6022 }
6023 return true;
6024 }
6025 case X86ISD::PINSRB:
6026 case X86ISD::PINSRW:
6029 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6030 // vector, for matching src/dst vector types.
6031 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6032
6033 unsigned DstIdx = 0;
6034 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6035 // Check we have an in-range constant insertion index.
6036 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6037 N.getConstantOperandAPInt(2).uge(NumElts))
6038 return false;
6039 DstIdx = N.getConstantOperandVal(2);
6040
6041 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6042 if (X86::isZeroNode(Scl)) {
6043 Ops.push_back(N.getOperand(0));
6044 for (unsigned i = 0; i != NumElts; ++i)
6045 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6046 return true;
6047 }
6048 }
6049
6050 // Peek through trunc/aext/zext/bitcast.
6051 // TODO: aext shouldn't require SM_SentinelZero padding.
6052 // TODO: handle shift of scalars.
6053 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6054 while (Scl.getOpcode() == ISD::TRUNCATE ||
6055 Scl.getOpcode() == ISD::ANY_EXTEND ||
6056 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6057 (Scl.getOpcode() == ISD::BITCAST &&
6060 Scl = Scl.getOperand(0);
6061 MinBitsPerElt =
6062 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6063 }
6064 if ((MinBitsPerElt % 8) != 0)
6065 return false;
6066
6067 // Attempt to find the source vector the scalar was extracted from.
6068 SDValue SrcExtract;
6069 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6070 Scl.getOpcode() == X86ISD::PEXTRW ||
6071 Scl.getOpcode() == X86ISD::PEXTRB) &&
6072 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6073 SrcExtract = Scl;
6074 }
6075 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6076 return false;
6077
6078 SDValue SrcVec = SrcExtract.getOperand(0);
6079 EVT SrcVT = SrcVec.getValueType();
6080 if (!SrcVT.getScalarType().isByteSized())
6081 return false;
6082 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6083 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6084 unsigned DstByte = DstIdx * NumBytesPerElt;
6085 MinBitsPerElt =
6086 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6087
6088 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6089 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6090 Ops.push_back(SrcVec);
6091 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6092 } else {
6093 Ops.push_back(SrcVec);
6094 Ops.push_back(N.getOperand(0));
6095 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6096 Mask.push_back(NumSizeInBytes + i);
6097 }
6098
6099 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6100 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6101 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6102 Mask[DstByte + i] = SrcByte + i;
6103 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6104 Mask[DstByte + i] = SM_SentinelZero;
6105 return true;
6106 }
6107 case X86ISD::PACKSS:
6108 case X86ISD::PACKUS: {
6109 SDValue N0 = N.getOperand(0);
6110 SDValue N1 = N.getOperand(1);
6111 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6112 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6113 "Unexpected input value type");
6114
6115 APInt EltsLHS, EltsRHS;
6116 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6117
6118 // If we know input saturation won't happen (or we don't care for particular
6119 // lanes), we can treat this as a truncation shuffle.
6120 bool Offset0 = false, Offset1 = false;
6121 if (Opcode == X86ISD::PACKSS) {
6122 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6123 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6124 (!(N1.isUndef() || EltsRHS.isZero()) &&
6125 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6126 return false;
6127 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6128 // PACKSS then it was likely being used for sign-extension for a
6129 // truncation, so just peek through and adjust the mask accordingly.
6130 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6131 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6132 Offset0 = true;
6133 N0 = N0.getOperand(0);
6134 }
6135 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6136 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6137 Offset1 = true;
6138 N1 = N1.getOperand(0);
6139 }
6140 } else {
6141 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6142 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6143 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6144 (!(N1.isUndef() || EltsRHS.isZero()) &&
6145 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6146 return false;
6147 }
6148
6149 bool IsUnary = (N0 == N1);
6150
6151 Ops.push_back(N0);
6152 if (!IsUnary)
6153 Ops.push_back(N1);
6154
6155 createPackShuffleMask(VT, Mask, IsUnary);
6156
6157 if (Offset0 || Offset1) {
6158 for (int &M : Mask)
6159 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6160 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6161 ++M;
6162 }
6163 return true;
6164 }
6165 case ISD::VSELECT:
6166 case X86ISD::BLENDV: {
6167 SDValue Cond = N.getOperand(0);
6168 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6169 Ops.push_back(N.getOperand(1));
6170 Ops.push_back(N.getOperand(2));
6171 return true;
6172 }
6173 return false;
6174 }
6175 case X86ISD::VTRUNC: {
6176 SDValue Src = N.getOperand(0);
6177 EVT SrcVT = Src.getValueType();
6178 // Truncated source must be a simple vector.
6179 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6180 (SrcVT.getScalarSizeInBits() % 8) != 0)
6181 return false;
6182 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6183 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6184 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6185 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6186 for (unsigned i = 0; i != NumSrcElts; ++i)
6187 Mask.push_back(i * Scale);
6188 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6189 Ops.push_back(Src);
6190 return true;
6191 }
6192 case X86ISD::VSHLI:
6193 case X86ISD::VSRLI: {
6194 uint64_t ShiftVal = N.getConstantOperandVal(1);
6195 // Out of range bit shifts are guaranteed to be zero.
6196 if (NumBitsPerElt <= ShiftVal) {
6197 Mask.append(NumElts, SM_SentinelZero);
6198 return true;
6199 }
6200
6201 // We can only decode 'whole byte' bit shifts as shuffles.
6202 if ((ShiftVal % 8) != 0)
6203 break;
6204
6205 uint64_t ByteShift = ShiftVal / 8;
6206 Ops.push_back(N.getOperand(0));
6207
6208 // Clear mask to all zeros and insert the shifted byte indices.
6209 Mask.append(NumSizeInBytes, SM_SentinelZero);
6210
6211 if (X86ISD::VSHLI == Opcode) {
6212 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6213 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6214 Mask[i + j] = i + j - ByteShift;
6215 } else {
6216 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6217 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6218 Mask[i + j - ByteShift] = i + j;
6219 }
6220 return true;
6221 }
6222 case X86ISD::VROTLI:
6223 case X86ISD::VROTRI: {
6224 // We can only decode 'whole byte' bit rotates as shuffles.
6225 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6226 if ((RotateVal % 8) != 0)
6227 return false;
6228 Ops.push_back(N.getOperand(0));
6229 int Offset = RotateVal / 8;
6230 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6231 for (int i = 0; i != (int)NumElts; ++i) {
6232 int BaseIdx = i * NumBytesPerElt;
6233 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6234 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6235 }
6236 }
6237 return true;
6238 }
6239 case X86ISD::VBROADCAST: {
6240 SDValue Src = N.getOperand(0);
6241 if (!Src.getSimpleValueType().isVector()) {
6242 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6243 !isNullConstant(Src.getOperand(1)) ||
6244 Src.getOperand(0).getValueType().getScalarType() !=
6245 VT.getScalarType())
6246 return false;
6247 Src = Src.getOperand(0);
6248 }
6249 Ops.push_back(Src);
6250 Mask.append(NumElts, 0);
6251 return true;
6252 }
6254 SDValue Src = N.getOperand(0);
6255 EVT SrcVT = Src.getValueType();
6256 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6257
6258 // Extended source must be a simple vector.
6259 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6260 (NumBitsPerSrcElt % 8) != 0)
6261 return false;
6262
6263 // We can only handle all-signbits extensions.
6264 APInt DemandedSrcElts =
6265 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6266 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6267 return false;
6268
6269 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6270 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6271 for (unsigned I = 0; I != NumElts; ++I)
6272 Mask.append(Scale, I);
6273 Ops.push_back(Src);
6274 return true;
6275 }
6276 case ISD::ZERO_EXTEND:
6277 case ISD::ANY_EXTEND:
6280 SDValue Src = N.getOperand(0);
6281 EVT SrcVT = Src.getValueType();
6282
6283 // Extended source must be a simple vector.
6284 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6285 (SrcVT.getScalarSizeInBits() % 8) != 0)
6286 return false;
6287
6288 bool IsAnyExtend =
6289 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6290 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6291 IsAnyExtend, Mask);
6292 Ops.push_back(Src);
6293 return true;
6294 }
6295 }
6296
6297 return false;
6298}
6299
6300/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6302 SmallVectorImpl<int> &Mask) {
6303 int MaskWidth = Mask.size();
6304 SmallVector<SDValue, 16> UsedInputs;
6305 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6306 int lo = UsedInputs.size() * MaskWidth;
6307 int hi = lo + MaskWidth;
6308
6309 // Strip UNDEF input usage.
6310 if (Inputs[i].isUndef())
6311 for (int &M : Mask)
6312 if ((lo <= M) && (M < hi))
6313 M = SM_SentinelUndef;
6314
6315 // Check for unused inputs.
6316 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6317 for (int &M : Mask)
6318 if (lo <= M)
6319 M -= MaskWidth;
6320 continue;
6321 }
6322
6323 // Check for repeated inputs.
6324 bool IsRepeat = false;
6325 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6326 if (UsedInputs[j] != Inputs[i])
6327 continue;
6328 for (int &M : Mask)
6329 if (lo <= M)
6330 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6331 IsRepeat = true;
6332 break;
6333 }
6334 if (IsRepeat)
6335 continue;
6336
6337 UsedInputs.push_back(Inputs[i]);
6338 }
6339 Inputs = UsedInputs;
6340}
6341
6342/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6343/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6344/// Returns true if the target shuffle mask was decoded.
6345static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6348 APInt &KnownUndef, APInt &KnownZero,
6349 const SelectionDAG &DAG, unsigned Depth,
6350 bool ResolveKnownElts) {
6352 return false; // Limit search depth.
6353
6354 EVT VT = Op.getValueType();
6355 if (!VT.isSimple() || !VT.isVector())
6356 return false;
6357
6358 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6359 if (ResolveKnownElts)
6360 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6361 return true;
6362 }
6363 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6364 ResolveKnownElts)) {
6365 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6366 return true;
6367 }
6368 return false;
6369}
6370
6371static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6374 const SelectionDAG &DAG, unsigned Depth,
6375 bool ResolveKnownElts) {
6376 APInt KnownUndef, KnownZero;
6377 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6378 KnownZero, DAG, Depth, ResolveKnownElts);
6379}
6380
6383 const SelectionDAG &DAG, unsigned Depth = 0,
6384 bool ResolveKnownElts = true) {
6385 EVT VT = Op.getValueType();
6386 if (!VT.isSimple() || !VT.isVector())
6387 return false;
6388
6389 unsigned NumElts = Op.getValueType().getVectorNumElements();
6390 APInt DemandedElts = APInt::getAllOnes(NumElts);
6391 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6392 ResolveKnownElts);
6393}
6394
6395// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6396static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6397 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6398 SelectionDAG &DAG) {
6399 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6400 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6401 "Unknown broadcast load type");
6402
6403 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6404 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6405 return SDValue();
6406
6409 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6410 SDValue Ops[] = {Mem->getChain(), Ptr};
6411 SDValue BcstLd = DAG.getMemIntrinsicNode(
6412 Opcode, DL, Tys, Ops, MemVT,
6414 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6415 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6416 return BcstLd;
6417}
6418
6419/// Returns the scalar element that will make up the i'th
6420/// element of the result of the vector shuffle.
6422 SelectionDAG &DAG, unsigned Depth) {
6424 return SDValue(); // Limit search depth.
6425
6426 EVT VT = Op.getValueType();
6427 unsigned Opcode = Op.getOpcode();
6428 unsigned NumElems = VT.getVectorNumElements();
6429
6430 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6431 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6432 int Elt = SV->getMaskElt(Index);
6433
6434 if (Elt < 0)
6435 return DAG.getUNDEF(VT.getVectorElementType());
6436
6437 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6438 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6439 }
6440
6441 // Recurse into target specific vector shuffles to find scalars.
6442 if (isTargetShuffle(Opcode)) {
6443 MVT ShufVT = VT.getSimpleVT();
6444 MVT ShufSVT = ShufVT.getVectorElementType();
6445 int NumElems = (int)ShufVT.getVectorNumElements();
6446 SmallVector<int, 16> ShuffleMask;
6448 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6449 return SDValue();
6450
6451 int Elt = ShuffleMask[Index];
6452 if (Elt == SM_SentinelZero)
6453 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6454 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6455 if (Elt == SM_SentinelUndef)
6456 return DAG.getUNDEF(ShufSVT);
6457
6458 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6459 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6460 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6461 }
6462
6463 // Recurse into insert_subvector base/sub vector to find scalars.
6464 if (Opcode == ISD::INSERT_SUBVECTOR) {
6465 SDValue Vec = Op.getOperand(0);
6466 SDValue Sub = Op.getOperand(1);
6467 uint64_t SubIdx = Op.getConstantOperandVal(2);
6468 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6469
6470 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6471 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6472 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6473 }
6474
6475 // Recurse into concat_vectors sub vector to find scalars.
6476 if (Opcode == ISD::CONCAT_VECTORS) {
6477 EVT SubVT = Op.getOperand(0).getValueType();
6478 unsigned NumSubElts = SubVT.getVectorNumElements();
6479 uint64_t SubIdx = Index / NumSubElts;
6480 uint64_t SubElt = Index % NumSubElts;
6481 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6482 }
6483
6484 // Recurse into extract_subvector src vector to find scalars.
6485 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6486 SDValue Src = Op.getOperand(0);
6487 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6488 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6489 }
6490
6491 // We only peek through bitcasts of the same vector width.
6492 if (Opcode == ISD::BITCAST) {
6493 SDValue Src = Op.getOperand(0);
6494 EVT SrcVT = Src.getValueType();
6495 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6496 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6497 return SDValue();
6498 }
6499
6500 // Actual nodes that may contain scalar elements
6501
6502 // For insert_vector_elt - either return the index matching scalar or recurse
6503 // into the base vector.
6504 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6505 isa<ConstantSDNode>(Op.getOperand(2))) {
6506 if (Op.getConstantOperandAPInt(2) == Index)
6507 return Op.getOperand(1);
6508 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6509 }
6510
6511 if (Opcode == ISD::SCALAR_TO_VECTOR)
6512 return (Index == 0) ? Op.getOperand(0)
6513 : DAG.getUNDEF(VT.getVectorElementType());
6514
6515 if (Opcode == ISD::BUILD_VECTOR)
6516 return Op.getOperand(Index);
6517
6518 return SDValue();
6519}
6520
6521// Use PINSRB/PINSRW/PINSRD to create a build vector.
6523 const APInt &NonZeroMask,
6524 unsigned NumNonZero, unsigned NumZero,
6525 SelectionDAG &DAG,
6526 const X86Subtarget &Subtarget) {
6527 MVT VT = Op.getSimpleValueType();
6528 unsigned NumElts = VT.getVectorNumElements();
6529 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6530 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6531 "Illegal vector insertion");
6532
6533 SDValue V;
6534 bool First = true;
6535
6536 for (unsigned i = 0; i < NumElts; ++i) {
6537 bool IsNonZero = NonZeroMask[i];
6538 if (!IsNonZero)
6539 continue;
6540
6541 // If the build vector contains zeros or our first insertion is not the
6542 // first index then insert into zero vector to break any register
6543 // dependency else use SCALAR_TO_VECTOR.
6544 if (First) {
6545 First = false;
6546 if (NumZero || 0 != i)
6547 V = getZeroVector(VT, Subtarget, DAG, DL);
6548 else {
6549 assert(0 == i && "Expected insertion into zero-index");
6550 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6551 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6552 V = DAG.getBitcast(VT, V);
6553 continue;
6554 }
6555 }
6556 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6557 DAG.getIntPtrConstant(i, DL));
6558 }
6559
6560 return V;
6561}
6562
6563/// Custom lower build_vector of v16i8.
6565 const APInt &NonZeroMask,
6566 unsigned NumNonZero, unsigned NumZero,
6567 SelectionDAG &DAG,
6568 const X86Subtarget &Subtarget) {
6569 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6570 return SDValue();
6571
6572 // SSE4.1 - use PINSRB to insert each byte directly.
6573 if (Subtarget.hasSSE41())
6574 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6575 DAG, Subtarget);
6576
6577 SDValue V;
6578
6579 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6580 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6581 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6582 !NonZeroMask.extractBits(2, 2).isZero()) {
6583 for (unsigned I = 0; I != 4; ++I) {
6584 if (!NonZeroMask[I])
6585 continue;
6586 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6587 if (I != 0)
6588 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6589 DAG.getConstant(I * 8, DL, MVT::i8));
6590 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6591 }
6592 assert(V && "Failed to fold v16i8 vector to zero");
6593 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6594 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6595 V = DAG.getBitcast(MVT::v8i16, V);
6596 }
6597 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6598 bool ThisIsNonZero = NonZeroMask[i];
6599 bool NextIsNonZero = NonZeroMask[i + 1];
6600 if (!ThisIsNonZero && !NextIsNonZero)
6601 continue;
6602
6603 SDValue Elt;
6604 if (ThisIsNonZero) {
6605 if (NumZero || NextIsNonZero)
6606 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6607 else
6608 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6609 }
6610
6611 if (NextIsNonZero) {
6612 SDValue NextElt = Op.getOperand(i + 1);
6613 if (i == 0 && NumZero)
6614 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6615 else
6616 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6617 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6618 DAG.getConstant(8, DL, MVT::i8));
6619 if (ThisIsNonZero)
6620 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6621 else
6622 Elt = NextElt;
6623 }
6624
6625 // If our first insertion is not the first index or zeros are needed, then
6626 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6627 // elements undefined).
6628 if (!V) {
6629 if (i != 0 || NumZero)
6630 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6631 else {
6632 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6633 V = DAG.getBitcast(MVT::v8i16, V);
6634 continue;
6635 }
6636 }
6637 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6638 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6639 DAG.getIntPtrConstant(i / 2, DL));
6640 }
6641
6642 return DAG.getBitcast(MVT::v16i8, V);
6643}
6644
6645/// Custom lower build_vector of v8i16.
6647 const APInt &NonZeroMask,
6648 unsigned NumNonZero, unsigned NumZero,
6649 SelectionDAG &DAG,
6650 const X86Subtarget &Subtarget) {
6651 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6652 return SDValue();
6653
6654 // Use PINSRW to insert each byte directly.
6655 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6656 Subtarget);
6657}
6658
6659/// Custom lower build_vector of v4i32 or v4f32.
6661 SelectionDAG &DAG,
6662 const X86Subtarget &Subtarget) {
6663 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6664 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6665 // Because we're creating a less complicated build vector here, we may enable
6666 // further folding of the MOVDDUP via shuffle transforms.
6667 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6668 Op.getOperand(0) == Op.getOperand(2) &&
6669 Op.getOperand(1) == Op.getOperand(3) &&
6670 Op.getOperand(0) != Op.getOperand(1)) {
6671 MVT VT = Op.getSimpleValueType();
6672 MVT EltVT = VT.getVectorElementType();
6673 // Create a new build vector with the first 2 elements followed by undef
6674 // padding, bitcast to v2f64, duplicate, and bitcast back.
6675 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6676 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6677 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6678 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6679 return DAG.getBitcast(VT, Dup);
6680 }
6681
6682 // Find all zeroable elements.
6683 std::bitset<4> Zeroable, Undefs;
6684 for (int i = 0; i < 4; ++i) {
6685 SDValue Elt = Op.getOperand(i);
6686 Undefs[i] = Elt.isUndef();
6687 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6688 }
6689 assert(Zeroable.size() - Zeroable.count() > 1 &&
6690 "We expect at least two non-zero elements!");
6691
6692 // We only know how to deal with build_vector nodes where elements are either
6693 // zeroable or extract_vector_elt with constant index.
6694 SDValue FirstNonZero;
6695 unsigned FirstNonZeroIdx;
6696 for (unsigned i = 0; i < 4; ++i) {
6697 if (Zeroable[i])
6698 continue;
6699 SDValue Elt = Op.getOperand(i);
6700 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6701 !isa<ConstantSDNode>(Elt.getOperand(1)))
6702 return SDValue();
6703 // Make sure that this node is extracting from a 128-bit vector.
6704 MVT VT = Elt.getOperand(0).getSimpleValueType();
6705 if (!VT.is128BitVector())
6706 return SDValue();
6707 if (!FirstNonZero.getNode()) {
6708 FirstNonZero = Elt;
6709 FirstNonZeroIdx = i;
6710 }
6711 }
6712
6713 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6714 SDValue V1 = FirstNonZero.getOperand(0);
6715 MVT VT = V1.getSimpleValueType();
6716
6717 // See if this build_vector can be lowered as a blend with zero.
6718 SDValue Elt;
6719 unsigned EltMaskIdx, EltIdx;
6720 int Mask[4];
6721 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6722 if (Zeroable[EltIdx]) {
6723 // The zero vector will be on the right hand side.
6724 Mask[EltIdx] = EltIdx+4;
6725 continue;
6726 }
6727
6728 Elt = Op->getOperand(EltIdx);
6729 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6730 EltMaskIdx = Elt.getConstantOperandVal(1);
6731 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6732 break;
6733 Mask[EltIdx] = EltIdx;
6734 }
6735
6736 if (EltIdx == 4) {
6737 // Let the shuffle legalizer deal with blend operations.
6738 SDValue VZeroOrUndef = (Zeroable == Undefs)
6739 ? DAG.getUNDEF(VT)
6740 : getZeroVector(VT, Subtarget, DAG, DL);
6741 if (V1.getSimpleValueType() != VT)
6742 V1 = DAG.getBitcast(VT, V1);
6743 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6744 }
6745
6746 // See if we can lower this build_vector to a INSERTPS.
6747 if (!Subtarget.hasSSE41())
6748 return SDValue();
6749
6750 SDValue V2 = Elt.getOperand(0);
6751 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6752 V1 = SDValue();
6753
6754 bool CanFold = true;
6755 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6756 if (Zeroable[i])
6757 continue;
6758
6759 SDValue Current = Op->getOperand(i);
6760 SDValue SrcVector = Current->getOperand(0);
6761 if (!V1.getNode())
6762 V1 = SrcVector;
6763 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6764 }
6765
6766 if (!CanFold)
6767 return SDValue();
6768
6769 assert(V1.getNode() && "Expected at least two non-zero elements!");
6770 if (V1.getSimpleValueType() != MVT::v4f32)
6771 V1 = DAG.getBitcast(MVT::v4f32, V1);
6772 if (V2.getSimpleValueType() != MVT::v4f32)
6773 V2 = DAG.getBitcast(MVT::v4f32, V2);
6774
6775 // Ok, we can emit an INSERTPS instruction.
6776 unsigned ZMask = Zeroable.to_ulong();
6777
6778 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6779 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6780 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6781 DAG.getIntPtrConstant(InsertPSMask, DL, true));
6782 return DAG.getBitcast(VT, Result);
6783}
6784
6785/// Return a vector logical shift node.
6786static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6787 SelectionDAG &DAG, const TargetLowering &TLI,
6788 const SDLoc &dl) {
6789 assert(VT.is128BitVector() && "Unknown type for VShift");
6790 MVT ShVT = MVT::v16i8;
6791 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6792 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6793 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6794 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6795 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6796}
6797
6799 SelectionDAG &DAG) {
6800
6801 // Check if the scalar load can be widened into a vector load. And if
6802 // the address is "base + cst" see if the cst can be "absorbed" into
6803 // the shuffle mask.
6804 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6805 SDValue Ptr = LD->getBasePtr();
6806 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
6807 return SDValue();
6808 EVT PVT = LD->getValueType(0);
6809 if (PVT != MVT::i32 && PVT != MVT::f32)
6810 return SDValue();
6811
6812 int FI = -1;
6813 int64_t Offset = 0;
6814 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6815 FI = FINode->getIndex();
6816 Offset = 0;
6817 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6818 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6819 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6820 Offset = Ptr.getConstantOperandVal(1);
6821 Ptr = Ptr.getOperand(0);
6822 } else {
6823 return SDValue();
6824 }
6825
6826 // FIXME: 256-bit vector instructions don't require a strict alignment,
6827 // improve this code to support it better.
6828 Align RequiredAlign(VT.getSizeInBits() / 8);
6829 SDValue Chain = LD->getChain();
6830 // Make sure the stack object alignment is at least 16 or 32.
6832 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
6833 if (!InferredAlign || *InferredAlign < RequiredAlign) {
6834 if (MFI.isFixedObjectIndex(FI)) {
6835 // Can't change the alignment. FIXME: It's possible to compute
6836 // the exact stack offset and reference FI + adjust offset instead.
6837 // If someone *really* cares about this. That's the way to implement it.
6838 return SDValue();
6839 } else {
6840 MFI.setObjectAlignment(FI, RequiredAlign);
6841 }
6842 }
6843
6844 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6845 // Ptr + (Offset & ~15).
6846 if (Offset < 0)
6847 return SDValue();
6848 if ((Offset % RequiredAlign.value()) & 3)
6849 return SDValue();
6850 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
6851 if (StartOffset) {
6852 SDLoc DL(Ptr);
6853 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6854 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6855 }
6856
6857 int EltNo = (Offset - StartOffset) >> 2;
6858 unsigned NumElems = VT.getVectorNumElements();
6859
6860 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6861 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6862 LD->getPointerInfo().getWithOffset(StartOffset));
6863
6864 SmallVector<int, 8> Mask(NumElems, EltNo);
6865
6866 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6867 }
6868
6869 return SDValue();
6870}
6871
6872// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
6873static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
6874 if (ISD::isNON_EXTLoad(Elt.getNode())) {
6875 auto *BaseLd = cast<LoadSDNode>(Elt);
6876 if (!BaseLd->isSimple())
6877 return false;
6878 Ld = BaseLd;
6879 ByteOffset = 0;
6880 return true;
6881 }
6882
6883 switch (Elt.getOpcode()) {
6884 case ISD::BITCAST:
6885 case ISD::TRUNCATE:
6887 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
6888 case ISD::SRL:
6889 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6890 uint64_t Amt = AmtC->getZExtValue();
6891 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
6892 ByteOffset += Amt / 8;
6893 return true;
6894 }
6895 }
6896 break;
6898 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6899 SDValue Src = Elt.getOperand(0);
6900 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
6901 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
6902 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
6903 findEltLoadSrc(Src, Ld, ByteOffset)) {
6904 uint64_t Idx = IdxC->getZExtValue();
6905 ByteOffset += Idx * (SrcSizeInBits / 8);
6906 return true;
6907 }
6908 }
6909 break;
6910 }
6911
6912 return false;
6913}
6914
6915/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6916/// elements can be replaced by a single large load which has the same value as
6917/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6918///
6919/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6921 const SDLoc &DL, SelectionDAG &DAG,
6922 const X86Subtarget &Subtarget,
6923 bool IsAfterLegalize) {
6924 if ((VT.getScalarSizeInBits() % 8) != 0)
6925 return SDValue();
6926
6927 unsigned NumElems = Elts.size();
6928
6929 int LastLoadedElt = -1;
6930 APInt LoadMask = APInt::getZero(NumElems);
6931 APInt ZeroMask = APInt::getZero(NumElems);
6932 APInt UndefMask = APInt::getZero(NumElems);
6933
6934 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
6935 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
6936
6937 // For each element in the initializer, see if we've found a load, zero or an
6938 // undef.
6939 for (unsigned i = 0; i < NumElems; ++i) {
6940 SDValue Elt = peekThroughBitcasts(Elts[i]);
6941 if (!Elt.getNode())
6942 return SDValue();
6943 if (Elt.isUndef()) {
6944 UndefMask.setBit(i);
6945 continue;
6946 }
6948 ZeroMask.setBit(i);
6949 continue;
6950 }
6951
6952 // Each loaded element must be the correct fractional portion of the
6953 // requested vector load.
6954 unsigned EltSizeInBits = Elt.getValueSizeInBits();
6955 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
6956 return SDValue();
6957
6958 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
6959 return SDValue();
6960 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
6961 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
6962 return SDValue();
6963
6964 LoadMask.setBit(i);
6965 LastLoadedElt = i;
6966 }
6967 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
6968 NumElems &&
6969 "Incomplete element masks");
6970
6971 // Handle Special Cases - all undef or undef/zero.
6972 if (UndefMask.popcount() == NumElems)
6973 return DAG.getUNDEF(VT);
6974 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
6975 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6976 : DAG.getConstantFP(0.0, DL, VT);
6977
6978 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6979 int FirstLoadedElt = LoadMask.countr_zero();
6980 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6981 EVT EltBaseVT = EltBase.getValueType();
6982 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
6983 "Register/Memory size mismatch");
6984 LoadSDNode *LDBase = Loads[FirstLoadedElt];
6985 assert(LDBase && "Did not find base load for merging consecutive loads");
6986 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
6987 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
6988 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
6989 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
6990 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
6991
6992 // TODO: Support offsetting the base load.
6993 if (ByteOffsets[FirstLoadedElt] != 0)
6994 return SDValue();
6995
6996 // Check to see if the element's load is consecutive to the base load
6997 // or offset from a previous (already checked) load.
6998 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
6999 LoadSDNode *Ld = Loads[EltIdx];
7000 int64_t ByteOffset = ByteOffsets[EltIdx];
7001 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7002 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7003 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7004 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7005 }
7006 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7007 EltIdx - FirstLoadedElt);
7008 };
7009
7010 // Consecutive loads can contain UNDEFS but not ZERO elements.
7011 // Consecutive loads with UNDEFs and ZEROs elements require a
7012 // an additional shuffle stage to clear the ZERO elements.
7013 bool IsConsecutiveLoad = true;
7014 bool IsConsecutiveLoadWithZeros = true;
7015 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7016 if (LoadMask[i]) {
7017 if (!CheckConsecutiveLoad(LDBase, i)) {
7018 IsConsecutiveLoad = false;
7019 IsConsecutiveLoadWithZeros = false;
7020 break;
7021 }
7022 } else if (ZeroMask[i]) {
7023 IsConsecutiveLoad = false;
7024 }
7025 }
7026
7027 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7028 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7029 assert(LDBase->isSimple() &&
7030 "Cannot merge volatile or atomic loads.");
7031 SDValue NewLd =
7032 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7033 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
7034 MMOFlags);
7035 for (auto *LD : Loads)
7036 if (LD)
7037 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7038 return NewLd;
7039 };
7040
7041 // Check if the base load is entirely dereferenceable.
7042 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7043 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7044
7045 // LOAD - all consecutive load/undefs (must start/end with a load or be
7046 // entirely dereferenceable). If we have found an entire vector of loads and
7047 // undefs, then return a large load of the entire vector width starting at the
7048 // base pointer. If the vector contains zeros, then attempt to shuffle those
7049 // elements.
7050 if (FirstLoadedElt == 0 &&
7051 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7052 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7053 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7054 return SDValue();
7055
7056 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7057 // will lower to regular temporal loads and use the cache.
7058 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7059 VT.is256BitVector() && !Subtarget.hasInt256())
7060 return SDValue();
7061
7062 if (NumElems == 1)
7063 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7064
7065 if (!ZeroMask)
7066 return CreateLoad(VT, LDBase);
7067
7068 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7069 // vector and a zero vector to clear out the zero elements.
7070 if (!IsAfterLegalize && VT.isVector()) {
7071 unsigned NumMaskElts = VT.getVectorNumElements();
7072 if ((NumMaskElts % NumElems) == 0) {
7073 unsigned Scale = NumMaskElts / NumElems;
7074 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7075 for (unsigned i = 0; i < NumElems; ++i) {
7076 if (UndefMask[i])
7077 continue;
7078 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7079 for (unsigned j = 0; j != Scale; ++j)
7080 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7081 }
7082 SDValue V = CreateLoad(VT, LDBase);
7083 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7084 : DAG.getConstantFP(0.0, DL, VT);
7085 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7086 }
7087 }
7088 }
7089
7090 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7091 if (VT.is256BitVector() || VT.is512BitVector()) {
7092 unsigned HalfNumElems = NumElems / 2;
7093 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7094 EVT HalfVT =
7095 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7096 SDValue HalfLD =
7097 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7098 DAG, Subtarget, IsAfterLegalize);
7099 if (HalfLD)
7100 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7101 HalfLD, DAG.getIntPtrConstant(0, DL));
7102 }
7103 }
7104
7105 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7106 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7107 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7108 LoadSizeInBits == 64) &&
7109 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7110 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7111 : MVT::getIntegerVT(LoadSizeInBits);
7112 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7113 // Allow v4f32 on SSE1 only targets.
7114 // FIXME: Add more isel patterns so we can just use VT directly.
7115 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7116 VecVT = MVT::v4f32;
7117 if (TLI.isTypeLegal(VecVT)) {
7118 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7119 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7120 SDValue ResNode = DAG.getMemIntrinsicNode(
7121 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7123 for (auto *LD : Loads)
7124 if (LD)
7125 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7126 return DAG.getBitcast(VT, ResNode);
7127 }
7128 }
7129
7130 // BROADCAST - match the smallest possible repetition pattern, load that
7131 // scalar/subvector element and then broadcast to the entire vector.
7132 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7133 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7134 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7135 unsigned RepeatSize = SubElems * BaseSizeInBits;
7136 unsigned ScalarSize = std::min(RepeatSize, 64u);
7137 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7138 continue;
7139
7140 // Don't attempt a 1:N subvector broadcast - it should be caught by
7141 // combineConcatVectorOps, else will cause infinite loops.
7142 if (RepeatSize > ScalarSize && SubElems == 1)
7143 continue;
7144
7145 bool Match = true;
7146 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7147 for (unsigned i = 0; i != NumElems && Match; ++i) {
7148 if (!LoadMask[i])
7149 continue;
7150 SDValue Elt = peekThroughBitcasts(Elts[i]);
7151 if (RepeatedLoads[i % SubElems].isUndef())
7152 RepeatedLoads[i % SubElems] = Elt;
7153 else
7154 Match &= (RepeatedLoads[i % SubElems] == Elt);
7155 }
7156
7157 // We must have loads at both ends of the repetition.
7158 Match &= !RepeatedLoads.front().isUndef();
7159 Match &= !RepeatedLoads.back().isUndef();
7160 if (!Match)
7161 continue;
7162
7163 EVT RepeatVT =
7164 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7165 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7166 : EVT::getFloatingPointVT(ScalarSize);
7167 if (RepeatSize > ScalarSize)
7168 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7169 RepeatSize / ScalarSize);
7170 EVT BroadcastVT =
7171 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7172 VT.getSizeInBits() / ScalarSize);
7173 if (TLI.isTypeLegal(BroadcastVT)) {
7174 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7175 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7176 SDValue Broadcast = RepeatLoad;
7177 if (RepeatSize > ScalarSize) {
7178 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7179 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7180 } else {
7181 if (!Subtarget.hasAVX2() &&
7183 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7184 Subtarget,
7185 /*AssumeSingleUse=*/true))
7186 return SDValue();
7187 Broadcast =
7188 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7189 }
7190 return DAG.getBitcast(VT, Broadcast);
7191 }
7192 }
7193 }
7194 }
7195
7196 return SDValue();
7197}
7198
7199// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7200// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7201// are consecutive, non-overlapping, and in the right order.
7203 SelectionDAG &DAG,
7204 const X86Subtarget &Subtarget,
7205 bool IsAfterLegalize) {
7207 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7208 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7209 Elts.push_back(Elt);
7210 continue;
7211 }
7212 return SDValue();
7213 }
7214 assert(Elts.size() == VT.getVectorNumElements());
7215 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7216 IsAfterLegalize);
7217}
7218
7220 const APInt &Undefs, LLVMContext &C) {
7221 unsigned ScalarSize = VT.getScalarSizeInBits();
7222 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7223
7224 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7225 if (VT.isFloatingPoint()) {
7226 if (ScalarSize == 16)
7227 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7228 if (ScalarSize == 32)
7229 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7230 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7231 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7232 }
7233 return Constant::getIntegerValue(Ty, Val);
7234 };
7235
7236 SmallVector<Constant *, 32> ConstantVec;
7237 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7238 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7239 : getConstantScalar(Bits[I]));
7240
7241 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7242}
7243
7244static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7245 unsigned SplatBitSize, LLVMContext &C) {
7246 unsigned ScalarSize = VT.getScalarSizeInBits();
7247
7248 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7249 if (VT.isFloatingPoint()) {
7250 if (ScalarSize == 16)
7251 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7252 if (ScalarSize == 32)
7253 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7254 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7255 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7256 }
7257 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7258 };
7259
7260 if (ScalarSize == SplatBitSize)
7261 return getConstantScalar(SplatValue);
7262
7263 unsigned NumElm = SplatBitSize / ScalarSize;
7264 SmallVector<Constant *, 32> ConstantVec;
7265 for (unsigned I = 0; I != NumElm; ++I) {
7266 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7267 ConstantVec.push_back(getConstantScalar(Val));
7268 }
7269 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7270}
7271
7273 for (auto *U : N->uses()) {
7274 unsigned Opc = U->getOpcode();
7275 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7276 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7277 return false;
7278 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7279 return false;
7280 if (isTargetShuffle(Opc))
7281 return true;
7282 if (Opc == ISD::BITCAST) // Ignore bitcasts
7283 return isFoldableUseOfShuffle(U);
7284 if (N->hasOneUse()) {
7285 // TODO, there may be some general way to know if a SDNode can
7286 // be folded. We now only know whether an MI is foldable.
7287 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7288 return false;
7289 return true;
7290 }
7291 }
7292 return false;
7293}
7294
7295/// Attempt to use the vbroadcast instruction to generate a splat value
7296/// from a splat BUILD_VECTOR which uses:
7297/// a. A single scalar load, or a constant.
7298/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7299///
7300/// The VBROADCAST node is returned when a pattern is found,
7301/// or SDValue() otherwise.
7303 const SDLoc &dl,
7304 const X86Subtarget &Subtarget,
7305 SelectionDAG &DAG) {
7306 // VBROADCAST requires AVX.
7307 // TODO: Splats could be generated for non-AVX CPUs using SSE
7308 // instructions, but there's less potential gain for only 128-bit vectors.
7309 if (!Subtarget.hasAVX())
7310 return SDValue();
7311
7312 MVT VT = BVOp->getSimpleValueType(0);
7313 unsigned NumElts = VT.getVectorNumElements();
7314 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7315 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7316 "Unsupported vector type for broadcast.");
7317
7318 // See if the build vector is a repeating sequence of scalars (inc. splat).
7319 SDValue Ld;
7320 BitVector UndefElements;
7321 SmallVector<SDValue, 16> Sequence;
7322 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7323 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7324 if (Sequence.size() == 1)
7325 Ld = Sequence[0];
7326 }
7327
7328 // Attempt to use VBROADCASTM
7329 // From this pattern:
7330 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7331 // b. t1 = (build_vector t0 t0)
7332 //
7333 // Create (VBROADCASTM v2i1 X)
7334 if (!Sequence.empty() && Subtarget.hasCDI()) {
7335 // If not a splat, are the upper sequence values zeroable?
7336 unsigned SeqLen = Sequence.size();
7337 bool UpperZeroOrUndef =
7338 SeqLen == 1 ||
7339 llvm::all_of(ArrayRef(Sequence).drop_front(),
7340 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7341 SDValue Op0 = Sequence[0];
7342 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7343 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7344 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7345 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7346 ? Op0.getOperand(0)
7347 : Op0.getOperand(0).getOperand(0);
7348 MVT MaskVT = BOperand.getSimpleValueType();
7349 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7350 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7351 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7352 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7353 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7354 unsigned Scale = 512 / VT.getSizeInBits();
7355 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7356 }
7357 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7358 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7359 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7360 return DAG.getBitcast(VT, Bcst);
7361 }
7362 }
7363 }
7364
7365 unsigned NumUndefElts = UndefElements.count();
7366 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7367 APInt SplatValue, Undef;
7368 unsigned SplatBitSize;
7369 bool HasUndef;
7370 // Check if this is a repeated constant pattern suitable for broadcasting.
7371 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7372 SplatBitSize > VT.getScalarSizeInBits() &&
7373 SplatBitSize < VT.getSizeInBits()) {
7374 // Avoid replacing with broadcast when it's a use of a shuffle
7375 // instruction to preserve the present custom lowering of shuffles.
7376 if (isFoldableUseOfShuffle(BVOp))
7377 return SDValue();
7378 // replace BUILD_VECTOR with broadcast of the repeated constants.
7379 LLVMContext *Ctx = DAG.getContext();
7380 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7381 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7382 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7383 // Load the constant scalar/subvector and broadcast it.
7384 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7385 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7386 SDValue CP = DAG.getConstantPool(C, PVT);
7387 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7388
7389 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7390 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7391 SDValue Ops[] = {DAG.getEntryNode(), CP};
7392 MachinePointerInfo MPI =
7394 SDValue Brdcst =
7395 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7396 MPI, Alignment, MachineMemOperand::MOLoad);
7397 return DAG.getBitcast(VT, Brdcst);
7398 }
7399 if (SplatBitSize > 64) {
7400 // Load the vector of constants and broadcast it.
7401 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7402 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7403 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7404 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7405 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7406 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7407 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7408 MachinePointerInfo MPI =
7411 Ops, VVT, MPI, Alignment,
7413 }
7414 }
7415
7416 // If we are moving a scalar into a vector (Ld must be set and all elements
7417 // but 1 are undef) and that operation is not obviously supported by
7418 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7419 // That's better than general shuffling and may eliminate a load to GPR and
7420 // move from scalar to vector register.
7421 if (!Ld || NumElts - NumUndefElts != 1)
7422 return SDValue();
7423 unsigned ScalarSize = Ld.getValueSizeInBits();
7424 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7425 return SDValue();
7426 }
7427
7428 bool ConstSplatVal =
7429 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7430 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7431
7432 // TODO: Handle broadcasts of non-constant sequences.
7433
7434 // Make sure that all of the users of a non-constant load are from the
7435 // BUILD_VECTOR node.
7436 // FIXME: Is the use count needed for non-constant, non-load case?
7437 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7438 return SDValue();
7439
7440 unsigned ScalarSize = Ld.getValueSizeInBits();
7441 bool IsGE256 = (VT.getSizeInBits() >= 256);
7442
7443 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7444 // instruction to save 8 or more bytes of constant pool data.
7445 // TODO: If multiple splats are generated to load the same constant,
7446 // it may be detrimental to overall size. There needs to be a way to detect
7447 // that condition to know if this is truly a size win.
7448 bool OptForSize = DAG.shouldOptForSize();
7449
7450 // Handle broadcasting a single constant scalar from the constant pool
7451 // into a vector.
7452 // On Sandybridge (no AVX2), it is still better to load a constant vector
7453 // from the constant pool and not to broadcast it from a scalar.
7454 // But override that restriction when optimizing for size.
7455 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7456 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7457 EVT CVT = Ld.getValueType();
7458 assert(!CVT.isVector() && "Must not broadcast a vector type");
7459
7460 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7461 // For size optimization, also splat v2f64 and v2i64, and for size opt
7462 // with AVX2, also splat i8 and i16.
7463 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7464 if (ScalarSize == 32 ||
7465 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7466 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7467 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7468 const Constant *C = nullptr;
7469 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7470 C = CI->getConstantIntValue();
7471 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7472 C = CF->getConstantFPValue();
7473
7474 assert(C && "Invalid constant type");
7475
7476 SDValue CP =
7478 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7479
7480 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7481 SDValue Ops[] = {DAG.getEntryNode(), CP};
7482 MachinePointerInfo MPI =
7484 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7485 MPI, Alignment, MachineMemOperand::MOLoad);
7486 }
7487 }
7488
7489 // Handle AVX2 in-register broadcasts.
7490 if (!IsLoad && Subtarget.hasInt256() &&
7491 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7492 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7493
7494 // The scalar source must be a normal load.
7495 if (!IsLoad)
7496 return SDValue();
7497
7498 // Make sure the non-chain result is only used by this build vector.
7499 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7500 return SDValue();
7501
7502 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7503 (Subtarget.hasVLX() && ScalarSize == 64)) {
7504 auto *LN = cast<LoadSDNode>(Ld);
7505 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7506 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7507 SDValue BCast =
7509 LN->getMemoryVT(), LN->getMemOperand());
7510 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7511 return BCast;
7512 }
7513
7514 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7515 // double since there is no vbroadcastsd xmm
7516 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7517 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7518 auto *LN = cast<LoadSDNode>(Ld);
7519 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7520 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7521 SDValue BCast =
7523 LN->getMemoryVT(), LN->getMemOperand());
7524 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7525 return BCast;
7526 }
7527
7528 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7529 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7530
7531 // Unsupported broadcast.
7532 return SDValue();
7533}
7534
7535/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7536/// underlying vector and index.
7537///
7538/// Modifies \p ExtractedFromVec to the real vector and returns the real
7539/// index.
7540static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7541 SDValue ExtIdx) {
7542 int Idx = ExtIdx->getAsZExtVal();
7543 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7544 return Idx;
7545
7546 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7547 // lowered this:
7548 // (extract_vector_elt (v8f32 %1), Constant<6>)
7549 // to:
7550 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7551 // (extract_subvector (v8f32 %0), Constant<4>),
7552 // undef)
7553 // Constant<0>)
7554 // In this case the vector is the extract_subvector expression and the index
7555 // is 2, as specified by the shuffle.
7556 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7557 SDValue ShuffleVec = SVOp->getOperand(0);
7558 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7559 assert(ShuffleVecVT.getVectorElementType() ==
7560 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7561
7562 int ShuffleIdx = SVOp->getMaskElt(Idx);
7563 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7564 ExtractedFromVec = ShuffleVec;
7565 return ShuffleIdx;
7566 }
7567 return Idx;
7568}
7569
7571 SelectionDAG &DAG) {
7572 MVT VT = Op.getSimpleValueType();
7573
7574 // Skip if insert_vec_elt is not supported.
7575 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7577 return SDValue();
7578
7579 unsigned NumElems = Op.getNumOperands();
7580 SDValue VecIn1;
7581 SDValue VecIn2;
7582 SmallVector<unsigned, 4> InsertIndices;
7583 SmallVector<int, 8> Mask(NumElems, -1);
7584
7585 for (unsigned i = 0; i != NumElems; ++i) {
7586 unsigned Opc = Op.getOperand(i).getOpcode();
7587
7588 if (Opc == ISD::UNDEF)
7589 continue;
7590
7591 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7592 // Quit if more than 1 elements need inserting.
7593 if (InsertIndices.size() > 1)
7594 return SDValue();
7595
7596 InsertIndices.push_back(i);
7597 continue;
7598 }
7599
7600 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7601 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7602
7603 // Quit if non-constant index.
7604 if (!isa<ConstantSDNode>(ExtIdx))
7605 return SDValue();
7606 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7607
7608 // Quit if extracted from vector of different type.
7609 if (ExtractedFromVec.getValueType() != VT)
7610 return SDValue();
7611
7612 if (!VecIn1.getNode())
7613 VecIn1 = ExtractedFromVec;
7614 else if (VecIn1 != ExtractedFromVec) {
7615 if (!VecIn2.getNode())
7616 VecIn2 = ExtractedFromVec;
7617 else if (VecIn2 != ExtractedFromVec)
7618 // Quit if more than 2 vectors to shuffle
7619 return SDValue();
7620 }
7621
7622 if (ExtractedFromVec == VecIn1)
7623 Mask[i] = Idx;
7624 else if (ExtractedFromVec == VecIn2)
7625 Mask[i] = Idx + NumElems;
7626 }
7627
7628 if (!VecIn1.getNode())
7629 return SDValue();
7630
7631 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7632 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7633
7634 for (unsigned Idx : InsertIndices)
7635 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7636 DAG.getIntPtrConstant(Idx, DL));
7637
7638 return NV;
7639}
7640
7641// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7643 const X86Subtarget &Subtarget) {
7644 MVT VT = Op.getSimpleValueType();
7645 MVT IVT =
7646 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7648 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7649 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7650 Op.getOperand(I)));
7651 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7652 return DAG.getBitcast(VT, Res);
7653}
7654
7655// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7657 SelectionDAG &DAG,
7658 const X86Subtarget &Subtarget) {
7659
7660 MVT VT = Op.getSimpleValueType();
7661 assert((VT.getVectorElementType() == MVT::i1) &&
7662 "Unexpected type in LowerBUILD_VECTORvXi1!");
7663 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7664 ISD::isBuildVectorAllOnes(Op.getNode()))
7665 return Op;
7666
7667 uint64_t Immediate = 0;
7668 SmallVector<unsigned, 16> NonConstIdx;
7669 bool IsSplat = true;
7670 bool HasConstElts = false;
7671 int SplatIdx = -1;
7672 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7673 SDValue In = Op.getOperand(idx);
7674 if (In.isUndef())
7675 continue;
7676 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7677 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7678 HasConstElts = true;
7679 } else {
7680 NonConstIdx.push_back(idx);
7681 }
7682 if (SplatIdx < 0)
7683 SplatIdx = idx;
7684 else if (In != Op.getOperand(SplatIdx))
7685 IsSplat = false;
7686 }
7687
7688 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7689 if (IsSplat) {
7690 // The build_vector allows the scalar element to be larger than the vector
7691 // element type. We need to mask it to use as a condition unless we know
7692 // the upper bits are zero.
7693 // FIXME: Use computeKnownBits instead of checking specific opcode?
7694 SDValue Cond = Op.getOperand(SplatIdx);
7695 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7696 if (Cond.getOpcode() != ISD::SETCC)
7697 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7698 DAG.getConstant(1, dl, MVT::i8));
7699
7700 // Perform the select in the scalar domain so we can use cmov.
7701 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7702 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7703 DAG.getAllOnesConstant(dl, MVT::i32),
7704 DAG.getConstant(0, dl, MVT::i32));
7705 Select = DAG.getBitcast(MVT::v32i1, Select);
7706 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7707 } else {
7708 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7709 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7710 DAG.getAllOnesConstant(dl, ImmVT),
7711 DAG.getConstant(0, dl, ImmVT));
7712 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7713 Select = DAG.getBitcast(VecVT, Select);
7714 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7715 DAG.getIntPtrConstant(0, dl));
7716 }
7717 }
7718
7719 // insert elements one by one
7720 SDValue DstVec;
7721 if (HasConstElts) {
7722 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7723 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7724 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7725 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7726 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7727 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7728 } else {
7729 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7730 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7731 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7732 DstVec = DAG.getBitcast(VecVT, Imm);
7733 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7734 DAG.getIntPtrConstant(0, dl));
7735 }
7736 } else
7737 DstVec = DAG.getUNDEF(VT);
7738
7739 for (unsigned InsertIdx : NonConstIdx) {
7740 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7741 Op.getOperand(InsertIdx),
7742 DAG.getIntPtrConstant(InsertIdx, dl));
7743 }
7744 return DstVec;
7745}
7746
7747LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7748 switch (Opcode) {
7749 case X86ISD::PACKSS:
7750 case X86ISD::PACKUS:
7751 case X86ISD::FHADD:
7752 case X86ISD::FHSUB:
7753 case X86ISD::HADD:
7754 case X86ISD::HSUB:
7755 return true;
7756 }
7757 return false;
7758}
7759
7760/// This is a helper function of LowerToHorizontalOp().
7761/// This function checks that the build_vector \p N in input implements a
7762/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7763/// may not match the layout of an x86 256-bit horizontal instruction.
7764/// In other words, if this returns true, then some extraction/insertion will
7765/// be required to produce a valid horizontal instruction.
7766///
7767/// Parameter \p Opcode defines the kind of horizontal operation to match.
7768/// For example, if \p Opcode is equal to ISD::ADD, then this function
7769/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7770/// is equal to ISD::SUB, then this function checks if this is a horizontal
7771/// arithmetic sub.
7772///
7773/// This function only analyzes elements of \p N whose indices are
7774/// in range [BaseIdx, LastIdx).
7775///
7776/// TODO: This function was originally used to match both real and fake partial
7777/// horizontal operations, but the index-matching logic is incorrect for that.
7778/// See the corrected implementation in isHopBuildVector(). Can we reduce this
7779/// code because it is only used for partial h-op matching now?
7780static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7781 const SDLoc &DL, SelectionDAG &DAG,
7782 unsigned BaseIdx, unsigned LastIdx,
7783 SDValue &V0, SDValue &V1) {
7784 EVT VT = N->getValueType(0);
7785 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7786 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7787 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7788 "Invalid Vector in input!");
7789
7790 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7791 bool CanFold = true;
7792 unsigned ExpectedVExtractIdx = BaseIdx;
7793 unsigned NumElts = LastIdx - BaseIdx;
7794 V0 = DAG.getUNDEF(VT);
7795 V1 = DAG.getUNDEF(VT);
7796
7797 // Check if N implements a horizontal binop.
7798 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7799 SDValue Op = N->getOperand(i + BaseIdx);
7800
7801 // Skip UNDEFs.
7802 if (Op->isUndef()) {
7803 // Update the expected vector extract index.
7804 if (i * 2 == NumElts)
7805 ExpectedVExtractIdx = BaseIdx;
7806 ExpectedVExtractIdx += 2;
7807 continue;
7808 }
7809
7810 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7811
7812 if (!CanFold)
7813 break;
7814
7815 SDValue Op0 = Op.getOperand(0);
7816 SDValue Op1 = Op.getOperand(1);
7817
7818 // Try to match the following pattern:
7819 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7820 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7822 Op0.getOperand(0) == Op1.getOperand(0) &&
7823 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7824 isa<ConstantSDNode>(Op1.getOperand(1)));
7825 if (!CanFold)
7826 break;
7827
7828 unsigned I0 = Op0.getConstantOperandVal(1);
7829 unsigned I1 = Op1.getConstantOperandVal(1);
7830
7831 if (i * 2 < NumElts) {
7832 if (V0.isUndef()) {
7833 V0 = Op0.getOperand(0);
7834 if (V0.getValueType() != VT)
7835 return false;
7836 }
7837 } else {
7838 if (V1.isUndef()) {
7839 V1 = Op0.getOperand(0);
7840 if (V1.getValueType() != VT)
7841 return false;
7842 }
7843 if (i * 2 == NumElts)
7844 ExpectedVExtractIdx = BaseIdx;
7845 }
7846
7847 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7848 if (I0 == ExpectedVExtractIdx)
7849 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7850 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7851 // Try to match the following dag sequence:
7852 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7853 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7854 } else
7855 CanFold = false;
7856
7857 ExpectedVExtractIdx += 2;
7858 }
7859
7860 return CanFold;
7861}
7862
7863/// Emit a sequence of two 128-bit horizontal add/sub followed by
7864/// a concat_vector.
7865///
7866/// This is a helper function of LowerToHorizontalOp().
7867/// This function expects two 256-bit vectors called V0 and V1.
7868/// At first, each vector is split into two separate 128-bit vectors.
7869/// Then, the resulting 128-bit vectors are used to implement two
7870/// horizontal binary operations.
7871///
7872/// The kind of horizontal binary operation is defined by \p X86Opcode.
7873///
7874/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7875/// the two new horizontal binop.
7876/// When Mode is set, the first horizontal binop dag node would take as input
7877/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7878/// horizontal binop dag node would take as input the lower 128-bit of V1
7879/// and the upper 128-bit of V1.
7880/// Example:
7881/// HADD V0_LO, V0_HI
7882/// HADD V1_LO, V1_HI
7883///
7884/// Otherwise, the first horizontal binop dag node takes as input the lower
7885/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7886/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7887/// Example:
7888/// HADD V0_LO, V1_LO
7889/// HADD V0_HI, V1_HI
7890///
7891/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7892/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7893/// the upper 128-bits of the result.
7894static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7895 const SDLoc &DL, SelectionDAG &DAG,
7896 unsigned X86Opcode, bool Mode,
7897 bool isUndefLO, bool isUndefHI) {
7898 MVT VT = V0.getSimpleValueType();
7899 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7900 "Invalid nodes in input!");
7901
7902 unsigned NumElts = VT.getVectorNumElements();
7903 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7904 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7905 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7906 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7907 MVT NewVT = V0_LO.getSimpleValueType();
7908
7909 SDValue LO = DAG.getUNDEF(NewVT);
7910 SDValue HI = DAG.getUNDEF(NewVT);
7911
7912 if (Mode) {
7913 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7914 if (!isUndefLO && !V0->isUndef())
7915 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7916 if (!isUndefHI && !V1->isUndef())
7917 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7918 } else {
7919 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7920 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7921 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7922
7923 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7924 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7925 }
7926
7927 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7928}
7929
7930/// Returns true iff \p BV builds a vector with the result equivalent to
7931/// the result of ADDSUB/SUBADD operation.
7932/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7933/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7934/// \p Opnd0 and \p Opnd1.
7936 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7937 SDValue &Opnd0, SDValue &Opnd1,
7938 unsigned &NumExtracts,
7939 bool &IsSubAdd) {
7940
7941 MVT VT = BV->getSimpleValueType(0);
7942 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7943 return false;
7944
7945 unsigned NumElts = VT.getVectorNumElements();
7946 SDValue InVec0 = DAG.getUNDEF(VT);
7947 SDValue InVec1 = DAG.getUNDEF(VT);
7948
7949 NumExtracts = 0;
7950
7951 // Odd-numbered elements in the input build vector are obtained from
7952 // adding/subtracting two integer/float elements.
7953 // Even-numbered elements in the input build vector are obtained from
7954 // subtracting/adding two integer/float elements.
7955 unsigned Opc[2] = {0, 0};
7956 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7957 SDValue Op = BV->getOperand(i);
7958
7959 // Skip 'undef' values.
7960 unsigned Opcode = Op.getOpcode();
7961 if (Opcode == ISD::UNDEF)
7962 continue;
7963
7964 // Early exit if we found an unexpected opcode.
7965 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7966 return false;
7967
7968 SDValue Op0 = Op.getOperand(0);
7969 SDValue Op1 = Op.getOperand(1);
7970
7971 // Try to match the following pattern:
7972 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7973 // Early exit if we cannot match that sequence.
7974 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7976 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7977 Op0.getOperand(1) != Op1.getOperand(1))
7978 return false;
7979
7980 unsigned I0 = Op0.getConstantOperandVal(1);
7981 if (I0 != i)
7982 return false;
7983
7984 // We found a valid add/sub node, make sure its the same opcode as previous
7985 // elements for this parity.
7986 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7987 return false;
7988 Opc[i % 2] = Opcode;
7989
7990 // Update InVec0 and InVec1.
7991 if (InVec0.isUndef()) {
7992 InVec0 = Op0.getOperand(0);
7993 if (InVec0.getSimpleValueType() != VT)
7994 return false;
7995 }
7996 if (InVec1.isUndef()) {
7997 InVec1 = Op1.getOperand(0);
7998 if (InVec1.getSimpleValueType() != VT)
7999 return false;
8000 }
8001
8002 // Make sure that operands in input to each add/sub node always
8003 // come from a same pair of vectors.
8004 if (InVec0 != Op0.getOperand(0)) {
8005 if (Opcode == ISD::FSUB)
8006 return false;
8007
8008 // FADD is commutable. Try to commute the operands
8009 // and then test again.
8010 std::swap(Op0, Op1);
8011 if (InVec0 != Op0.getOperand(0))
8012 return false;
8013 }
8014
8015 if (InVec1 != Op1.getOperand(0))
8016 return false;
8017
8018 // Increment the number of extractions done.
8019 ++NumExtracts;
8020 }
8021
8022 // Ensure we have found an opcode for both parities and that they are
8023 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8024 // inputs are undef.
8025 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8026 InVec0.isUndef() || InVec1.isUndef())
8027 return false;
8028
8029 IsSubAdd = Opc[0] == ISD::FADD;
8030
8031 Opnd0 = InVec0;
8032 Opnd1 = InVec1;
8033 return true;
8034}
8035
8036/// Returns true if is possible to fold MUL and an idiom that has already been
8037/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8038/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8039/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8040///
8041/// Prior to calling this function it should be known that there is some
8042/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8043/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8044/// before replacement of such SDNode with ADDSUB operation. Thus the number
8045/// of \p Opnd0 uses is expected to be equal to 2.
8046/// For example, this function may be called for the following IR:
8047/// %AB = fmul fast <2 x double> %A, %B
8048/// %Sub = fsub fast <2 x double> %AB, %C
8049/// %Add = fadd fast <2 x double> %AB, %C
8050/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8051/// <2 x i32> <i32 0, i32 3>
8052/// There is a def for %Addsub here, which potentially can be replaced by
8053/// X86ISD::ADDSUB operation:
8054/// %Addsub = X86ISD::ADDSUB %AB, %C
8055/// and such ADDSUB can further be replaced with FMADDSUB:
8056/// %Addsub = FMADDSUB %A, %B, %C.
8057///
8058/// The main reason why this method is called before the replacement of the
8059/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8060/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8061/// FMADDSUB is.
8062static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8063 SelectionDAG &DAG,
8064 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8065 unsigned ExpectedUses) {
8066 if (Opnd0.getOpcode() != ISD::FMUL ||
8067 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8068 return false;
8069
8070 // FIXME: These checks must match the similar ones in
8071 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8072 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8073 // or MUL + ADDSUB to FMADDSUB.
8074 const TargetOptions &Options = DAG.getTarget().Options;
8075 bool AllowFusion =
8076 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8077 if (!AllowFusion)
8078 return false;
8079
8080 Opnd2 = Opnd1;
8081 Opnd1 = Opnd0.getOperand(1);
8082 Opnd0 = Opnd0.getOperand(0);
8083
8084 return true;
8085}
8086
8087/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8088/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8089/// X86ISD::FMSUBADD node.
8091 const SDLoc &DL,
8092 const X86Subtarget &Subtarget,
8093 SelectionDAG &DAG) {
8094 SDValue Opnd0, Opnd1;
8095 unsigned NumExtracts;
8096 bool IsSubAdd;
8097 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8098 IsSubAdd))
8099 return SDValue();
8100
8101 MVT VT = BV->getSimpleValueType(0);
8102
8103 // Try to generate X86ISD::FMADDSUB node here.
8104 SDValue Opnd2;
8105 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8106 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8107 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8108 }
8109
8110 // We only support ADDSUB.
8111 if (IsSubAdd)
8112 return SDValue();
8113
8114 // There are no known X86 targets with 512-bit ADDSUB instructions!
8115 // Convert to blend(fsub,fadd).
8116 if (VT.is512BitVector()) {
8117 SmallVector<int> Mask;
8118 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8119 Mask.push_back(I);
8120 Mask.push_back(I + E + 1);
8121 }
8122 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8123 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8124 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8125 }
8126
8127 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8128}
8129
8131 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8132 // Initialize outputs to known values.
8133 MVT VT = BV->getSimpleValueType(0);
8134 HOpcode = ISD::DELETED_NODE;
8135 V0 = DAG.getUNDEF(VT);
8136 V1 = DAG.getUNDEF(VT);
8137
8138 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8139 // half of the result is calculated independently from the 128-bit halves of
8140 // the inputs, so that makes the index-checking logic below more complicated.
8141 unsigned NumElts = VT.getVectorNumElements();
8142 unsigned GenericOpcode = ISD::DELETED_NODE;
8143 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8144 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8145 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8146 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8147 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8148 // Ignore undef elements.
8149 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8150 if (Op.isUndef())
8151 continue;
8152
8153 // If there's an opcode mismatch, we're done.
8154 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8155 return false;
8156
8157 // Initialize horizontal opcode.
8158 if (HOpcode == ISD::DELETED_NODE) {
8159 GenericOpcode = Op.getOpcode();
8160 switch (GenericOpcode) {
8161 // clang-format off
8162 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8163 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8164 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8165 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8166 default: return false;
8167 // clang-format on
8168 }
8169 }
8170
8171 SDValue Op0 = Op.getOperand(0);
8172 SDValue Op1 = Op.getOperand(1);
8173 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8175 Op0.getOperand(0) != Op1.getOperand(0) ||
8176 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8177 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8178 return false;
8179
8180 // The source vector is chosen based on which 64-bit half of the
8181 // destination vector is being calculated.
8182 if (j < NumEltsIn64Bits) {
8183 if (V0.isUndef())
8184 V0 = Op0.getOperand(0);
8185 } else {
8186 if (V1.isUndef())
8187 V1 = Op0.getOperand(0);
8188 }
8189
8190 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8191 if (SourceVec != Op0.getOperand(0))
8192 return false;
8193
8194 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8195 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8196 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8197 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8198 (j % NumEltsIn64Bits) * 2;
8199 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8200 continue;
8201
8202 // If this is not a commutative op, this does not match.
8203 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8204 return false;
8205
8206 // Addition is commutative, so try swapping the extract indexes.
8207 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8208 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8209 continue;
8210
8211 // Extract indexes do not match horizontal requirement.
8212 return false;
8213 }
8214 }
8215 // We matched. Opcode and operands are returned by reference as arguments.
8216 return true;
8217}
8218
8220 const SDLoc &DL, SelectionDAG &DAG,
8221 unsigned HOpcode, SDValue V0, SDValue V1) {
8222 // If either input vector is not the same size as the build vector,
8223 // extract/insert the low bits to the correct size.
8224 // This is free (examples: zmm --> xmm, xmm --> ymm).
8225 MVT VT = BV->getSimpleValueType(0);
8226 unsigned Width = VT.getSizeInBits();
8227 if (V0.getValueSizeInBits() > Width)
8228 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8229 else if (V0.getValueSizeInBits() < Width)
8230 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8231
8232 if (V1.getValueSizeInBits() > Width)
8233 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8234 else if (V1.getValueSizeInBits() < Width)
8235 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8236
8237 unsigned NumElts = VT.getVectorNumElements();
8238 APInt DemandedElts = APInt::getAllOnes(NumElts);
8239 for (unsigned i = 0; i != NumElts; ++i)
8240 if (BV->getOperand(i).isUndef())
8241 DemandedElts.clearBit(i);
8242
8243 // If we don't need the upper xmm, then perform as a xmm hop.
8244 unsigned HalfNumElts = NumElts / 2;
8245 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8246 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8247 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8248 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8249 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8250 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8251 }
8252
8253 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8254}
8255
8256/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8258 const X86Subtarget &Subtarget,
8259 SelectionDAG &DAG) {
8260 // We need at least 2 non-undef elements to make this worthwhile by default.
8261 unsigned NumNonUndefs =
8262 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8263 if (NumNonUndefs < 2)
8264 return SDValue();
8265
8266 // There are 4 sets of horizontal math operations distinguished by type:
8267 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8268 // subtarget feature. Try to match those "native" patterns first.
8269 MVT VT = BV->getSimpleValueType(0);
8270 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8271 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8272 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8273 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8274 unsigned HOpcode;
8275 SDValue V0, V1;
8276 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8277 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8278 }
8279
8280 // Try harder to match 256-bit ops by using extract/concat.
8281 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8282 return SDValue();
8283
8284 // Count the number of UNDEF operands in the build_vector in input.
8285 unsigned NumElts = VT.getVectorNumElements();
8286 unsigned Half = NumElts / 2;
8287 unsigned NumUndefsLO = 0;
8288 unsigned NumUndefsHI = 0;
8289 for (unsigned i = 0, e = Half; i != e; ++i)
8290 if (BV->getOperand(i)->isUndef())
8291 NumUndefsLO++;
8292
8293 for (unsigned i = Half, e = NumElts; i != e; ++i)
8294 if (BV->getOperand(i)->isUndef())
8295 NumUndefsHI++;
8296
8297 SDValue InVec0, InVec1;
8298 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8299 SDValue InVec2, InVec3;
8300 unsigned X86Opcode;
8301 bool CanFold = true;
8302
8303 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8304 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8305 InVec3) &&
8306 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8307 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8308 X86Opcode = X86ISD::HADD;
8309 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8310 InVec1) &&
8311 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8312 InVec3) &&
8313 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8314 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8315 X86Opcode = X86ISD::HSUB;
8316 else
8317 CanFold = false;
8318
8319 if (CanFold) {
8320 // Do not try to expand this build_vector into a pair of horizontal
8321 // add/sub if we can emit a pair of scalar add/sub.
8322 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8323 return SDValue();
8324
8325 // Convert this build_vector into a pair of horizontal binops followed by
8326 // a concat vector. We must adjust the outputs from the partial horizontal
8327 // matching calls above to account for undefined vector halves.
8328 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8329 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8330 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8331 bool isUndefLO = NumUndefsLO == Half;
8332 bool isUndefHI = NumUndefsHI == Half;
8333 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8334 isUndefHI);
8335 }
8336 }
8337
8338 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8339 VT == MVT::v16i16) {
8340 unsigned X86Opcode;
8341 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8342 InVec1))
8343 X86Opcode = X86ISD::HADD;
8344 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8345 InVec1))
8346 X86Opcode = X86ISD::HSUB;
8347 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8348 InVec1))
8349 X86Opcode = X86ISD::FHADD;
8350 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8351 InVec1))
8352 X86Opcode = X86ISD::FHSUB;
8353 else
8354 return SDValue();
8355
8356 // Don't try to expand this build_vector into a pair of horizontal add/sub
8357 // if we can simply emit a pair of scalar add/sub.
8358 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8359 return SDValue();
8360
8361 // Convert this build_vector into two horizontal add/sub followed by
8362 // a concat vector.
8363 bool isUndefLO = NumUndefsLO == Half;
8364 bool isUndefHI = NumUndefsHI == Half;
8365 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8366 isUndefLO, isUndefHI);
8367 }
8368
8369 return SDValue();
8370}
8371
8372static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8373 SelectionDAG &DAG);
8374
8375/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8376/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8377/// just apply the bit to the vectors.
8378/// NOTE: Its not in our interest to start make a general purpose vectorizer
8379/// from this, but enough scalar bit operations are created from the later
8380/// legalization + scalarization stages to need basic support.
8382 const X86Subtarget &Subtarget,
8383 SelectionDAG &DAG) {
8384 MVT VT = Op->getSimpleValueType(0);
8385 unsigned NumElems = VT.getVectorNumElements();
8386 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8387
8388 // Check that all elements have the same opcode.
8389 // TODO: Should we allow UNDEFS and if so how many?
8390 unsigned Opcode = Op->getOperand(0).getOpcode();
8391 for (unsigned i = 1; i < NumElems; ++i)
8392 if (Opcode != Op->getOperand(i).getOpcode())
8393 return SDValue();
8394
8395 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8396 bool IsShift = false;
8397 switch (Opcode) {
8398 default:
8399 return SDValue();
8400 case ISD::SHL:
8401 case ISD::SRL:
8402 case ISD::SRA:
8403 IsShift = true;
8404 break;
8405 case ISD::AND:
8406 case ISD::XOR:
8407 case ISD::OR:
8408 // Don't do this if the buildvector is a splat - we'd replace one
8409 // constant with an entire vector.
8410 if (Op->getSplatValue())
8411 return SDValue();
8412 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8413 return SDValue();
8414 break;
8415 }
8416
8417 SmallVector<SDValue, 4> LHSElts, RHSElts;
8418 for (SDValue Elt : Op->ops()) {
8419 SDValue LHS = Elt.getOperand(0);
8420 SDValue RHS = Elt.getOperand(1);
8421
8422 // We expect the canonicalized RHS operand to be the constant.
8423 if (!isa<ConstantSDNode>(RHS))
8424 return SDValue();
8425
8426 // Extend shift amounts.
8427 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8428 if (!IsShift)
8429 return SDValue();
8430 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8431 }
8432
8433 LHSElts.push_back(LHS);
8434 RHSElts.push_back(RHS);
8435 }
8436
8437 // Limit to shifts by uniform immediates.
8438 // TODO: Only accept vXi8/vXi64 special cases?
8439 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8440 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8441 return SDValue();
8442
8443 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8444 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8445 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8446
8447 if (!IsShift)
8448 return Res;
8449
8450 // Immediately lower the shift to ensure the constant build vector doesn't
8451 // get converted to a constant pool before the shift is lowered.
8452 return LowerShift(Res, Subtarget, DAG);
8453}
8454
8455/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8456/// functionality to do this, so it's all zeros, all ones, or some derivation
8457/// that is cheap to calculate.
8459 SelectionDAG &DAG,
8460 const X86Subtarget &Subtarget) {
8461 MVT VT = Op.getSimpleValueType();
8462
8463 // Vectors containing all zeros can be matched by pxor and xorps.
8464 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8465 return Op;
8466
8467 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8468 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8469 // vpcmpeqd on 256-bit vectors.
8470 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8471 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8472 return Op;
8473
8474 return getOnesVector(VT, DAG, DL);
8475 }
8476
8477 return SDValue();
8478}
8479
8480/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8481/// from a vector of source values and a vector of extraction indices.
8482/// The vectors might be manipulated to match the type of the permute op.
8483static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8484 const SDLoc &DL, SelectionDAG &DAG,
8485 const X86Subtarget &Subtarget) {
8486 MVT ShuffleVT = VT;
8487 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8488 unsigned NumElts = VT.getVectorNumElements();
8489 unsigned SizeInBits = VT.getSizeInBits();
8490
8491 // Adjust IndicesVec to match VT size.
8492 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8493 "Illegal variable permute mask size");
8494 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8495 // Narrow/widen the indices vector to the correct size.
8496 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8497 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8498 NumElts * VT.getScalarSizeInBits());
8499 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8500 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8501 SDLoc(IndicesVec), SizeInBits);
8502 // Zero-extend the index elements within the vector.
8503 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8504 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8505 IndicesVT, IndicesVec);
8506 }
8507 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8508
8509 // Handle SrcVec that don't match VT type.
8510 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8511 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8512 // Handle larger SrcVec by treating it as a larger permute.
8513 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8514 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8515 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8516 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8517 Subtarget, DAG, SDLoc(IndicesVec));
8518 SDValue NewSrcVec =
8519 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8520 if (NewSrcVec)
8521 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8522 return SDValue();
8523 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8524 // Widen smaller SrcVec to match VT.
8525 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8526 } else
8527 return SDValue();
8528 }
8529
8530 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8531 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8532 EVT SrcVT = Idx.getValueType();
8533 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8534 uint64_t IndexScale = 0;
8535 uint64_t IndexOffset = 0;
8536
8537 // If we're scaling a smaller permute op, then we need to repeat the
8538 // indices, scaling and offsetting them as well.
8539 // e.g. v4i32 -> v16i8 (Scale = 4)
8540 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8541 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8542 for (uint64_t i = 0; i != Scale; ++i) {
8543 IndexScale |= Scale << (i * NumDstBits);
8544 IndexOffset |= i << (i * NumDstBits);
8545 }
8546
8547 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8548 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8549 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8550 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8551 return Idx;
8552 };
8553
8554 unsigned Opcode = 0;
8555 switch (VT.SimpleTy) {
8556 default:
8557 break;
8558 case MVT::v16i8:
8559 if (Subtarget.hasSSSE3())
8560 Opcode = X86ISD::PSHUFB;
8561 break;
8562 case MVT::v8i16:
8563 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8564 Opcode = X86ISD::VPERMV;
8565 else if (Subtarget.hasSSSE3()) {
8566 Opcode = X86ISD::PSHUFB;
8567 ShuffleVT = MVT::v16i8;
8568 }
8569 break;
8570 case MVT::v4f32:
8571 case MVT::v4i32:
8572 if (Subtarget.hasAVX()) {
8573 Opcode = X86ISD::VPERMILPV;
8574 ShuffleVT = MVT::v4f32;
8575 } else if (Subtarget.hasSSSE3()) {
8576 Opcode = X86ISD::PSHUFB;
8577 ShuffleVT = MVT::v16i8;
8578 }
8579 break;
8580 case MVT::v2f64:
8581 case MVT::v2i64:
8582 if (Subtarget.hasAVX()) {
8583 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8584 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8585 Opcode = X86ISD::VPERMILPV;
8586 ShuffleVT = MVT::v2f64;
8587 } else if (Subtarget.hasSSE41()) {
8588 // SSE41 can compare v2i64 - select between indices 0 and 1.
8589 return DAG.getSelectCC(
8590 DL, IndicesVec,
8591 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8592 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8593 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8595 }
8596 break;
8597 case MVT::v32i8:
8598 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8599 Opcode = X86ISD::VPERMV;
8600 else if (Subtarget.hasXOP()) {
8601 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8602 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8603 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8604 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8605 return DAG.getNode(
8607 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8608 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8609 } else if (Subtarget.hasAVX()) {
8610 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8611 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8612 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8613 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8614 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8615 ArrayRef<SDValue> Ops) {
8616 // Permute Lo and Hi and then select based on index range.
8617 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8618 // care about the bit[7] as its just an index vector.
8619 SDValue Idx = Ops[2];
8620 EVT VT = Idx.getValueType();
8621 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8622 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8623 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8625 };
8626 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8627 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8628 PSHUFBBuilder);
8629 }
8630 break;
8631 case MVT::v16i16:
8632 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8633 Opcode = X86ISD::VPERMV;
8634 else if (Subtarget.hasAVX()) {
8635 // Scale to v32i8 and perform as v32i8.
8636 IndicesVec = ScaleIndices(IndicesVec, 2);
8637 return DAG.getBitcast(
8639 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8640 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8641 }
8642 break;
8643 case MVT::v8f32:
8644 case MVT::v8i32:
8645 if (Subtarget.hasAVX2())
8646 Opcode = X86ISD::VPERMV;
8647 else if (Subtarget.hasAVX()) {
8648 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8649 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8650 {0, 1, 2, 3, 0, 1, 2, 3});
8651 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8652 {4, 5, 6, 7, 4, 5, 6, 7});
8653 if (Subtarget.hasXOP())
8654 return DAG.getBitcast(
8655 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8656 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8657 // Permute Lo and Hi and then select based on index range.
8658 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8659 SDValue Res = DAG.getSelectCC(
8660 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8661 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8662 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8664 return DAG.getBitcast(VT, Res);
8665 }
8666 break;
8667 case MVT::v4i64:
8668 case MVT::v4f64:
8669 if (Subtarget.hasAVX512()) {
8670 if (!Subtarget.hasVLX()) {
8671 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8672 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8673 SDLoc(SrcVec));
8674 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8675 DAG, SDLoc(IndicesVec));
8676 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8677 DAG, Subtarget);
8678 return extract256BitVector(Res, 0, DAG, DL);
8679 }
8680 Opcode = X86ISD::VPERMV;
8681 } else if (Subtarget.hasAVX()) {
8682 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8683 SDValue LoLo =
8684 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8685 SDValue HiHi =
8686 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8687 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8688 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8689 if (Subtarget.hasXOP())
8690 return DAG.getBitcast(
8691 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8692 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8693 // Permute Lo and Hi and then select based on index range.
8694 // This works as VPERMILPD only uses index bit[1] to permute elements.
8695 SDValue Res = DAG.getSelectCC(
8696 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8697 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8698 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8700 return DAG.getBitcast(VT, Res);
8701 }
8702 break;
8703 case MVT::v64i8:
8704 if (Subtarget.hasVBMI())
8705 Opcode = X86ISD::VPERMV;
8706 break;
8707 case MVT::v32i16:
8708 if (Subtarget.hasBWI())
8709 Opcode = X86ISD::VPERMV;
8710 break;
8711 case MVT::v16f32:
8712 case MVT::v16i32:
8713 case MVT::v8f64:
8714 case MVT::v8i64:
8715 if (Subtarget.hasAVX512())
8716 Opcode = X86ISD::VPERMV;
8717 break;
8718 }
8719 if (!Opcode)
8720 return SDValue();
8721
8722 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8723 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8724 "Illegal variable permute shuffle type");
8725
8726 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8727 if (Scale > 1)
8728 IndicesVec = ScaleIndices(IndicesVec, Scale);
8729
8730 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8731 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8732
8733 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8734 SDValue Res = Opcode == X86ISD::VPERMV
8735 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8736 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8737 return DAG.getBitcast(VT, Res);
8738}
8739
8740// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8741// reasoned to be a permutation of a vector by indices in a non-constant vector.
8742// (build_vector (extract_elt V, (extract_elt I, 0)),
8743// (extract_elt V, (extract_elt I, 1)),
8744// ...
8745// ->
8746// (vpermv I, V)
8747//
8748// TODO: Handle undefs
8749// TODO: Utilize pshufb and zero mask blending to support more efficient
8750// construction of vectors with constant-0 elements.
8751static SDValue
8753 SelectionDAG &DAG,
8754 const X86Subtarget &Subtarget) {
8755 SDValue SrcVec, IndicesVec;
8756 // Check for a match of the permute source vector and permute index elements.
8757 // This is done by checking that the i-th build_vector operand is of the form:
8758 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8759 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8760 SDValue Op = V.getOperand(Idx);
8761 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8762 return SDValue();
8763
8764 // If this is the first extract encountered in V, set the source vector,
8765 // otherwise verify the extract is from the previously defined source
8766 // vector.
8767 if (!SrcVec)
8768 SrcVec = Op.getOperand(0);
8769 else if (SrcVec != Op.getOperand(0))
8770 return SDValue();
8771 SDValue ExtractedIndex = Op->getOperand(1);
8772 // Peek through extends.
8773 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8774 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8775 ExtractedIndex = ExtractedIndex.getOperand(0);
8776 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8777 return SDValue();
8778
8779 // If this is the first extract from the index vector candidate, set the
8780 // indices vector, otherwise verify the extract is from the previously
8781 // defined indices vector.
8782 if (!IndicesVec)
8783 IndicesVec = ExtractedIndex.getOperand(0);
8784 else if (IndicesVec != ExtractedIndex.getOperand(0))
8785 return SDValue();
8786
8787 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8788 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8789 return SDValue();
8790 }
8791
8792 MVT VT = V.getSimpleValueType();
8793 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8794}
8795
8796SDValue
8797X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8798 SDLoc dl(Op);
8799
8800 MVT VT = Op.getSimpleValueType();
8801 MVT EltVT = VT.getVectorElementType();
8802 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
8803 unsigned NumElems = Op.getNumOperands();
8804
8805 // Generate vectors for predicate vectors.
8806 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8807 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
8808
8809 if (VT.getVectorElementType() == MVT::bf16 &&
8810 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
8811 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
8812
8813 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
8814 return VectorCst;
8815
8816 unsigned EVTBits = EltVT.getSizeInBits();
8817 APInt UndefMask = APInt::getZero(NumElems);
8818 APInt FrozenUndefMask = APInt::getZero(NumElems);
8819 APInt ZeroMask = APInt::getZero(NumElems);
8820 APInt NonZeroMask = APInt::getZero(NumElems);
8821 bool IsAllConstants = true;
8822 bool OneUseFrozenUndefs = true;
8823 SmallSet<SDValue, 8> Values;
8824 unsigned NumConstants = NumElems;
8825 for (unsigned i = 0; i < NumElems; ++i) {
8826 SDValue Elt = Op.getOperand(i);
8827 if (Elt.isUndef()) {
8828 UndefMask.setBit(i);
8829 continue;
8830 }
8831 if (ISD::isFreezeUndef(Elt.getNode())) {
8832 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
8833 FrozenUndefMask.setBit(i);
8834 continue;
8835 }
8836 Values.insert(Elt);
8837 if (!isIntOrFPConstant(Elt)) {
8838 IsAllConstants = false;
8839 NumConstants--;
8840 }
8841 if (X86::isZeroNode(Elt)) {
8842 ZeroMask.setBit(i);
8843 } else {
8844 NonZeroMask.setBit(i);
8845 }
8846 }
8847
8848 // All undef vector. Return an UNDEF.
8849 if (UndefMask.isAllOnes())
8850 return DAG.getUNDEF(VT);
8851
8852 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
8853 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
8854 return DAG.getFreeze(DAG.getUNDEF(VT));
8855
8856 // All undef/freeze(undef)/zero vector. Return a zero vector.
8857 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
8858 return getZeroVector(VT, Subtarget, DAG, dl);
8859
8860 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
8861 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
8862 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
8863 // and blend the FREEZE-UNDEF operands back in.
8864 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
8865 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
8866 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
8867 SmallVector<int, 16> BlendMask(NumElems, -1);
8868 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
8869 for (unsigned i = 0; i < NumElems; ++i) {
8870 if (UndefMask[i]) {
8871 BlendMask[i] = -1;
8872 continue;
8873 }
8874 BlendMask[i] = i;
8875 if (!FrozenUndefMask[i])
8876 Elts[i] = Op.getOperand(i);
8877 else
8878 BlendMask[i] += NumElems;
8879 }
8880 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
8881 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
8882 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
8883 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
8884 }
8885
8886 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8887
8888 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
8889 // be better off lowering to a smaller build vector and padding with
8890 // undef/zero.
8891 if ((VT.is256BitVector() || VT.is512BitVector()) &&
8893 unsigned UpperElems = NumElems / 2;
8894 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
8895 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
8896 if (NumUpperUndefsOrZeros >= UpperElems) {
8897 if (VT.is512BitVector() &&
8898 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
8899 UpperElems = NumElems - (NumElems / 4);
8900 // If freeze(undef) is in any upper elements, force to zero.
8901 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
8902 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
8903 SDValue NewBV =
8904 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
8905 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
8906 }
8907 }
8908
8909 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
8910 return AddSub;
8911 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
8912 return HorizontalOp;
8913 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
8914 return Broadcast;
8915 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
8916 return BitOp;
8917
8918 unsigned NumZero = ZeroMask.popcount();
8919 unsigned NumNonZero = NonZeroMask.popcount();
8920
8921 // If we are inserting one variable into a vector of non-zero constants, try
8922 // to avoid loading each constant element as a scalar. Load the constants as a
8923 // vector and then insert the variable scalar element. If insertion is not
8924 // supported, fall back to a shuffle to get the scalar blended with the
8925 // constants. Insertion into a zero vector is handled as a special-case
8926 // somewhere below here.
8927 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8928 FrozenUndefMask.isZero() &&
8931 // Create an all-constant vector. The variable element in the old
8932 // build vector is replaced by undef in the constant vector. Save the
8933 // variable scalar element and its index for use in the insertelement.
8934 LLVMContext &Context = *DAG.getContext();
8935 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8936 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8937 SDValue VarElt;
8938 SDValue InsIndex;
8939 for (unsigned i = 0; i != NumElems; ++i) {
8940 SDValue Elt = Op.getOperand(i);
8941 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8942 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8943 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8944 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8945 else if (!Elt.isUndef()) {
8946 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8947 "Expected one variable element in this vector");
8948 VarElt = Elt;
8949 InsIndex = DAG.getVectorIdxConstant(i, dl);
8950 }
8951 }
8952 Constant *CV = ConstantVector::get(ConstVecOps);
8953 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8954
8955 // The constants we just created may not be legal (eg, floating point). We
8956 // must lower the vector right here because we can not guarantee that we'll
8957 // legalize it before loading it. This is also why we could not just create
8958 // a new build vector here. If the build vector contains illegal constants,
8959 // it could get split back up into a series of insert elements.
8960 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8961 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8964 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8965 unsigned InsertC = InsIndex->getAsZExtVal();
8966 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
8967 if (InsertC < NumEltsInLow128Bits)
8968 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8969
8970 // There's no good way to insert into the high elements of a >128-bit
8971 // vector, so use shuffles to avoid an extract/insert sequence.
8972 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
8973 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
8974 SmallVector<int, 8> ShuffleMask;
8975 unsigned NumElts = VT.getVectorNumElements();
8976 for (unsigned i = 0; i != NumElts; ++i)
8977 ShuffleMask.push_back(i == InsertC ? NumElts : i);
8978 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
8979 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
8980 }
8981
8982 // Special case for single non-zero, non-undef, element.
8983 if (NumNonZero == 1) {
8984 unsigned Idx = NonZeroMask.countr_zero();
8985 SDValue Item = Op.getOperand(Idx);
8986
8987 // If we have a constant or non-constant insertion into the low element of
8988 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8989 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8990 // depending on what the source datatype is.
8991 if (Idx == 0) {
8992 if (NumZero == 0)
8993 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8994
8995 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
8996 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
8997 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
8998 assert((VT.is128BitVector() || VT.is256BitVector() ||
8999 VT.is512BitVector()) &&
9000 "Expected an SSE value type!");
9001 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9002 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9003 // zero vector.
9004 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9005 }
9006
9007 // We can't directly insert an i8 or i16 into a vector, so zero extend
9008 // it to i32 first.
9009 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9010 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9011 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9012 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9013 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9014 return DAG.getBitcast(VT, Item);
9015 }
9016 }
9017
9018 // Is it a vector logical left shift?
9019 if (NumElems == 2 && Idx == 1 &&
9020 X86::isZeroNode(Op.getOperand(0)) &&
9021 !X86::isZeroNode(Op.getOperand(1))) {
9022 unsigned NumBits = VT.getSizeInBits();
9023 return getVShift(true, VT,
9025 VT, Op.getOperand(1)),
9026 NumBits/2, DAG, *this, dl);
9027 }
9028
9029 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9030 return SDValue();
9031
9032 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9033 // is a non-constant being inserted into an element other than the low one,
9034 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9035 // movd/movss) to move this into the low element, then shuffle it into
9036 // place.
9037 if (EVTBits == 32) {
9038 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9039 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9040 }
9041 }
9042
9043 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9044 if (Values.size() == 1) {
9045 if (EVTBits == 32) {
9046 // Instead of a shuffle like this:
9047 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9048 // Check if it's possible to issue this instead.
9049 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9050 unsigned Idx = NonZeroMask.countr_zero();
9051 SDValue Item = Op.getOperand(Idx);
9052 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9053 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9054 }
9055 return SDValue();
9056 }
9057
9058 // A vector full of immediates; various special cases are already
9059 // handled, so this is best done with a single constant-pool load.
9060 if (IsAllConstants)
9061 return SDValue();
9062
9063 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9064 return V;
9065
9066 // See if we can use a vector load to get all of the elements.
9067 {
9068 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9069 if (SDValue LD =
9070 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9071 return LD;
9072 }
9073
9074 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9075 // build_vector and broadcast it.
9076 // TODO: We could probably generalize this more.
9077 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9078 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9079 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9080 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9081 // Make sure all the even/odd operands match.
9082 for (unsigned i = 2; i != NumElems; ++i)
9083 if (Ops[i % 2] != Op.getOperand(i))
9084 return false;
9085 return true;
9086 };
9087 if (CanSplat(Op, NumElems, Ops)) {
9088 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9089 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9090 // Create a new build vector and cast to v2i64/v2f64.
9091 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9092 DAG.getBuildVector(NarrowVT, dl, Ops));
9093 // Broadcast from v2i64/v2f64 and cast to final VT.
9094 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9095 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9096 NewBV));
9097 }
9098 }
9099
9100 // For AVX-length vectors, build the individual 128-bit pieces and use
9101 // shuffles to put them in place.
9102 if (VT.getSizeInBits() > 128) {
9103 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9104
9105 // Build both the lower and upper subvector.
9106 SDValue Lower =
9107 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9109 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9110
9111 // Recreate the wider vector with the lower and upper part.
9112 return concatSubVectors(Lower, Upper, DAG, dl);
9113 }
9114
9115 // Let legalizer expand 2-wide build_vectors.
9116 if (EVTBits == 64) {
9117 if (NumNonZero == 1) {
9118 // One half is zero or undef.
9119 unsigned Idx = NonZeroMask.countr_zero();
9121 Op.getOperand(Idx));
9122 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9123 }
9124 return SDValue();
9125 }
9126
9127 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9128 if (EVTBits == 8 && NumElems == 16)
9129 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9130 NumZero, DAG, Subtarget))
9131 return V;
9132
9133 if (EltVT == MVT::i16 && NumElems == 8)
9134 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9135 NumZero, DAG, Subtarget))
9136 return V;
9137
9138 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9139 if (EVTBits == 32 && NumElems == 4)
9140 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9141 return V;
9142
9143 // If element VT is == 32 bits, turn it into a number of shuffles.
9144 if (NumElems == 4 && NumZero > 0) {
9145 SmallVector<SDValue, 8> Ops(NumElems);
9146 for (unsigned i = 0; i < 4; ++i) {
9147 bool isZero = !NonZeroMask[i];
9148 if (isZero)
9149 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9150 else
9151 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9152 }
9153
9154 for (unsigned i = 0; i < 2; ++i) {
9155 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9156 default: llvm_unreachable("Unexpected NonZero count");
9157 case 0:
9158 Ops[i] = Ops[i*2]; // Must be a zero vector.
9159 break;
9160 case 1:
9161 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9162 break;
9163 case 2:
9164 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9165 break;
9166 case 3:
9167 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9168 break;
9169 }
9170 }
9171
9172 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9173 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9174 int MaskVec[] = {
9175 Reverse1 ? 1 : 0,
9176 Reverse1 ? 0 : 1,
9177 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9178 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9179 };
9180 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9181 }
9182
9183 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9184
9185 // Check for a build vector from mostly shuffle plus few inserting.
9186 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9187 return Sh;
9188
9189 // For SSE 4.1, use insertps to put the high elements into the low element.
9190 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9192 if (!Op.getOperand(0).isUndef())
9193 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9194 else
9195 Result = DAG.getUNDEF(VT);
9196
9197 for (unsigned i = 1; i < NumElems; ++i) {
9198 if (Op.getOperand(i).isUndef()) continue;
9199 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9200 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
9201 }
9202 return Result;
9203 }
9204
9205 // Otherwise, expand into a number of unpckl*, start by extending each of
9206 // our (non-undef) elements to the full vector width with the element in the
9207 // bottom slot of the vector (which generates no code for SSE).
9208 SmallVector<SDValue, 8> Ops(NumElems);
9209 for (unsigned i = 0; i < NumElems; ++i) {
9210 if (!Op.getOperand(i).isUndef())
9211 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9212 else
9213 Ops[i] = DAG.getUNDEF(VT);
9214 }
9215
9216 // Next, we iteratively mix elements, e.g. for v4f32:
9217 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9218 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9219 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9220 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9221 // Generate scaled UNPCKL shuffle mask.
9223 for(unsigned i = 0; i != Scale; ++i)
9224 Mask.push_back(i);
9225 for (unsigned i = 0; i != Scale; ++i)
9226 Mask.push_back(NumElems+i);
9227 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9228
9229 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9230 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9231 }
9232 return Ops[0];
9233}
9234
9235// 256-bit AVX can use the vinsertf128 instruction
9236// to create 256-bit vectors from two other 128-bit ones.
9237// TODO: Detect subvector broadcast here instead of DAG combine?
9239 const X86Subtarget &Subtarget) {
9240 SDLoc dl(Op);
9241 MVT ResVT = Op.getSimpleValueType();
9242
9243 assert((ResVT.is256BitVector() ||
9244 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9245
9246 unsigned NumOperands = Op.getNumOperands();
9247 unsigned NumFreezeUndef = 0;
9248 unsigned NumZero = 0;
9249 unsigned NumNonZero = 0;
9250 unsigned NonZeros = 0;
9251 for (unsigned i = 0; i != NumOperands; ++i) {
9252 SDValue SubVec = Op.getOperand(i);
9253 if (SubVec.isUndef())
9254 continue;
9255 if (ISD::isFreezeUndef(SubVec.getNode())) {
9256 // If the freeze(undef) has multiple uses then we must fold to zero.
9257 if (SubVec.hasOneUse())
9258 ++NumFreezeUndef;
9259 else
9260 ++NumZero;
9261 }
9262 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9263 ++NumZero;
9264 else {
9265 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9266 NonZeros |= 1 << i;
9267 ++NumNonZero;
9268 }
9269 }
9270
9271 // If we have more than 2 non-zeros, build each half separately.
9272 if (NumNonZero > 2) {
9273 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9274 ArrayRef<SDUse> Ops = Op->ops();
9275 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9276 Ops.slice(0, NumOperands/2));
9277 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9278 Ops.slice(NumOperands/2));
9279 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9280 }
9281
9282 // Otherwise, build it up through insert_subvectors.
9283 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9284 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9285 : DAG.getUNDEF(ResVT));
9286
9287 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9288 unsigned NumSubElems = SubVT.getVectorNumElements();
9289 for (unsigned i = 0; i != NumOperands; ++i) {
9290 if ((NonZeros & (1 << i)) == 0)
9291 continue;
9292
9293 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
9294 Op.getOperand(i),
9295 DAG.getIntPtrConstant(i * NumSubElems, dl));
9296 }
9297
9298 return Vec;
9299}
9300
9301// Returns true if the given node is a type promotion (by concatenating i1
9302// zeros) of the result of a node that already zeros all upper bits of
9303// k-register.
9304// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9306 const X86Subtarget &Subtarget,
9307 SelectionDAG & DAG) {
9308 SDLoc dl(Op);
9309 MVT ResVT = Op.getSimpleValueType();
9310 unsigned NumOperands = Op.getNumOperands();
9311
9312 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9313 "Unexpected number of operands in CONCAT_VECTORS");
9314
9315 uint64_t Zeros = 0;
9316 uint64_t NonZeros = 0;
9317 for (unsigned i = 0; i != NumOperands; ++i) {
9318 SDValue SubVec = Op.getOperand(i);
9319 if (SubVec.isUndef())
9320 continue;
9321 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9322 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9323 Zeros |= (uint64_t)1 << i;
9324 else
9325 NonZeros |= (uint64_t)1 << i;
9326 }
9327
9328 unsigned NumElems = ResVT.getVectorNumElements();
9329
9330 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9331 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9332 // insert_subvector will give us two kshifts.
9333 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9334 Log2_64(NonZeros) != NumOperands - 1) {
9335 unsigned Idx = Log2_64(NonZeros);
9336 SDValue SubVec = Op.getOperand(Idx);
9337 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9338 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9339 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9340 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9341 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9342 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9343 DAG.getIntPtrConstant(0, dl));
9344 }
9345
9346 // If there are zero or one non-zeros we can handle this very simply.
9347 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9348 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9349 if (!NonZeros)
9350 return Vec;
9351 unsigned Idx = Log2_64(NonZeros);
9352 SDValue SubVec = Op.getOperand(Idx);
9353 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9354 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9355 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
9356 }
9357
9358 if (NumOperands > 2) {
9359 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9360 ArrayRef<SDUse> Ops = Op->ops();
9361 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9362 Ops.slice(0, NumOperands/2));
9363 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9364 Ops.slice(NumOperands/2));
9365 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9366 }
9367
9368 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9369
9370 if (ResVT.getVectorNumElements() >= 16)
9371 return Op; // The operation is legal with KUNPCK
9372
9373 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
9374 DAG.getUNDEF(ResVT), Op.getOperand(0),
9375 DAG.getIntPtrConstant(0, dl));
9376 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9377 DAG.getIntPtrConstant(NumElems/2, dl));
9378}
9379
9381 const X86Subtarget &Subtarget,
9382 SelectionDAG &DAG) {
9383 MVT VT = Op.getSimpleValueType();
9384 if (VT.getVectorElementType() == MVT::i1)
9385 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9386
9387 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9388 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9389 Op.getNumOperands() == 4)));
9390
9391 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9392 // from two other 128-bit ones.
9393
9394 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9395 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9396}
9397
9398//===----------------------------------------------------------------------===//
9399// Vector shuffle lowering
9400//
9401// This is an experimental code path for lowering vector shuffles on x86. It is
9402// designed to handle arbitrary vector shuffles and blends, gracefully
9403// degrading performance as necessary. It works hard to recognize idiomatic
9404// shuffles and lower them to optimal instruction patterns without leaving
9405// a framework that allows reasonably efficient handling of all vector shuffle
9406// patterns.
9407//===----------------------------------------------------------------------===//
9408
9409/// Tiny helper function to identify a no-op mask.
9410///
9411/// This is a somewhat boring predicate function. It checks whether the mask
9412/// array input, which is assumed to be a single-input shuffle mask of the kind
9413/// used by the X86 shuffle instructions (not a fully general
9414/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9415/// in-place shuffle are 'no-op's.
9417 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9418 assert(Mask[i] >= -1 && "Out of bound mask element!");
9419 if (Mask[i] >= 0 && Mask[i] != i)
9420 return false;
9421 }
9422 return true;
9423}
9424
9425/// Test whether there are elements crossing LaneSizeInBits lanes in this
9426/// shuffle mask.
9427///
9428/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9429/// and we routinely test for these.
9430static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9431 unsigned ScalarSizeInBits,
9432 ArrayRef<int> Mask) {
9433 assert(LaneSizeInBits && ScalarSizeInBits &&
9434 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9435 "Illegal shuffle lane size");
9436 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9437 int Size = Mask.size();
9438 for (int i = 0; i < Size; ++i)
9439 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9440 return true;
9441 return false;
9442}
9443
9444/// Test whether there are elements crossing 128-bit lanes in this
9445/// shuffle mask.
9447 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9448}
9449
9450/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9451/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9452/// better support 'repeated mask + lane permute' style shuffles.
9453static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9454 unsigned ScalarSizeInBits,
9455 ArrayRef<int> Mask) {
9456 assert(LaneSizeInBits && ScalarSizeInBits &&
9457 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9458 "Illegal shuffle lane size");
9459 int NumElts = Mask.size();
9460 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9461 int NumLanes = NumElts / NumEltsPerLane;
9462 if (NumLanes > 1) {
9463 for (int i = 0; i != NumLanes; ++i) {
9464 int SrcLane = -1;
9465 for (int j = 0; j != NumEltsPerLane; ++j) {
9466 int M = Mask[(i * NumEltsPerLane) + j];
9467 if (M < 0)
9468 continue;
9469 int Lane = (M % NumElts) / NumEltsPerLane;
9470 if (SrcLane >= 0 && SrcLane != Lane)
9471 return true;
9472 SrcLane = Lane;
9473 }
9474 }
9475 }
9476 return false;
9477}
9478
9479/// Test whether a shuffle mask is equivalent within each sub-lane.
9480///
9481/// This checks a shuffle mask to see if it is performing the same
9482/// lane-relative shuffle in each sub-lane. This trivially implies
9483/// that it is also not lane-crossing. It may however involve a blend from the
9484/// same lane of a second vector.
9485///
9486/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9487/// non-trivial to compute in the face of undef lanes. The representation is
9488/// suitable for use with existing 128-bit shuffles as entries from the second
9489/// vector have been remapped to [LaneSize, 2*LaneSize).
9490static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9491 ArrayRef<int> Mask,
9492 SmallVectorImpl<int> &RepeatedMask) {
9493 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9494 RepeatedMask.assign(LaneSize, -1);
9495 int Size = Mask.size();
9496 for (int i = 0; i < Size; ++i) {
9497 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9498 if (Mask[i] < 0)
9499 continue;
9500 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9501 // This entry crosses lanes, so there is no way to model this shuffle.
9502 return false;
9503
9504 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9505 // Adjust second vector indices to start at LaneSize instead of Size.
9506 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9507 : Mask[i] % LaneSize + LaneSize;
9508 if (RepeatedMask[i % LaneSize] < 0)
9509 // This is the first non-undef entry in this slot of a 128-bit lane.
9510 RepeatedMask[i % LaneSize] = LocalM;
9511 else if (RepeatedMask[i % LaneSize] != LocalM)
9512 // Found a mismatch with the repeated mask.
9513 return false;
9514 }
9515 return true;
9516}
9517
9518/// Test whether a shuffle mask is equivalent within each 128-bit lane.
9519static bool
9521 SmallVectorImpl<int> &RepeatedMask) {
9522 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9523}
9524
9525static bool
9527 SmallVector<int, 32> RepeatedMask;
9528 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9529}
9530
9531/// Test whether a shuffle mask is equivalent within each 256-bit lane.
9532static bool
9534 SmallVectorImpl<int> &RepeatedMask) {
9535 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9536}
9537
9538/// Test whether a target shuffle mask is equivalent within each sub-lane.
9539/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9540static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9541 unsigned EltSizeInBits,
9542 ArrayRef<int> Mask,
9543 SmallVectorImpl<int> &RepeatedMask) {
9544 int LaneSize = LaneSizeInBits / EltSizeInBits;
9545 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9546 int Size = Mask.size();
9547 for (int i = 0; i < Size; ++i) {
9548 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9549 if (Mask[i] == SM_SentinelUndef)
9550 continue;
9551 if (Mask[i] == SM_SentinelZero) {
9552 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9553 return false;
9554 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9555 continue;
9556 }
9557 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9558 // This entry crosses lanes, so there is no way to model this shuffle.
9559 return false;
9560
9561 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9562 // later vector indices to start at multiples of LaneSize instead of Size.
9563 int LaneM = Mask[i] / Size;
9564 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9565 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9566 // This is the first non-undef entry in this slot of a 128-bit lane.
9567 RepeatedMask[i % LaneSize] = LocalM;
9568 else if (RepeatedMask[i % LaneSize] != LocalM)
9569 // Found a mismatch with the repeated mask.
9570 return false;
9571 }
9572 return true;
9573}
9574
9575/// Test whether a target shuffle mask is equivalent within each sub-lane.
9576/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9577static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9578 ArrayRef<int> Mask,
9579 SmallVectorImpl<int> &RepeatedMask) {
9580 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9581 Mask, RepeatedMask);
9582}
9583
9584/// Checks whether the vector elements referenced by two shuffle masks are
9585/// equivalent.
9586static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9587 int Idx, int ExpectedIdx) {
9588 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9589 ExpectedIdx < MaskSize && "Out of range element index");
9590 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9591 return false;
9592
9593 switch (Op.getOpcode()) {
9594 case ISD::BUILD_VECTOR:
9595 // If the values are build vectors, we can look through them to find
9596 // equivalent inputs that make the shuffles equivalent.
9597 // TODO: Handle MaskSize != Op.getNumOperands()?
9598 if (MaskSize == (int)Op.getNumOperands() &&
9599 MaskSize == (int)ExpectedOp.getNumOperands())
9600 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9601 break;
9602 case X86ISD::VBROADCAST:
9604 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9605 return (Op == ExpectedOp &&
9606 (int)Op.getValueType().getVectorNumElements() == MaskSize);
9607 case X86ISD::HADD:
9608 case X86ISD::HSUB:
9609 case X86ISD::FHADD:
9610 case X86ISD::FHSUB:
9611 case X86ISD::PACKSS:
9612 case X86ISD::PACKUS:
9613 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9614 // TODO: Handle MaskSize != NumElts?
9615 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9616 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9617 MVT VT = Op.getSimpleValueType();
9618 int NumElts = VT.getVectorNumElements();
9619 if (MaskSize == NumElts) {
9620 int NumLanes = VT.getSizeInBits() / 128;
9621 int NumEltsPerLane = NumElts / NumLanes;
9622 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9623 bool SameLane =
9624 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9625 bool SameElt =
9626 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9627 return SameLane && SameElt;
9628 }
9629 }
9630 break;
9631 }
9632
9633 return false;
9634}
9635
9636/// Checks whether a shuffle mask is equivalent to an explicit list of
9637/// arguments.
9638///
9639/// This is a fast way to test a shuffle mask against a fixed pattern:
9640///
9641/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9642///
9643/// It returns true if the mask is exactly as wide as the argument list, and
9644/// each element of the mask is either -1 (signifying undef) or the value given
9645/// in the argument.
9646static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9647 SDValue V1 = SDValue(),
9648 SDValue V2 = SDValue()) {
9649 int Size = Mask.size();
9650 if (Size != (int)ExpectedMask.size())
9651 return false;
9652
9653 for (int i = 0; i < Size; ++i) {
9654 assert(Mask[i] >= -1 && "Out of bound mask element!");
9655 int MaskIdx = Mask[i];
9656 int ExpectedIdx = ExpectedMask[i];
9657 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9658 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9659 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9660 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9661 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9662 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9663 return false;
9664 }
9665 }
9666 return true;
9667}
9668
9669/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9670///
9671/// The masks must be exactly the same width.
9672///
9673/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9674/// value in ExpectedMask is always accepted. Otherwise the indices must match.
9675///
9676/// SM_SentinelZero is accepted as a valid negative index but must match in
9677/// both, or via a known bits test.
9679 ArrayRef<int> ExpectedMask,
9680 const SelectionDAG &DAG,
9681 SDValue V1 = SDValue(),
9682 SDValue V2 = SDValue()) {
9683 int Size = Mask.size();
9684 if (Size != (int)ExpectedMask.size())
9685 return false;
9686 assert(llvm::all_of(ExpectedMask,
9687 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9688 "Illegal target shuffle mask");
9689
9690 // Check for out-of-range target shuffle mask indices.
9691 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9692 return false;
9693
9694 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9695 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9696 !V1.getValueType().isVector()))
9697 V1 = SDValue();
9698 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9699 !V2.getValueType().isVector()))
9700 V2 = SDValue();
9701
9702 APInt ZeroV1 = APInt::getZero(Size);
9703 APInt ZeroV2 = APInt::getZero(Size);
9704
9705 for (int i = 0; i < Size; ++i) {
9706 int MaskIdx = Mask[i];
9707 int ExpectedIdx = ExpectedMask[i];
9708 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9709 continue;
9710 if (MaskIdx == SM_SentinelZero) {
9711 // If we need this expected index to be a zero element, then update the
9712 // relevant zero mask and perform the known bits at the end to minimize
9713 // repeated computes.
9714 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9715 if (ExpectedV &&
9716 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9717 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9718 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9719 ZeroMask.setBit(BitIdx);
9720 continue;
9721 }
9722 }
9723 if (MaskIdx >= 0) {
9724 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9725 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9726 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9727 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9728 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9729 continue;
9730 }
9731 return false;
9732 }
9733 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9734 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9735}
9736
9737// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9738// instructions.
9740 const SelectionDAG &DAG) {
9741 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9742 return false;
9743
9744 SmallVector<int, 8> Unpcklwd;
9745 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9746 /* Unary = */ false);
9747 SmallVector<int, 8> Unpckhwd;
9748 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9749 /* Unary = */ false);
9750 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9751 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9752 return IsUnpackwdMask;
9753}
9754
9756 const SelectionDAG &DAG) {
9757 // Create 128-bit vector type based on mask size.
9758 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9759 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9760
9761 // We can't assume a canonical shuffle mask, so try the commuted version too.
9762 SmallVector<int, 4> CommutedMask(Mask);
9764
9765 // Match any of unary/binary or low/high.
9766 for (unsigned i = 0; i != 4; ++i) {
9767 SmallVector<int, 16> UnpackMask;
9768 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9769 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9770 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9771 return true;
9772 }
9773 return false;
9774}
9775
9776/// Return true if a shuffle mask chooses elements identically in its top and
9777/// bottom halves. For example, any splat mask has the same top and bottom
9778/// halves. If an element is undefined in only one half of the mask, the halves
9779/// are not considered identical.
9781 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9782 unsigned HalfSize = Mask.size() / 2;
9783 for (unsigned i = 0; i != HalfSize; ++i) {
9784 if (Mask[i] != Mask[i + HalfSize])
9785 return false;
9786 }
9787 return true;
9788}
9789
9790/// Get a 4-lane 8-bit shuffle immediate for a mask.
9791///
9792/// This helper function produces an 8-bit shuffle immediate corresponding to
9793/// the ubiquitous shuffle encoding scheme used in x86 instructions for
9794/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9795/// example.
9796///
9797/// NB: We rely heavily on "undef" masks preserving the input lane.
9798static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9799 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9800 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9801 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9802 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9803 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9804
9805 // If the mask only uses one non-undef element, then fully 'splat' it to
9806 // improve later broadcast matching.
9807 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
9808 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
9809
9810 int FirstElt = Mask[FirstIndex];
9811 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
9812 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
9813
9814 unsigned Imm = 0;
9815 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9816 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9817 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9818 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9819 return Imm;
9820}
9821
9823 SelectionDAG &DAG) {
9824 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9825}
9826
9827// The Shuffle result is as follow:
9828// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9829// Each Zeroable's element correspond to a particular Mask's element.
9830// As described in computeZeroableShuffleElements function.
9831//
9832// The function looks for a sub-mask that the nonzero elements are in
9833// increasing order. If such sub-mask exist. The function returns true.
9834static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9835 ArrayRef<int> Mask, const EVT &VectorType,
9836 bool &IsZeroSideLeft) {
9837 int NextElement = -1;
9838 // Check if the Mask's nonzero elements are in increasing order.
9839 for (int i = 0, e = Mask.size(); i < e; i++) {
9840 // Checks if the mask's zeros elements are built from only zeros.
9841 assert(Mask[i] >= -1 && "Out of bound mask element!");
9842 if (Mask[i] < 0)
9843 return false;
9844 if (Zeroable[i])
9845 continue;
9846 // Find the lowest non zero element
9847 if (NextElement < 0) {
9848 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9849 IsZeroSideLeft = NextElement != 0;
9850 }
9851 // Exit if the mask's non zero elements are not in increasing order.
9852 if (NextElement != Mask[i])
9853 return false;
9854 NextElement++;
9855 }
9856 return true;
9857}
9858
9859/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9861 ArrayRef<int> Mask, SDValue V1,
9862 SDValue V2, const APInt &Zeroable,
9863 const X86Subtarget &Subtarget,
9864 SelectionDAG &DAG) {
9865 int Size = Mask.size();
9866 int LaneSize = 128 / VT.getScalarSizeInBits();
9867 const int NumBytes = VT.getSizeInBits() / 8;
9868 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9869
9870 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9871 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9872 (Subtarget.hasBWI() && VT.is512BitVector()));
9873
9874 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9875 // Sign bit set in i8 mask means zero element.
9876 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9877
9878 SDValue V;
9879 for (int i = 0; i < NumBytes; ++i) {
9880 int M = Mask[i / NumEltBytes];
9881 if (M < 0) {
9882 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9883 continue;
9884 }
9885 if (Zeroable[i / NumEltBytes]) {
9886 PSHUFBMask[i] = ZeroMask;
9887 continue;
9888 }
9889
9890 // We can only use a single input of V1 or V2.
9891 SDValue SrcV = (M >= Size ? V2 : V1);
9892 if (V && V != SrcV)
9893 return SDValue();
9894 V = SrcV;
9895 M %= Size;
9896
9897 // PSHUFB can't cross lanes, ensure this doesn't happen.
9898 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9899 return SDValue();
9900
9901 M = M % LaneSize;
9902 M = M * NumEltBytes + (i % NumEltBytes);
9903 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9904 }
9905 assert(V && "Failed to find a source input");
9906
9907 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9908 return DAG.getBitcast(
9909 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9910 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9911}
9912
9913static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9914 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9915 const SDLoc &dl);
9916
9917// X86 has dedicated shuffle that can be lowered to VEXPAND
9919 const APInt &Zeroable,
9920 ArrayRef<int> Mask, SDValue &V1,
9921 SDValue &V2, SelectionDAG &DAG,
9922 const X86Subtarget &Subtarget) {
9923 bool IsLeftZeroSide = true;
9924 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9925 IsLeftZeroSide))
9926 return SDValue();
9927 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9929 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9930 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9931 unsigned NumElts = VT.getVectorNumElements();
9932 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9933 "Unexpected number of vector elements");
9934 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9935 Subtarget, DAG, DL);
9936 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9937 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9938 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
9939}
9940
9941static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9942 unsigned &UnpackOpcode, bool IsUnary,
9943 ArrayRef<int> TargetMask, const SDLoc &DL,
9944 SelectionDAG &DAG,
9945 const X86Subtarget &Subtarget) {
9946 int NumElts = VT.getVectorNumElements();
9947
9948 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9949 for (int i = 0; i != NumElts; i += 2) {
9950 int M1 = TargetMask[i + 0];
9951 int M2 = TargetMask[i + 1];
9952 Undef1 &= (SM_SentinelUndef == M1);
9953 Undef2 &= (SM_SentinelUndef == M2);
9954 Zero1 &= isUndefOrZero(M1);
9955 Zero2 &= isUndefOrZero(M2);
9956 }
9957 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9958 "Zeroable shuffle detected");
9959
9960 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9961 SmallVector<int, 64> Unpckl, Unpckh;
9962 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9963 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
9964 (IsUnary ? V1 : V2))) {
9965 UnpackOpcode = X86ISD::UNPCKL;
9966 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9967 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9968 return true;
9969 }
9970
9971 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9972 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
9973 (IsUnary ? V1 : V2))) {
9974 UnpackOpcode = X86ISD::UNPCKH;
9975 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9976 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9977 return true;
9978 }
9979
9980 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9981 if (IsUnary && (Zero1 || Zero2)) {
9982 // Don't bother if we can blend instead.
9983 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9984 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9985 return false;
9986
9987 bool MatchLo = true, MatchHi = true;
9988 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9989 int M = TargetMask[i];
9990
9991 // Ignore if the input is known to be zero or the index is undef.
9992 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9993 (M == SM_SentinelUndef))
9994 continue;
9995
9996 MatchLo &= (M == Unpckl[i]);
9997 MatchHi &= (M == Unpckh[i]);
9998 }
9999
10000 if (MatchLo || MatchHi) {
10001 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10002 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10003 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10004 return true;
10005 }
10006 }
10007
10008 // If a binary shuffle, commute and try again.
10009 if (!IsUnary) {
10011 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10012 UnpackOpcode = X86ISD::UNPCKL;
10013 std::swap(V1, V2);
10014 return true;
10015 }
10016
10018 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10019 UnpackOpcode = X86ISD::UNPCKH;
10020 std::swap(V1, V2);
10021 return true;
10022 }
10023 }
10024
10025 return false;
10026}
10027
10028// X86 has dedicated unpack instructions that can handle specific blend
10029// operations: UNPCKH and UNPCKL.
10031 ArrayRef<int> Mask, SDValue V1, SDValue V2,
10032 SelectionDAG &DAG) {
10033 SmallVector<int, 8> Unpckl;
10034 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10035 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10036 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10037
10038 SmallVector<int, 8> Unpckh;
10039 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10040 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10041 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10042
10043 // Commute and try again.
10045 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10046 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10047
10049 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10050 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10051
10052 return SDValue();
10053}
10054
10055/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10056/// followed by unpack 256-bit.
10058 ArrayRef<int> Mask, SDValue V1,
10059 SDValue V2, SelectionDAG &DAG) {
10060 SmallVector<int, 32> Unpckl, Unpckh;
10061 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10062 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10063
10064 unsigned UnpackOpcode;
10065 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10066 UnpackOpcode = X86ISD::UNPCKL;
10067 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10068 UnpackOpcode = X86ISD::UNPCKH;
10069 else
10070 return SDValue();
10071
10072 // This is a "natural" unpack operation (rather than the 128-bit sectored
10073 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10074 // input in order to use the x86 instruction.
10075 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10076 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10077 V1 = DAG.getBitcast(VT, V1);
10078 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10079}
10080
10081// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10082// source into the lower elements and zeroing the upper elements.
10083static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10084 ArrayRef<int> Mask, const APInt &Zeroable,
10085 const X86Subtarget &Subtarget) {
10086 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10087 return false;
10088
10089 unsigned NumElts = Mask.size();
10090 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10091 unsigned MaxScale = 64 / EltSizeInBits;
10092
10093 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10094 unsigned SrcEltBits = EltSizeInBits * Scale;
10095 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10096 continue;
10097 unsigned NumSrcElts = NumElts / Scale;
10098 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10099 continue;
10100 unsigned UpperElts = NumElts - NumSrcElts;
10101 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10102 continue;
10103 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10104 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10105 DstVT = MVT::getIntegerVT(EltSizeInBits);
10106 if ((NumSrcElts * EltSizeInBits) >= 128) {
10107 // ISD::TRUNCATE
10108 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10109 } else {
10110 // X86ISD::VTRUNC
10111 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10112 }
10113 return true;
10114 }
10115
10116 return false;
10117}
10118
10119// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10120// element padding to the final DstVT.
10121static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10122 const X86Subtarget &Subtarget,
10123 SelectionDAG &DAG, bool ZeroUppers) {
10124 MVT SrcVT = Src.getSimpleValueType();
10125 MVT DstSVT = DstVT.getScalarType();
10126 unsigned NumDstElts = DstVT.getVectorNumElements();
10127 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10128 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10129
10130 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10131 return SDValue();
10132
10133 // Perform a direct ISD::TRUNCATE if possible.
10134 if (NumSrcElts == NumDstElts)
10135 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10136
10137 if (NumSrcElts > NumDstElts) {
10138 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10139 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10140 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10141 }
10142
10143 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10144 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10145 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10146 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10147 DstVT.getSizeInBits());
10148 }
10149
10150 // Non-VLX targets must truncate from a 512-bit type, so we need to
10151 // widen, truncate and then possibly extract the original subvector.
10152 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10153 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10154 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10155 }
10156
10157 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10158 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10159 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10160 if (DstVT != TruncVT)
10161 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10162 DstVT.getSizeInBits());
10163 return Trunc;
10164}
10165
10166// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10167//
10168// An example is the following:
10169//
10170// t0: ch = EntryToken
10171// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10172// t25: v4i32 = truncate t2
10173// t41: v8i16 = bitcast t25
10174// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10175// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10176// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10177// t18: v2i64 = bitcast t51
10178//
10179// One can just use a single vpmovdw instruction, without avx512vl we need to
10180// use the zmm variant and extract the lower subvector, padding with zeroes.
10181// TODO: Merge with lowerShuffleAsVTRUNC.
10183 SDValue V2, ArrayRef<int> Mask,
10184 const APInt &Zeroable,
10185 const X86Subtarget &Subtarget,
10186 SelectionDAG &DAG) {
10187 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10188 if (!Subtarget.hasAVX512())
10189 return SDValue();
10190
10191 unsigned NumElts = VT.getVectorNumElements();
10192 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10193 unsigned MaxScale = 64 / EltSizeInBits;
10194 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10195 unsigned SrcEltBits = EltSizeInBits * Scale;
10196 unsigned NumSrcElts = NumElts / Scale;
10197 unsigned UpperElts = NumElts - NumSrcElts;
10198 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10199 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10200 continue;
10201
10202 // Attempt to find a matching source truncation, but as a fall back VLX
10203 // cases can use the VPMOV directly.
10204 SDValue Src = peekThroughBitcasts(V1);
10205 if (Src.getOpcode() == ISD::TRUNCATE &&
10206 Src.getScalarValueSizeInBits() == SrcEltBits) {
10207 Src = Src.getOperand(0);
10208 } else if (Subtarget.hasVLX()) {
10209 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10210 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10211 Src = DAG.getBitcast(SrcVT, Src);
10212 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10213 if (Scale == 2 &&
10214 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10215 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10216 return SDValue();
10217 } else
10218 return SDValue();
10219
10220 // VPMOVWB is only available with avx512bw.
10221 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10222 return SDValue();
10223
10224 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10225 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10226 }
10227
10228 return SDValue();
10229}
10230
10231// Attempt to match binary shuffle patterns as a truncate.
10233 SDValue V2, ArrayRef<int> Mask,
10234 const APInt &Zeroable,
10235 const X86Subtarget &Subtarget,
10236 SelectionDAG &DAG) {
10237 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10238 "Unexpected VTRUNC type");
10239 if (!Subtarget.hasAVX512())
10240 return SDValue();
10241
10242 unsigned NumElts = VT.getVectorNumElements();
10243 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10244 unsigned MaxScale = 64 / EltSizeInBits;
10245 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10246 // TODO: Support non-BWI VPMOVWB truncations?
10247 unsigned SrcEltBits = EltSizeInBits * Scale;
10248 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10249 continue;
10250
10251 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10252 // Bail if the V2 elements are undef.
10253 unsigned NumHalfSrcElts = NumElts / Scale;
10254 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10255 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10256 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10257 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10258 continue;
10259
10260 // The elements beyond the truncation must be undef/zero.
10261 unsigned UpperElts = NumElts - NumSrcElts;
10262 if (UpperElts > 0 &&
10263 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10264 continue;
10265 bool UndefUppers =
10266 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10267
10268 // For offset truncations, ensure that the concat is cheap.
10269 if (Offset) {
10270 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
10271 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10272 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
10273 return Lo.getOperand(0) == Hi.getOperand(0);
10274 if (ISD::isNormalLoad(Lo.getNode()) &&
10275 ISD::isNormalLoad(Hi.getNode())) {
10276 auto *LDLo = cast<LoadSDNode>(Lo);
10277 auto *LDHi = cast<LoadSDNode>(Hi);
10279 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10280 }
10281 return false;
10282 };
10283 if (!IsCheapConcat(V1, V2))
10284 continue;
10285 }
10286
10287 // As we're using both sources then we need to concat them together
10288 // and truncate from the double-sized src.
10289 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10290 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10291
10292 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10293 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10294 Src = DAG.getBitcast(SrcVT, Src);
10295
10296 // Shift the offset'd elements into place for the truncation.
10297 // TODO: Use getTargetVShiftByConstNode.
10298 if (Offset)
10299 Src = DAG.getNode(
10300 X86ISD::VSRLI, DL, SrcVT, Src,
10301 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10302
10303 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10304 }
10305 }
10306
10307 return SDValue();
10308}
10309
10310/// Check whether a compaction lowering can be done by dropping even/odd
10311/// elements and compute how many times even/odd elements must be dropped.
10312///
10313/// This handles shuffles which take every Nth element where N is a power of
10314/// two. Example shuffle masks:
10315///
10316/// (even)
10317/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10318/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10319/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10320/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10321/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10322/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10323///
10324/// (odd)
10325/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10326/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10327///
10328/// Any of these lanes can of course be undef.
10329///
10330/// This routine only supports N <= 3.
10331/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10332/// for larger N.
10333///
10334/// \returns N above, or the number of times even/odd elements must be dropped
10335/// if there is such a number. Otherwise returns zero.
10336static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10337 bool IsSingleInput) {
10338 // The modulus for the shuffle vector entries is based on whether this is
10339 // a single input or not.
10340 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10341 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10342 "We should only be called with masks with a power-of-2 size!");
10343
10344 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10345 int Offset = MatchEven ? 0 : 1;
10346
10347 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10348 // and 2^3 simultaneously. This is because we may have ambiguity with
10349 // partially undef inputs.
10350 bool ViableForN[3] = {true, true, true};
10351
10352 for (int i = 0, e = Mask.size(); i < e; ++i) {
10353 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10354 // want.
10355 if (Mask[i] < 0)
10356 continue;
10357
10358 bool IsAnyViable = false;
10359 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10360 if (ViableForN[j]) {
10361 uint64_t N = j + 1;
10362
10363 // The shuffle mask must be equal to (i * 2^N) % M.
10364 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10365 IsAnyViable = true;
10366 else
10367 ViableForN[j] = false;
10368 }
10369 // Early exit if we exhaust the possible powers of two.
10370 if (!IsAnyViable)
10371 break;
10372 }
10373
10374 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10375 if (ViableForN[j])
10376 return j + 1;
10377
10378 // Return 0 as there is no viable power of two.
10379 return 0;
10380}
10381
10382// X86 has dedicated pack instructions that can handle specific truncation
10383// operations: PACKSS and PACKUS.
10384// Checks for compaction shuffle masks if MaxStages > 1.
10385// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10386static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10387 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10388 const SelectionDAG &DAG,
10389 const X86Subtarget &Subtarget,
10390 unsigned MaxStages = 1) {
10391 unsigned NumElts = VT.getVectorNumElements();
10392 unsigned BitSize = VT.getScalarSizeInBits();
10393 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10394 "Illegal maximum compaction");
10395
10396 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10397 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10398 unsigned NumPackedBits = NumSrcBits - BitSize;
10399 N1 = peekThroughBitcasts(N1);
10400 N2 = peekThroughBitcasts(N2);
10401 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10402 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10403 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10404 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10405 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10406 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10407 return false;
10408 if (Subtarget.hasSSE41() || BitSize == 8) {
10409 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10410 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10411 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10412 V1 = N1;
10413 V2 = N2;
10414 SrcVT = PackVT;
10415 PackOpcode = X86ISD::PACKUS;
10416 return true;
10417 }
10418 }
10419 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10420 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10421 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10422 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10423 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10424 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10425 V1 = N1;
10426 V2 = N2;
10427 SrcVT = PackVT;
10428 PackOpcode = X86ISD::PACKSS;
10429 return true;
10430 }
10431 return false;
10432 };
10433
10434 // Attempt to match against wider and wider compaction patterns.
10435 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10436 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10437 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10438
10439 // Try binary shuffle.
10440 SmallVector<int, 32> BinaryMask;
10441 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10442 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10443 if (MatchPACK(V1, V2, PackVT))
10444 return true;
10445
10446 // Try unary shuffle.
10447 SmallVector<int, 32> UnaryMask;
10448 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10449 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10450 if (MatchPACK(V1, V1, PackVT))
10451 return true;
10452 }
10453
10454 return false;
10455}
10456
10458 SDValue V1, SDValue V2, SelectionDAG &DAG,
10459 const X86Subtarget &Subtarget) {
10460 MVT PackVT;
10461 unsigned PackOpcode;
10462 unsigned SizeBits = VT.getSizeInBits();
10463 unsigned EltBits = VT.getScalarSizeInBits();
10464 unsigned MaxStages = Log2_32(64 / EltBits);
10465 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10466 Subtarget, MaxStages))
10467 return SDValue();
10468
10469 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10470 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10471
10472 // Don't lower multi-stage packs on AVX512, truncation is better.
10473 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10474 return SDValue();
10475
10476 // Pack to the largest type possible:
10477 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10478 unsigned MaxPackBits = 16;
10479 if (CurrentEltBits > 16 &&
10480 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10481 MaxPackBits = 32;
10482
10483 // Repeatedly pack down to the target size.
10484 SDValue Res;
10485 for (unsigned i = 0; i != NumStages; ++i) {
10486 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10487 unsigned NumSrcElts = SizeBits / SrcEltBits;
10488 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10489 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10490 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10491 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10492 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10493 DAG.getBitcast(SrcVT, V2));
10494 V1 = V2 = Res;
10495 CurrentEltBits /= 2;
10496 }
10497 assert(Res && Res.getValueType() == VT &&
10498 "Failed to lower compaction shuffle");
10499 return Res;
10500}
10501
10502/// Try to emit a bitmask instruction for a shuffle.
10503///
10504/// This handles cases where we can model a blend exactly as a bitmask due to
10505/// one of the inputs being zeroable.
10507 SDValue V2, ArrayRef<int> Mask,
10508 const APInt &Zeroable,
10509 const X86Subtarget &Subtarget,
10510 SelectionDAG &DAG) {
10511 MVT MaskVT = VT;
10512 MVT EltVT = VT.getVectorElementType();
10513 SDValue Zero, AllOnes;
10514 // Use f64 if i64 isn't legal.
10515 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10516 EltVT = MVT::f64;
10517 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10518 }
10519
10520 MVT LogicVT = VT;
10521 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10522 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10523 APFloat AllOnesValue =
10525 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10526 LogicVT =
10527 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10528 } else {
10529 Zero = DAG.getConstant(0, DL, EltVT);
10530 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10531 }
10532
10533 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10534 SDValue V;
10535 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10536 if (Zeroable[i])
10537 continue;
10538 if (Mask[i] % Size != i)
10539 return SDValue(); // Not a blend.
10540 if (!V)
10541 V = Mask[i] < Size ? V1 : V2;
10542 else if (V != (Mask[i] < Size ? V1 : V2))
10543 return SDValue(); // Can only let one input through the mask.
10544
10545 VMaskOps[i] = AllOnes;
10546 }
10547 if (!V)
10548 return SDValue(); // No non-zeroable elements!
10549
10550 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10551 VMask = DAG.getBitcast(LogicVT, VMask);
10552 V = DAG.getBitcast(LogicVT, V);
10553 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10554 return DAG.getBitcast(VT, And);
10555}
10556
10557/// Try to emit a blend instruction for a shuffle using bit math.
10558///
10559/// This is used as a fallback approach when first class blend instructions are
10560/// unavailable. Currently it is only suitable for integer vectors, but could
10561/// be generalized for floating point vectors if desirable.
10563 SDValue V2, ArrayRef<int> Mask,
10564 SelectionDAG &DAG) {
10565 assert(VT.isInteger() && "Only supports integer vector types!");
10566 MVT EltVT = VT.getVectorElementType();
10567 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10568 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10570 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10571 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10572 return SDValue(); // Shuffled input!
10573 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10574 }
10575
10576 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10577 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10578}
10579
10581 SDValue PreservedSrc,
10582 const X86Subtarget &Subtarget,
10583 SelectionDAG &DAG);
10584
10587 const APInt &Zeroable, bool &ForceV1Zero,
10588 bool &ForceV2Zero, uint64_t &BlendMask) {
10589 bool V1IsZeroOrUndef =
10591 bool V2IsZeroOrUndef =
10592 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10593
10594 BlendMask = 0;
10595 ForceV1Zero = false, ForceV2Zero = false;
10596 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10597
10598 int NumElts = Mask.size();
10599 int NumLanes = VT.getSizeInBits() / 128;
10600 int NumEltsPerLane = NumElts / NumLanes;
10601 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10602
10603 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10604 // then ensure the blend mask part for that lane just references that input.
10605 bool ForceWholeLaneMasks =
10606 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10607
10608 // Attempt to generate the binary blend mask. If an input is zero then
10609 // we can use any lane.
10610 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10611 // Keep track of the inputs used per lane.
10612 bool LaneV1InUse = false;
10613 bool LaneV2InUse = false;
10614 uint64_t LaneBlendMask = 0;
10615 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10616 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10617 int M = Mask[Elt];
10618 if (M == SM_SentinelUndef)
10619 continue;
10620 if (M == Elt || (0 <= M && M < NumElts &&
10621 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10622 Mask[Elt] = Elt;
10623 LaneV1InUse = true;
10624 continue;
10625 }
10626 if (M == (Elt + NumElts) ||
10627 (NumElts <= M &&
10628 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10629 LaneBlendMask |= 1ull << LaneElt;
10630 Mask[Elt] = Elt + NumElts;
10631 LaneV2InUse = true;
10632 continue;
10633 }
10634 if (Zeroable[Elt]) {
10635 if (V1IsZeroOrUndef) {
10636 ForceV1Zero = true;
10637 Mask[Elt] = Elt;
10638 LaneV1InUse = true;
10639 continue;
10640 }
10641 if (V2IsZeroOrUndef) {
10642 ForceV2Zero = true;
10643 LaneBlendMask |= 1ull << LaneElt;
10644 Mask[Elt] = Elt + NumElts;
10645 LaneV2InUse = true;
10646 continue;
10647 }
10648 }
10649 return false;
10650 }
10651
10652 // If we only used V2 then splat the lane blend mask to avoid any demanded
10653 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10654 // blend mask bit).
10655 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10656 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10657
10658 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10659 }
10660 return true;
10661}
10662
10663/// Try to emit a blend instruction for a shuffle.
10664///
10665/// This doesn't do any checks for the availability of instructions for blending
10666/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10667/// be matched in the backend with the type given. What it does check for is
10668/// that the shuffle mask is a blend, or convertible into a blend with zero.
10670 SDValue V2, ArrayRef<int> Original,
10671 const APInt &Zeroable,
10672 const X86Subtarget &Subtarget,
10673 SelectionDAG &DAG) {
10674 uint64_t BlendMask = 0;
10675 bool ForceV1Zero = false, ForceV2Zero = false;
10676 SmallVector<int, 64> Mask(Original);
10677 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10678 BlendMask))
10679 return SDValue();
10680
10681 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10682 if (ForceV1Zero)
10683 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10684 if (ForceV2Zero)
10685 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10686
10687 unsigned NumElts = VT.getVectorNumElements();
10688
10689 switch (VT.SimpleTy) {
10690 case MVT::v4i64:
10691 case MVT::v8i32:
10692 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10693 [[fallthrough]];
10694 case MVT::v4f64:
10695 case MVT::v8f32:
10696 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10697 [[fallthrough]];
10698 case MVT::v2f64:
10699 case MVT::v2i64:
10700 case MVT::v4f32:
10701 case MVT::v4i32:
10702 case MVT::v8i16:
10703 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10704 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10705 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10706 case MVT::v16i16: {
10707 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10708 SmallVector<int, 8> RepeatedMask;
10709 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10710 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10711 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10712 BlendMask = 0;
10713 for (int i = 0; i < 8; ++i)
10714 if (RepeatedMask[i] >= 8)
10715 BlendMask |= 1ull << i;
10716 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10717 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10718 }
10719 // Use PBLENDW for lower/upper lanes and then blend lanes.
10720 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10721 // merge to VSELECT where useful.
10722 uint64_t LoMask = BlendMask & 0xFF;
10723 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10724 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10725 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10726 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10727 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10728 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10729 return DAG.getVectorShuffle(
10730 MVT::v16i16, DL, Lo, Hi,
10731 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10732 }
10733 [[fallthrough]];
10734 }
10735 case MVT::v32i8:
10736 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10737 [[fallthrough]];
10738 case MVT::v16i8: {
10739 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10740
10741 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10742 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10743 Subtarget, DAG))
10744 return Masked;
10745
10746 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10747 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10748 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10749 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10750 }
10751
10752 // If we have VPTERNLOG, we can use that as a bit blend.
10753 if (Subtarget.hasVLX())
10754 if (SDValue BitBlend =
10755 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10756 return BitBlend;
10757
10758 // Scale the blend by the number of bytes per element.
10759 int Scale = VT.getScalarSizeInBits() / 8;
10760
10761 // This form of blend is always done on bytes. Compute the byte vector
10762 // type.
10763 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10764
10765 // x86 allows load folding with blendvb from the 2nd source operand. But
10766 // we are still using LLVM select here (see comment below), so that's V1.
10767 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
10768 // allow that load-folding possibility.
10769 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
10771 std::swap(V1, V2);
10772 }
10773
10774 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
10775 // mix of LLVM's code generator and the x86 backend. We tell the code
10776 // generator that boolean values in the elements of an x86 vector register
10777 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
10778 // mapping a select to operand #1, and 'false' mapping to operand #2. The
10779 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
10780 // of the element (the remaining are ignored) and 0 in that high bit would
10781 // mean operand #1 while 1 in the high bit would mean operand #2. So while
10782 // the LLVM model for boolean values in vector elements gets the relevant
10783 // bit set, it is set backwards and over constrained relative to x86's
10784 // actual model.
10785 SmallVector<SDValue, 32> VSELECTMask;
10786 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10787 for (int j = 0; j < Scale; ++j)
10788 VSELECTMask.push_back(
10789 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
10790 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
10791 MVT::i8));
10792
10793 V1 = DAG.getBitcast(BlendVT, V1);
10794 V2 = DAG.getBitcast(BlendVT, V2);
10795 return DAG.getBitcast(
10796 VT,
10797 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
10798 V1, V2));
10799 }
10800 case MVT::v16f32:
10801 case MVT::v8f64:
10802 case MVT::v8i64:
10803 case MVT::v16i32:
10804 case MVT::v32i16:
10805 case MVT::v64i8: {
10806 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
10807 bool OptForSize = DAG.shouldOptForSize();
10808 if (!OptForSize) {
10809 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10810 Subtarget, DAG))
10811 return Masked;
10812 }
10813
10814 // Otherwise load an immediate into a GPR, cast to k-register, and use a
10815 // masked move.
10816 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10817 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10818 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10819 }
10820 default:
10821 llvm_unreachable("Not a supported integer vector type!");
10822 }
10823}
10824
10825/// Try to lower as a blend of elements from two inputs followed by
10826/// a single-input permutation.
10827///
10828/// This matches the pattern where we can blend elements from two inputs and
10829/// then reduce the shuffle to a single-input permutation.
10831 SDValue V1, SDValue V2,
10832 ArrayRef<int> Mask,
10833 SelectionDAG &DAG,
10834 bool ImmBlends = false) {
10835 // We build up the blend mask while checking whether a blend is a viable way
10836 // to reduce the shuffle.
10837 SmallVector<int, 32> BlendMask(Mask.size(), -1);
10838 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
10839
10840 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10841 if (Mask[i] < 0)
10842 continue;
10843
10844 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
10845
10846 if (BlendMask[Mask[i] % Size] < 0)
10847 BlendMask[Mask[i] % Size] = Mask[i];
10848 else if (BlendMask[Mask[i] % Size] != Mask[i])
10849 return SDValue(); // Can't blend in the needed input!
10850
10851 PermuteMask[i] = Mask[i] % Size;
10852 }
10853
10854 // If only immediate blends, then bail if the blend mask can't be widened to
10855 // i16.
10856 unsigned EltSize = VT.getScalarSizeInBits();
10857 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
10858 return SDValue();
10859
10860 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
10861 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
10862}
10863
10864/// Try to lower as an unpack of elements from two inputs followed by
10865/// a single-input permutation.
10866///
10867/// This matches the pattern where we can unpack elements from two inputs and
10868/// then reduce the shuffle to a single-input (wider) permutation.
10870 SDValue V1, SDValue V2,
10871 ArrayRef<int> Mask,
10872 SelectionDAG &DAG) {
10873 int NumElts = Mask.size();
10874 int NumLanes = VT.getSizeInBits() / 128;
10875 int NumLaneElts = NumElts / NumLanes;
10876 int NumHalfLaneElts = NumLaneElts / 2;
10877
10878 bool MatchLo = true, MatchHi = true;
10879 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
10880
10881 // Determine UNPCKL/UNPCKH type and operand order.
10882 for (int Elt = 0; Elt != NumElts; ++Elt) {
10883 int M = Mask[Elt];
10884 if (M < 0)
10885 continue;
10886
10887 // Normalize the mask value depending on whether it's V1 or V2.
10888 int NormM = M;
10889 SDValue &Op = Ops[Elt & 1];
10890 if (M < NumElts && (Op.isUndef() || Op == V1))
10891 Op = V1;
10892 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
10893 Op = V2;
10894 NormM -= NumElts;
10895 } else
10896 return SDValue();
10897
10898 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
10899 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
10900 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
10901 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
10902 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
10903 if (MatchLoAnyLane || MatchHiAnyLane) {
10904 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
10905 "Failed to match UNPCKLO/UNPCKHI");
10906 break;
10907 }
10908 }
10909 MatchLo &= MatchLoAnyLane;
10910 MatchHi &= MatchHiAnyLane;
10911 if (!MatchLo && !MatchHi)
10912 return SDValue();
10913 }
10914 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
10915
10916 // Element indices have changed after unpacking. Calculate permute mask
10917 // so that they will be put back to the position as dictated by the
10918 // original shuffle mask indices.
10919 SmallVector<int, 32> PermuteMask(NumElts, -1);
10920 for (int Elt = 0; Elt != NumElts; ++Elt) {
10921 int M = Mask[Elt];
10922 if (M < 0)
10923 continue;
10924 int NormM = M;
10925 if (NumElts <= M)
10926 NormM -= NumElts;
10927 bool IsFirstOp = M < NumElts;
10928 int BaseMaskElt =
10929 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
10930 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
10931 PermuteMask[Elt] = BaseMaskElt;
10932 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
10933 PermuteMask[Elt] = BaseMaskElt + 1;
10934 assert(PermuteMask[Elt] != -1 &&
10935 "Input mask element is defined but failed to assign permute mask");
10936 }
10937
10938 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10939 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
10940 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
10941}
10942
10943/// Try to lower a shuffle as a permute of the inputs followed by an
10944/// UNPCK instruction.
10945///
10946/// This specifically targets cases where we end up with alternating between
10947/// the two inputs, and so can permute them into something that feeds a single
10948/// UNPCK instruction. Note that this routine only targets integer vectors
10949/// because for floating point vectors we have a generalized SHUFPS lowering
10950/// strategy that handles everything that doesn't *exactly* match an unpack,
10951/// making this clever lowering unnecessary.
10953 SDValue V1, SDValue V2,
10954 ArrayRef<int> Mask,
10955 const X86Subtarget &Subtarget,
10956 SelectionDAG &DAG) {
10957 int Size = Mask.size();
10958 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10959
10960 // This routine only supports 128-bit integer dual input vectors.
10961 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
10962 return SDValue();
10963
10964 int NumLoInputs =
10965 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10966 int NumHiInputs =
10967 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10968
10969 bool UnpackLo = NumLoInputs >= NumHiInputs;
10970
10971 auto TryUnpack = [&](int ScalarSize, int Scale) {
10972 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10973 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10974
10975 for (int i = 0; i < Size; ++i) {
10976 if (Mask[i] < 0)
10977 continue;
10978
10979 // Each element of the unpack contains Scale elements from this mask.
10980 int UnpackIdx = i / Scale;
10981
10982 // We only handle the case where V1 feeds the first slots of the unpack.
10983 // We rely on canonicalization to ensure this is the case.
10984 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10985 return SDValue();
10986
10987 // Setup the mask for this input. The indexing is tricky as we have to
10988 // handle the unpack stride.
10989 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10990 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10991 Mask[i] % Size;
10992 }
10993
10994 // If we will have to shuffle both inputs to use the unpack, check whether
10995 // we can just unpack first and shuffle the result. If so, skip this unpack.
10996 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10997 !isNoopShuffleMask(V2Mask))
10998 return SDValue();
10999
11000 // Shuffle the inputs into place.
11001 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11002 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11003
11004 // Cast the inputs to the type we will use to unpack them.
11005 MVT UnpackVT =
11006 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11007 V1 = DAG.getBitcast(UnpackVT, V1);
11008 V2 = DAG.getBitcast(UnpackVT, V2);
11009
11010 // Unpack the inputs and cast the result back to the desired type.
11011 return DAG.getBitcast(
11012 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11013 UnpackVT, V1, V2));
11014 };
11015
11016 // We try each unpack from the largest to the smallest to try and find one
11017 // that fits this mask.
11018 int OrigScalarSize = VT.getScalarSizeInBits();
11019 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11020 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11021 return Unpack;
11022
11023 // If we're shuffling with a zero vector then we're better off not doing
11024 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11026 ISD::isBuildVectorAllZeros(V2.getNode()))
11027 return SDValue();
11028
11029 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11030 // initial unpack.
11031 if (NumLoInputs == 0 || NumHiInputs == 0) {
11032 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11033 "We have to have *some* inputs!");
11034 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11035
11036 // FIXME: We could consider the total complexity of the permute of each
11037 // possible unpacking. Or at the least we should consider how many
11038 // half-crossings are created.
11039 // FIXME: We could consider commuting the unpacks.
11040
11041 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11042 for (int i = 0; i < Size; ++i) {
11043 if (Mask[i] < 0)
11044 continue;
11045
11046 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11047
11048 PermMask[i] =
11049 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11050 }
11051 return DAG.getVectorShuffle(
11052 VT, DL,
11053 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11054 V1, V2),
11055 DAG.getUNDEF(VT), PermMask);
11056 }
11057
11058 return SDValue();
11059}
11060
11061/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11062/// permuting the elements of the result in place.
11064 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11065 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11066 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11067 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11068 (VT.is512BitVector() && !Subtarget.hasBWI()))
11069 return SDValue();
11070
11071 // We don't currently support lane crossing permutes.
11072 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11073 return SDValue();
11074
11075 int Scale = VT.getScalarSizeInBits() / 8;
11076 int NumLanes = VT.getSizeInBits() / 128;
11077 int NumElts = VT.getVectorNumElements();
11078 int NumEltsPerLane = NumElts / NumLanes;
11079
11080 // Determine range of mask elts.
11081 bool Blend1 = true;
11082 bool Blend2 = true;
11083 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11084 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11085 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11086 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11087 int M = Mask[Lane + Elt];
11088 if (M < 0)
11089 continue;
11090 if (M < NumElts) {
11091 Blend1 &= (M == (Lane + Elt));
11092 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11093 M = M % NumEltsPerLane;
11094 Range1.first = std::min(Range1.first, M);
11095 Range1.second = std::max(Range1.second, M);
11096 } else {
11097 M -= NumElts;
11098 Blend2 &= (M == (Lane + Elt));
11099 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11100 M = M % NumEltsPerLane;
11101 Range2.first = std::min(Range2.first, M);
11102 Range2.second = std::max(Range2.second, M);
11103 }
11104 }
11105 }
11106
11107 // Bail if we don't need both elements.
11108 // TODO - it might be worth doing this for unary shuffles if the permute
11109 // can be widened.
11110 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11111 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11112 return SDValue();
11113
11114 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11115 return SDValue();
11116
11117 // Rotate the 2 ops so we can access both ranges, then permute the result.
11118 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11119 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11120 SDValue Rotate = DAG.getBitcast(
11121 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11122 DAG.getBitcast(ByteVT, Lo),
11123 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11124 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11125 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11126 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11127 int M = Mask[Lane + Elt];
11128 if (M < 0)
11129 continue;
11130 if (M < NumElts)
11131 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11132 else
11133 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11134 }
11135 }
11136 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11137 };
11138
11139 // Check if the ranges are small enough to rotate from either direction.
11140 if (Range2.second < Range1.first)
11141 return RotateAndPermute(V1, V2, Range1.first, 0);
11142 if (Range1.second < Range2.first)
11143 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11144 return SDValue();
11145}
11146
11148 return isUndefOrEqual(Mask, 0);
11149}
11150
11152 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11153}
11154
11155/// Check if the Mask consists of the same element repeated multiple times.
11157 size_t NumUndefs = 0;
11158 std::optional<int> UniqueElt;
11159 for (int Elt : Mask) {
11160 if (Elt == SM_SentinelUndef) {
11161 NumUndefs++;
11162 continue;
11163 }
11164 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11165 return false;
11166 UniqueElt = Elt;
11167 }
11168 // Make sure the element is repeated enough times by checking the number of
11169 // undefs is small.
11170 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11171}
11172
11173/// Generic routine to decompose a shuffle and blend into independent
11174/// blends and permutes.
11175///
11176/// This matches the extremely common pattern for handling combined
11177/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11178/// operations. It will try to pick the best arrangement of shuffles and
11179/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11181 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11182 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11183 int NumElts = Mask.size();
11184 int NumLanes = VT.getSizeInBits() / 128;
11185 int NumEltsPerLane = NumElts / NumLanes;
11186
11187 // Shuffle the input elements into the desired positions in V1 and V2 and
11188 // unpack/blend them together.
11189 bool IsAlternating = true;
11190 SmallVector<int, 32> V1Mask(NumElts, -1);
11191 SmallVector<int, 32> V2Mask(NumElts, -1);
11192 SmallVector<int, 32> FinalMask(NumElts, -1);
11193 for (int i = 0; i < NumElts; ++i) {
11194 int M = Mask[i];
11195 if (M >= 0 && M < NumElts) {
11196 V1Mask[i] = M;
11197 FinalMask[i] = i;
11198 IsAlternating &= (i & 1) == 0;
11199 } else if (M >= NumElts) {
11200 V2Mask[i] = M - NumElts;
11201 FinalMask[i] = i + NumElts;
11202 IsAlternating &= (i & 1) == 1;
11203 }
11204 }
11205
11206 // If we effectively only demand the 0'th element of \p Input, and not only
11207 // as 0'th element, then broadcast said input,
11208 // and change \p InputMask to be a no-op (identity) mask.
11209 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11210 &DAG](SDValue &Input,
11211 MutableArrayRef<int> InputMask) {
11212 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11213 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11214 !X86::mayFoldLoad(Input, Subtarget)))
11215 return;
11216 if (isNoopShuffleMask(InputMask))
11217 return;
11218 assert(isBroadcastShuffleMask(InputMask) &&
11219 "Expected to demand only the 0'th element.");
11220 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11221 for (auto I : enumerate(InputMask)) {
11222 int &InputMaskElt = I.value();
11223 if (InputMaskElt >= 0)
11224 InputMaskElt = I.index();
11225 }
11226 };
11227
11228 // Currently, we may need to produce one shuffle per input, and blend results.
11229 // It is possible that the shuffle for one of the inputs is already a no-op.
11230 // See if we can simplify non-no-op shuffles into broadcasts,
11231 // which we consider to be strictly better than an arbitrary shuffle.
11232 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11234 canonicalizeBroadcastableInput(V1, V1Mask);
11235 canonicalizeBroadcastableInput(V2, V2Mask);
11236 }
11237
11238 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11239 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11240 // the shuffle may be able to fold with a load or other benefit. However, when
11241 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11242 // pre-shuffle first is a better strategy.
11243 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11244 // Only prefer immediate blends to unpack/rotate.
11245 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11246 DAG, true))
11247 return BlendPerm;
11248 // If either input vector provides only a single element which is repeated
11249 // multiple times, unpacking from both input vectors would generate worse
11250 // code. e.g. for
11251 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11252 // it is better to process t4 first to create a vector of t4[0], then unpack
11253 // that vector with t2.
11254 if (!isSingleElementRepeatedMask(V1Mask) &&
11256 if (SDValue UnpackPerm =
11257 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11258 return UnpackPerm;
11260 DL, VT, V1, V2, Mask, Subtarget, DAG))
11261 return RotatePerm;
11262 // Unpack/rotate failed - try again with variable blends.
11263 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11264 DAG))
11265 return BlendPerm;
11266 if (VT.getScalarSizeInBits() >= 32)
11267 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11268 DL, VT, V1, V2, Mask, Subtarget, DAG))
11269 return PermUnpack;
11270 }
11271
11272 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11273 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11274 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11275 // than half the elements coming from each source.
11276 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11277 V1Mask.assign(NumElts, -1);
11278 V2Mask.assign(NumElts, -1);
11279 FinalMask.assign(NumElts, -1);
11280 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11281 for (int j = 0; j != NumEltsPerLane; ++j) {
11282 int M = Mask[i + j];
11283 if (M >= 0 && M < NumElts) {
11284 V1Mask[i + (j / 2)] = M;
11285 FinalMask[i + j] = i + (j / 2);
11286 } else if (M >= NumElts) {
11287 V2Mask[i + (j / 2)] = M - NumElts;
11288 FinalMask[i + j] = i + (j / 2) + NumElts;
11289 }
11290 }
11291 }
11292
11293 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11294 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11295 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11296}
11297
11298static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11299 const X86Subtarget &Subtarget,
11300 ArrayRef<int> Mask) {
11301 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11302 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11303
11304 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11305 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11306 int MaxSubElts = 64 / EltSizeInBits;
11307 unsigned RotateAmt, NumSubElts;
11308 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11309 MaxSubElts, NumSubElts, RotateAmt))
11310 return -1;
11311 unsigned NumElts = Mask.size();
11312 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11313 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11314 return RotateAmt;
11315}
11316
11317/// Lower shuffle using X86ISD::VROTLI rotations.
11319 ArrayRef<int> Mask,
11320 const X86Subtarget &Subtarget,
11321 SelectionDAG &DAG) {
11322 // Only XOP + AVX512 targets have bit rotation instructions.
11323 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11324 bool IsLegal =
11325 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11326 if (!IsLegal && Subtarget.hasSSE3())
11327 return SDValue();
11328
11329 MVT RotateVT;
11330 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11331 Subtarget, Mask);
11332 if (RotateAmt < 0)
11333 return SDValue();
11334
11335 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11336 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11337 // widen to vXi16 or more then existing lowering should will be better.
11338 if (!IsLegal) {
11339 if ((RotateAmt % 16) == 0)
11340 return SDValue();
11341 // TODO: Use getTargetVShiftByConstNode.
11342 unsigned ShlAmt = RotateAmt;
11343 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11344 V1 = DAG.getBitcast(RotateVT, V1);
11345 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11346 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11347 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11348 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11349 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11350 return DAG.getBitcast(VT, Rot);
11351 }
11352
11353 SDValue Rot =
11354 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11355 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11356 return DAG.getBitcast(VT, Rot);
11357}
11358
11359/// Try to match a vector shuffle as an element rotation.
11360///
11361/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11363 ArrayRef<int> Mask) {
11364 int NumElts = Mask.size();
11365
11366 // We need to detect various ways of spelling a rotation:
11367 // [11, 12, 13, 14, 15, 0, 1, 2]
11368 // [-1, 12, 13, 14, -1, -1, 1, -1]
11369 // [-1, -1, -1, -1, -1, -1, 1, 2]
11370 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11371 // [-1, 4, 5, 6, -1, -1, 9, -1]
11372 // [-1, 4, 5, 6, -1, -1, -1, -1]
11373 int Rotation = 0;
11374 SDValue Lo, Hi;
11375 for (int i = 0; i < NumElts; ++i) {
11376 int M = Mask[i];
11377 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11378 "Unexpected mask index.");
11379 if (M < 0)
11380 continue;
11381
11382 // Determine where a rotated vector would have started.
11383 int StartIdx = i - (M % NumElts);
11384 if (StartIdx == 0)
11385 // The identity rotation isn't interesting, stop.
11386 return -1;
11387
11388 // If we found the tail of a vector the rotation must be the missing
11389 // front. If we found the head of a vector, it must be how much of the
11390 // head.
11391 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11392
11393 if (Rotation == 0)
11394 Rotation = CandidateRotation;
11395 else if (Rotation != CandidateRotation)
11396 // The rotations don't match, so we can't match this mask.
11397 return -1;
11398
11399 // Compute which value this mask is pointing at.
11400 SDValue MaskV = M < NumElts ? V1 : V2;
11401
11402 // Compute which of the two target values this index should be assigned
11403 // to. This reflects whether the high elements are remaining or the low
11404 // elements are remaining.
11405 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11406
11407 // Either set up this value if we've not encountered it before, or check
11408 // that it remains consistent.
11409 if (!TargetV)
11410 TargetV = MaskV;
11411 else if (TargetV != MaskV)
11412 // This may be a rotation, but it pulls from the inputs in some
11413 // unsupported interleaving.
11414 return -1;
11415 }
11416
11417 // Check that we successfully analyzed the mask, and normalize the results.
11418 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11419 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11420 if (!Lo)
11421 Lo = Hi;
11422 else if (!Hi)
11423 Hi = Lo;
11424
11425 V1 = Lo;
11426 V2 = Hi;
11427
11428 return Rotation;
11429}
11430
11431/// Try to lower a vector shuffle as a byte rotation.
11432///
11433/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11434/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11435/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11436/// try to generically lower a vector shuffle through such an pattern. It
11437/// does not check for the profitability of lowering either as PALIGNR or
11438/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11439/// This matches shuffle vectors that look like:
11440///
11441/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11442///
11443/// Essentially it concatenates V1 and V2, shifts right by some number of
11444/// elements, and takes the low elements as the result. Note that while this is
11445/// specified as a *right shift* because x86 is little-endian, it is a *left
11446/// rotate* of the vector lanes.
11448 ArrayRef<int> Mask) {
11449 // Don't accept any shuffles with zero elements.
11450 if (isAnyZero(Mask))
11451 return -1;
11452
11453 // PALIGNR works on 128-bit lanes.
11454 SmallVector<int, 16> RepeatedMask;
11455 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11456 return -1;
11457
11458 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11459 if (Rotation <= 0)
11460 return -1;
11461
11462 // PALIGNR rotates bytes, so we need to scale the
11463 // rotation based on how many bytes are in the vector lane.
11464 int NumElts = RepeatedMask.size();
11465 int Scale = 16 / NumElts;
11466 return Rotation * Scale;
11467}
11468
11470 SDValue V2, ArrayRef<int> Mask,
11471 const X86Subtarget &Subtarget,
11472 SelectionDAG &DAG) {
11473 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11474
11475 SDValue Lo = V1, Hi = V2;
11476 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11477 if (ByteRotation <= 0)
11478 return SDValue();
11479
11480 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11481 // PSLLDQ/PSRLDQ.
11482 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11483 Lo = DAG.getBitcast(ByteVT, Lo);
11484 Hi = DAG.getBitcast(ByteVT, Hi);
11485
11486 // SSSE3 targets can use the palignr instruction.
11487 if (Subtarget.hasSSSE3()) {
11488 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11489 "512-bit PALIGNR requires BWI instructions");
11490 return DAG.getBitcast(
11491 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11492 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11493 }
11494
11495 assert(VT.is128BitVector() &&
11496 "Rotate-based lowering only supports 128-bit lowering!");
11497 assert(Mask.size() <= 16 &&
11498 "Can shuffle at most 16 bytes in a 128-bit vector!");
11499 assert(ByteVT == MVT::v16i8 &&
11500 "SSE2 rotate lowering only needed for v16i8!");
11501
11502 // Default SSE2 implementation
11503 int LoByteShift = 16 - ByteRotation;
11504 int HiByteShift = ByteRotation;
11505
11506 SDValue LoShift =
11507 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11508 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11509 SDValue HiShift =
11510 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11511 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11512 return DAG.getBitcast(VT,
11513 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11514}
11515
11516/// Try to lower a vector shuffle as a dword/qword rotation.
11517///
11518/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11519/// rotation of the concatenation of two vectors; This routine will
11520/// try to generically lower a vector shuffle through such an pattern.
11521///
11522/// Essentially it concatenates V1 and V2, shifts right by some number of
11523/// elements, and takes the low elements as the result. Note that while this is
11524/// specified as a *right shift* because x86 is little-endian, it is a *left
11525/// rotate* of the vector lanes.
11527 SDValue V2, ArrayRef<int> Mask,
11528 const APInt &Zeroable,
11529 const X86Subtarget &Subtarget,
11530 SelectionDAG &DAG) {
11531 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11532 "Only 32-bit and 64-bit elements are supported!");
11533
11534 // 128/256-bit vectors are only supported with VLX.
11535 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11536 && "VLX required for 128/256-bit vectors");
11537
11538 SDValue Lo = V1, Hi = V2;
11539 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11540 if (0 < Rotation)
11541 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11542 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11543
11544 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11545 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11546 // TODO: We can probably make this more aggressive and use shift-pairs like
11547 // lowerShuffleAsByteShiftMask.
11548 unsigned NumElts = Mask.size();
11549 unsigned ZeroLo = Zeroable.countr_one();
11550 unsigned ZeroHi = Zeroable.countl_one();
11551 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11552 if (!ZeroLo && !ZeroHi)
11553 return SDValue();
11554
11555 if (ZeroLo) {
11556 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11557 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11558 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11559 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11560 getZeroVector(VT, Subtarget, DAG, DL),
11561 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11562 }
11563
11564 if (ZeroHi) {
11565 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11566 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11567 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11568 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11569 getZeroVector(VT, Subtarget, DAG, DL), Src,
11570 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11571 }
11572
11573 return SDValue();
11574}
11575
11576/// Try to lower a vector shuffle as a byte shift sequence.
11578 SDValue V2, ArrayRef<int> Mask,
11579 const APInt &Zeroable,
11580 const X86Subtarget &Subtarget,
11581 SelectionDAG &DAG) {
11582 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11583 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11584
11585 // We need a shuffle that has zeros at one/both ends and a sequential
11586 // shuffle from one source within.
11587 unsigned ZeroLo = Zeroable.countr_one();
11588 unsigned ZeroHi = Zeroable.countl_one();
11589 if (!ZeroLo && !ZeroHi)
11590 return SDValue();
11591
11592 unsigned NumElts = Mask.size();
11593 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11594 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11595 return SDValue();
11596
11597 unsigned Scale = VT.getScalarSizeInBits() / 8;
11598 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11599 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11600 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11601 return SDValue();
11602
11603 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11604 Res = DAG.getBitcast(MVT::v16i8, Res);
11605
11606 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11607 // inner sequential set of elements, possibly offset:
11608 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11609 // 01234567 --> 4567zzzz --> zzzzz456
11610 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11611 if (ZeroLo == 0) {
11612 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11613 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11614 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11615 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11616 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11617 } else if (ZeroHi == 0) {
11618 unsigned Shift = Mask[ZeroLo] % NumElts;
11619 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11620 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11621 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11622 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11623 } else if (!Subtarget.hasSSSE3()) {
11624 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11625 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11626 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11627 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11628 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11629 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11630 Shift += Mask[ZeroLo] % NumElts;
11631 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11632 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11633 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11634 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11635 } else
11636 return SDValue();
11637
11638 return DAG.getBitcast(VT, Res);
11639}
11640
11641/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11642///
11643/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11644/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11645/// matches elements from one of the input vectors shuffled to the left or
11646/// right with zeroable elements 'shifted in'. It handles both the strictly
11647/// bit-wise element shifts and the byte shift across an entire 128-bit double
11648/// quad word lane.
11649///
11650/// PSHL : (little-endian) left bit shift.
11651/// [ zz, 0, zz, 2 ]
11652/// [ -1, 4, zz, -1 ]
11653/// PSRL : (little-endian) right bit shift.
11654/// [ 1, zz, 3, zz]
11655/// [ -1, -1, 7, zz]
11656/// PSLLDQ : (little-endian) left byte shift
11657/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11658/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11659/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11660/// PSRLDQ : (little-endian) right byte shift
11661/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11662/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11663/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11664static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11665 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11666 int MaskOffset, const APInt &Zeroable,
11667 const X86Subtarget &Subtarget) {
11668 int Size = Mask.size();
11669 unsigned SizeInBits = Size * ScalarSizeInBits;
11670
11671 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11672 for (int i = 0; i < Size; i += Scale)
11673 for (int j = 0; j < Shift; ++j)
11674 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11675 return false;
11676
11677 return true;
11678 };
11679
11680 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11681 for (int i = 0; i != Size; i += Scale) {
11682 unsigned Pos = Left ? i + Shift : i;
11683 unsigned Low = Left ? i : i + Shift;
11684 unsigned Len = Scale - Shift;
11685 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11686 return -1;
11687 }
11688
11689 int ShiftEltBits = ScalarSizeInBits * Scale;
11690 bool ByteShift = ShiftEltBits > 64;
11691 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11692 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11693 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11694
11695 // Normalize the scale for byte shifts to still produce an i64 element
11696 // type.
11697 Scale = ByteShift ? Scale / 2 : Scale;
11698
11699 // We need to round trip through the appropriate type for the shift.
11700 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11701 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11702 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11703 return (int)ShiftAmt;
11704 };
11705
11706 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11707 // keep doubling the size of the integer elements up to that. We can
11708 // then shift the elements of the integer vector by whole multiples of
11709 // their width within the elements of the larger integer vector. Test each
11710 // multiple to see if we can find a match with the moved element indices
11711 // and that the shifted in elements are all zeroable.
11712 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11713 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11714 for (int Shift = 1; Shift != Scale; ++Shift)
11715 for (bool Left : {true, false})
11716 if (CheckZeros(Shift, Scale, Left)) {
11717 int ShiftAmt = MatchShift(Shift, Scale, Left);
11718 if (0 < ShiftAmt)
11719 return ShiftAmt;
11720 }
11721
11722 // no match
11723 return -1;
11724}
11725
11727 SDValue V2, ArrayRef<int> Mask,
11728 const APInt &Zeroable,
11729 const X86Subtarget &Subtarget,
11730 SelectionDAG &DAG, bool BitwiseOnly) {
11731 int Size = Mask.size();
11732 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11733
11734 MVT ShiftVT;
11735 SDValue V = V1;
11736 unsigned Opcode;
11737
11738 // Try to match shuffle against V1 shift.
11739 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11740 Mask, 0, Zeroable, Subtarget);
11741
11742 // If V1 failed, try to match shuffle against V2 shift.
11743 if (ShiftAmt < 0) {
11744 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11745 Mask, Size, Zeroable, Subtarget);
11746 V = V2;
11747 }
11748
11749 if (ShiftAmt < 0)
11750 return SDValue();
11751
11752 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11753 return SDValue();
11754
11755 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11756 "Illegal integer vector type");
11757 V = DAG.getBitcast(ShiftVT, V);
11758 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11759 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11760 return DAG.getBitcast(VT, V);
11761}
11762
11763// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
11764// Remainder of lower half result is zero and upper half is all undef.
11765static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
11766 ArrayRef<int> Mask, uint64_t &BitLen,
11767 uint64_t &BitIdx, const APInt &Zeroable) {
11768 int Size = Mask.size();
11769 int HalfSize = Size / 2;
11770 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11771 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
11772
11773 // Upper half must be undefined.
11774 if (!isUndefUpperHalf(Mask))
11775 return false;
11776
11777 // Determine the extraction length from the part of the
11778 // lower half that isn't zeroable.
11779 int Len = HalfSize;
11780 for (; Len > 0; --Len)
11781 if (!Zeroable[Len - 1])
11782 break;
11783 assert(Len > 0 && "Zeroable shuffle mask");
11784
11785 // Attempt to match first Len sequential elements from the lower half.
11786 SDValue Src;
11787 int Idx = -1;
11788 for (int i = 0; i != Len; ++i) {
11789 int M = Mask[i];
11790 if (M == SM_SentinelUndef)
11791 continue;
11792 SDValue &V = (M < Size ? V1 : V2);
11793 M = M % Size;
11794
11795 // The extracted elements must start at a valid index and all mask
11796 // elements must be in the lower half.
11797 if (i > M || M >= HalfSize)
11798 return false;
11799
11800 if (Idx < 0 || (Src == V && Idx == (M - i))) {
11801 Src = V;
11802 Idx = M - i;
11803 continue;
11804 }
11805 return false;
11806 }
11807
11808 if (!Src || Idx < 0)
11809 return false;
11810
11811 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
11812 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11813 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11814 V1 = Src;
11815 return true;
11816}
11817
11818// INSERTQ: Extract lowest Len elements from lower half of second source and
11819// insert over first source, starting at Idx.
11820// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
11821static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
11822 ArrayRef<int> Mask, uint64_t &BitLen,
11823 uint64_t &BitIdx) {
11824 int Size = Mask.size();
11825 int HalfSize = Size / 2;
11826 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11827
11828 // Upper half must be undefined.
11829 if (!isUndefUpperHalf(Mask))
11830 return false;
11831
11832 for (int Idx = 0; Idx != HalfSize; ++Idx) {
11833 SDValue Base;
11834
11835 // Attempt to match first source from mask before insertion point.
11836 if (isUndefInRange(Mask, 0, Idx)) {
11837 /* EMPTY */
11838 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
11839 Base = V1;
11840 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
11841 Base = V2;
11842 } else {
11843 continue;
11844 }
11845
11846 // Extend the extraction length looking to match both the insertion of
11847 // the second source and the remaining elements of the first.
11848 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
11849 SDValue Insert;
11850 int Len = Hi - Idx;
11851
11852 // Match insertion.
11853 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
11854 Insert = V1;
11855 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
11856 Insert = V2;
11857 } else {
11858 continue;
11859 }
11860
11861 // Match the remaining elements of the lower half.
11862 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
11863 /* EMPTY */
11864 } else if ((!Base || (Base == V1)) &&
11865 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
11866 Base = V1;
11867 } else if ((!Base || (Base == V2)) &&
11868 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
11869 Size + Hi)) {
11870 Base = V2;
11871 } else {
11872 continue;
11873 }
11874
11875 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11876 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11877 V1 = Base;
11878 V2 = Insert;
11879 return true;
11880 }
11881 }
11882
11883 return false;
11884}
11885
11886/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
11888 SDValue V2, ArrayRef<int> Mask,
11889 const APInt &Zeroable, SelectionDAG &DAG) {
11890 uint64_t BitLen, BitIdx;
11891 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
11892 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
11893 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11894 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11895
11896 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
11897 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
11898 V2 ? V2 : DAG.getUNDEF(VT),
11899 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11900 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11901
11902 return SDValue();
11903}
11904
11905/// Lower a vector shuffle as a zero or any extension.
11906///
11907/// Given a specific number of elements, element bit width, and extension
11908/// stride, produce either a zero or any extension based on the available
11909/// features of the subtarget. The extended elements are consecutive and
11910/// begin and can start from an offsetted element index in the input; to
11911/// avoid excess shuffling the offset must either being in the bottom lane
11912/// or at the start of a higher lane. All extended elements must be from
11913/// the same lane.
11915 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
11916 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11917 assert(Scale > 1 && "Need a scale to extend.");
11918 int EltBits = VT.getScalarSizeInBits();
11919 int NumElements = VT.getVectorNumElements();
11920 int NumEltsPerLane = 128 / EltBits;
11921 int OffsetLane = Offset / NumEltsPerLane;
11922 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
11923 "Only 8, 16, and 32 bit elements can be extended.");
11924 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
11925 assert(0 <= Offset && "Extension offset must be positive.");
11926 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
11927 "Extension offset must be in the first lane or start an upper lane.");
11928
11929 // Check that an index is in same lane as the base offset.
11930 auto SafeOffset = [&](int Idx) {
11931 return OffsetLane == (Idx / NumEltsPerLane);
11932 };
11933
11934 // Shift along an input so that the offset base moves to the first element.
11935 auto ShuffleOffset = [&](SDValue V) {
11936 if (!Offset)
11937 return V;
11938
11939 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11940 for (int i = 0; i * Scale < NumElements; ++i) {
11941 int SrcIdx = i + Offset;
11942 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
11943 }
11944 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
11945 };
11946
11947 // Found a valid a/zext mask! Try various lowering strategies based on the
11948 // input type and available ISA extensions.
11949 if (Subtarget.hasSSE41()) {
11950 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
11951 // PUNPCK will catch this in a later shuffle match.
11952 if (Offset && Scale == 2 && VT.is128BitVector())
11953 return SDValue();
11954 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
11955 NumElements / Scale);
11956 InputV = DAG.getBitcast(VT, InputV);
11957 InputV = ShuffleOffset(InputV);
11959 DL, ExtVT, InputV, DAG);
11960 return DAG.getBitcast(VT, InputV);
11961 }
11962
11963 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
11964 InputV = DAG.getBitcast(VT, InputV);
11965
11966 // For any extends we can cheat for larger element sizes and use shuffle
11967 // instructions that can fold with a load and/or copy.
11968 if (AnyExt && EltBits == 32) {
11969 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
11970 -1};
11971 return DAG.getBitcast(
11972 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11973 DAG.getBitcast(MVT::v4i32, InputV),
11974 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11975 }
11976 if (AnyExt && EltBits == 16 && Scale > 2) {
11977 int PSHUFDMask[4] = {Offset / 2, -1,
11978 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
11979 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11980 DAG.getBitcast(MVT::v4i32, InputV),
11981 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11982 int PSHUFWMask[4] = {1, -1, -1, -1};
11983 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
11984 return DAG.getBitcast(
11985 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
11986 DAG.getBitcast(MVT::v8i16, InputV),
11987 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
11988 }
11989
11990 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
11991 // to 64-bits.
11992 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
11993 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
11994 assert(VT.is128BitVector() && "Unexpected vector width!");
11995
11996 int LoIdx = Offset * EltBits;
11997 SDValue Lo = DAG.getBitcast(
11998 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11999 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12000 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12001
12002 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12003 return DAG.getBitcast(VT, Lo);
12004
12005 int HiIdx = (Offset + 1) * EltBits;
12006 SDValue Hi = DAG.getBitcast(
12007 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12008 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12009 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12010 return DAG.getBitcast(VT,
12011 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12012 }
12013
12014 // If this would require more than 2 unpack instructions to expand, use
12015 // pshufb when available. We can only use more than 2 unpack instructions
12016 // when zero extending i8 elements which also makes it easier to use pshufb.
12017 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12018 assert(NumElements == 16 && "Unexpected byte vector width!");
12019 SDValue PSHUFBMask[16];
12020 for (int i = 0; i < 16; ++i) {
12021 int Idx = Offset + (i / Scale);
12022 if ((i % Scale == 0 && SafeOffset(Idx))) {
12023 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12024 continue;
12025 }
12026 PSHUFBMask[i] =
12027 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12028 }
12029 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12030 return DAG.getBitcast(
12031 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12032 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12033 }
12034
12035 // If we are extending from an offset, ensure we start on a boundary that
12036 // we can unpack from.
12037 int AlignToUnpack = Offset % (NumElements / Scale);
12038 if (AlignToUnpack) {
12039 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12040 for (int i = AlignToUnpack; i < NumElements; ++i)
12041 ShMask[i - AlignToUnpack] = i;
12042 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12043 Offset -= AlignToUnpack;
12044 }
12045
12046 // Otherwise emit a sequence of unpacks.
12047 do {
12048 unsigned UnpackLoHi = X86ISD::UNPCKL;
12049 if (Offset >= (NumElements / 2)) {
12050 UnpackLoHi = X86ISD::UNPCKH;
12051 Offset -= (NumElements / 2);
12052 }
12053
12054 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12055 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12056 : getZeroVector(InputVT, Subtarget, DAG, DL);
12057 InputV = DAG.getBitcast(InputVT, InputV);
12058 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12059 Scale /= 2;
12060 EltBits *= 2;
12061 NumElements /= 2;
12062 } while (Scale > 1);
12063 return DAG.getBitcast(VT, InputV);
12064}
12065
12066/// Try to lower a vector shuffle as a zero extension on any microarch.
12067///
12068/// This routine will try to do everything in its power to cleverly lower
12069/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12070/// check for the profitability of this lowering, it tries to aggressively
12071/// match this pattern. It will use all of the micro-architectural details it
12072/// can to emit an efficient lowering. It handles both blends with all-zero
12073/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12074/// masking out later).
12075///
12076/// The reason we have dedicated lowering for zext-style shuffles is that they
12077/// are both incredibly common and often quite performance sensitive.
12079 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12080 const APInt &Zeroable, const X86Subtarget &Subtarget,
12081 SelectionDAG &DAG) {
12082 int Bits = VT.getSizeInBits();
12083 int NumLanes = Bits / 128;
12084 int NumElements = VT.getVectorNumElements();
12085 int NumEltsPerLane = NumElements / NumLanes;
12086 assert(VT.getScalarSizeInBits() <= 32 &&
12087 "Exceeds 32-bit integer zero extension limit");
12088 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12089
12090 // Define a helper function to check a particular ext-scale and lower to it if
12091 // valid.
12092 auto Lower = [&](int Scale) -> SDValue {
12093 SDValue InputV;
12094 bool AnyExt = true;
12095 int Offset = 0;
12096 int Matches = 0;
12097 for (int i = 0; i < NumElements; ++i) {
12098 int M = Mask[i];
12099 if (M < 0)
12100 continue; // Valid anywhere but doesn't tell us anything.
12101 if (i % Scale != 0) {
12102 // Each of the extended elements need to be zeroable.
12103 if (!Zeroable[i])
12104 return SDValue();
12105
12106 // We no longer are in the anyext case.
12107 AnyExt = false;
12108 continue;
12109 }
12110
12111 // Each of the base elements needs to be consecutive indices into the
12112 // same input vector.
12113 SDValue V = M < NumElements ? V1 : V2;
12114 M = M % NumElements;
12115 if (!InputV) {
12116 InputV = V;
12117 Offset = M - (i / Scale);
12118 } else if (InputV != V)
12119 return SDValue(); // Flip-flopping inputs.
12120
12121 // Offset must start in the lowest 128-bit lane or at the start of an
12122 // upper lane.
12123 // FIXME: Is it ever worth allowing a negative base offset?
12124 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12125 (Offset % NumEltsPerLane) == 0))
12126 return SDValue();
12127
12128 // If we are offsetting, all referenced entries must come from the same
12129 // lane.
12130 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12131 return SDValue();
12132
12133 if ((M % NumElements) != (Offset + (i / Scale)))
12134 return SDValue(); // Non-consecutive strided elements.
12135 Matches++;
12136 }
12137
12138 // If we fail to find an input, we have a zero-shuffle which should always
12139 // have already been handled.
12140 // FIXME: Maybe handle this here in case during blending we end up with one?
12141 if (!InputV)
12142 return SDValue();
12143
12144 // If we are offsetting, don't extend if we only match a single input, we
12145 // can always do better by using a basic PSHUF or PUNPCK.
12146 if (Offset != 0 && Matches < 2)
12147 return SDValue();
12148
12149 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12150 InputV, Mask, Subtarget, DAG);
12151 };
12152
12153 // The widest scale possible for extending is to a 64-bit integer.
12154 assert(Bits % 64 == 0 &&
12155 "The number of bits in a vector must be divisible by 64 on x86!");
12156 int NumExtElements = Bits / 64;
12157
12158 // Each iteration, try extending the elements half as much, but into twice as
12159 // many elements.
12160 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12161 assert(NumElements % NumExtElements == 0 &&
12162 "The input vector size must be divisible by the extended size.");
12163 if (SDValue V = Lower(NumElements / NumExtElements))
12164 return V;
12165 }
12166
12167 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12168 if (Bits != 128)
12169 return SDValue();
12170
12171 // Returns one of the source operands if the shuffle can be reduced to a
12172 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12173 auto CanZExtLowHalf = [&]() {
12174 for (int i = NumElements / 2; i != NumElements; ++i)
12175 if (!Zeroable[i])
12176 return SDValue();
12177 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12178 return V1;
12179 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12180 return V2;
12181 return SDValue();
12182 };
12183
12184 if (SDValue V = CanZExtLowHalf()) {
12185 V = DAG.getBitcast(MVT::v2i64, V);
12186 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12187 return DAG.getBitcast(VT, V);
12188 }
12189
12190 // No viable ext lowering found.
12191 return SDValue();
12192}
12193
12194/// Try to get a scalar value for a specific element of a vector.
12195///
12196/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12198 SelectionDAG &DAG) {
12199 MVT VT = V.getSimpleValueType();
12200 MVT EltVT = VT.getVectorElementType();
12201 V = peekThroughBitcasts(V);
12202
12203 // If the bitcasts shift the element size, we can't extract an equivalent
12204 // element from it.
12205 MVT NewVT = V.getSimpleValueType();
12206 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12207 return SDValue();
12208
12209 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12210 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12211 // Ensure the scalar operand is the same size as the destination.
12212 // FIXME: Add support for scalar truncation where possible.
12213 SDValue S = V.getOperand(Idx);
12214 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12215 return DAG.getBitcast(EltVT, S);
12216 }
12217
12218 return SDValue();
12219}
12220
12221/// Helper to test for a load that can be folded with x86 shuffles.
12222///
12223/// This is particularly important because the set of instructions varies
12224/// significantly based on whether the operand is a load or not.
12226 return V->hasOneUse() &&
12228}
12229
12230template<typename T>
12231static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12232 T EltVT = VT.getScalarType();
12233 return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
12234}
12235
12236/// Try to lower insertion of a single element into a zero vector.
12237///
12238/// This is a common pattern that we have especially efficient patterns to lower
12239/// across all subtarget feature sets.
12241 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12242 const APInt &Zeroable, const X86Subtarget &Subtarget,
12243 SelectionDAG &DAG) {
12244 MVT ExtVT = VT;
12245 MVT EltVT = VT.getVectorElementType();
12246 unsigned NumElts = VT.getVectorNumElements();
12247 unsigned EltBits = VT.getScalarSizeInBits();
12248
12249 if (isSoftF16(EltVT, Subtarget))
12250 return SDValue();
12251
12252 int V2Index =
12253 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12254 Mask.begin();
12255 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12256 bool IsV1Zeroable = true;
12257 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12258 if (i != V2Index && !Zeroable[i]) {
12259 IsV1Zeroable = false;
12260 break;
12261 }
12262
12263 // Bail if a non-zero V1 isn't used in place.
12264 if (!IsV1Zeroable) {
12265 SmallVector<int, 8> V1Mask(Mask);
12266 V1Mask[V2Index] = -1;
12267 if (!isNoopShuffleMask(V1Mask))
12268 return SDValue();
12269 }
12270
12271 // Check for a single input from a SCALAR_TO_VECTOR node.
12272 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12273 // all the smarts here sunk into that routine. However, the current
12274 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12275 // vector shuffle lowering is dead.
12276 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12277 DAG);
12278 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12279 // We need to zext the scalar if it is smaller than an i32.
12280 V2S = DAG.getBitcast(EltVT, V2S);
12281 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12282 // Using zext to expand a narrow element won't work for non-zero
12283 // insertions. But we can use a masked constant vector if we're
12284 // inserting V2 into the bottom of V1.
12285 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12286 return SDValue();
12287
12288 // Zero-extend directly to i32.
12289 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12290 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12291
12292 // If we're inserting into a constant, mask off the inserted index
12293 // and OR with the zero-extended scalar.
12294 if (!IsV1Zeroable) {
12295 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12296 Bits[V2Index] = APInt::getZero(EltBits);
12297 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12298 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12299 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12300 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12301 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12302 }
12303 }
12304 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12305 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12306 EltVT == MVT::i16) {
12307 // Either not inserting from the low element of the input or the input
12308 // element size is too small to use VZEXT_MOVL to clear the high bits.
12309 return SDValue();
12310 }
12311
12312 if (!IsV1Zeroable) {
12313 // If V1 can't be treated as a zero vector we have fewer options to lower
12314 // this. We can't support integer vectors or non-zero targets cheaply.
12315 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12316 if (!VT.isFloatingPoint() || V2Index != 0)
12317 return SDValue();
12318 if (!VT.is128BitVector())
12319 return SDValue();
12320
12321 // Otherwise, use MOVSD, MOVSS or MOVSH.
12322 unsigned MovOpc = 0;
12323 if (EltVT == MVT::f16)
12324 MovOpc = X86ISD::MOVSH;
12325 else if (EltVT == MVT::f32)
12326 MovOpc = X86ISD::MOVSS;
12327 else if (EltVT == MVT::f64)
12328 MovOpc = X86ISD::MOVSD;
12329 else
12330 llvm_unreachable("Unsupported floating point element type to handle!");
12331 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12332 }
12333
12334 // This lowering only works for the low element with floating point vectors.
12335 if (VT.isFloatingPoint() && V2Index != 0)
12336 return SDValue();
12337
12338 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12339 if (ExtVT != VT)
12340 V2 = DAG.getBitcast(VT, V2);
12341
12342 if (V2Index != 0) {
12343 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12344 // the desired position. Otherwise it is more efficient to do a vector
12345 // shift left. We know that we can do a vector shift left because all
12346 // the inputs are zero.
12347 if (VT.isFloatingPoint() || NumElts <= 4) {
12348 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12349 V2Shuffle[V2Index] = 0;
12350 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12351 } else {
12352 V2 = DAG.getBitcast(MVT::v16i8, V2);
12353 V2 = DAG.getNode(
12354 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12355 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12356 V2 = DAG.getBitcast(VT, V2);
12357 }
12358 }
12359 return V2;
12360}
12361
12362/// Try to lower broadcast of a single - truncated - integer element,
12363/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12364///
12365/// This assumes we have AVX2.
12367 int BroadcastIdx,
12368 const X86Subtarget &Subtarget,
12369 SelectionDAG &DAG) {
12370 assert(Subtarget.hasAVX2() &&
12371 "We can only lower integer broadcasts with AVX2!");
12372
12373 MVT EltVT = VT.getVectorElementType();
12374 MVT V0VT = V0.getSimpleValueType();
12375
12376 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12377 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12378
12379 MVT V0EltVT = V0VT.getVectorElementType();
12380 if (!V0EltVT.isInteger())
12381 return SDValue();
12382
12383 const unsigned EltSize = EltVT.getSizeInBits();
12384 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12385
12386 // This is only a truncation if the original element type is larger.
12387 if (V0EltSize <= EltSize)
12388 return SDValue();
12389
12390 assert(((V0EltSize % EltSize) == 0) &&
12391 "Scalar type sizes must all be powers of 2 on x86!");
12392
12393 const unsigned V0Opc = V0.getOpcode();
12394 const unsigned Scale = V0EltSize / EltSize;
12395 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12396
12397 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12398 V0Opc != ISD::BUILD_VECTOR)
12399 return SDValue();
12400
12401 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12402
12403 // If we're extracting non-least-significant bits, shift so we can truncate.
12404 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12405 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12406 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12407 if (const int OffsetIdx = BroadcastIdx % Scale)
12408 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12409 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12410
12411 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12412 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12413}
12414
12415/// Test whether this can be lowered with a single SHUFPS instruction.
12416///
12417/// This is used to disable more specialized lowerings when the shufps lowering
12418/// will happen to be efficient.
12420 // This routine only handles 128-bit shufps.
12421 assert(Mask.size() == 4 && "Unsupported mask size!");
12422 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12423 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12424 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12425 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12426
12427 // To lower with a single SHUFPS we need to have the low half and high half
12428 // each requiring a single input.
12429 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12430 return false;
12431 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12432 return false;
12433
12434 return true;
12435}
12436
12437/// Test whether the specified input (0 or 1) is in-place blended by the
12438/// given mask.
12439///
12440/// This returns true if the elements from a particular input are already in the
12441/// slot required by the given mask and require no permutation.
12442static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12443 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12444 int Size = Mask.size();
12445 for (int i = 0; i < Size; ++i)
12446 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12447 return false;
12448
12449 return true;
12450}
12451
12452/// If we are extracting two 128-bit halves of a vector and shuffling the
12453/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12454/// multi-shuffle lowering.
12456 SDValue N1, ArrayRef<int> Mask,
12457 SelectionDAG &DAG) {
12458 MVT VT = N0.getSimpleValueType();
12459 assert((VT.is128BitVector() &&
12460 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12461 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12462
12463 // Check that both sources are extracts of the same source vector.
12464 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12466 N0.getOperand(0) != N1.getOperand(0) ||
12467 !N0.hasOneUse() || !N1.hasOneUse())
12468 return SDValue();
12469
12470 SDValue WideVec = N0.getOperand(0);
12471 MVT WideVT = WideVec.getSimpleValueType();
12472 if (!WideVT.is256BitVector())
12473 return SDValue();
12474
12475 // Match extracts of each half of the wide source vector. Commute the shuffle
12476 // if the extract of the low half is N1.
12477 unsigned NumElts = VT.getVectorNumElements();
12478 SmallVector<int, 4> NewMask(Mask);
12479 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12480 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12481 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12483 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12484 return SDValue();
12485
12486 // Final bailout: if the mask is simple, we are better off using an extract
12487 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12488 // because that avoids a constant load from memory.
12489 if (NumElts == 4 &&
12490 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12491 return SDValue();
12492
12493 // Extend the shuffle mask with undef elements.
12494 NewMask.append(NumElts, -1);
12495
12496 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12497 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12498 NewMask);
12499 // This is free: ymm -> xmm.
12500 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12501 DAG.getIntPtrConstant(0, DL));
12502}
12503
12504/// Try to lower broadcast of a single element.
12505///
12506/// For convenience, this code also bundles all of the subtarget feature set
12507/// filtering. While a little annoying to re-dispatch on type here, there isn't
12508/// a convenient way to factor it out.
12510 SDValue V2, ArrayRef<int> Mask,
12511 const X86Subtarget &Subtarget,
12512 SelectionDAG &DAG) {
12513 MVT EltVT = VT.getVectorElementType();
12514 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12515 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12516 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12517 return SDValue();
12518
12519 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12520 // we can only broadcast from a register with AVX2.
12521 unsigned NumEltBits = VT.getScalarSizeInBits();
12522 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12525 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12526
12527 // Check that the mask is a broadcast.
12528 int BroadcastIdx = getSplatIndex(Mask);
12529 if (BroadcastIdx < 0)
12530 return SDValue();
12531 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12532 "a sorted mask where the broadcast "
12533 "comes from V1.");
12534
12535 // Go up the chain of (vector) values to find a scalar load that we can
12536 // combine with the broadcast.
12537 // TODO: Combine this logic with findEltLoadSrc() used by
12538 // EltsFromConsecutiveLoads().
12539 int BitOffset = BroadcastIdx * NumEltBits;
12540 SDValue V = V1;
12541 for (;;) {
12542 switch (V.getOpcode()) {
12543 case ISD::BITCAST: {
12544 V = V.getOperand(0);
12545 continue;
12546 }
12547 case ISD::CONCAT_VECTORS: {
12548 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12549 int OpIdx = BitOffset / OpBitWidth;
12550 V = V.getOperand(OpIdx);
12551 BitOffset %= OpBitWidth;
12552 continue;
12553 }
12555 // The extraction index adds to the existing offset.
12556 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12557 unsigned Idx = V.getConstantOperandVal(1);
12558 unsigned BeginOffset = Idx * EltBitWidth;
12559 BitOffset += BeginOffset;
12560 V = V.getOperand(0);
12561 continue;
12562 }
12563 case ISD::INSERT_SUBVECTOR: {
12564 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12565 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12566 int Idx = (int)V.getConstantOperandVal(2);
12567 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12568 int BeginOffset = Idx * EltBitWidth;
12569 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12570 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12571 BitOffset -= BeginOffset;
12572 V = VInner;
12573 } else {
12574 V = VOuter;
12575 }
12576 continue;
12577 }
12578 }
12579 break;
12580 }
12581 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12582 BroadcastIdx = BitOffset / NumEltBits;
12583
12584 // Do we need to bitcast the source to retrieve the original broadcast index?
12585 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12586
12587 // Check if this is a broadcast of a scalar. We special case lowering
12588 // for scalars so that we can more effectively fold with loads.
12589 // If the original value has a larger element type than the shuffle, the
12590 // broadcast element is in essence truncated. Make that explicit to ease
12591 // folding.
12592 if (BitCastSrc && VT.isInteger())
12593 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12594 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12595 return TruncBroadcast;
12596
12597 // Also check the simpler case, where we can directly reuse the scalar.
12598 if (!BitCastSrc &&
12599 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12600 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12601 V = V.getOperand(BroadcastIdx);
12602
12603 // If we can't broadcast from a register, check that the input is a load.
12604 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12605 return SDValue();
12606 } else if (ISD::isNormalLoad(V.getNode()) &&
12607 cast<LoadSDNode>(V)->isSimple()) {
12608 // We do not check for one-use of the vector load because a broadcast load
12609 // is expected to be a win for code size, register pressure, and possibly
12610 // uops even if the original vector load is not eliminated.
12611
12612 // Reduce the vector load and shuffle to a broadcasted scalar load.
12613 LoadSDNode *Ld = cast<LoadSDNode>(V);
12614 SDValue BaseAddr = Ld->getOperand(1);
12615 MVT SVT = VT.getScalarType();
12616 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12617 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12618 SDValue NewAddr =
12620
12621 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12622 // than MOVDDUP.
12623 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12624 if (Opcode == X86ISD::VBROADCAST) {
12625 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12626 SDValue Ops[] = {Ld->getChain(), NewAddr};
12627 V = DAG.getMemIntrinsicNode(
12628 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12630 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12632 return DAG.getBitcast(VT, V);
12633 }
12634 assert(SVT == MVT::f64 && "Unexpected VT!");
12635 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12637 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12639 } else if (!BroadcastFromReg) {
12640 // We can't broadcast from a vector register.
12641 return SDValue();
12642 } else if (BitOffset != 0) {
12643 // We can only broadcast from the zero-element of a vector register,
12644 // but it can be advantageous to broadcast from the zero-element of a
12645 // subvector.
12646 if (!VT.is256BitVector() && !VT.is512BitVector())
12647 return SDValue();
12648
12649 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12650 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12651 return SDValue();
12652
12653 // Only broadcast the zero-element of a 128-bit subvector.
12654 if ((BitOffset % 128) != 0)
12655 return SDValue();
12656
12657 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12658 "Unexpected bit-offset");
12659 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12660 "Unexpected vector size");
12661 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12662 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12663 }
12664
12665 // On AVX we can use VBROADCAST directly for scalar sources.
12666 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12667 V = DAG.getBitcast(MVT::f64, V);
12668 if (Subtarget.hasAVX()) {
12669 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12670 return DAG.getBitcast(VT, V);
12671 }
12672 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12673 }
12674
12675 // If this is a scalar, do the broadcast on this type and bitcast.
12676 if (!V.getValueType().isVector()) {
12677 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12678 "Unexpected scalar size");
12679 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12681 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12682 }
12683
12684 // We only support broadcasting from 128-bit vectors to minimize the
12685 // number of patterns we need to deal with in isel. So extract down to
12686 // 128-bits, removing as many bitcasts as possible.
12687 if (V.getValueSizeInBits() > 128)
12689
12690 // Otherwise cast V to a vector with the same element type as VT, but
12691 // possibly narrower than VT. Then perform the broadcast.
12692 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12693 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12694 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12695}
12696
12697// Check for whether we can use INSERTPS to perform the shuffle. We only use
12698// INSERTPS when the V1 elements are already in the correct locations
12699// because otherwise we can just always use two SHUFPS instructions which
12700// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12701// perform INSERTPS if a single V1 element is out of place and all V2
12702// elements are zeroable.
12704 unsigned &InsertPSMask,
12705 const APInt &Zeroable,
12706 ArrayRef<int> Mask, SelectionDAG &DAG) {
12707 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12708 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12709 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12710
12711 // Attempt to match INSERTPS with one element from VA or VB being
12712 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12713 // are updated.
12714 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12715 ArrayRef<int> CandidateMask) {
12716 unsigned ZMask = 0;
12717 int VADstIndex = -1;
12718 int VBDstIndex = -1;
12719 bool VAUsedInPlace = false;
12720
12721 for (int i = 0; i < 4; ++i) {
12722 // Synthesize a zero mask from the zeroable elements (includes undefs).
12723 if (Zeroable[i]) {
12724 ZMask |= 1 << i;
12725 continue;
12726 }
12727
12728 // Flag if we use any VA inputs in place.
12729 if (i == CandidateMask[i]) {
12730 VAUsedInPlace = true;
12731 continue;
12732 }
12733
12734 // We can only insert a single non-zeroable element.
12735 if (VADstIndex >= 0 || VBDstIndex >= 0)
12736 return false;
12737
12738 if (CandidateMask[i] < 4) {
12739 // VA input out of place for insertion.
12740 VADstIndex = i;
12741 } else {
12742 // VB input for insertion.
12743 VBDstIndex = i;
12744 }
12745 }
12746
12747 // Don't bother if we have no (non-zeroable) element for insertion.
12748 if (VADstIndex < 0 && VBDstIndex < 0)
12749 return false;
12750
12751 // Determine element insertion src/dst indices. The src index is from the
12752 // start of the inserted vector, not the start of the concatenated vector.
12753 unsigned VBSrcIndex = 0;
12754 if (VADstIndex >= 0) {
12755 // If we have a VA input out of place, we use VA as the V2 element
12756 // insertion and don't use the original V2 at all.
12757 VBSrcIndex = CandidateMask[VADstIndex];
12758 VBDstIndex = VADstIndex;
12759 VB = VA;
12760 } else {
12761 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
12762 }
12763
12764 // If no V1 inputs are used in place, then the result is created only from
12765 // the zero mask and the V2 insertion - so remove V1 dependency.
12766 if (!VAUsedInPlace)
12767 VA = DAG.getUNDEF(MVT::v4f32);
12768
12769 // Update V1, V2 and InsertPSMask accordingly.
12770 V1 = VA;
12771 V2 = VB;
12772
12773 // Insert the V2 element into the desired position.
12774 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
12775 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
12776 return true;
12777 };
12778
12779 if (matchAsInsertPS(V1, V2, Mask))
12780 return true;
12781
12782 // Commute and try again.
12783 SmallVector<int, 4> CommutedMask(Mask);
12785 if (matchAsInsertPS(V2, V1, CommutedMask))
12786 return true;
12787
12788 return false;
12789}
12790
12792 ArrayRef<int> Mask, const APInt &Zeroable,
12793 SelectionDAG &DAG) {
12794 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12795 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12796
12797 // Attempt to match the insertps pattern.
12798 unsigned InsertPSMask = 0;
12799 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
12800 return SDValue();
12801
12802 // Insert the V2 element into the desired position.
12803 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
12804 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
12805}
12806
12807/// Handle lowering of 2-lane 64-bit floating point shuffles.
12808///
12809/// This is the basis function for the 2-lane 64-bit shuffles as we have full
12810/// support for floating point shuffles but not integer shuffles. These
12811/// instructions will incur a domain crossing penalty on some chips though so
12812/// it is better to avoid lowering through this for integer vectors where
12813/// possible.
12815 const APInt &Zeroable, SDValue V1, SDValue V2,
12816 const X86Subtarget &Subtarget,
12817 SelectionDAG &DAG) {
12818 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12819 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12820 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12821
12822 if (V2.isUndef()) {
12823 // Check for being able to broadcast a single element.
12824 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
12825 Mask, Subtarget, DAG))
12826 return Broadcast;
12827
12828 // Straight shuffle of a single input vector. Simulate this by using the
12829 // single input as both of the "inputs" to this instruction..
12830 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
12831
12832 if (Subtarget.hasAVX()) {
12833 // If we have AVX, we can use VPERMILPS which will allow folding a load
12834 // into the shuffle.
12835 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
12836 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12837 }
12838
12839 return DAG.getNode(
12840 X86ISD::SHUFP, DL, MVT::v2f64,
12841 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12842 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12843 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12844 }
12845 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12846 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12847 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12848 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12849
12850 if (Subtarget.hasAVX2())
12851 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12852 return Extract;
12853
12854 // When loading a scalar and then shuffling it into a vector we can often do
12855 // the insertion cheaply.
12857 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12858 return Insertion;
12859 // Try inverting the insertion since for v2 masks it is easy to do and we
12860 // can't reliably sort the mask one way or the other.
12861 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
12862 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
12864 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12865 return Insertion;
12866
12867 // Try to use one of the special instruction patterns to handle two common
12868 // blend patterns if a zero-blend above didn't work.
12869 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
12870 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
12871 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
12872 // We can either use a special instruction to load over the low double or
12873 // to move just the low double.
12874 return DAG.getNode(
12875 X86ISD::MOVSD, DL, MVT::v2f64, V2,
12876 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
12877
12878 if (Subtarget.hasSSE41())
12879 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
12880 Zeroable, Subtarget, DAG))
12881 return Blend;
12882
12883 // Use dedicated unpack instructions for masks that match their pattern.
12884 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
12885 return V;
12886
12887 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
12888 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
12889 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12890}
12891
12892/// Handle lowering of 2-lane 64-bit integer shuffles.
12893///
12894/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
12895/// the integer unit to minimize domain crossing penalties. However, for blends
12896/// it falls back to the floating point shuffle operation with appropriate bit
12897/// casting.
12899 const APInt &Zeroable, SDValue V1, SDValue V2,
12900 const X86Subtarget &Subtarget,
12901 SelectionDAG &DAG) {
12902 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12903 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12904 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12905
12906 if (V2.isUndef()) {
12907 // Check for being able to broadcast a single element.
12908 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
12909 Mask, Subtarget, DAG))
12910 return Broadcast;
12911
12912 // Straight shuffle of a single input vector. For everything from SSE2
12913 // onward this has a single fast instruction with no scary immediates.
12914 // We have to map the mask as it is actually a v4i32 shuffle instruction.
12915 V1 = DAG.getBitcast(MVT::v4i32, V1);
12916 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
12917 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
12918 Mask[1] < 0 ? -1 : (Mask[1] * 2),
12919 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
12920 return DAG.getBitcast(
12921 MVT::v2i64,
12922 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12923 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
12924 }
12925 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
12926 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
12927 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12928 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12929
12930 if (Subtarget.hasAVX2())
12931 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12932 return Extract;
12933
12934 // Try to use shift instructions.
12935 if (SDValue Shift =
12936 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
12937 DAG, /*BitwiseOnly*/ false))
12938 return Shift;
12939
12940 // When loading a scalar and then shuffling it into a vector we can often do
12941 // the insertion cheaply.
12943 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12944 return Insertion;
12945 // Try inverting the insertion since for v2 masks it is easy to do and we
12946 // can't reliably sort the mask one way or the other.
12947 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
12949 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12950 return Insertion;
12951
12952 // We have different paths for blend lowering, but they all must use the
12953 // *exact* same predicate.
12954 bool IsBlendSupported = Subtarget.hasSSE41();
12955 if (IsBlendSupported)
12956 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
12957 Zeroable, Subtarget, DAG))
12958 return Blend;
12959
12960 // Use dedicated unpack instructions for masks that match their pattern.
12961 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
12962 return V;
12963
12964 // Try to use byte rotation instructions.
12965 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
12966 if (Subtarget.hasSSSE3()) {
12967 if (Subtarget.hasVLX())
12968 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
12969 Zeroable, Subtarget, DAG))
12970 return Rotate;
12971
12972 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
12973 Subtarget, DAG))
12974 return Rotate;
12975 }
12976
12977 // If we have direct support for blends, we should lower by decomposing into
12978 // a permute. That will be faster than the domain cross.
12979 if (IsBlendSupported)
12980 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
12981 Subtarget, DAG);
12982
12983 // We implement this with SHUFPD which is pretty lame because it will likely
12984 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
12985 // However, all the alternatives are still more cycles and newer chips don't
12986 // have this problem. It would be really nice if x86 had better shuffles here.
12987 V1 = DAG.getBitcast(MVT::v2f64, V1);
12988 V2 = DAG.getBitcast(MVT::v2f64, V2);
12989 return DAG.getBitcast(MVT::v2i64,
12990 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
12991}
12992
12993/// Lower a vector shuffle using the SHUFPS instruction.
12994///
12995/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
12996/// It makes no assumptions about whether this is the *best* lowering, it simply
12997/// uses it.
12999 ArrayRef<int> Mask, SDValue V1,
13000 SDValue V2, SelectionDAG &DAG) {
13001 SDValue LowV = V1, HighV = V2;
13002 SmallVector<int, 4> NewMask(Mask);
13003 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13004
13005 if (NumV2Elements == 1) {
13006 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13007
13008 // Compute the index adjacent to V2Index and in the same half by toggling
13009 // the low bit.
13010 int V2AdjIndex = V2Index ^ 1;
13011
13012 if (Mask[V2AdjIndex] < 0) {
13013 // Handles all the cases where we have a single V2 element and an undef.
13014 // This will only ever happen in the high lanes because we commute the
13015 // vector otherwise.
13016 if (V2Index < 2)
13017 std::swap(LowV, HighV);
13018 NewMask[V2Index] -= 4;
13019 } else {
13020 // Handle the case where the V2 element ends up adjacent to a V1 element.
13021 // To make this work, blend them together as the first step.
13022 int V1Index = V2AdjIndex;
13023 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13024 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13025 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13026
13027 // Now proceed to reconstruct the final blend as we have the necessary
13028 // high or low half formed.
13029 if (V2Index < 2) {
13030 LowV = V2;
13031 HighV = V1;
13032 } else {
13033 HighV = V2;
13034 }
13035 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13036 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13037 }
13038 } else if (NumV2Elements == 2) {
13039 if (Mask[0] < 4 && Mask[1] < 4) {
13040 // Handle the easy case where we have V1 in the low lanes and V2 in the
13041 // high lanes.
13042 NewMask[2] -= 4;
13043 NewMask[3] -= 4;
13044 } else if (Mask[2] < 4 && Mask[3] < 4) {
13045 // We also handle the reversed case because this utility may get called
13046 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13047 // arrange things in the right direction.
13048 NewMask[0] -= 4;
13049 NewMask[1] -= 4;
13050 HighV = V1;
13051 LowV = V2;
13052 } else {
13053 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13054 // trying to place elements directly, just blend them and set up the final
13055 // shuffle to place them.
13056
13057 // The first two blend mask elements are for V1, the second two are for
13058 // V2.
13059 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13060 Mask[2] < 4 ? Mask[2] : Mask[3],
13061 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13062 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13063 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13064 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13065
13066 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13067 // a blend.
13068 LowV = HighV = V1;
13069 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13070 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13071 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13072 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13073 }
13074 } else if (NumV2Elements == 3) {
13075 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13076 // we can get here due to other paths (e.g repeated mask matching) that we
13077 // don't want to do another round of lowerVECTOR_SHUFFLE.
13079 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13080 }
13081 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13082 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13083}
13084
13085/// Lower 4-lane 32-bit floating point shuffles.
13086///
13087/// Uses instructions exclusively from the floating point unit to minimize
13088/// domain crossing penalties, as these are sufficient to implement all v4f32
13089/// shuffles.
13091 const APInt &Zeroable, SDValue V1, SDValue V2,
13092 const X86Subtarget &Subtarget,
13093 SelectionDAG &DAG) {
13094 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13095 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13096 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13097
13098 if (Subtarget.hasSSE41())
13099 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13100 Zeroable, Subtarget, DAG))
13101 return Blend;
13102
13103 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13104
13105 if (NumV2Elements == 0) {
13106 // Check for being able to broadcast a single element.
13107 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13108 Mask, Subtarget, DAG))
13109 return Broadcast;
13110
13111 // Use even/odd duplicate instructions for masks that match their pattern.
13112 if (Subtarget.hasSSE3()) {
13113 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13114 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13115 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13116 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13117 }
13118
13119 if (Subtarget.hasAVX()) {
13120 // If we have AVX, we can use VPERMILPS which will allow folding a load
13121 // into the shuffle.
13122 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13123 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13124 }
13125
13126 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13127 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13128 if (!Subtarget.hasSSE2()) {
13129 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13130 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13131 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13132 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13133 }
13134
13135 // Otherwise, use a straight shuffle of a single input vector. We pass the
13136 // input vector to both operands to simulate this with a SHUFPS.
13137 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13138 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13139 }
13140
13141 if (Subtarget.hasSSE2())
13143 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13144 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13145 return ZExt;
13146 }
13147
13148 if (Subtarget.hasAVX2())
13149 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13150 return Extract;
13151
13152 // There are special ways we can lower some single-element blends. However, we
13153 // have custom ways we can lower more complex single-element blends below that
13154 // we defer to if both this and BLENDPS fail to match, so restrict this to
13155 // when the V2 input is targeting element 0 of the mask -- that is the fast
13156 // case here.
13157 if (NumV2Elements == 1 && Mask[0] >= 4)
13159 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13160 return V;
13161
13162 if (Subtarget.hasSSE41()) {
13163 // Use INSERTPS if we can complete the shuffle efficiently.
13164 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13165 return V;
13166
13167 if (!isSingleSHUFPSMask(Mask))
13168 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13169 V2, Mask, DAG))
13170 return BlendPerm;
13171 }
13172
13173 // Use low/high mov instructions. These are only valid in SSE1 because
13174 // otherwise they are widened to v2f64 and never get here.
13175 if (!Subtarget.hasSSE2()) {
13176 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13177 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13178 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13179 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13180 }
13181
13182 // Use dedicated unpack instructions for masks that match their pattern.
13183 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
13184 return V;
13185
13186 // Otherwise fall back to a SHUFPS lowering strategy.
13187 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13188}
13189
13190/// Lower 4-lane i32 vector shuffles.
13191///
13192/// We try to handle these with integer-domain shuffles where we can, but for
13193/// blends we use the floating point domain blend instructions.
13195 const APInt &Zeroable, SDValue V1, SDValue V2,
13196 const X86Subtarget &Subtarget,
13197 SelectionDAG &DAG) {
13198 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13199 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13200 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13201
13202 // Whenever we can lower this as a zext, that instruction is strictly faster
13203 // than any alternative. It also allows us to fold memory operands into the
13204 // shuffle in many cases.
13205 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13206 Zeroable, Subtarget, DAG))
13207 return ZExt;
13208
13209 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13210
13211 // Try to use shift instructions if fast.
13212 if (Subtarget.preferLowerShuffleAsShift()) {
13213 if (SDValue Shift =
13214 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13215 Subtarget, DAG, /*BitwiseOnly*/ true))
13216 return Shift;
13217 if (NumV2Elements == 0)
13218 if (SDValue Rotate =
13219 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13220 return Rotate;
13221 }
13222
13223 if (NumV2Elements == 0) {
13224 // Try to use broadcast unless the mask only has one non-undef element.
13225 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13226 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13227 Mask, Subtarget, DAG))
13228 return Broadcast;
13229 }
13230
13231 // Straight shuffle of a single input vector. For everything from SSE2
13232 // onward this has a single fast instruction with no scary immediates.
13233 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13234 // but we aren't actually going to use the UNPCK instruction because doing
13235 // so prevents folding a load into this instruction or making a copy.
13236 const int UnpackLoMask[] = {0, 0, 1, 1};
13237 const int UnpackHiMask[] = {2, 2, 3, 3};
13238 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13239 Mask = UnpackLoMask;
13240 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13241 Mask = UnpackHiMask;
13242
13243 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13244 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13245 }
13246
13247 if (Subtarget.hasAVX2())
13248 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13249 return Extract;
13250
13251 // Try to use shift instructions.
13252 if (SDValue Shift =
13253 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13254 DAG, /*BitwiseOnly*/ false))
13255 return Shift;
13256
13257 // There are special ways we can lower some single-element blends.
13258 if (NumV2Elements == 1)
13260 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13261 return V;
13262
13263 // We have different paths for blend lowering, but they all must use the
13264 // *exact* same predicate.
13265 bool IsBlendSupported = Subtarget.hasSSE41();
13266 if (IsBlendSupported)
13267 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13268 Zeroable, Subtarget, DAG))
13269 return Blend;
13270
13271 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13272 Zeroable, Subtarget, DAG))
13273 return Masked;
13274
13275 // Use dedicated unpack instructions for masks that match their pattern.
13276 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
13277 return V;
13278
13279 // Try to use byte rotation instructions.
13280 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13281 if (Subtarget.hasSSSE3()) {
13282 if (Subtarget.hasVLX())
13283 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13284 Zeroable, Subtarget, DAG))
13285 return Rotate;
13286
13287 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13288 Subtarget, DAG))
13289 return Rotate;
13290 }
13291
13292 // Assume that a single SHUFPS is faster than an alternative sequence of
13293 // multiple instructions (even if the CPU has a domain penalty).
13294 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13295 if (!isSingleSHUFPSMask(Mask)) {
13296 // If we have direct support for blends, we should lower by decomposing into
13297 // a permute. That will be faster than the domain cross.
13298 if (IsBlendSupported)
13299 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13300 Subtarget, DAG);
13301
13302 // Try to lower by permuting the inputs into an unpack instruction.
13303 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13304 Mask, Subtarget, DAG))
13305 return Unpack;
13306 }
13307
13308 // We implement this with SHUFPS because it can blend from two vectors.
13309 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13310 // up the inputs, bypassing domain shift penalties that we would incur if we
13311 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13312 // relevant.
13313 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13314 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13315 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13316 return DAG.getBitcast(MVT::v4i32, ShufPS);
13317}
13318
13319/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13320/// shuffle lowering, and the most complex part.
13321///
13322/// The lowering strategy is to try to form pairs of input lanes which are
13323/// targeted at the same half of the final vector, and then use a dword shuffle
13324/// to place them onto the right half, and finally unpack the paired lanes into
13325/// their final position.
13326///
13327/// The exact breakdown of how to form these dword pairs and align them on the
13328/// correct sides is really tricky. See the comments within the function for
13329/// more of the details.
13330///
13331/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13332/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13333/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13334/// vector, form the analogous 128-bit 8-element Mask.
13336 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13337 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13338 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13339 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13340
13341 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13342 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13343 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13344
13345 // Attempt to directly match PSHUFLW or PSHUFHW.
13346 if (isUndefOrInRange(LoMask, 0, 4) &&
13347 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13348 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13349 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13350 }
13351 if (isUndefOrInRange(HiMask, 4, 8) &&
13352 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13353 for (int i = 0; i != 4; ++i)
13354 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13355 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13356 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13357 }
13358
13359 SmallVector<int, 4> LoInputs;
13360 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13361 array_pod_sort(LoInputs.begin(), LoInputs.end());
13362 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13363 SmallVector<int, 4> HiInputs;
13364 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13365 array_pod_sort(HiInputs.begin(), HiInputs.end());
13366 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13367 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13368 int NumHToL = LoInputs.size() - NumLToL;
13369 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13370 int NumHToH = HiInputs.size() - NumLToH;
13371 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13372 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13373 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13374 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13375
13376 // If we are shuffling values from one half - check how many different DWORD
13377 // pairs we need to create. If only 1 or 2 then we can perform this as a
13378 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13379 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13380 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13381 V = DAG.getNode(ShufWOp, DL, VT, V,
13382 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13383 V = DAG.getBitcast(PSHUFDVT, V);
13384 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13385 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13386 return DAG.getBitcast(VT, V);
13387 };
13388
13389 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13390 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13391 SmallVector<std::pair<int, int>, 4> DWordPairs;
13392 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13393
13394 // Collect the different DWORD pairs.
13395 for (int DWord = 0; DWord != 4; ++DWord) {
13396 int M0 = Mask[2 * DWord + 0];
13397 int M1 = Mask[2 * DWord + 1];
13398 M0 = (M0 >= 0 ? M0 % 4 : M0);
13399 M1 = (M1 >= 0 ? M1 % 4 : M1);
13400 if (M0 < 0 && M1 < 0)
13401 continue;
13402
13403 bool Match = false;
13404 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13405 auto &DWordPair = DWordPairs[j];
13406 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13407 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13408 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13409 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13410 PSHUFDMask[DWord] = DOffset + j;
13411 Match = true;
13412 break;
13413 }
13414 }
13415 if (!Match) {
13416 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13417 DWordPairs.push_back(std::make_pair(M0, M1));
13418 }
13419 }
13420
13421 if (DWordPairs.size() <= 2) {
13422 DWordPairs.resize(2, std::make_pair(-1, -1));
13423 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13424 DWordPairs[1].first, DWordPairs[1].second};
13425 if ((NumHToL + NumHToH) == 0)
13426 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13427 if ((NumLToL + NumLToH) == 0)
13428 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13429 }
13430 }
13431
13432 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13433 // such inputs we can swap two of the dwords across the half mark and end up
13434 // with <=2 inputs to each half in each half. Once there, we can fall through
13435 // to the generic code below. For example:
13436 //
13437 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13438 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13439 //
13440 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13441 // and an existing 2-into-2 on the other half. In this case we may have to
13442 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13443 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13444 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13445 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13446 // half than the one we target for fixing) will be fixed when we re-enter this
13447 // path. We will also combine away any sequence of PSHUFD instructions that
13448 // result into a single instruction. Here is an example of the tricky case:
13449 //
13450 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13451 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13452 //
13453 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13454 //
13455 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13456 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13457 //
13458 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13459 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13460 //
13461 // The result is fine to be handled by the generic logic.
13462 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13463 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13464 int AOffset, int BOffset) {
13465 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13466 "Must call this with A having 3 or 1 inputs from the A half.");
13467 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13468 "Must call this with B having 1 or 3 inputs from the B half.");
13469 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13470 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13471
13472 bool ThreeAInputs = AToAInputs.size() == 3;
13473
13474 // Compute the index of dword with only one word among the three inputs in
13475 // a half by taking the sum of the half with three inputs and subtracting
13476 // the sum of the actual three inputs. The difference is the remaining
13477 // slot.
13478 int ADWord = 0, BDWord = 0;
13479 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13480 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13481 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13482 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13483 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13484 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13485 int TripleNonInputIdx =
13486 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13487 TripleDWord = TripleNonInputIdx / 2;
13488
13489 // We use xor with one to compute the adjacent DWord to whichever one the
13490 // OneInput is in.
13491 OneInputDWord = (OneInput / 2) ^ 1;
13492
13493 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13494 // and BToA inputs. If there is also such a problem with the BToB and AToB
13495 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13496 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13497 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13498 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13499 // Compute how many inputs will be flipped by swapping these DWords. We
13500 // need
13501 // to balance this to ensure we don't form a 3-1 shuffle in the other
13502 // half.
13503 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13504 llvm::count(AToBInputs, 2 * ADWord + 1);
13505 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13506 llvm::count(BToBInputs, 2 * BDWord + 1);
13507 if ((NumFlippedAToBInputs == 1 &&
13508 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13509 (NumFlippedBToBInputs == 1 &&
13510 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13511 // We choose whether to fix the A half or B half based on whether that
13512 // half has zero flipped inputs. At zero, we may not be able to fix it
13513 // with that half. We also bias towards fixing the B half because that
13514 // will more commonly be the high half, and we have to bias one way.
13515 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13516 ArrayRef<int> Inputs) {
13517 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13518 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13519 // Determine whether the free index is in the flipped dword or the
13520 // unflipped dword based on where the pinned index is. We use this bit
13521 // in an xor to conditionally select the adjacent dword.
13522 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13523 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13524 if (IsFixIdxInput == IsFixFreeIdxInput)
13525 FixFreeIdx += 1;
13526 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13527 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13528 "We need to be changing the number of flipped inputs!");
13529 int PSHUFHalfMask[] = {0, 1, 2, 3};
13530 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13531 V = DAG.getNode(
13532 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13533 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13534 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13535
13536 for (int &M : Mask)
13537 if (M >= 0 && M == FixIdx)
13538 M = FixFreeIdx;
13539 else if (M >= 0 && M == FixFreeIdx)
13540 M = FixIdx;
13541 };
13542 if (NumFlippedBToBInputs != 0) {
13543 int BPinnedIdx =
13544 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13545 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13546 } else {
13547 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13548 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13549 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13550 }
13551 }
13552 }
13553
13554 int PSHUFDMask[] = {0, 1, 2, 3};
13555 PSHUFDMask[ADWord] = BDWord;
13556 PSHUFDMask[BDWord] = ADWord;
13557 V = DAG.getBitcast(
13558 VT,
13559 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13560 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13561
13562 // Adjust the mask to match the new locations of A and B.
13563 for (int &M : Mask)
13564 if (M >= 0 && M/2 == ADWord)
13565 M = 2 * BDWord + M % 2;
13566 else if (M >= 0 && M/2 == BDWord)
13567 M = 2 * ADWord + M % 2;
13568
13569 // Recurse back into this routine to re-compute state now that this isn't
13570 // a 3 and 1 problem.
13571 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13572 };
13573 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13574 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13575 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13576 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13577
13578 // At this point there are at most two inputs to the low and high halves from
13579 // each half. That means the inputs can always be grouped into dwords and
13580 // those dwords can then be moved to the correct half with a dword shuffle.
13581 // We use at most one low and one high word shuffle to collect these paired
13582 // inputs into dwords, and finally a dword shuffle to place them.
13583 int PSHUFLMask[4] = {-1, -1, -1, -1};
13584 int PSHUFHMask[4] = {-1, -1, -1, -1};
13585 int PSHUFDMask[4] = {-1, -1, -1, -1};
13586
13587 // First fix the masks for all the inputs that are staying in their
13588 // original halves. This will then dictate the targets of the cross-half
13589 // shuffles.
13590 auto fixInPlaceInputs =
13591 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13592 MutableArrayRef<int> SourceHalfMask,
13593 MutableArrayRef<int> HalfMask, int HalfOffset) {
13594 if (InPlaceInputs.empty())
13595 return;
13596 if (InPlaceInputs.size() == 1) {
13597 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13598 InPlaceInputs[0] - HalfOffset;
13599 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13600 return;
13601 }
13602 if (IncomingInputs.empty()) {
13603 // Just fix all of the in place inputs.
13604 for (int Input : InPlaceInputs) {
13605 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13606 PSHUFDMask[Input / 2] = Input / 2;
13607 }
13608 return;
13609 }
13610
13611 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13612 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13613 InPlaceInputs[0] - HalfOffset;
13614 // Put the second input next to the first so that they are packed into
13615 // a dword. We find the adjacent index by toggling the low bit.
13616 int AdjIndex = InPlaceInputs[0] ^ 1;
13617 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13618 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13619 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13620 };
13621 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13622 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13623
13624 // Now gather the cross-half inputs and place them into a free dword of
13625 // their target half.
13626 // FIXME: This operation could almost certainly be simplified dramatically to
13627 // look more like the 3-1 fixing operation.
13628 auto moveInputsToRightHalf = [&PSHUFDMask](
13629 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13630 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13631 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13632 int DestOffset) {
13633 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13634 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13635 };
13636 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13637 int Word) {
13638 int LowWord = Word & ~1;
13639 int HighWord = Word | 1;
13640 return isWordClobbered(SourceHalfMask, LowWord) ||
13641 isWordClobbered(SourceHalfMask, HighWord);
13642 };
13643
13644 if (IncomingInputs.empty())
13645 return;
13646
13647 if (ExistingInputs.empty()) {
13648 // Map any dwords with inputs from them into the right half.
13649 for (int Input : IncomingInputs) {
13650 // If the source half mask maps over the inputs, turn those into
13651 // swaps and use the swapped lane.
13652 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13653 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13654 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13655 Input - SourceOffset;
13656 // We have to swap the uses in our half mask in one sweep.
13657 for (int &M : HalfMask)
13658 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13659 M = Input;
13660 else if (M == Input)
13661 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13662 } else {
13663 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13664 Input - SourceOffset &&
13665 "Previous placement doesn't match!");
13666 }
13667 // Note that this correctly re-maps both when we do a swap and when
13668 // we observe the other side of the swap above. We rely on that to
13669 // avoid swapping the members of the input list directly.
13670 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13671 }
13672
13673 // Map the input's dword into the correct half.
13674 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13675 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13676 else
13677 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13678 Input / 2 &&
13679 "Previous placement doesn't match!");
13680 }
13681
13682 // And just directly shift any other-half mask elements to be same-half
13683 // as we will have mirrored the dword containing the element into the
13684 // same position within that half.
13685 for (int &M : HalfMask)
13686 if (M >= SourceOffset && M < SourceOffset + 4) {
13687 M = M - SourceOffset + DestOffset;
13688 assert(M >= 0 && "This should never wrap below zero!");
13689 }
13690 return;
13691 }
13692
13693 // Ensure we have the input in a viable dword of its current half. This
13694 // is particularly tricky because the original position may be clobbered
13695 // by inputs being moved and *staying* in that half.
13696 if (IncomingInputs.size() == 1) {
13697 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13698 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13699 SourceOffset;
13700 SourceHalfMask[InputFixed - SourceOffset] =
13701 IncomingInputs[0] - SourceOffset;
13702 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13703 InputFixed);
13704 IncomingInputs[0] = InputFixed;
13705 }
13706 } else if (IncomingInputs.size() == 2) {
13707 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13708 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13709 // We have two non-adjacent or clobbered inputs we need to extract from
13710 // the source half. To do this, we need to map them into some adjacent
13711 // dword slot in the source mask.
13712 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13713 IncomingInputs[1] - SourceOffset};
13714
13715 // If there is a free slot in the source half mask adjacent to one of
13716 // the inputs, place the other input in it. We use (Index XOR 1) to
13717 // compute an adjacent index.
13718 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13719 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13720 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13721 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13722 InputsFixed[1] = InputsFixed[0] ^ 1;
13723 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13724 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13725 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13726 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13727 InputsFixed[0] = InputsFixed[1] ^ 1;
13728 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13729 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13730 // The two inputs are in the same DWord but it is clobbered and the
13731 // adjacent DWord isn't used at all. Move both inputs to the free
13732 // slot.
13733 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13734 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13735 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13736 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13737 } else {
13738 // The only way we hit this point is if there is no clobbering
13739 // (because there are no off-half inputs to this half) and there is no
13740 // free slot adjacent to one of the inputs. In this case, we have to
13741 // swap an input with a non-input.
13742 for (int i = 0; i < 4; ++i)
13743 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13744 "We can't handle any clobbers here!");
13745 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13746 "Cannot have adjacent inputs here!");
13747
13748 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13749 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13750
13751 // We also have to update the final source mask in this case because
13752 // it may need to undo the above swap.
13753 for (int &M : FinalSourceHalfMask)
13754 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13755 M = InputsFixed[1] + SourceOffset;
13756 else if (M == InputsFixed[1] + SourceOffset)
13757 M = (InputsFixed[0] ^ 1) + SourceOffset;
13758
13759 InputsFixed[1] = InputsFixed[0] ^ 1;
13760 }
13761
13762 // Point everything at the fixed inputs.
13763 for (int &M : HalfMask)
13764 if (M == IncomingInputs[0])
13765 M = InputsFixed[0] + SourceOffset;
13766 else if (M == IncomingInputs[1])
13767 M = InputsFixed[1] + SourceOffset;
13768
13769 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
13770 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
13771 }
13772 } else {
13773 llvm_unreachable("Unhandled input size!");
13774 }
13775
13776 // Now hoist the DWord down to the right half.
13777 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
13778 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
13779 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
13780 for (int &M : HalfMask)
13781 for (int Input : IncomingInputs)
13782 if (M == Input)
13783 M = FreeDWord * 2 + Input % 2;
13784 };
13785 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
13786 /*SourceOffset*/ 4, /*DestOffset*/ 0);
13787 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
13788 /*SourceOffset*/ 0, /*DestOffset*/ 4);
13789
13790 // Now enact all the shuffles we've computed to move the inputs into their
13791 // target half.
13792 if (!isNoopShuffleMask(PSHUFLMask))
13793 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13794 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
13795 if (!isNoopShuffleMask(PSHUFHMask))
13796 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13797 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
13798 if (!isNoopShuffleMask(PSHUFDMask))
13799 V = DAG.getBitcast(
13800 VT,
13801 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13802 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13803
13804 // At this point, each half should contain all its inputs, and we can then
13805 // just shuffle them into their final position.
13806 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
13807 "Failed to lift all the high half inputs to the low mask!");
13808 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
13809 "Failed to lift all the low half inputs to the high mask!");
13810
13811 // Do a half shuffle for the low mask.
13812 if (!isNoopShuffleMask(LoMask))
13813 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13814 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13815
13816 // Do a half shuffle with the high mask after shifting its values down.
13817 for (int &M : HiMask)
13818 if (M >= 0)
13819 M -= 4;
13820 if (!isNoopShuffleMask(HiMask))
13821 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13822 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13823
13824 return V;
13825}
13826
13827/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
13828/// blend if only one input is used.
13830 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13831 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
13833 "Lane crossing shuffle masks not supported");
13834
13835 int NumBytes = VT.getSizeInBits() / 8;
13836 int Size = Mask.size();
13837 int Scale = NumBytes / Size;
13838
13839 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13840 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13841 V1InUse = false;
13842 V2InUse = false;
13843
13844 for (int i = 0; i < NumBytes; ++i) {
13845 int M = Mask[i / Scale];
13846 if (M < 0)
13847 continue;
13848
13849 const int ZeroMask = 0x80;
13850 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
13851 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
13852 if (Zeroable[i / Scale])
13853 V1Idx = V2Idx = ZeroMask;
13854
13855 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
13856 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
13857 V1InUse |= (ZeroMask != V1Idx);
13858 V2InUse |= (ZeroMask != V2Idx);
13859 }
13860
13861 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
13862 if (V1InUse)
13863 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
13864 DAG.getBuildVector(ShufVT, DL, V1Mask));
13865 if (V2InUse)
13866 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
13867 DAG.getBuildVector(ShufVT, DL, V2Mask));
13868
13869 // If we need shuffled inputs from both, blend the two.
13870 SDValue V;
13871 if (V1InUse && V2InUse)
13872 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
13873 else
13874 V = V1InUse ? V1 : V2;
13875
13876 // Cast the result back to the correct type.
13877 return DAG.getBitcast(VT, V);
13878}
13879
13880/// Generic lowering of 8-lane i16 shuffles.
13881///
13882/// This handles both single-input shuffles and combined shuffle/blends with
13883/// two inputs. The single input shuffles are immediately delegated to
13884/// a dedicated lowering routine.
13885///
13886/// The blends are lowered in one of three fundamental ways. If there are few
13887/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
13888/// of the input is significantly cheaper when lowered as an interleaving of
13889/// the two inputs, try to interleave them. Otherwise, blend the low and high
13890/// halves of the inputs separately (making them have relatively few inputs)
13891/// and then concatenate them.
13893 const APInt &Zeroable, SDValue V1, SDValue V2,
13894 const X86Subtarget &Subtarget,
13895 SelectionDAG &DAG) {
13896 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13897 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13898 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13899
13900 // Whenever we can lower this as a zext, that instruction is strictly faster
13901 // than any alternative.
13902 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
13903 Zeroable, Subtarget, DAG))
13904 return ZExt;
13905
13906 // Try to use lower using a truncation.
13907 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13908 Subtarget, DAG))
13909 return V;
13910
13911 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
13912
13913 if (NumV2Inputs == 0) {
13914 // Try to use shift instructions.
13915 if (SDValue Shift =
13916 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
13917 Subtarget, DAG, /*BitwiseOnly*/ false))
13918 return Shift;
13919
13920 // Check for being able to broadcast a single element.
13921 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
13922 Mask, Subtarget, DAG))
13923 return Broadcast;
13924
13925 // Try to use bit rotation instructions.
13926 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
13927 Subtarget, DAG))
13928 return Rotate;
13929
13930 // Use dedicated unpack instructions for masks that match their pattern.
13931 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13932 return V;
13933
13934 // Use dedicated pack instructions for masks that match their pattern.
13935 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13936 Subtarget))
13937 return V;
13938
13939 // Try to use byte rotation instructions.
13940 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
13941 Subtarget, DAG))
13942 return Rotate;
13943
13944 // Make a copy of the mask so it can be modified.
13945 SmallVector<int, 8> MutableMask(Mask);
13946 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
13947 Subtarget, DAG);
13948 }
13949
13950 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
13951 "All single-input shuffles should be canonicalized to be V1-input "
13952 "shuffles.");
13953
13954 // Try to use shift instructions.
13955 if (SDValue Shift =
13956 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
13957 DAG, /*BitwiseOnly*/ false))
13958 return Shift;
13959
13960 // See if we can use SSE4A Extraction / Insertion.
13961 if (Subtarget.hasSSE4A())
13962 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
13963 Zeroable, DAG))
13964 return V;
13965
13966 // There are special ways we can lower some single-element blends.
13967 if (NumV2Inputs == 1)
13969 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13970 return V;
13971
13972 // We have different paths for blend lowering, but they all must use the
13973 // *exact* same predicate.
13974 bool IsBlendSupported = Subtarget.hasSSE41();
13975 if (IsBlendSupported)
13976 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
13977 Zeroable, Subtarget, DAG))
13978 return Blend;
13979
13980 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
13981 Zeroable, Subtarget, DAG))
13982 return Masked;
13983
13984 // Use dedicated unpack instructions for masks that match their pattern.
13985 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13986 return V;
13987
13988 // Use dedicated pack instructions for masks that match their pattern.
13989 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13990 Subtarget))
13991 return V;
13992
13993 // Try to use lower using a truncation.
13994 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13995 Subtarget, DAG))
13996 return V;
13997
13998 // Try to use byte rotation instructions.
13999 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14000 Subtarget, DAG))
14001 return Rotate;
14002
14003 if (SDValue BitBlend =
14004 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14005 return BitBlend;
14006
14007 // Try to use byte shift instructions to mask.
14008 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14009 Zeroable, Subtarget, DAG))
14010 return V;
14011
14012 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14013 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14014 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14015 !Subtarget.hasVLX()) {
14016 // Check if this is part of a 256-bit vector truncation.
14017 unsigned PackOpc = 0;
14018 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14021 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14022 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14023 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14024 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14025 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14026 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14027 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14028 PackOpc = X86ISD::PACKUS;
14029 } else if (Subtarget.hasSSE41()) {
14030 SmallVector<SDValue, 4> DWordClearOps(4,
14031 DAG.getConstant(0, DL, MVT::i32));
14032 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14033 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14034 SDValue DWordClearMask =
14035 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14036 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14037 DWordClearMask);
14038 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14039 DWordClearMask);
14040 PackOpc = X86ISD::PACKUS;
14041 } else if (!Subtarget.hasSSSE3()) {
14042 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14043 V1 = DAG.getBitcast(MVT::v4i32, V1);
14044 V2 = DAG.getBitcast(MVT::v4i32, V2);
14045 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14046 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14047 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14048 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14049 PackOpc = X86ISD::PACKSS;
14050 }
14051 if (PackOpc) {
14052 // Now pack things back together.
14053 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14054 if (NumEvenDrops == 2) {
14055 Result = DAG.getBitcast(MVT::v4i32, Result);
14056 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14057 }
14058 return Result;
14059 }
14060 }
14061
14062 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14063 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14064 if (NumOddDrops == 1) {
14065 bool HasSSE41 = Subtarget.hasSSE41();
14066 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14067 DAG.getBitcast(MVT::v4i32, V1),
14068 DAG.getTargetConstant(16, DL, MVT::i8));
14069 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14070 DAG.getBitcast(MVT::v4i32, V2),
14071 DAG.getTargetConstant(16, DL, MVT::i8));
14072 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14073 MVT::v8i16, V1, V2);
14074 }
14075
14076 // Try to lower by permuting the inputs into an unpack instruction.
14077 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14078 Mask, Subtarget, DAG))
14079 return Unpack;
14080
14081 // If we can't directly blend but can use PSHUFB, that will be better as it
14082 // can both shuffle and set up the inefficient blend.
14083 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14084 bool V1InUse, V2InUse;
14085 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14086 Zeroable, DAG, V1InUse, V2InUse);
14087 }
14088
14089 // We can always bit-blend if we have to so the fallback strategy is to
14090 // decompose into single-input permutes and blends/unpacks.
14091 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
14092 Mask, Subtarget, DAG);
14093}
14094
14095/// Lower 8-lane 16-bit floating point shuffles.
14097 const APInt &Zeroable, SDValue V1, SDValue V2,
14098 const X86Subtarget &Subtarget,
14099 SelectionDAG &DAG) {
14100 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14101 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14102 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14103 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14104
14105 if (Subtarget.hasFP16()) {
14106 if (NumV2Elements == 0) {
14107 // Check for being able to broadcast a single element.
14108 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14109 Mask, Subtarget, DAG))
14110 return Broadcast;
14111 }
14112 if (NumV2Elements == 1 && Mask[0] >= 8)
14114 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14115 return V;
14116 }
14117
14118 V1 = DAG.getBitcast(MVT::v8i16, V1);
14119 V2 = DAG.getBitcast(MVT::v8i16, V2);
14120 return DAG.getBitcast(MVT::v8f16,
14121 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14122}
14123
14124// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14125// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14126// the active subvector is extracted.
14128 ArrayRef<int> Mask, SDValue V1, SDValue V2,
14129 const X86Subtarget &Subtarget,
14130 SelectionDAG &DAG) {
14131 MVT MaskVT = VT.changeTypeToInteger();
14132 SDValue MaskNode;
14133 MVT ShuffleVT = VT;
14134 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14135 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14136 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14137 ShuffleVT = V1.getSimpleValueType();
14138
14139 // Adjust mask to correct indices for the second input.
14140 int NumElts = VT.getVectorNumElements();
14141 unsigned Scale = 512 / VT.getSizeInBits();
14142 SmallVector<int, 32> AdjustedMask(Mask);
14143 for (int &M : AdjustedMask)
14144 if (NumElts <= M)
14145 M += (Scale - 1) * NumElts;
14146 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14147 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14148 } else {
14149 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14150 }
14151
14152 SDValue Result;
14153 if (V2.isUndef())
14154 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14155 else
14156 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14157
14158 if (VT != ShuffleVT)
14159 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14160
14161 return Result;
14162}
14163
14164/// Generic lowering of v16i8 shuffles.
14165///
14166/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14167/// detect any complexity reducing interleaving. If that doesn't help, it uses
14168/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14169/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14170/// back together.
14172 const APInt &Zeroable, SDValue V1, SDValue V2,
14173 const X86Subtarget &Subtarget,
14174 SelectionDAG &DAG) {
14175 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14176 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14177 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14178
14179 // Try to use shift instructions.
14180 if (SDValue Shift =
14181 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14182 DAG, /*BitwiseOnly*/ false))
14183 return Shift;
14184
14185 // Try to use byte rotation instructions.
14186 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14187 Subtarget, DAG))
14188 return Rotate;
14189
14190 // Use dedicated pack instructions for masks that match their pattern.
14191 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
14192 Subtarget))
14193 return V;
14194
14195 // Try to use a zext lowering.
14196 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14197 Zeroable, Subtarget, DAG))
14198 return ZExt;
14199
14200 // Try to use lower using a truncation.
14201 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14202 Subtarget, DAG))
14203 return V;
14204
14205 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14206 Subtarget, DAG))
14207 return V;
14208
14209 // See if we can use SSE4A Extraction / Insertion.
14210 if (Subtarget.hasSSE4A())
14211 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14212 Zeroable, DAG))
14213 return V;
14214
14215 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14216
14217 // For single-input shuffles, there are some nicer lowering tricks we can use.
14218 if (NumV2Elements == 0) {
14219 // Check for being able to broadcast a single element.
14220 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14221 Mask, Subtarget, DAG))
14222 return Broadcast;
14223
14224 // Try to use bit rotation instructions.
14225 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14226 Subtarget, DAG))
14227 return Rotate;
14228
14229 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14230 return V;
14231
14232 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14233 // Notably, this handles splat and partial-splat shuffles more efficiently.
14234 // However, it only makes sense if the pre-duplication shuffle simplifies
14235 // things significantly. Currently, this means we need to be able to
14236 // express the pre-duplication shuffle as an i16 shuffle.
14237 //
14238 // FIXME: We should check for other patterns which can be widened into an
14239 // i16 shuffle as well.
14240 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14241 for (int i = 0; i < 16; i += 2)
14242 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14243 return false;
14244
14245 return true;
14246 };
14247 auto tryToWidenViaDuplication = [&]() -> SDValue {
14248 if (!canWidenViaDuplication(Mask))
14249 return SDValue();
14250 SmallVector<int, 4> LoInputs;
14251 copy_if(Mask, std::back_inserter(LoInputs),
14252 [](int M) { return M >= 0 && M < 8; });
14253 array_pod_sort(LoInputs.begin(), LoInputs.end());
14254 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14255 SmallVector<int, 4> HiInputs;
14256 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14257 array_pod_sort(HiInputs.begin(), HiInputs.end());
14258 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14259
14260 bool TargetLo = LoInputs.size() >= HiInputs.size();
14261 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14262 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14263
14264 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14266 for (int I : InPlaceInputs) {
14267 PreDupI16Shuffle[I/2] = I/2;
14268 LaneMap[I] = I;
14269 }
14270 int j = TargetLo ? 0 : 4, je = j + 4;
14271 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14272 // Check if j is already a shuffle of this input. This happens when
14273 // there are two adjacent bytes after we move the low one.
14274 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14275 // If we haven't yet mapped the input, search for a slot into which
14276 // we can map it.
14277 while (j < je && PreDupI16Shuffle[j] >= 0)
14278 ++j;
14279
14280 if (j == je)
14281 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14282 return SDValue();
14283
14284 // Map this input with the i16 shuffle.
14285 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14286 }
14287
14288 // Update the lane map based on the mapping we ended up with.
14289 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14290 }
14291 V1 = DAG.getBitcast(
14292 MVT::v16i8,
14293 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14294 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14295
14296 // Unpack the bytes to form the i16s that will be shuffled into place.
14297 bool EvenInUse = false, OddInUse = false;
14298 for (int i = 0; i < 16; i += 2) {
14299 EvenInUse |= (Mask[i + 0] >= 0);
14300 OddInUse |= (Mask[i + 1] >= 0);
14301 if (EvenInUse && OddInUse)
14302 break;
14303 }
14304 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14305 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14306 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14307
14308 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14309 for (int i = 0; i < 16; ++i)
14310 if (Mask[i] >= 0) {
14311 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14312 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14313 if (PostDupI16Shuffle[i / 2] < 0)
14314 PostDupI16Shuffle[i / 2] = MappedMask;
14315 else
14316 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14317 "Conflicting entries in the original shuffle!");
14318 }
14319 return DAG.getBitcast(
14320 MVT::v16i8,
14321 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14322 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14323 };
14324 if (SDValue V = tryToWidenViaDuplication())
14325 return V;
14326 }
14327
14328 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14329 Zeroable, Subtarget, DAG))
14330 return Masked;
14331
14332 // Use dedicated unpack instructions for masks that match their pattern.
14333 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14334 return V;
14335
14336 // Try to use byte shift instructions to mask.
14337 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14338 Zeroable, Subtarget, DAG))
14339 return V;
14340
14341 // Check for compaction patterns.
14342 bool IsSingleInput = V2.isUndef();
14343 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14344
14345 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14346 // with PSHUFB. It is important to do this before we attempt to generate any
14347 // blends but after all of the single-input lowerings. If the single input
14348 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14349 // want to preserve that and we can DAG combine any longer sequences into
14350 // a PSHUFB in the end. But once we start blending from multiple inputs,
14351 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14352 // and there are *very* few patterns that would actually be faster than the
14353 // PSHUFB approach because of its ability to zero lanes.
14354 //
14355 // If the mask is a binary compaction, we can more efficiently perform this
14356 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14357 //
14358 // FIXME: The only exceptions to the above are blends which are exact
14359 // interleavings with direct instructions supporting them. We currently don't
14360 // handle those well here.
14361 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14362 bool V1InUse = false;
14363 bool V2InUse = false;
14364
14366 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14367
14368 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14369 // do so. This avoids using them to handle blends-with-zero which is
14370 // important as a single pshufb is significantly faster for that.
14371 if (V1InUse && V2InUse) {
14372 if (Subtarget.hasSSE41())
14373 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14374 Zeroable, Subtarget, DAG))
14375 return Blend;
14376
14377 // We can use an unpack to do the blending rather than an or in some
14378 // cases. Even though the or may be (very minorly) more efficient, we
14379 // preference this lowering because there are common cases where part of
14380 // the complexity of the shuffles goes away when we do the final blend as
14381 // an unpack.
14382 // FIXME: It might be worth trying to detect if the unpack-feeding
14383 // shuffles will both be pshufb, in which case we shouldn't bother with
14384 // this.
14386 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14387 return Unpack;
14388
14389 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14390 if (Subtarget.hasVBMI())
14391 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14392 DAG);
14393
14394 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14395 if (Subtarget.hasXOP()) {
14396 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14397 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14398 }
14399
14400 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14401 // PALIGNR will be cheaper than the second PSHUFB+OR.
14403 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14404 return V;
14405 }
14406
14407 return PSHUFB;
14408 }
14409
14410 // There are special ways we can lower some single-element blends.
14411 if (NumV2Elements == 1)
14413 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14414 return V;
14415
14416 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14417 return Blend;
14418
14419 // Check whether a compaction lowering can be done. This handles shuffles
14420 // which take every Nth element for some even N. See the helper function for
14421 // details.
14422 //
14423 // We special case these as they can be particularly efficiently handled with
14424 // the PACKUSB instruction on x86 and they show up in common patterns of
14425 // rearranging bytes to truncate wide elements.
14426 if (NumEvenDrops) {
14427 // NumEvenDrops is the power of two stride of the elements. Another way of
14428 // thinking about it is that we need to drop the even elements this many
14429 // times to get the original input.
14430
14431 // First we need to zero all the dropped bytes.
14432 assert(NumEvenDrops <= 3 &&
14433 "No support for dropping even elements more than 3 times.");
14434 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14435 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14436 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14437 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14438 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14439 WordClearMask);
14440 if (!IsSingleInput)
14441 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14442 WordClearMask);
14443
14444 // Now pack things back together.
14445 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14446 IsSingleInput ? V1 : V2);
14447 for (int i = 1; i < NumEvenDrops; ++i) {
14448 Result = DAG.getBitcast(MVT::v8i16, Result);
14449 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14450 }
14451 return Result;
14452 }
14453
14454 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14455 if (NumOddDrops == 1) {
14456 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14457 DAG.getBitcast(MVT::v8i16, V1),
14458 DAG.getTargetConstant(8, DL, MVT::i8));
14459 if (!IsSingleInput)
14460 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14461 DAG.getBitcast(MVT::v8i16, V2),
14462 DAG.getTargetConstant(8, DL, MVT::i8));
14463 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14464 IsSingleInput ? V1 : V2);
14465 }
14466
14467 // Handle multi-input cases by blending/unpacking single-input shuffles.
14468 if (NumV2Elements > 0)
14469 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14470 Subtarget, DAG);
14471
14472 // The fallback path for single-input shuffles widens this into two v8i16
14473 // vectors with unpacks, shuffles those, and then pulls them back together
14474 // with a pack.
14475 SDValue V = V1;
14476
14477 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14478 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14479 for (int i = 0; i < 16; ++i)
14480 if (Mask[i] >= 0)
14481 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14482
14483 SDValue VLoHalf, VHiHalf;
14484 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14485 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14486 // i16s.
14487 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14488 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14489 // Use a mask to drop the high bytes.
14490 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14491 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14492 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14493
14494 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14495 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14496
14497 // Squash the masks to point directly into VLoHalf.
14498 for (int &M : LoBlendMask)
14499 if (M >= 0)
14500 M /= 2;
14501 for (int &M : HiBlendMask)
14502 if (M >= 0)
14503 M /= 2;
14504 } else {
14505 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14506 // VHiHalf so that we can blend them as i16s.
14507 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14508
14509 VLoHalf = DAG.getBitcast(
14510 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14511 VHiHalf = DAG.getBitcast(
14512 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14513 }
14514
14515 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14516 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14517
14518 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14519}
14520
14521/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14522///
14523/// This routine breaks down the specific type of 128-bit shuffle and
14524/// dispatches to the lowering routines accordingly.
14526 MVT VT, SDValue V1, SDValue V2,
14527 const APInt &Zeroable,
14528 const X86Subtarget &Subtarget,
14529 SelectionDAG &DAG) {
14530 if (VT == MVT::v8bf16) {
14531 V1 = DAG.getBitcast(MVT::v8i16, V1);
14532 V2 = DAG.getBitcast(MVT::v8i16, V2);
14533 return DAG.getBitcast(VT,
14534 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14535 }
14536
14537 switch (VT.SimpleTy) {
14538 case MVT::v2i64:
14539 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14540 case MVT::v2f64:
14541 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14542 case MVT::v4i32:
14543 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14544 case MVT::v4f32:
14545 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14546 case MVT::v8i16:
14547 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14548 case MVT::v8f16:
14549 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14550 case MVT::v16i8:
14551 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14552
14553 default:
14554 llvm_unreachable("Unimplemented!");
14555 }
14556}
14557
14558/// Generic routine to split vector shuffle into half-sized shuffles.
14559///
14560/// This routine just extracts two subvectors, shuffles them independently, and
14561/// then concatenates them back together. This should work effectively with all
14562/// AVX vector shuffle types.
14564 SDValue V2, ArrayRef<int> Mask,
14565 SelectionDAG &DAG, bool SimpleOnly) {
14566 assert(VT.getSizeInBits() >= 256 &&
14567 "Only for 256-bit or wider vector shuffles!");
14568 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14569 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14570
14571 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14572 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14573
14574 int NumElements = VT.getVectorNumElements();
14575 int SplitNumElements = NumElements / 2;
14576 MVT ScalarVT = VT.getVectorElementType();
14577 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14578
14579 // Use splitVector/extractSubVector so that split build-vectors just build two
14580 // narrower build vectors. This helps shuffling with splats and zeros.
14581 auto SplitVector = [&](SDValue V) {
14582 SDValue LoV, HiV;
14583 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14584 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14585 DAG.getBitcast(SplitVT, HiV));
14586 };
14587
14588 SDValue LoV1, HiV1, LoV2, HiV2;
14589 std::tie(LoV1, HiV1) = SplitVector(V1);
14590 std::tie(LoV2, HiV2) = SplitVector(V2);
14591
14592 // Now create two 4-way blends of these half-width vectors.
14593 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14594 bool &UseHiV1, bool &UseLoV2,
14595 bool &UseHiV2) {
14596 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14597 for (int i = 0; i < SplitNumElements; ++i) {
14598 int M = HalfMask[i];
14599 if (M >= NumElements) {
14600 if (M >= NumElements + SplitNumElements)
14601 UseHiV2 = true;
14602 else
14603 UseLoV2 = true;
14604 } else if (M >= 0) {
14605 if (M >= SplitNumElements)
14606 UseHiV1 = true;
14607 else
14608 UseLoV1 = true;
14609 }
14610 }
14611 };
14612
14613 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14614 if (!SimpleOnly)
14615 return true;
14616
14617 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14618 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14619
14620 return !(UseHiV1 || UseHiV2);
14621 };
14622
14623 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14624 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14625 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14626 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14627 for (int i = 0; i < SplitNumElements; ++i) {
14628 int M = HalfMask[i];
14629 if (M >= NumElements) {
14630 V2BlendMask[i] = M - NumElements;
14631 BlendMask[i] = SplitNumElements + i;
14632 } else if (M >= 0) {
14633 V1BlendMask[i] = M;
14634 BlendMask[i] = i;
14635 }
14636 }
14637
14638 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14639 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14640
14641 // Because the lowering happens after all combining takes place, we need to
14642 // manually combine these blend masks as much as possible so that we create
14643 // a minimal number of high-level vector shuffle nodes.
14644 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14645
14646 // First try just blending the halves of V1 or V2.
14647 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14648 return DAG.getUNDEF(SplitVT);
14649 if (!UseLoV2 && !UseHiV2)
14650 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14651 if (!UseLoV1 && !UseHiV1)
14652 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14653
14654 SDValue V1Blend, V2Blend;
14655 if (UseLoV1 && UseHiV1) {
14656 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14657 } else {
14658 // We only use half of V1 so map the usage down into the final blend mask.
14659 V1Blend = UseLoV1 ? LoV1 : HiV1;
14660 for (int i = 0; i < SplitNumElements; ++i)
14661 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14662 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14663 }
14664 if (UseLoV2 && UseHiV2) {
14665 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14666 } else {
14667 // We only use half of V2 so map the usage down into the final blend mask.
14668 V2Blend = UseLoV2 ? LoV2 : HiV2;
14669 for (int i = 0; i < SplitNumElements; ++i)
14670 if (BlendMask[i] >= SplitNumElements)
14671 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14672 }
14673 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14674 };
14675
14676 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14677 return SDValue();
14678
14679 SDValue Lo = HalfBlend(LoMask);
14680 SDValue Hi = HalfBlend(HiMask);
14681 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14682}
14683
14684/// Either split a vector in halves or decompose the shuffles and the
14685/// blend/unpack.
14686///
14687/// This is provided as a good fallback for many lowerings of non-single-input
14688/// shuffles with more than one 128-bit lane. In those cases, we want to select
14689/// between splitting the shuffle into 128-bit components and stitching those
14690/// back together vs. extracting the single-input shuffles and blending those
14691/// results.
14693 SDValue V2, ArrayRef<int> Mask,
14694 const X86Subtarget &Subtarget,
14695 SelectionDAG &DAG) {
14696 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14697 "shuffles as it could then recurse on itself.");
14698 int Size = Mask.size();
14699
14700 // If this can be modeled as a broadcast of two elements followed by a blend,
14701 // prefer that lowering. This is especially important because broadcasts can
14702 // often fold with memory operands.
14703 auto DoBothBroadcast = [&] {
14704 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14705 for (int M : Mask)
14706 if (M >= Size) {
14707 if (V2BroadcastIdx < 0)
14708 V2BroadcastIdx = M - Size;
14709 else if (M - Size != V2BroadcastIdx)
14710 return false;
14711 } else if (M >= 0) {
14712 if (V1BroadcastIdx < 0)
14713 V1BroadcastIdx = M;
14714 else if (M != V1BroadcastIdx)
14715 return false;
14716 }
14717 return true;
14718 };
14719 if (DoBothBroadcast())
14720 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14721 DAG);
14722
14723 // If the inputs all stem from a single 128-bit lane of each input, then we
14724 // split them rather than blending because the split will decompose to
14725 // unusually few instructions.
14726 int LaneCount = VT.getSizeInBits() / 128;
14727 int LaneSize = Size / LaneCount;
14728 SmallBitVector LaneInputs[2];
14729 LaneInputs[0].resize(LaneCount, false);
14730 LaneInputs[1].resize(LaneCount, false);
14731 for (int i = 0; i < Size; ++i)
14732 if (Mask[i] >= 0)
14733 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14734 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14735 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14736 /*SimpleOnly*/ false);
14737
14738 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
14739 // requires that the decomposed single-input shuffles don't end up here.
14740 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14741 DAG);
14742}
14743
14744// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14745// TODO: Extend to support v8f32 (+ 512-bit shuffles).
14747 SDValue V1, SDValue V2,
14748 ArrayRef<int> Mask,
14749 SelectionDAG &DAG) {
14750 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
14751
14752 int LHSMask[4] = {-1, -1, -1, -1};
14753 int RHSMask[4] = {-1, -1, -1, -1};
14754 unsigned SHUFPMask = 0;
14755
14756 // As SHUFPD uses a single LHS/RHS element per lane, we can always
14757 // perform the shuffle once the lanes have been shuffled in place.
14758 for (int i = 0; i != 4; ++i) {
14759 int M = Mask[i];
14760 if (M < 0)
14761 continue;
14762 int LaneBase = i & ~1;
14763 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
14764 LaneMask[LaneBase + (M & 1)] = M;
14765 SHUFPMask |= (M & 1) << i;
14766 }
14767
14768 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
14769 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
14770 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
14771 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
14772}
14773
14774/// Lower a vector shuffle crossing multiple 128-bit lanes as
14775/// a lane permutation followed by a per-lane permutation.
14776///
14777/// This is mainly for cases where we can have non-repeating permutes
14778/// in each lane.
14779///
14780/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
14781/// we should investigate merging them.
14783 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14784 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14785 int NumElts = VT.getVectorNumElements();
14786 int NumLanes = VT.getSizeInBits() / 128;
14787 int NumEltsPerLane = NumElts / NumLanes;
14788 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
14789
14790 /// Attempts to find a sublane permute with the given size
14791 /// that gets all elements into their target lanes.
14792 ///
14793 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
14794 /// If unsuccessful, returns false and may overwrite InLaneMask.
14795 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
14796 int NumSublanesPerLane = NumSublanes / NumLanes;
14797 int NumEltsPerSublane = NumElts / NumSublanes;
14798
14799 SmallVector<int, 16> CrossLaneMask;
14800 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
14801 // CrossLaneMask but one entry == one sublane.
14802 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
14803
14804 for (int i = 0; i != NumElts; ++i) {
14805 int M = Mask[i];
14806 if (M < 0)
14807 continue;
14808
14809 int SrcSublane = M / NumEltsPerSublane;
14810 int DstLane = i / NumEltsPerLane;
14811
14812 // We only need to get the elements into the right lane, not sublane.
14813 // So search all sublanes that make up the destination lane.
14814 bool Found = false;
14815 int DstSubStart = DstLane * NumSublanesPerLane;
14816 int DstSubEnd = DstSubStart + NumSublanesPerLane;
14817 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
14818 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
14819 continue;
14820
14821 Found = true;
14822 CrossLaneMaskLarge[DstSublane] = SrcSublane;
14823 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
14824 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
14825 break;
14826 }
14827 if (!Found)
14828 return SDValue();
14829 }
14830
14831 // Fill CrossLaneMask using CrossLaneMaskLarge.
14832 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
14833
14834 if (!CanUseSublanes) {
14835 // If we're only shuffling a single lowest lane and the rest are identity
14836 // then don't bother.
14837 // TODO - isShuffleMaskInputInPlace could be extended to something like
14838 // this.
14839 int NumIdentityLanes = 0;
14840 bool OnlyShuffleLowestLane = true;
14841 for (int i = 0; i != NumLanes; ++i) {
14842 int LaneOffset = i * NumEltsPerLane;
14843 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
14844 i * NumEltsPerLane))
14845 NumIdentityLanes++;
14846 else if (CrossLaneMask[LaneOffset] != 0)
14847 OnlyShuffleLowestLane = false;
14848 }
14849 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
14850 return SDValue();
14851 }
14852
14853 // Avoid returning the same shuffle operation. For example,
14854 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
14855 // undef:v16i16
14856 if (CrossLaneMask == Mask || InLaneMask == Mask)
14857 return SDValue();
14858
14859 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
14860 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
14861 InLaneMask);
14862 };
14863
14864 // First attempt a solution with full lanes.
14865 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
14866 return V;
14867
14868 // The rest of the solutions use sublanes.
14869 if (!CanUseSublanes)
14870 return SDValue();
14871
14872 // Then attempt a solution with 64-bit sublanes (vpermq).
14873 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
14874 return V;
14875
14876 // If that doesn't work and we have fast variable cross-lane shuffle,
14877 // attempt 32-bit sublanes (vpermd).
14878 if (!Subtarget.hasFastVariableCrossLaneShuffle())
14879 return SDValue();
14880
14881 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
14882}
14883
14884/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
14885static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
14886 SmallVector<int> &InLaneMask) {
14887 int Size = Mask.size();
14888 InLaneMask.assign(Mask.begin(), Mask.end());
14889 for (int i = 0; i < Size; ++i) {
14890 int &M = InLaneMask[i];
14891 if (M < 0)
14892 continue;
14893 if (((M % Size) / LaneSize) != (i / LaneSize))
14894 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
14895 }
14896}
14897
14898/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
14899/// source with a lane permutation.
14900///
14901/// This lowering strategy results in four instructions in the worst case for a
14902/// single-input cross lane shuffle which is lower than any other fully general
14903/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
14904/// shuffle pattern should be handled prior to trying this lowering.
14906 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14907 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14908 // FIXME: This should probably be generalized for 512-bit vectors as well.
14909 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
14910 int Size = Mask.size();
14911 int LaneSize = Size / 2;
14912
14913 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14914 // Only do this if the elements aren't all from the lower lane,
14915 // otherwise we're (probably) better off doing a split.
14916 if (VT == MVT::v4f64 &&
14917 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
14918 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
14919
14920 // If there are only inputs from one 128-bit lane, splitting will in fact be
14921 // less expensive. The flags track whether the given lane contains an element
14922 // that crosses to another lane.
14923 bool AllLanes;
14924 if (!Subtarget.hasAVX2()) {
14925 bool LaneCrossing[2] = {false, false};
14926 for (int i = 0; i < Size; ++i)
14927 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
14928 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
14929 AllLanes = LaneCrossing[0] && LaneCrossing[1];
14930 } else {
14931 bool LaneUsed[2] = {false, false};
14932 for (int i = 0; i < Size; ++i)
14933 if (Mask[i] >= 0)
14934 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
14935 AllLanes = LaneUsed[0] && LaneUsed[1];
14936 }
14937
14938 // TODO - we could support shuffling V2 in the Flipped input.
14939 assert(V2.isUndef() &&
14940 "This last part of this routine only works on single input shuffles");
14941
14942 SmallVector<int> InLaneMask;
14943 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
14944
14945 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
14946 "In-lane shuffle mask expected");
14947
14948 // If we're not using both lanes in each lane and the inlane mask is not
14949 // repeating, then we're better off splitting.
14950 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
14951 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14952 /*SimpleOnly*/ false);
14953
14954 // Flip the lanes, and shuffle the results which should now be in-lane.
14955 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
14956 SDValue Flipped = DAG.getBitcast(PVT, V1);
14957 Flipped =
14958 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
14959 Flipped = DAG.getBitcast(VT, Flipped);
14960 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
14961}
14962
14963/// Handle lowering 2-lane 128-bit shuffles.
14965 SDValue V2, ArrayRef<int> Mask,
14966 const APInt &Zeroable,
14967 const X86Subtarget &Subtarget,
14968 SelectionDAG &DAG) {
14969 if (V2.isUndef()) {
14970 // Attempt to match VBROADCAST*128 subvector broadcast load.
14971 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
14972 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
14973 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
14975 MVT MemVT = VT.getHalfNumVectorElementsVT();
14976 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
14977 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
14979 VT, MemVT, Ld, Ofs, DAG))
14980 return BcstLd;
14981 }
14982
14983 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
14984 if (Subtarget.hasAVX2())
14985 return SDValue();
14986 }
14987
14988 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
14989
14990 SmallVector<int, 4> WidenedMask;
14991 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
14992 return SDValue();
14993
14994 bool IsLowZero = (Zeroable & 0x3) == 0x3;
14995 bool IsHighZero = (Zeroable & 0xc) == 0xc;
14996
14997 // Try to use an insert into a zero vector.
14998 if (WidenedMask[0] == 0 && IsHighZero) {
14999 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15000 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15001 DAG.getIntPtrConstant(0, DL));
15002 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15003 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15004 DAG.getIntPtrConstant(0, DL));
15005 }
15006
15007 // TODO: If minimizing size and one of the inputs is a zero vector and the
15008 // the zero vector has only one use, we could use a VPERM2X128 to save the
15009 // instruction bytes needed to explicitly generate the zero vector.
15010
15011 // Blends are faster and handle all the non-lane-crossing cases.
15012 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15013 Subtarget, DAG))
15014 return Blend;
15015
15016 // If either input operand is a zero vector, use VPERM2X128 because its mask
15017 // allows us to replace the zero input with an implicit zero.
15018 if (!IsLowZero && !IsHighZero) {
15019 // Check for patterns which can be matched with a single insert of a 128-bit
15020 // subvector.
15021 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15022 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15023
15024 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15025 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15026 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
15027 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15028 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
15029 OnlyUsesV1 ? V1 : V2,
15030 DAG.getIntPtrConstant(0, DL));
15031 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15032 DAG.getIntPtrConstant(2, DL));
15033 }
15034 }
15035
15036 // Try to use SHUF128 if possible.
15037 if (Subtarget.hasVLX()) {
15038 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15039 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15040 ((WidenedMask[1] % 2) << 1);
15041 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15042 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15043 }
15044 }
15045 }
15046
15047 // Otherwise form a 128-bit permutation. After accounting for undefs,
15048 // convert the 64-bit shuffle mask selection values into 128-bit
15049 // selection bits by dividing the indexes by 2 and shifting into positions
15050 // defined by a vperm2*128 instruction's immediate control byte.
15051
15052 // The immediate permute control byte looks like this:
15053 // [1:0] - select 128 bits from sources for low half of destination
15054 // [2] - ignore
15055 // [3] - zero low half of destination
15056 // [5:4] - select 128 bits from sources for high half of destination
15057 // [6] - ignore
15058 // [7] - zero high half of destination
15059
15060 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15061 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15062
15063 unsigned PermMask = 0;
15064 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15065 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15066
15067 // Check the immediate mask and replace unused sources with undef.
15068 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15069 V1 = DAG.getUNDEF(VT);
15070 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15071 V2 = DAG.getUNDEF(VT);
15072
15073 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15074 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15075}
15076
15077/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15078/// shuffling each lane.
15079///
15080/// This attempts to create a repeated lane shuffle where each lane uses one
15081/// or two of the lanes of the inputs. The lanes of the input vectors are
15082/// shuffled in one or two independent shuffles to get the lanes into the
15083/// position needed by the final shuffle.
15085 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15086 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15087 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15088
15089 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15090 return SDValue();
15091
15092 int NumElts = Mask.size();
15093 int NumLanes = VT.getSizeInBits() / 128;
15094 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15095 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15096 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15097
15098 // First pass will try to fill in the RepeatMask from lanes that need two
15099 // sources.
15100 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15101 int Srcs[2] = {-1, -1};
15102 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15103 for (int i = 0; i != NumLaneElts; ++i) {
15104 int M = Mask[(Lane * NumLaneElts) + i];
15105 if (M < 0)
15106 continue;
15107 // Determine which of the possible input lanes (NumLanes from each source)
15108 // this element comes from. Assign that as one of the sources for this
15109 // lane. We can assign up to 2 sources for this lane. If we run out
15110 // sources we can't do anything.
15111 int LaneSrc = M / NumLaneElts;
15112 int Src;
15113 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15114 Src = 0;
15115 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15116 Src = 1;
15117 else
15118 return SDValue();
15119
15120 Srcs[Src] = LaneSrc;
15121 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15122 }
15123
15124 // If this lane has two sources, see if it fits with the repeat mask so far.
15125 if (Srcs[1] < 0)
15126 continue;
15127
15128 LaneSrcs[Lane][0] = Srcs[0];
15129 LaneSrcs[Lane][1] = Srcs[1];
15130
15131 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15132 assert(M1.size() == M2.size() && "Unexpected mask size");
15133 for (int i = 0, e = M1.size(); i != e; ++i)
15134 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15135 return false;
15136 return true;
15137 };
15138
15139 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15140 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15141 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15142 int M = Mask[i];
15143 if (M < 0)
15144 continue;
15145 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15146 "Unexpected mask element");
15147 MergedMask[i] = M;
15148 }
15149 };
15150
15151 if (MatchMasks(InLaneMask, RepeatMask)) {
15152 // Merge this lane mask into the final repeat mask.
15153 MergeMasks(InLaneMask, RepeatMask);
15154 continue;
15155 }
15156
15157 // Didn't find a match. Swap the operands and try again.
15158 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15160
15161 if (MatchMasks(InLaneMask, RepeatMask)) {
15162 // Merge this lane mask into the final repeat mask.
15163 MergeMasks(InLaneMask, RepeatMask);
15164 continue;
15165 }
15166
15167 // Couldn't find a match with the operands in either order.
15168 return SDValue();
15169 }
15170
15171 // Now handle any lanes with only one source.
15172 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15173 // If this lane has already been processed, skip it.
15174 if (LaneSrcs[Lane][0] >= 0)
15175 continue;
15176
15177 for (int i = 0; i != NumLaneElts; ++i) {
15178 int M = Mask[(Lane * NumLaneElts) + i];
15179 if (M < 0)
15180 continue;
15181
15182 // If RepeatMask isn't defined yet we can define it ourself.
15183 if (RepeatMask[i] < 0)
15184 RepeatMask[i] = M % NumLaneElts;
15185
15186 if (RepeatMask[i] < NumElts) {
15187 if (RepeatMask[i] != M % NumLaneElts)
15188 return SDValue();
15189 LaneSrcs[Lane][0] = M / NumLaneElts;
15190 } else {
15191 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15192 return SDValue();
15193 LaneSrcs[Lane][1] = M / NumLaneElts;
15194 }
15195 }
15196
15197 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15198 return SDValue();
15199 }
15200
15201 SmallVector<int, 16> NewMask(NumElts, -1);
15202 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15203 int Src = LaneSrcs[Lane][0];
15204 for (int i = 0; i != NumLaneElts; ++i) {
15205 int M = -1;
15206 if (Src >= 0)
15207 M = Src * NumLaneElts + i;
15208 NewMask[Lane * NumLaneElts + i] = M;
15209 }
15210 }
15211 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15212 // Ensure we didn't get back the shuffle we started with.
15213 // FIXME: This is a hack to make up for some splat handling code in
15214 // getVectorShuffle.
15215 if (isa<ShuffleVectorSDNode>(NewV1) &&
15216 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15217 return SDValue();
15218
15219 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15220 int Src = LaneSrcs[Lane][1];
15221 for (int i = 0; i != NumLaneElts; ++i) {
15222 int M = -1;
15223 if (Src >= 0)
15224 M = Src * NumLaneElts + i;
15225 NewMask[Lane * NumLaneElts + i] = M;
15226 }
15227 }
15228 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15229 // Ensure we didn't get back the shuffle we started with.
15230 // FIXME: This is a hack to make up for some splat handling code in
15231 // getVectorShuffle.
15232 if (isa<ShuffleVectorSDNode>(NewV2) &&
15233 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15234 return SDValue();
15235
15236 for (int i = 0; i != NumElts; ++i) {
15237 if (Mask[i] < 0) {
15238 NewMask[i] = -1;
15239 continue;
15240 }
15241 NewMask[i] = RepeatMask[i % NumLaneElts];
15242 if (NewMask[i] < 0)
15243 continue;
15244
15245 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15246 }
15247 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15248}
15249
15250/// If the input shuffle mask results in a vector that is undefined in all upper
15251/// or lower half elements and that mask accesses only 2 halves of the
15252/// shuffle's operands, return true. A mask of half the width with mask indexes
15253/// adjusted to access the extracted halves of the original shuffle operands is
15254/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15255/// lower half of each input operand is accessed.
15256static bool
15258 int &HalfIdx1, int &HalfIdx2) {
15259 assert((Mask.size() == HalfMask.size() * 2) &&
15260 "Expected input mask to be twice as long as output");
15261
15262 // Exactly one half of the result must be undef to allow narrowing.
15263 bool UndefLower = isUndefLowerHalf(Mask);
15264 bool UndefUpper = isUndefUpperHalf(Mask);
15265 if (UndefLower == UndefUpper)
15266 return false;
15267
15268 unsigned HalfNumElts = HalfMask.size();
15269 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15270 HalfIdx1 = -1;
15271 HalfIdx2 = -1;
15272 for (unsigned i = 0; i != HalfNumElts; ++i) {
15273 int M = Mask[i + MaskIndexOffset];
15274 if (M < 0) {
15275 HalfMask[i] = M;
15276 continue;
15277 }
15278
15279 // Determine which of the 4 half vectors this element is from.
15280 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15281 int HalfIdx = M / HalfNumElts;
15282
15283 // Determine the element index into its half vector source.
15284 int HalfElt = M % HalfNumElts;
15285
15286 // We can shuffle with up to 2 half vectors, set the new 'half'
15287 // shuffle mask accordingly.
15288 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15289 HalfMask[i] = HalfElt;
15290 HalfIdx1 = HalfIdx;
15291 continue;
15292 }
15293 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15294 HalfMask[i] = HalfElt + HalfNumElts;
15295 HalfIdx2 = HalfIdx;
15296 continue;
15297 }
15298
15299 // Too many half vectors referenced.
15300 return false;
15301 }
15302
15303 return true;
15304}
15305
15306/// Given the output values from getHalfShuffleMask(), create a half width
15307/// shuffle of extracted vectors followed by an insert back to full width.
15309 ArrayRef<int> HalfMask, int HalfIdx1,
15310 int HalfIdx2, bool UndefLower,
15311 SelectionDAG &DAG, bool UseConcat = false) {
15312 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15313 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15314
15315 MVT VT = V1.getSimpleValueType();
15316 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15317 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15318
15319 auto getHalfVector = [&](int HalfIdx) {
15320 if (HalfIdx < 0)
15321 return DAG.getUNDEF(HalfVT);
15322 SDValue V = (HalfIdx < 2 ? V1 : V2);
15323 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15324 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15325 DAG.getIntPtrConstant(HalfIdx, DL));
15326 };
15327
15328 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15329 SDValue Half1 = getHalfVector(HalfIdx1);
15330 SDValue Half2 = getHalfVector(HalfIdx2);
15331 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15332 if (UseConcat) {
15333 SDValue Op0 = V;
15334 SDValue Op1 = DAG.getUNDEF(HalfVT);
15335 if (UndefLower)
15336 std::swap(Op0, Op1);
15337 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15338 }
15339
15340 unsigned Offset = UndefLower ? HalfNumElts : 0;
15341 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15343}
15344
15345/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15346/// This allows for fast cases such as subvector extraction/insertion
15347/// or shuffling smaller vector types which can lower more efficiently.
15349 SDValue V2, ArrayRef<int> Mask,
15350 const X86Subtarget &Subtarget,
15351 SelectionDAG &DAG) {
15352 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15353 "Expected 256-bit or 512-bit vector");
15354
15355 bool UndefLower = isUndefLowerHalf(Mask);
15356 if (!UndefLower && !isUndefUpperHalf(Mask))
15357 return SDValue();
15358
15359 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15360 "Completely undef shuffle mask should have been simplified already");
15361
15362 // Upper half is undef and lower half is whole upper subvector.
15363 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15364 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15365 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15366 if (!UndefLower &&
15367 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15368 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15369 DAG.getIntPtrConstant(HalfNumElts, DL));
15370 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15371 DAG.getIntPtrConstant(0, DL));
15372 }
15373
15374 // Lower half is undef and upper half is whole lower subvector.
15375 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15376 if (UndefLower &&
15377 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15378 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15379 DAG.getIntPtrConstant(0, DL));
15380 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15381 DAG.getIntPtrConstant(HalfNumElts, DL));
15382 }
15383
15384 int HalfIdx1, HalfIdx2;
15385 SmallVector<int, 8> HalfMask(HalfNumElts);
15386 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15387 return SDValue();
15388
15389 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15390
15391 // Only shuffle the halves of the inputs when useful.
15392 unsigned NumLowerHalves =
15393 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15394 unsigned NumUpperHalves =
15395 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15396 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15397
15398 // Determine the larger pattern of undef/halves, then decide if it's worth
15399 // splitting the shuffle based on subtarget capabilities and types.
15400 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15401 if (!UndefLower) {
15402 // XXXXuuuu: no insert is needed.
15403 // Always extract lowers when setting lower - these are all free subreg ops.
15404 if (NumUpperHalves == 0)
15405 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15406 UndefLower, DAG);
15407
15408 if (NumUpperHalves == 1) {
15409 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15410 if (Subtarget.hasAVX2()) {
15411 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15412 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15413 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15414 (!isSingleSHUFPSMask(HalfMask) ||
15415 Subtarget.hasFastVariableCrossLaneShuffle()))
15416 return SDValue();
15417 // If this is a unary shuffle (assume that the 2nd operand is
15418 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15419 // are better off extracting the upper half of 1 operand and using a
15420 // narrow shuffle.
15421 if (EltWidth == 64 && V2.isUndef())
15422 return SDValue();
15423 }
15424 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15425 if (Subtarget.hasAVX512() && VT.is512BitVector())
15426 return SDValue();
15427 // Extract + narrow shuffle is better than the wide alternative.
15428 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15429 UndefLower, DAG);
15430 }
15431
15432 // Don't extract both uppers, instead shuffle and then extract.
15433 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15434 return SDValue();
15435 }
15436
15437 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15438 if (NumUpperHalves == 0) {
15439 // AVX2 has efficient 64-bit element cross-lane shuffles.
15440 // TODO: Refine to account for unary shuffle, splat, and other masks?
15441 if (Subtarget.hasAVX2() && EltWidth == 64)
15442 return SDValue();
15443 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15444 if (Subtarget.hasAVX512() && VT.is512BitVector())
15445 return SDValue();
15446 // Narrow shuffle + insert is better than the wide alternative.
15447 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15448 UndefLower, DAG);
15449 }
15450
15451 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15452 return SDValue();
15453}
15454
15455/// Handle case where shuffle sources are coming from the same 128-bit lane and
15456/// every lane can be represented as the same repeating mask - allowing us to
15457/// shuffle the sources with the repeating shuffle and then permute the result
15458/// to the destination lanes.
15460 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15461 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15462 int NumElts = VT.getVectorNumElements();
15463 int NumLanes = VT.getSizeInBits() / 128;
15464 int NumLaneElts = NumElts / NumLanes;
15465
15466 // On AVX2 we may be able to just shuffle the lowest elements and then
15467 // broadcast the result.
15468 if (Subtarget.hasAVX2()) {
15469 for (unsigned BroadcastSize : {16, 32, 64}) {
15470 if (BroadcastSize <= VT.getScalarSizeInBits())
15471 continue;
15472 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15473
15474 // Attempt to match a repeating pattern every NumBroadcastElts,
15475 // accounting for UNDEFs but only references the lowest 128-bit
15476 // lane of the inputs.
15477 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15478 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15479 for (int j = 0; j != NumBroadcastElts; ++j) {
15480 int M = Mask[i + j];
15481 if (M < 0)
15482 continue;
15483 int &R = RepeatMask[j];
15484 if (0 != ((M % NumElts) / NumLaneElts))
15485 return false;
15486 if (0 <= R && R != M)
15487 return false;
15488 R = M;
15489 }
15490 return true;
15491 };
15492
15493 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15494 if (!FindRepeatingBroadcastMask(RepeatMask))
15495 continue;
15496
15497 // Shuffle the (lowest) repeated elements in place for broadcast.
15498 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15499
15500 // Shuffle the actual broadcast.
15501 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15502 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15503 for (int j = 0; j != NumBroadcastElts; ++j)
15504 BroadcastMask[i + j] = j;
15505
15506 // Avoid returning the same shuffle operation. For example,
15507 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15508 if (BroadcastMask == Mask)
15509 return SDValue();
15510
15511 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15512 BroadcastMask);
15513 }
15514 }
15515
15516 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15517 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15518 return SDValue();
15519
15520 // Bail if we already have a repeated lane shuffle mask.
15521 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15522 return SDValue();
15523
15524 // Helper to look for repeated mask in each split sublane, and that those
15525 // sublanes can then be permuted into place.
15526 auto ShuffleSubLanes = [&](int SubLaneScale) {
15527 int NumSubLanes = NumLanes * SubLaneScale;
15528 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15529
15530 // Check that all the sources are coming from the same lane and see if we
15531 // can form a repeating shuffle mask (local to each sub-lane). At the same
15532 // time, determine the source sub-lane for each destination sub-lane.
15533 int TopSrcSubLane = -1;
15534 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15535 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15536 SubLaneScale,
15537 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15538
15539 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15540 // Extract the sub-lane mask, check that it all comes from the same lane
15541 // and normalize the mask entries to come from the first lane.
15542 int SrcLane = -1;
15543 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15544 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15545 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15546 if (M < 0)
15547 continue;
15548 int Lane = (M % NumElts) / NumLaneElts;
15549 if ((0 <= SrcLane) && (SrcLane != Lane))
15550 return SDValue();
15551 SrcLane = Lane;
15552 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15553 SubLaneMask[Elt] = LocalM;
15554 }
15555
15556 // Whole sub-lane is UNDEF.
15557 if (SrcLane < 0)
15558 continue;
15559
15560 // Attempt to match against the candidate repeated sub-lane masks.
15561 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15562 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15563 for (int i = 0; i != NumSubLaneElts; ++i) {
15564 if (M1[i] < 0 || M2[i] < 0)
15565 continue;
15566 if (M1[i] != M2[i])
15567 return false;
15568 }
15569 return true;
15570 };
15571
15572 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15573 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15574 continue;
15575
15576 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15577 for (int i = 0; i != NumSubLaneElts; ++i) {
15578 int M = SubLaneMask[i];
15579 if (M < 0)
15580 continue;
15581 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15582 "Unexpected mask element");
15583 RepeatedSubLaneMask[i] = M;
15584 }
15585
15586 // Track the top most source sub-lane - by setting the remaining to
15587 // UNDEF we can greatly simplify shuffle matching.
15588 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15589 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15590 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15591 break;
15592 }
15593
15594 // Bail if we failed to find a matching repeated sub-lane mask.
15595 if (Dst2SrcSubLanes[DstSubLane] < 0)
15596 return SDValue();
15597 }
15598 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15599 "Unexpected source lane");
15600
15601 // Create a repeating shuffle mask for the entire vector.
15602 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15603 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15604 int Lane = SubLane / SubLaneScale;
15605 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15606 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15607 int M = RepeatedSubLaneMask[Elt];
15608 if (M < 0)
15609 continue;
15610 int Idx = (SubLane * NumSubLaneElts) + Elt;
15611 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15612 }
15613 }
15614
15615 // Shuffle each source sub-lane to its destination.
15616 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15617 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15618 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15619 if (SrcSubLane < 0)
15620 continue;
15621 for (int j = 0; j != NumSubLaneElts; ++j)
15622 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15623 }
15624
15625 // Avoid returning the same shuffle operation.
15626 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15627 if (RepeatedMask == Mask || SubLaneMask == Mask)
15628 return SDValue();
15629
15630 SDValue RepeatedShuffle =
15631 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15632
15633 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15634 SubLaneMask);
15635 };
15636
15637 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15638 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15639 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15640 // Otherwise we can only permute whole 128-bit lanes.
15641 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15642 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15643 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15644 MinSubLaneScale = 2;
15645 MaxSubLaneScale =
15646 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15647 }
15648 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15649 MinSubLaneScale = MaxSubLaneScale = 4;
15650
15651 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15652 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15653 return Shuffle;
15654
15655 return SDValue();
15656}
15657
15659 bool &ForceV1Zero, bool &ForceV2Zero,
15660 unsigned &ShuffleImm, ArrayRef<int> Mask,
15661 const APInt &Zeroable) {
15662 int NumElts = VT.getVectorNumElements();
15663 assert(VT.getScalarSizeInBits() == 64 &&
15664 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15665 "Unexpected data type for VSHUFPD");
15666 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15667 "Illegal shuffle mask");
15668
15669 bool ZeroLane[2] = { true, true };
15670 for (int i = 0; i < NumElts; ++i)
15671 ZeroLane[i & 1] &= Zeroable[i];
15672
15673 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15674 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15675 ShuffleImm = 0;
15676 bool ShufpdMask = true;
15677 bool CommutableMask = true;
15678 for (int i = 0; i < NumElts; ++i) {
15679 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15680 continue;
15681 if (Mask[i] < 0)
15682 return false;
15683 int Val = (i & 6) + NumElts * (i & 1);
15684 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15685 if (Mask[i] < Val || Mask[i] > Val + 1)
15686 ShufpdMask = false;
15687 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15688 CommutableMask = false;
15689 ShuffleImm |= (Mask[i] % 2) << i;
15690 }
15691
15692 if (!ShufpdMask && !CommutableMask)
15693 return false;
15694
15695 if (!ShufpdMask && CommutableMask)
15696 std::swap(V1, V2);
15697
15698 ForceV1Zero = ZeroLane[0];
15699 ForceV2Zero = ZeroLane[1];
15700 return true;
15701}
15702
15704 SDValue V2, ArrayRef<int> Mask,
15705 const APInt &Zeroable,
15706 const X86Subtarget &Subtarget,
15707 SelectionDAG &DAG) {
15708 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15709 "Unexpected data type for VSHUFPD");
15710
15711 unsigned Immediate = 0;
15712 bool ForceV1Zero = false, ForceV2Zero = false;
15713 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15714 Mask, Zeroable))
15715 return SDValue();
15716
15717 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15718 if (ForceV1Zero)
15719 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15720 if (ForceV2Zero)
15721 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15722
15723 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15724 DAG.getTargetConstant(Immediate, DL, MVT::i8));
15725}
15726
15727// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15728// by zeroable elements in the remaining 24 elements. Turn this into two
15729// vmovqb instructions shuffled together.
15731 SDValue V1, SDValue V2,
15732 ArrayRef<int> Mask,
15733 const APInt &Zeroable,
15734 SelectionDAG &DAG) {
15735 assert(VT == MVT::v32i8 && "Unexpected type!");
15736
15737 // The first 8 indices should be every 8th element.
15738 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
15739 return SDValue();
15740
15741 // Remaining elements need to be zeroable.
15742 if (Zeroable.countl_one() < (Mask.size() - 8))
15743 return SDValue();
15744
15745 V1 = DAG.getBitcast(MVT::v4i64, V1);
15746 V2 = DAG.getBitcast(MVT::v4i64, V2);
15747
15748 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
15749 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
15750
15751 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
15752 // the upper bits of the result using an unpckldq.
15753 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
15754 { 0, 1, 2, 3, 16, 17, 18, 19,
15755 4, 5, 6, 7, 20, 21, 22, 23 });
15756 // Insert the unpckldq into a zero vector to widen to v32i8.
15757 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
15758 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
15759 DAG.getIntPtrConstant(0, DL));
15760}
15761
15762// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
15763// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
15764// =>
15765// ul = unpckl v1, v2
15766// uh = unpckh v1, v2
15767// a = vperm ul, uh
15768// b = vperm ul, uh
15769//
15770// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
15771// and permute. We cannot directly match v3 because it is split into two
15772// 256-bit vectors in earlier isel stages. Therefore, this function matches a
15773// pair of 256-bit shuffles and makes sure the masks are consecutive.
15774//
15775// Once unpck and permute nodes are created, the permute corresponding to this
15776// shuffle is returned, while the other permute replaces the other half of the
15777// shuffle in the selection dag.
15779 SDValue V1, SDValue V2,
15780 ArrayRef<int> Mask,
15781 SelectionDAG &DAG) {
15782 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
15783 VT != MVT::v32i8)
15784 return SDValue();
15785 // <B0, B1, B0+1, B1+1, ..., >
15786 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
15787 unsigned Begin1) {
15788 size_t Size = Mask.size();
15789 assert(Size % 2 == 0 && "Expected even mask size");
15790 for (unsigned I = 0; I < Size; I += 2) {
15791 if (Mask[I] != (int)(Begin0 + I / 2) ||
15792 Mask[I + 1] != (int)(Begin1 + I / 2))
15793 return false;
15794 }
15795 return true;
15796 };
15797 // Check which half is this shuffle node
15798 int NumElts = VT.getVectorNumElements();
15799 size_t FirstQtr = NumElts / 2;
15800 size_t ThirdQtr = NumElts + NumElts / 2;
15801 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
15802 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
15803 if (!IsFirstHalf && !IsSecondHalf)
15804 return SDValue();
15805
15806 // Find the intersection between shuffle users of V1 and V2.
15807 SmallVector<SDNode *, 2> Shuffles;
15808 for (SDNode *User : V1->uses())
15809 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
15810 User->getOperand(1) == V2)
15811 Shuffles.push_back(User);
15812 // Limit user size to two for now.
15813 if (Shuffles.size() != 2)
15814 return SDValue();
15815 // Find out which half of the 512-bit shuffles is each smaller shuffle
15816 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
15817 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
15818 SDNode *FirstHalf;
15819 SDNode *SecondHalf;
15820 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
15821 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
15822 FirstHalf = Shuffles[0];
15823 SecondHalf = Shuffles[1];
15824 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
15825 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
15826 FirstHalf = Shuffles[1];
15827 SecondHalf = Shuffles[0];
15828 } else {
15829 return SDValue();
15830 }
15831 // Lower into unpck and perm. Return the perm of this shuffle and replace
15832 // the other.
15833 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
15834 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
15835 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15836 DAG.getTargetConstant(0x20, DL, MVT::i8));
15837 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15838 DAG.getTargetConstant(0x31, DL, MVT::i8));
15839 if (IsFirstHalf) {
15840 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
15841 return Perm1;
15842 }
15843 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
15844 return Perm2;
15845}
15846
15847/// Handle lowering of 4-lane 64-bit floating point shuffles.
15848///
15849/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
15850/// isn't available.
15852 const APInt &Zeroable, SDValue V1, SDValue V2,
15853 const X86Subtarget &Subtarget,
15854 SelectionDAG &DAG) {
15855 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15856 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15857 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15858
15859 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
15860 Subtarget, DAG))
15861 return V;
15862
15863 if (V2.isUndef()) {
15864 // Check for being able to broadcast a single element.
15865 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
15866 Mask, Subtarget, DAG))
15867 return Broadcast;
15868
15869 // Use low duplicate instructions for masks that match their pattern.
15870 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15871 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
15872
15873 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
15874 // Non-half-crossing single input shuffles can be lowered with an
15875 // interleaved permutation.
15876 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
15877 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
15878 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
15879 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
15880 }
15881
15882 // With AVX2 we have direct support for this permutation.
15883 if (Subtarget.hasAVX2())
15884 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
15885 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15886
15887 // Try to create an in-lane repeating shuffle mask and then shuffle the
15888 // results into the target lanes.
15890 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15891 return V;
15892
15893 // Try to permute the lanes and then use a per-lane permute.
15894 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
15895 Mask, DAG, Subtarget))
15896 return V;
15897
15898 // Otherwise, fall back.
15899 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
15900 DAG, Subtarget);
15901 }
15902
15903 // Use dedicated unpack instructions for masks that match their pattern.
15904 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
15905 return V;
15906
15907 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
15908 Zeroable, Subtarget, DAG))
15909 return Blend;
15910
15911 // Check if the blend happens to exactly fit that of SHUFPD.
15912 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
15913 Zeroable, Subtarget, DAG))
15914 return Op;
15915
15916 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
15917 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
15918
15919 // If we have lane crossing shuffles AND they don't all come from the lower
15920 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15921 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
15922 // canonicalize to a blend of splat which isn't necessary for this combine.
15923 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
15924 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
15925 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
15926 (V2.getOpcode() != ISD::BUILD_VECTOR))
15927 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
15928
15929 // If we have one input in place, then we can permute the other input and
15930 // blend the result.
15931 if (V1IsInPlace || V2IsInPlace)
15932 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15933 Subtarget, DAG);
15934
15935 // Try to create an in-lane repeating shuffle mask and then shuffle the
15936 // results into the target lanes.
15938 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15939 return V;
15940
15941 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15942 // shuffle. However, if we have AVX2 and either inputs are already in place,
15943 // we will be able to shuffle even across lanes the other input in a single
15944 // instruction so skip this pattern.
15945 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
15947 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15948 return V;
15949
15950 // If we have VLX support, we can use VEXPAND.
15951 if (Subtarget.hasVLX())
15952 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
15953 DAG, Subtarget))
15954 return V;
15955
15956 // If we have AVX2 then we always want to lower with a blend because an v4 we
15957 // can fully permute the elements.
15958 if (Subtarget.hasAVX2())
15959 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15960 Subtarget, DAG);
15961
15962 // Otherwise fall back on generic lowering.
15963 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
15964 Subtarget, DAG);
15965}
15966
15967/// Handle lowering of 4-lane 64-bit integer shuffles.
15968///
15969/// This routine is only called when we have AVX2 and thus a reasonable
15970/// instruction set for v4i64 shuffling..
15972 const APInt &Zeroable, SDValue V1, SDValue V2,
15973 const X86Subtarget &Subtarget,
15974 SelectionDAG &DAG) {
15975 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15976 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15977 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15978 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
15979
15980 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15981 Subtarget, DAG))
15982 return V;
15983
15984 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
15985 Zeroable, Subtarget, DAG))
15986 return Blend;
15987
15988 // Check for being able to broadcast a single element.
15989 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
15990 Subtarget, DAG))
15991 return Broadcast;
15992
15993 // Try to use shift instructions if fast.
15994 if (Subtarget.preferLowerShuffleAsShift())
15995 if (SDValue Shift =
15996 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15997 Subtarget, DAG, /*BitwiseOnly*/ true))
15998 return Shift;
15999
16000 if (V2.isUndef()) {
16001 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16002 // can use lower latency instructions that will operate on both lanes.
16003 SmallVector<int, 2> RepeatedMask;
16004 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16005 SmallVector<int, 4> PSHUFDMask;
16006 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16007 return DAG.getBitcast(
16008 MVT::v4i64,
16009 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16010 DAG.getBitcast(MVT::v8i32, V1),
16011 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16012 }
16013
16014 // AVX2 provides a direct instruction for permuting a single input across
16015 // lanes.
16016 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16017 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16018 }
16019
16020 // Try to use shift instructions.
16021 if (SDValue Shift =
16022 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16023 DAG, /*BitwiseOnly*/ false))
16024 return Shift;
16025
16026 // If we have VLX support, we can use VALIGN or VEXPAND.
16027 if (Subtarget.hasVLX()) {
16028 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16029 Zeroable, Subtarget, DAG))
16030 return Rotate;
16031
16032 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
16033 DAG, Subtarget))
16034 return V;
16035 }
16036
16037 // Try to use PALIGNR.
16038 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16039 Subtarget, DAG))
16040 return Rotate;
16041
16042 // Use dedicated unpack instructions for masks that match their pattern.
16043 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
16044 return V;
16045
16046 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16047 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16048
16049 // If we have one input in place, then we can permute the other input and
16050 // blend the result.
16051 if (V1IsInPlace || V2IsInPlace)
16052 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16053 Subtarget, DAG);
16054
16055 // Try to create an in-lane repeating shuffle mask and then shuffle the
16056 // results into the target lanes.
16058 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16059 return V;
16060
16061 // Try to lower to PERMQ(BLENDD(V1,V2)).
16062 if (SDValue V =
16063 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16064 return V;
16065
16066 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16067 // shuffle. However, if we have AVX2 and either inputs are already in place,
16068 // we will be able to shuffle even across lanes the other input in a single
16069 // instruction so skip this pattern.
16070 if (!V1IsInPlace && !V2IsInPlace)
16072 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16073 return Result;
16074
16075 // Otherwise fall back on generic blend lowering.
16076 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16077 Subtarget, DAG);
16078}
16079
16080/// Handle lowering of 8-lane 32-bit floating point shuffles.
16081///
16082/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16083/// isn't available.
16085 const APInt &Zeroable, SDValue V1, SDValue V2,
16086 const X86Subtarget &Subtarget,
16087 SelectionDAG &DAG) {
16088 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16089 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16090 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16091
16092 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16093 Zeroable, Subtarget, DAG))
16094 return Blend;
16095
16096 // Check for being able to broadcast a single element.
16097 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16098 Subtarget, DAG))
16099 return Broadcast;
16100
16101 if (!Subtarget.hasAVX2()) {
16102 SmallVector<int> InLaneMask;
16103 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16104
16105 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16106 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16107 /*SimpleOnly*/ true))
16108 return R;
16109 }
16110 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16111 Zeroable, Subtarget, DAG))
16112 return DAG.getBitcast(MVT::v8f32, ZExt);
16113
16114 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16115 // options to efficiently lower the shuffle.
16116 SmallVector<int, 4> RepeatedMask;
16117 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16118 assert(RepeatedMask.size() == 4 &&
16119 "Repeated masks must be half the mask width!");
16120
16121 // Use even/odd duplicate instructions for masks that match their pattern.
16122 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16123 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16124 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16125 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16126
16127 if (V2.isUndef())
16128 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16129 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16130
16131 // Use dedicated unpack instructions for masks that match their pattern.
16132 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
16133 return V;
16134
16135 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16136 // have already handled any direct blends.
16137 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16138 }
16139
16140 // Try to create an in-lane repeating shuffle mask and then shuffle the
16141 // results into the target lanes.
16143 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16144 return V;
16145
16146 // If we have a single input shuffle with different shuffle patterns in the
16147 // two 128-bit lanes use the variable mask to VPERMILPS.
16148 if (V2.isUndef()) {
16149 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16150 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16151 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16152 }
16153 if (Subtarget.hasAVX2()) {
16154 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16155 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16156 }
16157 // Otherwise, fall back.
16158 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16159 DAG, Subtarget);
16160 }
16161
16162 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16163 // shuffle.
16165 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16166 return Result;
16167
16168 // If we have VLX support, we can use VEXPAND.
16169 if (Subtarget.hasVLX())
16170 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
16171 DAG, Subtarget))
16172 return V;
16173
16174 // Try to match an interleave of two v8f32s and lower them as unpck and
16175 // permutes using ymms. This needs to go before we try to split the vectors.
16176 //
16177 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
16178 // this path inadvertently.
16179 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
16180 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16181 Mask, DAG))
16182 return V;
16183
16184 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16185 // since after split we get a more efficient code using vpunpcklwd and
16186 // vpunpckhwd instrs than vblend.
16187 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16188 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
16189 DAG);
16190
16191 // If we have AVX2 then we always want to lower with a blend because at v8 we
16192 // can fully permute the elements.
16193 if (Subtarget.hasAVX2())
16194 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16195 Subtarget, DAG);
16196
16197 // Otherwise fall back on generic lowering.
16198 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16199 Subtarget, DAG);
16200}
16201
16202/// Handle lowering of 8-lane 32-bit integer shuffles.
16203///
16204/// This routine is only called when we have AVX2 and thus a reasonable
16205/// instruction set for v8i32 shuffling..
16207 const APInt &Zeroable, SDValue V1, SDValue V2,
16208 const X86Subtarget &Subtarget,
16209 SelectionDAG &DAG) {
16210 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16211 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16212 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16213 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16214
16215 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16216
16217 // Whenever we can lower this as a zext, that instruction is strictly faster
16218 // than any alternative. It also allows us to fold memory operands into the
16219 // shuffle in many cases.
16220 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16221 Zeroable, Subtarget, DAG))
16222 return ZExt;
16223
16224 // Try to match an interleave of two v8i32s and lower them as unpck and
16225 // permutes using ymms. This needs to go before we try to split the vectors.
16226 if (!Subtarget.hasAVX512())
16227 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16228 Mask, DAG))
16229 return V;
16230
16231 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16232 // since after split we get a more efficient code than vblend by using
16233 // vpunpcklwd and vpunpckhwd instrs.
16234 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16235 !Subtarget.hasAVX512())
16236 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
16237 DAG);
16238
16239 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16240 Zeroable, Subtarget, DAG))
16241 return Blend;
16242
16243 // Check for being able to broadcast a single element.
16244 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16245 Subtarget, DAG))
16246 return Broadcast;
16247
16248 // Try to use shift instructions if fast.
16249 if (Subtarget.preferLowerShuffleAsShift()) {
16250 if (SDValue Shift =
16251 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16252 Subtarget, DAG, /*BitwiseOnly*/ true))
16253 return Shift;
16254 if (NumV2Elements == 0)
16255 if (SDValue Rotate =
16256 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16257 return Rotate;
16258 }
16259
16260 // If the shuffle mask is repeated in each 128-bit lane we can use more
16261 // efficient instructions that mirror the shuffles across the two 128-bit
16262 // lanes.
16263 SmallVector<int, 4> RepeatedMask;
16264 bool Is128BitLaneRepeatedShuffle =
16265 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16266 if (Is128BitLaneRepeatedShuffle) {
16267 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16268 if (V2.isUndef())
16269 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16270 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16271
16272 // Use dedicated unpack instructions for masks that match their pattern.
16273 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
16274 return V;
16275 }
16276
16277 // Try to use shift instructions.
16278 if (SDValue Shift =
16279 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16280 DAG, /*BitwiseOnly*/ false))
16281 return Shift;
16282
16283 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16284 if (SDValue Rotate =
16285 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16286 return Rotate;
16287
16288 // If we have VLX support, we can use VALIGN or EXPAND.
16289 if (Subtarget.hasVLX()) {
16290 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16291 Zeroable, Subtarget, DAG))
16292 return Rotate;
16293
16294 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
16295 DAG, Subtarget))
16296 return V;
16297 }
16298
16299 // Try to use byte rotation instructions.
16300 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16301 Subtarget, DAG))
16302 return Rotate;
16303
16304 // Try to create an in-lane repeating shuffle mask and then shuffle the
16305 // results into the target lanes.
16307 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16308 return V;
16309
16310 if (V2.isUndef()) {
16311 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16312 // because that should be faster than the variable permute alternatives.
16313 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
16314 return V;
16315
16316 // If the shuffle patterns aren't repeated but it's a single input, directly
16317 // generate a cross-lane VPERMD instruction.
16318 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16319 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16320 }
16321
16322 // Assume that a single SHUFPS is faster than an alternative sequence of
16323 // multiple instructions (even if the CPU has a domain penalty).
16324 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16325 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16326 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16327 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16328 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16329 CastV1, CastV2, DAG);
16330 return DAG.getBitcast(MVT::v8i32, ShufPS);
16331 }
16332
16333 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16334 // shuffle.
16336 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16337 return Result;
16338
16339 // Otherwise fall back on generic blend lowering.
16340 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16341 Subtarget, DAG);
16342}
16343
16344/// Handle lowering of 16-lane 16-bit integer shuffles.
16345///
16346/// This routine is only called when we have AVX2 and thus a reasonable
16347/// instruction set for v16i16 shuffling..
16349 const APInt &Zeroable, SDValue V1, SDValue V2,
16350 const X86Subtarget &Subtarget,
16351 SelectionDAG &DAG) {
16352 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16353 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16354 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16355 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16356
16357 // Whenever we can lower this as a zext, that instruction is strictly faster
16358 // than any alternative. It also allows us to fold memory operands into the
16359 // shuffle in many cases.
16361 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16362 return ZExt;
16363
16364 // Check for being able to broadcast a single element.
16365 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16366 Subtarget, DAG))
16367 return Broadcast;
16368
16369 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16370 Zeroable, Subtarget, DAG))
16371 return Blend;
16372
16373 // Use dedicated unpack instructions for masks that match their pattern.
16374 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
16375 return V;
16376
16377 // Use dedicated pack instructions for masks that match their pattern.
16378 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
16379 Subtarget))
16380 return V;
16381
16382 // Try to use lower using a truncation.
16383 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16384 Subtarget, DAG))
16385 return V;
16386
16387 // Try to use shift instructions.
16388 if (SDValue Shift =
16389 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16390 Subtarget, DAG, /*BitwiseOnly*/ false))
16391 return Shift;
16392
16393 // Try to use byte rotation instructions.
16394 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16395 Subtarget, DAG))
16396 return Rotate;
16397
16398 // Try to create an in-lane repeating shuffle mask and then shuffle the
16399 // results into the target lanes.
16401 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16402 return V;
16403
16404 if (V2.isUndef()) {
16405 // Try to use bit rotation instructions.
16406 if (SDValue Rotate =
16407 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16408 return Rotate;
16409
16410 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16411 // because that should be faster than the variable permute alternatives.
16412 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
16413 return V;
16414
16415 // There are no generalized cross-lane shuffle operations available on i16
16416 // element types.
16417 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16419 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16420 return V;
16421
16422 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16423 DAG, Subtarget);
16424 }
16425
16426 SmallVector<int, 8> RepeatedMask;
16427 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16428 // As this is a single-input shuffle, the repeated mask should be
16429 // a strictly valid v8i16 mask that we can pass through to the v8i16
16430 // lowering to handle even the v16 case.
16432 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16433 }
16434 }
16435
16436 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16437 Zeroable, Subtarget, DAG))
16438 return PSHUFB;
16439
16440 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16441 if (Subtarget.hasBWI())
16442 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16443
16444 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16445 // shuffle.
16447 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16448 return Result;
16449
16450 // Try to permute the lanes and then use a per-lane permute.
16452 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16453 return V;
16454
16455 // Try to match an interleave of two v16i16s and lower them as unpck and
16456 // permutes using ymms.
16457 if (!Subtarget.hasAVX512())
16458 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16459 Mask, DAG))
16460 return V;
16461
16462 // Otherwise fall back on generic lowering.
16463 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16464 Subtarget, DAG);
16465}
16466
16467/// Handle lowering of 32-lane 8-bit integer shuffles.
16468///
16469/// This routine is only called when we have AVX2 and thus a reasonable
16470/// instruction set for v32i8 shuffling..
16472 const APInt &Zeroable, SDValue V1, SDValue V2,
16473 const X86Subtarget &Subtarget,
16474 SelectionDAG &DAG) {
16475 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16476 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16477 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16478 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16479
16480 // Whenever we can lower this as a zext, that instruction is strictly faster
16481 // than any alternative. It also allows us to fold memory operands into the
16482 // shuffle in many cases.
16483 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16484 Zeroable, Subtarget, DAG))
16485 return ZExt;
16486
16487 // Check for being able to broadcast a single element.
16488 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16489 Subtarget, DAG))
16490 return Broadcast;
16491
16492 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16493 Zeroable, Subtarget, DAG))
16494 return Blend;
16495
16496 // Use dedicated unpack instructions for masks that match their pattern.
16497 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
16498 return V;
16499
16500 // Use dedicated pack instructions for masks that match their pattern.
16501 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
16502 Subtarget))
16503 return V;
16504
16505 // Try to use lower using a truncation.
16506 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16507 Subtarget, DAG))
16508 return V;
16509
16510 // Try to use shift instructions.
16511 if (SDValue Shift =
16512 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16513 DAG, /*BitwiseOnly*/ false))
16514 return Shift;
16515
16516 // Try to use byte rotation instructions.
16517 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16518 Subtarget, DAG))
16519 return Rotate;
16520
16521 // Try to use bit rotation instructions.
16522 if (V2.isUndef())
16523 if (SDValue Rotate =
16524 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16525 return Rotate;
16526
16527 // Try to create an in-lane repeating shuffle mask and then shuffle the
16528 // results into the target lanes.
16530 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16531 return V;
16532
16533 // There are no generalized cross-lane shuffle operations available on i8
16534 // element types.
16535 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16536 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16537 // because that should be faster than the variable permute alternatives.
16538 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
16539 return V;
16540
16542 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16543 return V;
16544
16545 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16546 DAG, Subtarget);
16547 }
16548
16549 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16550 Zeroable, Subtarget, DAG))
16551 return PSHUFB;
16552
16553 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16554 if (Subtarget.hasVBMI())
16555 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16556
16557 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16558 // shuffle.
16560 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16561 return Result;
16562
16563 // Try to permute the lanes and then use a per-lane permute.
16565 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16566 return V;
16567
16568 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16569 // by zeroable elements in the remaining 24 elements. Turn this into two
16570 // vmovqb instructions shuffled together.
16571 if (Subtarget.hasVLX())
16572 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16573 Mask, Zeroable, DAG))
16574 return V;
16575
16576 // Try to match an interleave of two v32i8s and lower them as unpck and
16577 // permutes using ymms.
16578 if (!Subtarget.hasAVX512())
16579 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16580 Mask, DAG))
16581 return V;
16582
16583 // Otherwise fall back on generic lowering.
16584 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16585 Subtarget, DAG);
16586}
16587
16588/// High-level routine to lower various 256-bit x86 vector shuffles.
16589///
16590/// This routine either breaks down the specific type of a 256-bit x86 vector
16591/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16592/// together based on the available instructions.
16594 SDValue V1, SDValue V2, const APInt &Zeroable,
16595 const X86Subtarget &Subtarget,
16596 SelectionDAG &DAG) {
16597 // If we have a single input to the zero element, insert that into V1 if we
16598 // can do so cheaply.
16599 int NumElts = VT.getVectorNumElements();
16600 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16601
16602 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16604 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16605 return Insertion;
16606
16607 // Handle special cases where the lower or upper half is UNDEF.
16608 if (SDValue V =
16609 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16610 return V;
16611
16612 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16613 // can check for those subtargets here and avoid much of the subtarget
16614 // querying in the per-vector-type lowering routines. With AVX1 we have
16615 // essentially *zero* ability to manipulate a 256-bit vector with integer
16616 // types. Since we'll use floating point types there eventually, just
16617 // immediately cast everything to a float and operate entirely in that domain.
16618 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16619 int ElementBits = VT.getScalarSizeInBits();
16620 if (ElementBits < 32) {
16621 // No floating point type available, if we can't use the bit operations
16622 // for masking/blending then decompose into 128-bit vectors.
16623 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16624 Subtarget, DAG))
16625 return V;
16626 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16627 return V;
16628 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16629 }
16630
16631 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16633 V1 = DAG.getBitcast(FpVT, V1);
16634 V2 = DAG.getBitcast(FpVT, V2);
16635 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16636 }
16637
16638 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16639 V1 = DAG.getBitcast(MVT::v16i16, V1);
16640 V2 = DAG.getBitcast(MVT::v16i16, V2);
16641 return DAG.getBitcast(VT,
16642 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16643 }
16644
16645 switch (VT.SimpleTy) {
16646 case MVT::v4f64:
16647 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16648 case MVT::v4i64:
16649 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16650 case MVT::v8f32:
16651 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16652 case MVT::v8i32:
16653 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16654 case MVT::v16i16:
16655 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16656 case MVT::v32i8:
16657 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16658
16659 default:
16660 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16661 }
16662}
16663
16664/// Try to lower a vector shuffle as a 128-bit shuffles.
16666 const APInt &Zeroable, SDValue V1, SDValue V2,
16667 const X86Subtarget &Subtarget,
16668 SelectionDAG &DAG) {
16669 assert(VT.getScalarSizeInBits() == 64 &&
16670 "Unexpected element type size for 128bit shuffle.");
16671
16672 // To handle 256 bit vector requires VLX and most probably
16673 // function lowerV2X128VectorShuffle() is better solution.
16674 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16675
16676 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16677 SmallVector<int, 4> Widened128Mask;
16678 if (!canWidenShuffleElements(Mask, Widened128Mask))
16679 return SDValue();
16680 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16681
16682 // Try to use an insert into a zero vector.
16683 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16684 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16685 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16686 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16687 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16688 DAG.getIntPtrConstant(0, DL));
16689 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16690 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16691 DAG.getIntPtrConstant(0, DL));
16692 }
16693
16694 // Check for patterns which can be matched with a single insert of a 256-bit
16695 // subvector.
16696 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16697 if (OnlyUsesV1 ||
16698 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16699 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16700 SDValue SubVec =
16701 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16702 DAG.getIntPtrConstant(0, DL));
16703 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16704 DAG.getIntPtrConstant(4, DL));
16705 }
16706
16707 // See if this is an insertion of the lower 128-bits of V2 into V1.
16708 bool IsInsert = true;
16709 int V2Index = -1;
16710 for (int i = 0; i < 4; ++i) {
16711 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16712 if (Widened128Mask[i] < 0)
16713 continue;
16714
16715 // Make sure all V1 subvectors are in place.
16716 if (Widened128Mask[i] < 4) {
16717 if (Widened128Mask[i] != i) {
16718 IsInsert = false;
16719 break;
16720 }
16721 } else {
16722 // Make sure we only have a single V2 index and its the lowest 128-bits.
16723 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16724 IsInsert = false;
16725 break;
16726 }
16727 V2Index = i;
16728 }
16729 }
16730 if (IsInsert && V2Index >= 0) {
16731 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16732 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16733 DAG.getIntPtrConstant(0, DL));
16734 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16735 }
16736
16737 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
16738 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
16739 // possible we at least ensure the lanes stay sequential to help later
16740 // combines.
16741 SmallVector<int, 2> Widened256Mask;
16742 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
16743 Widened128Mask.clear();
16744 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
16745 }
16746
16747 // Try to lower to vshuf64x2/vshuf32x4.
16748 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
16749 int PermMask[4] = {-1, -1, -1, -1};
16750 // Ensure elements came from the same Op.
16751 for (int i = 0; i < 4; ++i) {
16752 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16753 if (Widened128Mask[i] < 0)
16754 continue;
16755
16756 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
16757 unsigned OpIndex = i / 2;
16758 if (Ops[OpIndex].isUndef())
16759 Ops[OpIndex] = Op;
16760 else if (Ops[OpIndex] != Op)
16761 return SDValue();
16762
16763 PermMask[i] = Widened128Mask[i] % 4;
16764 }
16765
16766 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
16767 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
16768}
16769
16770/// Handle lowering of 8-lane 64-bit floating point shuffles.
16772 const APInt &Zeroable, SDValue V1, SDValue V2,
16773 const X86Subtarget &Subtarget,
16774 SelectionDAG &DAG) {
16775 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16776 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16777 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16778
16779 if (V2.isUndef()) {
16780 // Use low duplicate instructions for masks that match their pattern.
16781 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
16782 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
16783
16784 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
16785 // Non-half-crossing single input shuffles can be lowered with an
16786 // interleaved permutation.
16787 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16788 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
16789 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
16790 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
16791 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
16792 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16793 }
16794
16795 SmallVector<int, 4> RepeatedMask;
16796 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
16797 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
16798 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16799 }
16800
16801 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
16802 V2, Subtarget, DAG))
16803 return Shuf128;
16804
16805 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
16806 return Unpck;
16807
16808 // Check if the blend happens to exactly fit that of SHUFPD.
16809 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
16810 Zeroable, Subtarget, DAG))
16811 return Op;
16812
16813 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
16814 DAG, Subtarget))
16815 return V;
16816
16817 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
16818 Zeroable, Subtarget, DAG))
16819 return Blend;
16820
16821 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
16822}
16823
16824/// Handle lowering of 16-lane 32-bit floating point shuffles.
16826 const APInt &Zeroable, SDValue V1, SDValue V2,
16827 const X86Subtarget &Subtarget,
16828 SelectionDAG &DAG) {
16829 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16830 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16831 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16832
16833 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16834 // options to efficiently lower the shuffle.
16835 SmallVector<int, 4> RepeatedMask;
16836 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
16837 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16838
16839 // Use even/odd duplicate instructions for masks that match their pattern.
16840 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16841 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
16842 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16843 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
16844
16845 if (V2.isUndef())
16846 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
16847 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16848
16849 // Use dedicated unpack instructions for masks that match their pattern.
16850 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
16851 return V;
16852
16853 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16854 Zeroable, Subtarget, DAG))
16855 return Blend;
16856
16857 // Otherwise, fall back to a SHUFPS sequence.
16858 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
16859 }
16860
16861 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16862 Zeroable, Subtarget, DAG))
16863 return Blend;
16864
16866 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16867 return DAG.getBitcast(MVT::v16f32, ZExt);
16868
16869 // Try to create an in-lane repeating shuffle mask and then shuffle the
16870 // results into the target lanes.
16872 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
16873 return V;
16874
16875 // If we have a single input shuffle with different shuffle patterns in the
16876 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16877 if (V2.isUndef() &&
16878 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
16879 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
16880 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
16881 }
16882
16883 // If we have AVX512F support, we can use VEXPAND.
16884 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
16885 V1, V2, DAG, Subtarget))
16886 return V;
16887
16888 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
16889}
16890
16891/// Handle lowering of 8-lane 64-bit integer shuffles.
16893 const APInt &Zeroable, SDValue V1, SDValue V2,
16894 const X86Subtarget &Subtarget,
16895 SelectionDAG &DAG) {
16896 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16897 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16898 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16899
16900 // Try to use shift instructions if fast.
16901 if (Subtarget.preferLowerShuffleAsShift())
16902 if (SDValue Shift =
16903 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
16904 Subtarget, DAG, /*BitwiseOnly*/ true))
16905 return Shift;
16906
16907 if (V2.isUndef()) {
16908 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16909 // can use lower latency instructions that will operate on all four
16910 // 128-bit lanes.
16911 SmallVector<int, 2> Repeated128Mask;
16912 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
16913 SmallVector<int, 4> PSHUFDMask;
16914 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
16915 return DAG.getBitcast(
16916 MVT::v8i64,
16917 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
16918 DAG.getBitcast(MVT::v16i32, V1),
16919 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16920 }
16921
16922 SmallVector<int, 4> Repeated256Mask;
16923 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
16924 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
16925 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
16926 }
16927
16928 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
16929 V2, Subtarget, DAG))
16930 return Shuf128;
16931
16932 // Try to use shift instructions.
16933 if (SDValue Shift =
16934 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
16935 DAG, /*BitwiseOnly*/ false))
16936 return Shift;
16937
16938 // Try to use VALIGN.
16939 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
16940 Zeroable, Subtarget, DAG))
16941 return Rotate;
16942
16943 // Try to use PALIGNR.
16944 if (Subtarget.hasBWI())
16945 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
16946 Subtarget, DAG))
16947 return Rotate;
16948
16949 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
16950 return Unpck;
16951
16952 // If we have AVX512F support, we can use VEXPAND.
16953 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
16954 DAG, Subtarget))
16955 return V;
16956
16957 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
16958 Zeroable, Subtarget, DAG))
16959 return Blend;
16960
16961 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
16962}
16963
16964/// Handle lowering of 16-lane 32-bit integer shuffles.
16966 const APInt &Zeroable, SDValue V1, SDValue V2,
16967 const X86Subtarget &Subtarget,
16968 SelectionDAG &DAG) {
16969 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16970 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16971 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16972
16973 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16974
16975 // Whenever we can lower this as a zext, that instruction is strictly faster
16976 // than any alternative. It also allows us to fold memory operands into the
16977 // shuffle in many cases.
16979 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16980 return ZExt;
16981
16982 // Try to use shift instructions if fast.
16983 if (Subtarget.preferLowerShuffleAsShift()) {
16984 if (SDValue Shift =
16985 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16986 Subtarget, DAG, /*BitwiseOnly*/ true))
16987 return Shift;
16988 if (NumV2Elements == 0)
16989 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
16990 Subtarget, DAG))
16991 return Rotate;
16992 }
16993
16994 // If the shuffle mask is repeated in each 128-bit lane we can use more
16995 // efficient instructions that mirror the shuffles across the four 128-bit
16996 // lanes.
16997 SmallVector<int, 4> RepeatedMask;
16998 bool Is128BitLaneRepeatedShuffle =
16999 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17000 if (Is128BitLaneRepeatedShuffle) {
17001 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17002 if (V2.isUndef())
17003 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17004 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17005
17006 // Use dedicated unpack instructions for masks that match their pattern.
17007 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
17008 return V;
17009 }
17010
17011 // Try to use shift instructions.
17012 if (SDValue Shift =
17013 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17014 Subtarget, DAG, /*BitwiseOnly*/ false))
17015 return Shift;
17016
17017 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17018 if (SDValue Rotate =
17019 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17020 return Rotate;
17021
17022 // Try to use VALIGN.
17023 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17024 Zeroable, Subtarget, DAG))
17025 return Rotate;
17026
17027 // Try to use byte rotation instructions.
17028 if (Subtarget.hasBWI())
17029 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17030 Subtarget, DAG))
17031 return Rotate;
17032
17033 // Assume that a single SHUFPS is faster than using a permv shuffle.
17034 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17035 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17036 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17037 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17038 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17039 CastV1, CastV2, DAG);
17040 return DAG.getBitcast(MVT::v16i32, ShufPS);
17041 }
17042
17043 // Try to create an in-lane repeating shuffle mask and then shuffle the
17044 // results into the target lanes.
17046 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17047 return V;
17048
17049 // If we have AVX512F support, we can use VEXPAND.
17050 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17051 DAG, Subtarget))
17052 return V;
17053
17054 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17055 Zeroable, Subtarget, DAG))
17056 return Blend;
17057
17058 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17059}
17060
17061/// Handle lowering of 32-lane 16-bit integer shuffles.
17063 const APInt &Zeroable, SDValue V1, SDValue V2,
17064 const X86Subtarget &Subtarget,
17065 SelectionDAG &DAG) {
17066 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17067 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17068 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17069 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17070
17071 // Whenever we can lower this as a zext, that instruction is strictly faster
17072 // than any alternative. It also allows us to fold memory operands into the
17073 // shuffle in many cases.
17075 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17076 return ZExt;
17077
17078 // Use dedicated unpack instructions for masks that match their pattern.
17079 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17080 return V;
17081
17082 // Use dedicated pack instructions for masks that match their pattern.
17083 if (SDValue V =
17084 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17085 return V;
17086
17087 // Try to use shift instructions.
17088 if (SDValue Shift =
17089 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17090 Subtarget, DAG, /*BitwiseOnly*/ false))
17091 return Shift;
17092
17093 // Try to use byte rotation instructions.
17094 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17095 Subtarget, DAG))
17096 return Rotate;
17097
17098 if (V2.isUndef()) {
17099 // Try to use bit rotation instructions.
17100 if (SDValue Rotate =
17101 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17102 return Rotate;
17103
17104 SmallVector<int, 8> RepeatedMask;
17105 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17106 // As this is a single-input shuffle, the repeated mask should be
17107 // a strictly valid v8i16 mask that we can pass through to the v8i16
17108 // lowering to handle even the v32 case.
17109 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17110 RepeatedMask, Subtarget, DAG);
17111 }
17112 }
17113
17114 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17115 Zeroable, Subtarget, DAG))
17116 return Blend;
17117
17118 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17119 Zeroable, Subtarget, DAG))
17120 return PSHUFB;
17121
17122 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17123}
17124
17125/// Handle lowering of 64-lane 8-bit integer shuffles.
17127 const APInt &Zeroable, SDValue V1, SDValue V2,
17128 const X86Subtarget &Subtarget,
17129 SelectionDAG &DAG) {
17130 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17131 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17132 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17133 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17134
17135 // Whenever we can lower this as a zext, that instruction is strictly faster
17136 // than any alternative. It also allows us to fold memory operands into the
17137 // shuffle in many cases.
17139 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17140 return ZExt;
17141
17142 // Use dedicated unpack instructions for masks that match their pattern.
17143 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
17144 return V;
17145
17146 // Use dedicated pack instructions for masks that match their pattern.
17147 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
17148 Subtarget))
17149 return V;
17150
17151 // Try to use shift instructions.
17152 if (SDValue Shift =
17153 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17154 DAG, /*BitwiseOnly*/ false))
17155 return Shift;
17156
17157 // Try to use byte rotation instructions.
17158 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17159 Subtarget, DAG))
17160 return Rotate;
17161
17162 // Try to use bit rotation instructions.
17163 if (V2.isUndef())
17164 if (SDValue Rotate =
17165 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17166 return Rotate;
17167
17168 // Lower as AND if possible.
17169 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17170 Zeroable, Subtarget, DAG))
17171 return Masked;
17172
17173 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17174 Zeroable, Subtarget, DAG))
17175 return PSHUFB;
17176
17177 // Try to create an in-lane repeating shuffle mask and then shuffle the
17178 // results into the target lanes.
17180 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17181 return V;
17182
17184 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17185 return Result;
17186
17187 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17188 Zeroable, Subtarget, DAG))
17189 return Blend;
17190
17191 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17192 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17193 // PALIGNR will be cheaper than the second PSHUFB+OR.
17194 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17195 Mask, Subtarget, DAG))
17196 return V;
17197
17198 // If we can't directly blend but can use PSHUFB, that will be better as it
17199 // can both shuffle and set up the inefficient blend.
17200 bool V1InUse, V2InUse;
17201 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17202 DAG, V1InUse, V2InUse);
17203 }
17204
17205 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17206 // shuffle.
17207 if (!V2.isUndef())
17209 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17210 return Result;
17211
17212 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17213 if (Subtarget.hasVBMI())
17214 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17215
17216 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17217}
17218
17219/// High-level routine to lower various 512-bit x86 vector shuffles.
17220///
17221/// This routine either breaks down the specific type of a 512-bit x86 vector
17222/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17223/// together based on the available instructions.
17225 MVT VT, SDValue V1, SDValue V2,
17226 const APInt &Zeroable,
17227 const X86Subtarget &Subtarget,
17228 SelectionDAG &DAG) {
17229 assert(Subtarget.hasAVX512() &&
17230 "Cannot lower 512-bit vectors w/ basic ISA!");
17231
17232 // If we have a single input to the zero element, insert that into V1 if we
17233 // can do so cheaply.
17234 int NumElts = Mask.size();
17235 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17236
17237 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17239 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17240 return Insertion;
17241
17242 // Handle special cases where the lower or upper half is UNDEF.
17243 if (SDValue V =
17244 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17245 return V;
17246
17247 // Check for being able to broadcast a single element.
17248 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17249 Subtarget, DAG))
17250 return Broadcast;
17251
17252 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17253 // Try using bit ops for masking and blending before falling back to
17254 // splitting.
17255 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17256 Subtarget, DAG))
17257 return V;
17258 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17259 return V;
17260
17261 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17262 }
17263
17264 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17265 if (!Subtarget.hasBWI())
17266 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17267 /*SimpleOnly*/ false);
17268
17269 V1 = DAG.getBitcast(MVT::v32i16, V1);
17270 V2 = DAG.getBitcast(MVT::v32i16, V2);
17271 return DAG.getBitcast(VT,
17272 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17273 }
17274
17275 // Dispatch to each element type for lowering. If we don't have support for
17276 // specific element type shuffles at 512 bits, immediately split them and
17277 // lower them. Each lowering routine of a given type is allowed to assume that
17278 // the requisite ISA extensions for that element type are available.
17279 switch (VT.SimpleTy) {
17280 case MVT::v8f64:
17281 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17282 case MVT::v16f32:
17283 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17284 case MVT::v8i64:
17285 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17286 case MVT::v16i32:
17287 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17288 case MVT::v32i16:
17289 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17290 case MVT::v64i8:
17291 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17292
17293 default:
17294 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17295 }
17296}
17297
17299 MVT VT, SDValue V1, SDValue V2,
17300 const X86Subtarget &Subtarget,
17301 SelectionDAG &DAG) {
17302 // Shuffle should be unary.
17303 if (!V2.isUndef())
17304 return SDValue();
17305
17306 int ShiftAmt = -1;
17307 int NumElts = Mask.size();
17308 for (int i = 0; i != NumElts; ++i) {
17309 int M = Mask[i];
17310 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17311 "Unexpected mask index.");
17312 if (M < 0)
17313 continue;
17314
17315 // The first non-undef element determines our shift amount.
17316 if (ShiftAmt < 0) {
17317 ShiftAmt = M - i;
17318 // Need to be shifting right.
17319 if (ShiftAmt <= 0)
17320 return SDValue();
17321 }
17322 // All non-undef elements must shift by the same amount.
17323 if (ShiftAmt != M - i)
17324 return SDValue();
17325 }
17326 assert(ShiftAmt >= 0 && "All undef?");
17327
17328 // Great we found a shift right.
17329 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17330 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17331 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17332 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17333 DAG.getIntPtrConstant(0, DL));
17334}
17335
17336// Determine if this shuffle can be implemented with a KSHIFT instruction.
17337// Returns the shift amount if possible or -1 if not. This is a simplified
17338// version of matchShuffleAsShift.
17339static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17340 int MaskOffset, const APInt &Zeroable) {
17341 int Size = Mask.size();
17342
17343 auto CheckZeros = [&](int Shift, bool Left) {
17344 for (int j = 0; j < Shift; ++j)
17345 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17346 return false;
17347
17348 return true;
17349 };
17350
17351 auto MatchShift = [&](int Shift, bool Left) {
17352 unsigned Pos = Left ? Shift : 0;
17353 unsigned Low = Left ? 0 : Shift;
17354 unsigned Len = Size - Shift;
17355 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17356 };
17357
17358 for (int Shift = 1; Shift != Size; ++Shift)
17359 for (bool Left : {true, false})
17360 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17362 return Shift;
17363 }
17364
17365 return -1;
17366}
17367
17368
17369// Lower vXi1 vector shuffles.
17370// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17371// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17372// vector, shuffle and then truncate it back.
17374 MVT VT, SDValue V1, SDValue V2,
17375 const APInt &Zeroable,
17376 const X86Subtarget &Subtarget,
17377 SelectionDAG &DAG) {
17378 assert(Subtarget.hasAVX512() &&
17379 "Cannot lower 512-bit vectors w/o basic ISA!");
17380
17381 int NumElts = Mask.size();
17382 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17383
17384 // Try to recognize shuffles that are just padding a subvector with zeros.
17385 int SubvecElts = 0;
17386 int Src = -1;
17387 for (int i = 0; i != NumElts; ++i) {
17388 if (Mask[i] >= 0) {
17389 // Grab the source from the first valid mask. All subsequent elements need
17390 // to use this same source.
17391 if (Src < 0)
17392 Src = Mask[i] / NumElts;
17393 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17394 break;
17395 }
17396
17397 ++SubvecElts;
17398 }
17399 assert(SubvecElts != NumElts && "Identity shuffle?");
17400
17401 // Clip to a power 2.
17402 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17403
17404 // Make sure the number of zeroable bits in the top at least covers the bits
17405 // not covered by the subvector.
17406 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17407 assert(Src >= 0 && "Expected a source!");
17408 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17409 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
17410 Src == 0 ? V1 : V2,
17411 DAG.getIntPtrConstant(0, DL));
17412 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17413 DAG.getConstant(0, DL, VT),
17414 Extract, DAG.getIntPtrConstant(0, DL));
17415 }
17416
17417 // Try a simple shift right with undef elements. Later we'll try with zeros.
17418 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
17419 DAG))
17420 return Shift;
17421
17422 // Try to match KSHIFTs.
17423 unsigned Offset = 0;
17424 for (SDValue V : { V1, V2 }) {
17425 unsigned Opcode;
17426 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17427 if (ShiftAmt >= 0) {
17428 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17429 MVT WideVT = Res.getSimpleValueType();
17430 // Widened right shifts need two shifts to ensure we shift in zeroes.
17431 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17432 int WideElts = WideVT.getVectorNumElements();
17433 // Shift left to put the original vector in the MSBs of the new size.
17434 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17435 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17436 // Increase the shift amount to account for the left shift.
17437 ShiftAmt += WideElts - NumElts;
17438 }
17439
17440 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17441 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17442 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17443 DAG.getIntPtrConstant(0, DL));
17444 }
17445 Offset += NumElts; // Increment for next iteration.
17446 }
17447
17448 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
17449 // ops instead.
17450 // TODO: What other unary shuffles would benefit from this?
17451 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17452 SDValue Op0 = V1.getOperand(0);
17453 SDValue Op1 = V1.getOperand(1);
17454 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17455 EVT OpVT = Op0.getValueType();
17456 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17457 return DAG.getSetCC(
17458 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17459 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17460 }
17461
17462 MVT ExtVT;
17463 switch (VT.SimpleTy) {
17464 default:
17465 llvm_unreachable("Expected a vector of i1 elements");
17466 case MVT::v2i1:
17467 ExtVT = MVT::v2i64;
17468 break;
17469 case MVT::v4i1:
17470 ExtVT = MVT::v4i32;
17471 break;
17472 case MVT::v8i1:
17473 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17474 // shuffle.
17475 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17476 break;
17477 case MVT::v16i1:
17478 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17479 // 256-bit operation available.
17480 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17481 break;
17482 case MVT::v32i1:
17483 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17484 // 256-bit operation available.
17485 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17486 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17487 break;
17488 case MVT::v64i1:
17489 // Fall back to scalarization. FIXME: We can do better if the shuffle
17490 // can be partitioned cleanly.
17491 if (!Subtarget.useBWIRegs())
17492 return SDValue();
17493 ExtVT = MVT::v64i8;
17494 break;
17495 }
17496
17497 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17498 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17499
17500 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17501 // i1 was sign extended we can use X86ISD::CVT2MASK.
17502 int NumElems = VT.getVectorNumElements();
17503 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17504 (Subtarget.hasDQI() && (NumElems < 32)))
17505 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17506 Shuffle, ISD::SETGT);
17507
17508 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17509}
17510
17511/// Helper function that returns true if the shuffle mask should be
17512/// commuted to improve canonicalization.
17514 int NumElements = Mask.size();
17515
17516 int NumV1Elements = 0, NumV2Elements = 0;
17517 for (int M : Mask)
17518 if (M < 0)
17519 continue;
17520 else if (M < NumElements)
17521 ++NumV1Elements;
17522 else
17523 ++NumV2Elements;
17524
17525 // Commute the shuffle as needed such that more elements come from V1 than
17526 // V2. This allows us to match the shuffle pattern strictly on how many
17527 // elements come from V1 without handling the symmetric cases.
17528 if (NumV2Elements > NumV1Elements)
17529 return true;
17530
17531 assert(NumV1Elements > 0 && "No V1 indices");
17532
17533 if (NumV2Elements == 0)
17534 return false;
17535
17536 // When the number of V1 and V2 elements are the same, try to minimize the
17537 // number of uses of V2 in the low half of the vector. When that is tied,
17538 // ensure that the sum of indices for V1 is equal to or lower than the sum
17539 // indices for V2. When those are equal, try to ensure that the number of odd
17540 // indices for V1 is lower than the number of odd indices for V2.
17541 if (NumV1Elements == NumV2Elements) {
17542 int LowV1Elements = 0, LowV2Elements = 0;
17543 for (int M : Mask.slice(0, NumElements / 2))
17544 if (M >= NumElements)
17545 ++LowV2Elements;
17546 else if (M >= 0)
17547 ++LowV1Elements;
17548 if (LowV2Elements > LowV1Elements)
17549 return true;
17550 if (LowV2Elements == LowV1Elements) {
17551 int SumV1Indices = 0, SumV2Indices = 0;
17552 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17553 if (Mask[i] >= NumElements)
17554 SumV2Indices += i;
17555 else if (Mask[i] >= 0)
17556 SumV1Indices += i;
17557 if (SumV2Indices < SumV1Indices)
17558 return true;
17559 if (SumV2Indices == SumV1Indices) {
17560 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17561 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17562 if (Mask[i] >= NumElements)
17563 NumV2OddIndices += i % 2;
17564 else if (Mask[i] >= 0)
17565 NumV1OddIndices += i % 2;
17566 if (NumV2OddIndices < NumV1OddIndices)
17567 return true;
17568 }
17569 }
17570 }
17571
17572 return false;
17573}
17574
17576 const X86Subtarget &Subtarget) {
17577 if (!Subtarget.hasAVX512())
17578 return false;
17579
17580 if (!V.getValueType().isSimple())
17581 return false;
17582
17583 MVT VT = V.getSimpleValueType().getScalarType();
17584 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17585 return false;
17586
17587 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17588 // are preferable to blendw/blendvb/masked-mov.
17589 if ((VT == MVT::i16 || VT == MVT::i8) &&
17590 V.getSimpleValueType().getSizeInBits() < 512)
17591 return false;
17592
17593 auto HasMaskOperation = [&](SDValue V) {
17594 // TODO: Currently we only check limited opcode. We probably extend
17595 // it to all binary operation by checking TLI.isBinOp().
17596 switch (V->getOpcode()) {
17597 default:
17598 return false;
17599 case ISD::ADD:
17600 case ISD::SUB:
17601 case ISD::AND:
17602 case ISD::XOR:
17603 case ISD::OR:
17604 case ISD::SMAX:
17605 case ISD::SMIN:
17606 case ISD::UMAX:
17607 case ISD::UMIN:
17608 case ISD::ABS:
17609 case ISD::SHL:
17610 case ISD::SRL:
17611 case ISD::SRA:
17612 case ISD::MUL:
17613 break;
17614 }
17615 if (!V->hasOneUse())
17616 return false;
17617
17618 return true;
17619 };
17620
17621 if (HasMaskOperation(V))
17622 return true;
17623
17624 return false;
17625}
17626
17627// Forward declaration.
17630 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17631 const X86Subtarget &Subtarget);
17632
17633 /// Top-level lowering for x86 vector shuffles.
17634///
17635/// This handles decomposition, canonicalization, and lowering of all x86
17636/// vector shuffles. Most of the specific lowering strategies are encapsulated
17637/// above in helper routines. The canonicalization attempts to widen shuffles
17638/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17639/// s.t. only one of the two inputs needs to be tested, etc.
17641 SelectionDAG &DAG) {
17642 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17643 ArrayRef<int> OrigMask = SVOp->getMask();
17644 SDValue V1 = Op.getOperand(0);
17645 SDValue V2 = Op.getOperand(1);
17646 MVT VT = Op.getSimpleValueType();
17647 int NumElements = VT.getVectorNumElements();
17648 SDLoc DL(Op);
17649 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17650
17651 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17652 "Can't lower MMX shuffles");
17653
17654 bool V1IsUndef = V1.isUndef();
17655 bool V2IsUndef = V2.isUndef();
17656 if (V1IsUndef && V2IsUndef)
17657 return DAG.getUNDEF(VT);
17658
17659 // When we create a shuffle node we put the UNDEF node to second operand,
17660 // but in some cases the first operand may be transformed to UNDEF.
17661 // In this case we should just commute the node.
17662 if (V1IsUndef)
17663 return DAG.getCommutedVectorShuffle(*SVOp);
17664
17665 // Check for non-undef masks pointing at an undef vector and make the masks
17666 // undef as well. This makes it easier to match the shuffle based solely on
17667 // the mask.
17668 if (V2IsUndef &&
17669 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17670 SmallVector<int, 8> NewMask(OrigMask);
17671 for (int &M : NewMask)
17672 if (M >= NumElements)
17673 M = -1;
17674 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17675 }
17676
17677 // Check for illegal shuffle mask element index values.
17678 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17679 (void)MaskUpperLimit;
17680 assert(llvm::all_of(OrigMask,
17681 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17682 "Out of bounds shuffle index");
17683
17684 // We actually see shuffles that are entirely re-arrangements of a set of
17685 // zero inputs. This mostly happens while decomposing complex shuffles into
17686 // simple ones. Directly lower these as a buildvector of zeros.
17687 APInt KnownUndef, KnownZero;
17688 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17689
17690 APInt Zeroable = KnownUndef | KnownZero;
17691 if (Zeroable.isAllOnes())
17692 return getZeroVector(VT, Subtarget, DAG, DL);
17693
17694 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17695
17696 // Try to collapse shuffles into using a vector type with fewer elements but
17697 // wider element types. We cap this to not form integers or floating point
17698 // elements wider than 64 bits. It does not seem beneficial to form i128
17699 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17700 SmallVector<int, 16> WidenedMask;
17701 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17702 !canCombineAsMaskOperation(V1, Subtarget) &&
17703 !canCombineAsMaskOperation(V2, Subtarget) &&
17704 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17705 // Shuffle mask widening should not interfere with a broadcast opportunity
17706 // by obfuscating the operands with bitcasts.
17707 // TODO: Avoid lowering directly from this top-level function: make this
17708 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17709 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17710 Subtarget, DAG))
17711 return Broadcast;
17712
17713 MVT NewEltVT = VT.isFloatingPoint()
17716 int NewNumElts = NumElements / 2;
17717 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17718 // Make sure that the new vector type is legal. For example, v2f64 isn't
17719 // legal on SSE1.
17720 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17721 if (V2IsZero) {
17722 // Modify the new Mask to take all zeros from the all-zero vector.
17723 // Choose indices that are blend-friendly.
17724 bool UsedZeroVector = false;
17725 assert(is_contained(WidenedMask, SM_SentinelZero) &&
17726 "V2's non-undef elements are used?!");
17727 for (int i = 0; i != NewNumElts; ++i)
17728 if (WidenedMask[i] == SM_SentinelZero) {
17729 WidenedMask[i] = i + NewNumElts;
17730 UsedZeroVector = true;
17731 }
17732 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17733 // some elements to be undef.
17734 if (UsedZeroVector)
17735 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
17736 }
17737 V1 = DAG.getBitcast(NewVT, V1);
17738 V2 = DAG.getBitcast(NewVT, V2);
17739 return DAG.getBitcast(
17740 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
17741 }
17742 }
17743
17744 SmallVector<SDValue> Ops = {V1, V2};
17745 SmallVector<int> Mask(OrigMask);
17746
17747 // Canonicalize the shuffle with any horizontal ops inputs.
17748 // NOTE: This may update Ops and Mask.
17750 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
17751 return DAG.getBitcast(VT, HOp);
17752
17753 V1 = DAG.getBitcast(VT, Ops[0]);
17754 V2 = DAG.getBitcast(VT, Ops[1]);
17755 assert(NumElements == (int)Mask.size() &&
17756 "canonicalizeShuffleMaskWithHorizOp "
17757 "shouldn't alter the shuffle mask size");
17758
17759 // Commute the shuffle if it will improve canonicalization.
17762 std::swap(V1, V2);
17763 }
17764
17765 // For each vector width, delegate to a specialized lowering routine.
17766 if (VT.is128BitVector())
17767 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17768
17769 if (VT.is256BitVector())
17770 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17771
17772 if (VT.is512BitVector())
17773 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17774
17775 if (Is1BitVector)
17776 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17777
17778 llvm_unreachable("Unimplemented!");
17779}
17780
17781/// Try to lower a VSELECT instruction to a vector shuffle.
17783 const X86Subtarget &Subtarget,
17784 SelectionDAG &DAG) {
17785 SDValue Cond = Op.getOperand(0);
17786 SDValue LHS = Op.getOperand(1);
17787 SDValue RHS = Op.getOperand(2);
17788 MVT VT = Op.getSimpleValueType();
17789
17790 // Only non-legal VSELECTs reach this lowering, convert those into generic
17791 // shuffles and re-use the shuffle lowering path for blends.
17795 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
17796 }
17797
17798 return SDValue();
17799}
17800
17801SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
17802 SDValue Cond = Op.getOperand(0);
17803 SDValue LHS = Op.getOperand(1);
17804 SDValue RHS = Op.getOperand(2);
17805
17806 SDLoc dl(Op);
17807 MVT VT = Op.getSimpleValueType();
17808 if (isSoftF16(VT, Subtarget)) {
17810 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
17811 DAG.getBitcast(NVT, LHS),
17812 DAG.getBitcast(NVT, RHS)));
17813 }
17814
17815 // A vselect where all conditions and data are constants can be optimized into
17816 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
17820 return SDValue();
17821
17822 // Try to lower this to a blend-style vector shuffle. This can handle all
17823 // constant condition cases.
17824 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
17825 return BlendOp;
17826
17827 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
17828 // with patterns on the mask registers on AVX-512.
17829 MVT CondVT = Cond.getSimpleValueType();
17830 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
17831 if (CondEltSize == 1)
17832 return Op;
17833
17834 // Variable blends are only legal from SSE4.1 onward.
17835 if (!Subtarget.hasSSE41())
17836 return SDValue();
17837
17838 unsigned EltSize = VT.getScalarSizeInBits();
17839 unsigned NumElts = VT.getVectorNumElements();
17840
17841 // Expand v32i16/v64i8 without BWI.
17842 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
17843 return SDValue();
17844
17845 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17846 // into an i1 condition so that we can use the mask-based 512-bit blend
17847 // instructions.
17848 if (VT.getSizeInBits() == 512) {
17849 // Build a mask by testing the condition against zero.
17850 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
17851 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
17852 DAG.getConstant(0, dl, CondVT),
17853 ISD::SETNE);
17854 // Now return a new VSELECT using the mask.
17855 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
17856 }
17857
17858 // SEXT/TRUNC cases where the mask doesn't match the destination size.
17859 if (CondEltSize != EltSize) {
17860 // If we don't have a sign splat, rely on the expansion.
17861 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
17862 return SDValue();
17863
17864 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
17865 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
17866 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
17867 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
17868 }
17869
17870 // v16i16/v32i8 selects without AVX2, if the condition and another operand
17871 // are free to split, then better to split before expanding the
17872 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
17873 // TODO: This is very similar to narrowVectorSelect.
17874 // TODO: Add Load splitting to isFreeToSplitVector ?
17875 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
17876 !Subtarget.hasXOP()) {
17877 bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG);
17878 bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) ||
17879 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
17880 bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) ||
17881 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
17882 if (FreeCond && (FreeLHS || FreeRHS))
17883 return splitVectorOp(Op, DAG, dl);
17884 }
17885
17886 // Only some types will be legal on some subtargets. If we can emit a legal
17887 // VSELECT-matching blend, return Op, and but if we need to expand, return
17888 // a null value.
17889 switch (VT.SimpleTy) {
17890 default:
17891 // Most of the vector types have blends past SSE4.1.
17892 return Op;
17893
17894 case MVT::v32i8:
17895 // The byte blends for AVX vectors were introduced only in AVX2.
17896 if (Subtarget.hasAVX2())
17897 return Op;
17898
17899 return SDValue();
17900
17901 case MVT::v8i16:
17902 case MVT::v16i16: {
17903 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
17904 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
17905 Cond = DAG.getBitcast(CastVT, Cond);
17906 LHS = DAG.getBitcast(CastVT, LHS);
17907 RHS = DAG.getBitcast(CastVT, RHS);
17908 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
17909 return DAG.getBitcast(VT, Select);
17910 }
17911 }
17912}
17913
17915 MVT VT = Op.getSimpleValueType();
17916 SDValue Vec = Op.getOperand(0);
17917 SDValue Idx = Op.getOperand(1);
17918 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
17919 SDLoc dl(Op);
17920
17922 return SDValue();
17923
17924 if (VT.getSizeInBits() == 8) {
17925 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
17926 // we're going to zero extend the register or fold the store.
17929 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
17930 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17931 DAG.getBitcast(MVT::v4i32, Vec), Idx));
17932
17933 unsigned IdxVal = Idx->getAsZExtVal();
17934 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
17935 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17936 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17937 }
17938
17939 if (VT == MVT::f32) {
17940 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
17941 // the result back to FR32 register. It's only worth matching if the
17942 // result has a single use which is a store or a bitcast to i32. And in
17943 // the case of a store, it's not worth it if the index is a constant 0,
17944 // because a MOVSSmr can be used instead, which is smaller and faster.
17945 if (!Op.hasOneUse())
17946 return SDValue();
17947 SDNode *User = *Op.getNode()->use_begin();
17948 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
17949 (User->getOpcode() != ISD::BITCAST ||
17950 User->getValueType(0) != MVT::i32))
17951 return SDValue();
17952 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17953 DAG.getBitcast(MVT::v4i32, Vec), Idx);
17954 return DAG.getBitcast(MVT::f32, Extract);
17955 }
17956
17957 if (VT == MVT::i32 || VT == MVT::i64)
17958 return Op;
17959
17960 return SDValue();
17961}
17962
17963/// Extract one bit from mask vector, like v16i1 or v8i1.
17964/// AVX-512 feature.
17966 const X86Subtarget &Subtarget) {
17967 SDValue Vec = Op.getOperand(0);
17968 SDLoc dl(Vec);
17969 MVT VecVT = Vec.getSimpleValueType();
17970 SDValue Idx = Op.getOperand(1);
17971 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17972 MVT EltVT = Op.getSimpleValueType();
17973
17974 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
17975 "Unexpected vector type in ExtractBitFromMaskVector");
17976
17977 // variable index can't be handled in mask registers,
17978 // extend vector to VR512/128
17979 if (!IdxC) {
17980 unsigned NumElts = VecVT.getVectorNumElements();
17981 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17982 // than extending to 128/256bit.
17983 if (NumElts == 1) {
17984 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17986 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
17987 }
17988 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17989 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17990 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
17991 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
17992 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
17993 }
17994
17995 unsigned IdxVal = IdxC->getZExtValue();
17996 if (IdxVal == 0) // the operation is legal
17997 return Op;
17998
17999 // Extend to natively supported kshift.
18000 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18001
18002 // Use kshiftr instruction to move to the lower element.
18003 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18004 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18005
18006 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18007 DAG.getIntPtrConstant(0, dl));
18008}
18009
18010// Helper to find all the extracted elements from a vector.
18012 MVT VT = N->getSimpleValueType(0);
18013 unsigned NumElts = VT.getVectorNumElements();
18014 APInt DemandedElts = APInt::getZero(NumElts);
18015 for (SDNode *User : N->uses()) {
18016 switch (User->getOpcode()) {
18017 case X86ISD::PEXTRB:
18018 case X86ISD::PEXTRW:
18020 if (!isa<ConstantSDNode>(User->getOperand(1))) {
18021 DemandedElts.setAllBits();
18022 return DemandedElts;
18023 }
18024 DemandedElts.setBit(User->getConstantOperandVal(1));
18025 break;
18026 case ISD::BITCAST: {
18027 if (!User->getValueType(0).isSimple() ||
18028 !User->getValueType(0).isVector()) {
18029 DemandedElts.setAllBits();
18030 return DemandedElts;
18031 }
18032 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18033 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18034 break;
18035 }
18036 default:
18037 DemandedElts.setAllBits();
18038 return DemandedElts;
18039 }
18040 }
18041 return DemandedElts;
18042}
18043
18044SDValue
18045X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18046 SelectionDAG &DAG) const {
18047 SDLoc dl(Op);
18048 SDValue Vec = Op.getOperand(0);
18049 MVT VecVT = Vec.getSimpleValueType();
18050 SDValue Idx = Op.getOperand(1);
18051 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18052
18053 if (VecVT.getVectorElementType() == MVT::i1)
18054 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18055
18056 if (!IdxC) {
18057 // Its more profitable to go through memory (1 cycles throughput)
18058 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18059 // IACA tool was used to get performance estimation
18060 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18061 //
18062 // example : extractelement <16 x i8> %a, i32 %i
18063 //
18064 // Block Throughput: 3.00 Cycles
18065 // Throughput Bottleneck: Port5
18066 //
18067 // | Num Of | Ports pressure in cycles | |
18068 // | Uops | 0 - DV | 5 | 6 | 7 | |
18069 // ---------------------------------------------
18070 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18071 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18072 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18073 // Total Num Of Uops: 4
18074 //
18075 //
18076 // Block Throughput: 1.00 Cycles
18077 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18078 //
18079 // | | Ports pressure in cycles | |
18080 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18081 // ---------------------------------------------------------
18082 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18083 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18084 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18085 // Total Num Of Uops: 4
18086
18087 return SDValue();
18088 }
18089
18090 unsigned IdxVal = IdxC->getZExtValue();
18091
18092 // If this is a 256-bit vector result, first extract the 128-bit vector and
18093 // then extract the element from the 128-bit vector.
18094 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18095 // Get the 128-bit vector.
18096 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18097 MVT EltVT = VecVT.getVectorElementType();
18098
18099 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18100 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18101
18102 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18103 // this can be done with a mask.
18104 IdxVal &= ElemsPerChunk - 1;
18105 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18106 DAG.getIntPtrConstant(IdxVal, dl));
18107 }
18108
18109 assert(VecVT.is128BitVector() && "Unexpected vector length");
18110
18111 MVT VT = Op.getSimpleValueType();
18112
18113 if (VT == MVT::i16) {
18114 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18115 // we're going to zero extend the register or fold the store (SSE41 only).
18116 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18117 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18118 if (Subtarget.hasFP16())
18119 return Op;
18120
18121 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18122 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18123 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18124 }
18125
18126 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18127 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18128 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18129 }
18130
18131 if (Subtarget.hasSSE41())
18132 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18133 return Res;
18134
18135 // Only extract a single element from a v16i8 source - determine the common
18136 // DWORD/WORD that all extractions share, and extract the sub-byte.
18137 // TODO: Add QWORD MOVQ extraction?
18138 if (VT == MVT::i8) {
18139 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18140 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18141
18142 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18143 int DWordIdx = IdxVal / 4;
18144 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18145 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18146 DAG.getBitcast(MVT::v4i32, Vec),
18147 DAG.getIntPtrConstant(DWordIdx, dl));
18148 int ShiftVal = (IdxVal % 4) * 8;
18149 if (ShiftVal != 0)
18150 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18151 DAG.getConstant(ShiftVal, dl, MVT::i8));
18152 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18153 }
18154
18155 int WordIdx = IdxVal / 2;
18156 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18157 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18158 DAG.getBitcast(MVT::v8i16, Vec),
18159 DAG.getIntPtrConstant(WordIdx, dl));
18160 int ShiftVal = (IdxVal % 2) * 8;
18161 if (ShiftVal != 0)
18162 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18163 DAG.getConstant(ShiftVal, dl, MVT::i8));
18164 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18165 }
18166 }
18167
18168 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18169 if (IdxVal == 0)
18170 return Op;
18171
18172 // Shuffle the element to the lowest element, then movss or movsh.
18174 Mask[0] = static_cast<int>(IdxVal);
18175 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18176 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18177 DAG.getIntPtrConstant(0, dl));
18178 }
18179
18180 if (VT.getSizeInBits() == 64) {
18181 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18182 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18183 // to match extract_elt for f64.
18184 if (IdxVal == 0)
18185 return Op;
18186
18187 // UNPCKHPD the element to the lowest double word, then movsd.
18188 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18189 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18190 int Mask[2] = { 1, -1 };
18191 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18192 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18193 DAG.getIntPtrConstant(0, dl));
18194 }
18195
18196 return SDValue();
18197}
18198
18199/// Insert one bit to mask vector, like v16i1 or v8i1.
18200/// AVX-512 feature.
18202 const X86Subtarget &Subtarget) {
18203 SDLoc dl(Op);
18204 SDValue Vec = Op.getOperand(0);
18205 SDValue Elt = Op.getOperand(1);
18206 SDValue Idx = Op.getOperand(2);
18207 MVT VecVT = Vec.getSimpleValueType();
18208
18209 if (!isa<ConstantSDNode>(Idx)) {
18210 // Non constant index. Extend source and destination,
18211 // insert element and then truncate the result.
18212 unsigned NumElts = VecVT.getVectorNumElements();
18213 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18214 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18215 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18216 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18217 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18218 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18219 }
18220
18221 // Copy into a k-register, extract to v1i1 and insert_subvector.
18222 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18223 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18224}
18225
18226SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18227 SelectionDAG &DAG) const {
18228 MVT VT = Op.getSimpleValueType();
18229 MVT EltVT = VT.getVectorElementType();
18230 unsigned NumElts = VT.getVectorNumElements();
18231 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18232
18233 if (EltVT == MVT::i1)
18234 return InsertBitToMaskVector(Op, DAG, Subtarget);
18235
18236 SDLoc dl(Op);
18237 SDValue N0 = Op.getOperand(0);
18238 SDValue N1 = Op.getOperand(1);
18239 SDValue N2 = Op.getOperand(2);
18240 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18241
18242 if (EltVT == MVT::bf16) {
18244 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18245 DAG.getBitcast(IVT, N0),
18246 DAG.getBitcast(MVT::i16, N1), N2);
18247 return DAG.getBitcast(VT, Res);
18248 }
18249
18250 if (!N2C) {
18251 // Variable insertion indices, usually we're better off spilling to stack,
18252 // but AVX512 can use a variable compare+select by comparing against all
18253 // possible vector indices, and FP insertion has less gpr->simd traffic.
18254 if (!(Subtarget.hasBWI() ||
18255 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18256 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18257 return SDValue();
18258
18259 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18260 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18261 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18262 return SDValue();
18263
18264 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18265 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18266 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18267
18268 SmallVector<SDValue, 16> RawIndices;
18269 for (unsigned I = 0; I != NumElts; ++I)
18270 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18271 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18272
18273 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18274 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18276 }
18277
18278 if (N2C->getAPIntValue().uge(NumElts))
18279 return SDValue();
18280 uint64_t IdxVal = N2C->getZExtValue();
18281
18282 bool IsZeroElt = X86::isZeroNode(N1);
18283 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18284
18285 if (IsZeroElt || IsAllOnesElt) {
18286 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18287 // We don't deal with i8 0 since it appears to be handled elsewhere.
18288 if (IsAllOnesElt &&
18289 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18290 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18291 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18292 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18293 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18294 CstVectorElts[IdxVal] = OnesCst;
18295 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18296 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18297 }
18298 // See if we can do this more efficiently with a blend shuffle with a
18299 // rematerializable vector.
18300 if (Subtarget.hasSSE41() &&
18301 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18302 SmallVector<int, 8> BlendMask;
18303 for (unsigned i = 0; i != NumElts; ++i)
18304 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18305 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18306 : getOnesVector(VT, DAG, dl);
18307 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18308 }
18309 }
18310
18311 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18312 // into that, and then insert the subvector back into the result.
18313 if (VT.is256BitVector() || VT.is512BitVector()) {
18314 // With a 256-bit vector, we can insert into the zero element efficiently
18315 // using a blend if we have AVX or AVX2 and the right data type.
18316 if (VT.is256BitVector() && IdxVal == 0) {
18317 // TODO: It is worthwhile to cast integer to floating point and back
18318 // and incur a domain crossing penalty if that's what we'll end up
18319 // doing anyway after extracting to a 128-bit vector.
18320 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18321 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18322 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18323 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18324 DAG.getTargetConstant(1, dl, MVT::i8));
18325 }
18326 }
18327
18328 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18329 assert(isPowerOf2_32(NumEltsIn128) &&
18330 "Vectors will always have power-of-two number of elements.");
18331
18332 // If we are not inserting into the low 128-bit vector chunk,
18333 // then prefer the broadcast+blend sequence.
18334 // FIXME: relax the profitability check iff all N1 uses are insertions.
18335 if (IdxVal >= NumEltsIn128 &&
18336 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18337 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18338 X86::mayFoldLoad(N1, Subtarget)))) {
18339 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18340 SmallVector<int, 8> BlendMask;
18341 for (unsigned i = 0; i != NumElts; ++i)
18342 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18343 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18344 }
18345
18346 // Get the desired 128-bit vector chunk.
18347 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18348
18349 // Insert the element into the desired chunk.
18350 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18351 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18352
18353 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18354 DAG.getIntPtrConstant(IdxIn128, dl));
18355
18356 // Insert the changed part back into the bigger vector
18357 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18358 }
18359 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18360
18361 // This will be just movw/movd/movq/movsh/movss/movsd.
18362 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18363 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18364 EltVT == MVT::f16 || EltVT == MVT::i64) {
18365 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18366 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18367 }
18368
18369 // We can't directly insert an i8 or i16 into a vector, so zero extend
18370 // it to i32 first.
18371 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18372 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18373 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18374 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18375 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18376 return DAG.getBitcast(VT, N1);
18377 }
18378 }
18379
18380 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18381 // argument. SSE41 required for pinsrb.
18382 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18383 unsigned Opc;
18384 if (VT == MVT::v8i16) {
18385 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18386 Opc = X86ISD::PINSRW;
18387 } else {
18388 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18389 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18390 Opc = X86ISD::PINSRB;
18391 }
18392
18393 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18394 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18395 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18396 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18397 }
18398
18399 if (Subtarget.hasSSE41()) {
18400 if (EltVT == MVT::f32) {
18401 // Bits [7:6] of the constant are the source select. This will always be
18402 // zero here. The DAG Combiner may combine an extract_elt index into
18403 // these bits. For example (insert (extract, 3), 2) could be matched by
18404 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18405 // Bits [5:4] of the constant are the destination select. This is the
18406 // value of the incoming immediate.
18407 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18408 // combine either bitwise AND or insert of float 0.0 to set these bits.
18409
18410 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18411 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18412 // If this is an insertion of 32-bits into the low 32-bits of
18413 // a vector, we prefer to generate a blend with immediate rather
18414 // than an insertps. Blends are simpler operations in hardware and so
18415 // will always have equal or better performance than insertps.
18416 // But if optimizing for size and there's a load folding opportunity,
18417 // generate insertps because blendps does not have a 32-bit memory
18418 // operand form.
18419 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18420 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18421 DAG.getTargetConstant(1, dl, MVT::i8));
18422 }
18423 // Create this as a scalar to vector..
18424 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18425 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18426 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18427 }
18428
18429 // PINSR* works with constant index.
18430 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18431 return Op;
18432 }
18433
18434 return SDValue();
18435}
18436
18438 SelectionDAG &DAG) {
18439 SDLoc dl(Op);
18440 MVT OpVT = Op.getSimpleValueType();
18441
18442 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18443 // combines.
18444 if (X86::isZeroNode(Op.getOperand(0)))
18445 return getZeroVector(OpVT, Subtarget, DAG, dl);
18446
18447 // If this is a 256-bit vector result, first insert into a 128-bit
18448 // vector and then insert into the 256-bit vector.
18449 if (!OpVT.is128BitVector()) {
18450 // Insert into a 128-bit vector.
18451 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18453 OpVT.getVectorNumElements() / SizeFactor);
18454
18455 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18456
18457 // Insert the 128-bit vector.
18458 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18459 }
18460 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18461 "Expected an SSE type!");
18462
18463 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18464 // tblgen.
18465 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18466 return Op;
18467
18468 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18469 return DAG.getBitcast(
18470 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18471}
18472
18473// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18474// simple superregister reference or explicit instructions to insert
18475// the upper bits of a vector.
18477 SelectionDAG &DAG) {
18478 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18479
18480 return insert1BitVector(Op, DAG, Subtarget);
18481}
18482
18484 SelectionDAG &DAG) {
18485 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18486 "Only vXi1 extract_subvectors need custom lowering");
18487
18488 SDLoc dl(Op);
18489 SDValue Vec = Op.getOperand(0);
18490 uint64_t IdxVal = Op.getConstantOperandVal(1);
18491
18492 if (IdxVal == 0) // the operation is legal
18493 return Op;
18494
18495 // Extend to natively supported kshift.
18496 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18497
18498 // Shift to the LSB.
18499 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18500 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18501
18502 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18503 DAG.getIntPtrConstant(0, dl));
18504}
18505
18506// Returns the appropriate wrapper opcode for a global reference.
18507unsigned X86TargetLowering::getGlobalWrapperKind(
18508 const GlobalValue *GV, const unsigned char OpFlags) const {
18509 // References to absolute symbols are never PC-relative.
18510 if (GV && GV->isAbsoluteSymbolRef())
18511 return X86ISD::Wrapper;
18512
18513 // The following OpFlags under RIP-rel PIC use RIP.
18514 if (Subtarget.isPICStyleRIPRel() &&
18515 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18516 OpFlags == X86II::MO_DLLIMPORT))
18517 return X86ISD::WrapperRIP;
18518
18519 // GOTPCREL references must always use RIP.
18520 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18521 return X86ISD::WrapperRIP;
18522
18523 return X86ISD::Wrapper;
18524}
18525
18526// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18527// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18528// one of the above mentioned nodes. It has to be wrapped because otherwise
18529// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18530// be used to form addressing mode. These wrapped nodes will be selected
18531// into MOV32ri.
18532SDValue
18533X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18534 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18535
18536 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18537 // global base reg.
18538 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18539
18540 auto PtrVT = getPointerTy(DAG.getDataLayout());
18542 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18543 SDLoc DL(CP);
18544 Result =
18545 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18546 // With PIC, the address is actually $g + Offset.
18547 if (OpFlag) {
18548 Result =
18549 DAG.getNode(ISD::ADD, DL, PtrVT,
18550 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18551 }
18552
18553 return Result;
18554}
18555
18556SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18557 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18558
18559 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18560 // global base reg.
18561 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18562
18563 auto PtrVT = getPointerTy(DAG.getDataLayout());
18564 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18565 SDLoc DL(JT);
18566 Result =
18567 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18568
18569 // With PIC, the address is actually $g + Offset.
18570 if (OpFlag)
18571 Result =
18572 DAG.getNode(ISD::ADD, DL, PtrVT,
18573 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18574
18575 return Result;
18576}
18577
18578SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18579 SelectionDAG &DAG) const {
18580 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18581}
18582
18583SDValue
18584X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18585 // Create the TargetBlockAddressAddress node.
18586 unsigned char OpFlags =
18588 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18589 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18590 SDLoc dl(Op);
18591 auto PtrVT = getPointerTy(DAG.getDataLayout());
18592 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18593 Result =
18594 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18595
18596 // With PIC, the address is actually $g + Offset.
18597 if (isGlobalRelativeToPICBase(OpFlags)) {
18598 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18599 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18600 }
18601
18602 return Result;
18603}
18604
18605/// Creates target global address or external symbol nodes for calls or
18606/// other uses.
18607SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18608 bool ForCall) const {
18609 // Unpack the global address or external symbol.
18610 SDLoc dl(Op);
18611 const GlobalValue *GV = nullptr;
18612 int64_t Offset = 0;
18613 const char *ExternalSym = nullptr;
18614 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18615 GV = G->getGlobal();
18616 Offset = G->getOffset();
18617 } else {
18618 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18619 ExternalSym = ES->getSymbol();
18620 }
18621
18622 // Calculate some flags for address lowering.
18624 unsigned char OpFlags;
18625 if (ForCall)
18626 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18627 else
18628 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18629 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18630 bool NeedsLoad = isGlobalStubReference(OpFlags);
18631
18633 auto PtrVT = getPointerTy(DAG.getDataLayout());
18635
18636 if (GV) {
18637 // Create a target global address if this is a global. If possible, fold the
18638 // offset into the global address reference. Otherwise, ADD it on later.
18639 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18640 // allowed because if the address of foo is 0, the ELF R_X86_64_32
18641 // relocation will compute to a negative value, which is invalid.
18642 int64_t GlobalOffset = 0;
18643 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18645 std::swap(GlobalOffset, Offset);
18646 }
18647 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18648 } else {
18649 // If this is not a global address, this must be an external symbol.
18650 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18651 }
18652
18653 // If this is a direct call, avoid the wrapper if we don't need to do any
18654 // loads or adds. This allows SDAG ISel to match direct calls.
18655 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
18656 return Result;
18657
18658 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
18659
18660 // With PIC, the address is actually $g + Offset.
18661 if (HasPICReg) {
18662 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18663 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18664 }
18665
18666 // For globals that require a load from a stub to get the address, emit the
18667 // load.
18668 if (NeedsLoad)
18669 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
18671
18672 // If there was a non-zero offset that we didn't fold, create an explicit
18673 // addition for it.
18674 if (Offset != 0)
18675 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
18676 DAG.getConstant(Offset, dl, PtrVT));
18677
18678 return Result;
18679}
18680
18681SDValue
18682X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
18683 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18684}
18685
18686static SDValue
18688 SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
18689 unsigned char OperandFlags, bool LocalDynamic = false) {
18691 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18692 SDLoc dl(GA);
18693 SDValue TGA;
18694 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
18695 if (LocalDynamic && UseTLSDESC) {
18696 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
18697 auto UI = TGA->use_begin();
18698 // Reuse existing GetTLSADDR node if we can find it.
18699 if (UI != TGA->use_end())
18700 return SDValue(*UI->use_begin()->use_begin(), 0);
18701 } else {
18702 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18703 GA->getOffset(), OperandFlags);
18704 }
18705
18706 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
18707 : LocalDynamic ? X86ISD::TLSBASEADDR
18709
18710 if (InGlue) {
18711 SDValue Ops[] = { Chain, TGA, *InGlue };
18712 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18713 } else {
18714 SDValue Ops[] = { Chain, TGA };
18715 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18716 }
18717
18718 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
18719 MFI.setAdjustsStack(true);
18720 MFI.setHasCalls(true);
18721
18722 SDValue Glue = Chain.getValue(1);
18723 SDValue Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
18724
18725 if (!UseTLSDESC)
18726 return Ret;
18727
18728 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
18729 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
18730
18732 SDValue Offset =
18733 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18735 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
18736}
18737
18738// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
18739static SDValue
18741 const EVT PtrVT) {
18742 SDValue InGlue;
18743 SDLoc dl(GA); // ? function entry point might be better
18744 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18746 SDLoc(), PtrVT), InGlue);
18747 InGlue = Chain.getValue(1);
18748
18749 return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
18750}
18751
18752// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
18753static SDValue
18755 const EVT PtrVT) {
18756 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18757 X86::RAX, X86II::MO_TLSGD);
18758}
18759
18760// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
18761static SDValue
18763 const EVT PtrVT) {
18764 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18765 X86::EAX, X86II::MO_TLSGD);
18766}
18767
18769 SelectionDAG &DAG, const EVT PtrVT,
18770 bool Is64Bit, bool Is64BitLP64) {
18771 SDLoc dl(GA);
18772
18773 // Get the start address of the TLS block for this module.
18777
18778 SDValue Base;
18779 if (Is64Bit) {
18780 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
18781 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
18782 X86II::MO_TLSLD, /*LocalDynamic=*/true);
18783 } else {
18784 SDValue InGlue;
18785 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18786 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
18787 InGlue = Chain.getValue(1);
18788 Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
18789 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
18790 }
18791
18792 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
18793 // of Base.
18794
18795 // Build x@dtpoff.
18796 unsigned char OperandFlags = X86II::MO_DTPOFF;
18797 unsigned WrapperKind = X86ISD::Wrapper;
18798 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18799 GA->getValueType(0),
18800 GA->getOffset(), OperandFlags);
18801 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18802
18803 // Add x@dtpoff with the base.
18804 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
18805}
18806
18807// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
18809 const EVT PtrVT, TLSModel::Model model,
18810 bool is64Bit, bool isPIC) {
18811 SDLoc dl(GA);
18812
18813 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18815 PointerType::get(*DAG.getContext(), is64Bit ? 257 : 256));
18816
18817 SDValue ThreadPointer =
18818 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18820
18821 unsigned char OperandFlags = 0;
18822 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
18823 // initialexec.
18824 unsigned WrapperKind = X86ISD::Wrapper;
18825 if (model == TLSModel::LocalExec) {
18826 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
18827 } else if (model == TLSModel::InitialExec) {
18828 if (is64Bit) {
18829 OperandFlags = X86II::MO_GOTTPOFF;
18830 WrapperKind = X86ISD::WrapperRIP;
18831 } else {
18832 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
18833 }
18834 } else {
18835 llvm_unreachable("Unexpected model");
18836 }
18837
18838 // emit "addl x@ntpoff,%eax" (local exec)
18839 // or "addl x@indntpoff,%eax" (initial exec)
18840 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18841 SDValue TGA =
18842 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18843 GA->getOffset(), OperandFlags);
18844 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18845
18846 if (model == TLSModel::InitialExec) {
18847 if (isPIC && !is64Bit) {
18848 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
18849 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18850 Offset);
18851 }
18852
18853 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
18855 }
18856
18857 // The address of the thread local variable is the add of the thread
18858 // pointer with the offset of the variable.
18859 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
18860}
18861
18862SDValue
18863X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
18864
18865 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
18866
18867 if (DAG.getTarget().useEmulatedTLS())
18868 return LowerToTLSEmulatedModel(GA, DAG);
18869
18870 const GlobalValue *GV = GA->getGlobal();
18871 auto PtrVT = getPointerTy(DAG.getDataLayout());
18872 bool PositionIndependent = isPositionIndependent();
18873
18874 if (Subtarget.isTargetELF()) {
18875 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
18876 switch (model) {
18878 if (Subtarget.is64Bit()) {
18879 if (Subtarget.isTarget64BitLP64())
18880 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
18881 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
18882 }
18883 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
18885 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
18886 Subtarget.isTarget64BitLP64());
18889 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
18890 PositionIndependent);
18891 }
18892 llvm_unreachable("Unknown TLS model.");
18893 }
18894
18895 if (Subtarget.isTargetDarwin()) {
18896 // Darwin only has one model of TLS. Lower to that.
18897 unsigned char OpFlag = 0;
18898 unsigned WrapperKind = 0;
18899
18900 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18901 // global base reg.
18902 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
18903 if (PIC32) {
18904 OpFlag = X86II::MO_TLVP_PIC_BASE;
18905 WrapperKind = X86ISD::Wrapper;
18906 } else {
18907 OpFlag = X86II::MO_TLVP;
18908 WrapperKind = X86ISD::WrapperRIP;
18909 }
18910 SDLoc DL(Op);
18912 GA->getValueType(0),
18913 GA->getOffset(), OpFlag);
18914 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
18915
18916 // With PIC32, the address is actually $g + Offset.
18917 if (PIC32)
18918 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
18919 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18920 Offset);
18921
18922 // Lowering the machine isd will make sure everything is in the right
18923 // location.
18924 SDValue Chain = DAG.getEntryNode();
18925 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18926 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
18927 SDValue Args[] = { Chain, Offset };
18928 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
18929 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
18930
18931 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
18933 MFI.setAdjustsStack(true);
18934
18935 // And our return value (tls address) is in the standard call return value
18936 // location.
18937 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
18938 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
18939 }
18940
18941 if (Subtarget.isOSWindows()) {
18942 // Just use the implicit TLS architecture
18943 // Need to generate something similar to:
18944 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
18945 // ; from TEB
18946 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
18947 // mov rcx, qword [rdx+rcx*8]
18948 // mov eax, .tls$:tlsvar
18949 // [rax+rcx] contains the address
18950 // Windows 64bit: gs:0x58
18951 // Windows 32bit: fs:__tls_array
18952
18953 SDLoc dl(GA);
18954 SDValue Chain = DAG.getEntryNode();
18955
18956 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18957 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18958 // use its literal value of 0x2C.
18960 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), 256)
18961 : PointerType::get(*DAG.getContext(), 257));
18962
18963 SDValue TlsArray = Subtarget.is64Bit()
18964 ? DAG.getIntPtrConstant(0x58, dl)
18965 : (Subtarget.isTargetWindowsGNU()
18966 ? DAG.getIntPtrConstant(0x2C, dl)
18967 : DAG.getExternalSymbol("_tls_array", PtrVT));
18968
18970 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
18971
18972 SDValue res;
18974 res = ThreadPointer;
18975 } else {
18976 // Load the _tls_index variable
18977 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
18978 if (Subtarget.is64Bit())
18979 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
18980 MachinePointerInfo(), MVT::i32);
18981 else
18982 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
18983
18984 const DataLayout &DL = DAG.getDataLayout();
18985 SDValue Scale =
18986 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
18987 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
18988
18989 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
18990 }
18991
18992 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
18993
18994 // Get the offset of start of .tls section
18995 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18996 GA->getValueType(0),
18998 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
18999
19000 // The address of the thread local variable is the add of the thread
19001 // pointer with the offset of the variable.
19002 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19003 }
19004
19005 llvm_unreachable("TLS not implemented for this target.");
19006}
19007
19009 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19011 TLSModel::Model Model = TM.getTLSModel(&GV);
19012 switch (Model) {
19015 // We can include the %fs segment register in addressing modes.
19016 return true;
19019 // These models do not result in %fs relative addresses unless
19020 // TLS descriptior are used.
19021 //
19022 // Even in the case of TLS descriptors we currently have no way to model
19023 // the difference between %fs access and the computations needed for the
19024 // offset and returning `true` for TLS-desc currently duplicates both
19025 // which is detrimental :-/
19026 return false;
19027 }
19028 }
19029 return false;
19030}
19031
19032/// Lower SRA_PARTS and friends, which return two i32 values
19033/// and take a 2 x i32 value to shift plus a shift amount.
19034/// TODO: Can this be moved to general expansion code?
19036 SDValue Lo, Hi;
19037 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19038 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19039}
19040
19041// Try to use a packed vector operation to handle i64 on 32-bit targets when
19042// AVX512DQ is enabled.
19044 SelectionDAG &DAG,
19045 const X86Subtarget &Subtarget) {
19046 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19047 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19048 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19049 Op.getOpcode() == ISD::UINT_TO_FP) &&
19050 "Unexpected opcode!");
19051 bool IsStrict = Op->isStrictFPOpcode();
19052 unsigned OpNo = IsStrict ? 1 : 0;
19053 SDValue Src = Op.getOperand(OpNo);
19054 MVT SrcVT = Src.getSimpleValueType();
19055 MVT VT = Op.getSimpleValueType();
19056
19057 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19058 (VT != MVT::f32 && VT != MVT::f64))
19059 return SDValue();
19060
19061 // Pack the i64 into a vector, do the operation and extract.
19062
19063 // Using 256-bit to ensure result is 128-bits for f32 case.
19064 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19065 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19066 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19067
19068 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19069 if (IsStrict) {
19070 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19071 {Op.getOperand(0), InVec});
19072 SDValue Chain = CvtVec.getValue(1);
19073 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19074 DAG.getIntPtrConstant(0, dl));
19075 return DAG.getMergeValues({Value, Chain}, dl);
19076 }
19077
19078 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19079
19080 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19081 DAG.getIntPtrConstant(0, dl));
19082}
19083
19084// Try to use a packed vector operation to handle i64 on 32-bit targets.
19086 const X86Subtarget &Subtarget) {
19087 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19088 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19089 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19090 Op.getOpcode() == ISD::UINT_TO_FP) &&
19091 "Unexpected opcode!");
19092 bool IsStrict = Op->isStrictFPOpcode();
19093 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19094 MVT SrcVT = Src.getSimpleValueType();
19095 MVT VT = Op.getSimpleValueType();
19096
19097 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19098 return SDValue();
19099
19100 // Pack the i64 into a vector, do the operation and extract.
19101
19102 assert(Subtarget.hasFP16() && "Expected FP16");
19103
19104 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19105 if (IsStrict) {
19106 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19107 {Op.getOperand(0), InVec});
19108 SDValue Chain = CvtVec.getValue(1);
19109 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19110 DAG.getIntPtrConstant(0, dl));
19111 return DAG.getMergeValues({Value, Chain}, dl);
19112 }
19113
19114 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19115
19116 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19117 DAG.getIntPtrConstant(0, dl));
19118}
19119
19120static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19121 const X86Subtarget &Subtarget) {
19122 switch (Opcode) {
19123 case ISD::SINT_TO_FP:
19124 // TODO: Handle wider types with AVX/AVX512.
19125 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19126 return false;
19127 // CVTDQ2PS or (V)CVTDQ2PD
19128 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19129
19130 case ISD::UINT_TO_FP:
19131 // TODO: Handle wider types and i64 elements.
19132 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19133 return false;
19134 // VCVTUDQ2PS or VCVTUDQ2PD
19135 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19136
19137 default:
19138 return false;
19139 }
19140}
19141
19142/// Given a scalar cast operation that is extracted from a vector, try to
19143/// vectorize the cast op followed by extraction. This will avoid an expensive
19144/// round-trip between XMM and GPR.
19146 SelectionDAG &DAG,
19147 const X86Subtarget &Subtarget) {
19148 // TODO: This could be enhanced to handle smaller integer types by peeking
19149 // through an extend.
19150 SDValue Extract = Cast.getOperand(0);
19151 MVT DestVT = Cast.getSimpleValueType();
19152 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19153 !isa<ConstantSDNode>(Extract.getOperand(1)))
19154 return SDValue();
19155
19156 // See if we have a 128-bit vector cast op for this type of cast.
19157 SDValue VecOp = Extract.getOperand(0);
19158 MVT FromVT = VecOp.getSimpleValueType();
19159 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19160 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19161 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19162 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19163 return SDValue();
19164
19165 // If we are extracting from a non-zero element, first shuffle the source
19166 // vector to allow extracting from element zero.
19167 if (!isNullConstant(Extract.getOperand(1))) {
19168 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19169 Mask[0] = Extract.getConstantOperandVal(1);
19170 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19171 }
19172 // If the source vector is wider than 128-bits, extract the low part. Do not
19173 // create an unnecessarily wide vector cast op.
19174 if (FromVT != Vec128VT)
19175 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19176
19177 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19178 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19179 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19180 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19181 DAG.getIntPtrConstant(0, DL));
19182}
19183
19184/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19185/// try to vectorize the cast ops. This will avoid an expensive round-trip
19186/// between XMM and GPR.
19187static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19188 SelectionDAG &DAG,
19189 const X86Subtarget &Subtarget) {
19190 // TODO: Allow FP_TO_UINT.
19191 SDValue CastToInt = CastToFP.getOperand(0);
19192 MVT VT = CastToFP.getSimpleValueType();
19193 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19194 return SDValue();
19195
19196 MVT IntVT = CastToInt.getSimpleValueType();
19197 SDValue X = CastToInt.getOperand(0);
19198 MVT SrcVT = X.getSimpleValueType();
19199 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19200 return SDValue();
19201
19202 // See if we have 128-bit vector cast instructions for this type of cast.
19203 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19204 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19205 IntVT != MVT::i32)
19206 return SDValue();
19207
19208 unsigned SrcSize = SrcVT.getSizeInBits();
19209 unsigned IntSize = IntVT.getSizeInBits();
19210 unsigned VTSize = VT.getSizeInBits();
19211 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19212 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19213 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19214
19215 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19216 unsigned ToIntOpcode =
19217 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19218 unsigned ToFPOpcode =
19219 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19220
19221 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19222 //
19223 // We are not defining the high elements (for example, zero them) because
19224 // that could nullify any performance advantage that we hoped to gain from
19225 // this vector op hack. We do not expect any adverse effects (like denorm
19226 // penalties) with cast ops.
19227 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19228 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19229 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19230 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19231 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19232}
19233
19235 SelectionDAG &DAG,
19236 const X86Subtarget &Subtarget) {
19237 bool IsStrict = Op->isStrictFPOpcode();
19238 MVT VT = Op->getSimpleValueType(0);
19239 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19240
19241 if (Subtarget.hasDQI()) {
19242 assert(!Subtarget.hasVLX() && "Unexpected features");
19243
19244 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19245 Src.getSimpleValueType() == MVT::v4i64) &&
19246 "Unsupported custom type");
19247
19248 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19249 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19250 "Unexpected VT!");
19251 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19252
19253 // Need to concat with zero vector for strict fp to avoid spurious
19254 // exceptions.
19255 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19256 : DAG.getUNDEF(MVT::v8i64);
19257 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19258 DAG.getIntPtrConstant(0, DL));
19259 SDValue Res, Chain;
19260 if (IsStrict) {
19261 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19262 {Op->getOperand(0), Src});
19263 Chain = Res.getValue(1);
19264 } else {
19265 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19266 }
19267
19268 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19269 DAG.getIntPtrConstant(0, DL));
19270
19271 if (IsStrict)
19272 return DAG.getMergeValues({Res, Chain}, DL);
19273 return Res;
19274 }
19275
19276 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19277 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19278 if (VT != MVT::v4f32 || IsSigned)
19279 return SDValue();
19280
19281 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19282 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19283 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19284 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19285 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19286 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19287 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19288 SmallVector<SDValue, 4> SignCvts(4);
19289 SmallVector<SDValue, 4> Chains(4);
19290 for (int i = 0; i != 4; ++i) {
19291 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19292 DAG.getIntPtrConstant(i, DL));
19293 if (IsStrict) {
19294 SignCvts[i] =
19295 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19296 {Op.getOperand(0), Elt});
19297 Chains[i] = SignCvts[i].getValue(1);
19298 } else {
19299 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19300 }
19301 }
19302 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19303
19304 SDValue Slow, Chain;
19305 if (IsStrict) {
19306 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19307 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19308 {Chain, SignCvt, SignCvt});
19309 Chain = Slow.getValue(1);
19310 } else {
19311 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19312 }
19313
19314 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19315 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19316
19317 if (IsStrict)
19318 return DAG.getMergeValues({Cvt, Chain}, DL);
19319
19320 return Cvt;
19321}
19322
19324 SelectionDAG &DAG) {
19325 bool IsStrict = Op->isStrictFPOpcode();
19326 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19327 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19328 MVT VT = Op.getSimpleValueType();
19329 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
19330
19331 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
19332 if (IsStrict)
19333 return DAG.getNode(
19334 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19335 {Chain,
19336 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19337 Rnd});
19338 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19339 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19340}
19341
19342static bool isLegalConversion(MVT VT, bool IsSigned,
19343 const X86Subtarget &Subtarget) {
19344 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19345 return true;
19346 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19347 return true;
19348 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19349 return true;
19350 if (Subtarget.useAVX512Regs()) {
19351 if (VT == MVT::v16i32)
19352 return true;
19353 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19354 return true;
19355 }
19356 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19357 (VT == MVT::v2i64 || VT == MVT::v4i64))
19358 return true;
19359 return false;
19360}
19361
19362SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19363 SelectionDAG &DAG) const {
19364 bool IsStrict = Op->isStrictFPOpcode();
19365 unsigned OpNo = IsStrict ? 1 : 0;
19366 SDValue Src = Op.getOperand(OpNo);
19367 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19368 MVT SrcVT = Src.getSimpleValueType();
19369 MVT VT = Op.getSimpleValueType();
19370 SDLoc dl(Op);
19371
19372 if (isSoftF16(VT, Subtarget))
19373 return promoteXINT_TO_FP(Op, dl, DAG);
19374 else if (isLegalConversion(SrcVT, true, Subtarget))
19375 return Op;
19376
19377 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19378 return LowerWin64_INT128_TO_FP(Op, DAG);
19379
19380 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19381 return Extract;
19382
19383 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
19384 return R;
19385
19386 if (SrcVT.isVector()) {
19387 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19388 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19389 // source for strict FP.
19390 if (IsStrict)
19391 return DAG.getNode(
19392 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19393 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19394 DAG.getUNDEF(SrcVT))});
19395 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19396 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19397 DAG.getUNDEF(SrcVT)));
19398 }
19399 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19400 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19401
19402 return SDValue();
19403 }
19404
19405 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19406 "Unknown SINT_TO_FP to lower!");
19407
19408 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19409
19410 // These are really Legal; return the operand so the caller accepts it as
19411 // Legal.
19412 if (SrcVT == MVT::i32 && UseSSEReg)
19413 return Op;
19414 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19415 return Op;
19416
19417 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19418 return V;
19419 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19420 return V;
19421
19422 // SSE doesn't have an i16 conversion so we need to promote.
19423 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19424 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19425 if (IsStrict)
19426 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19427 {Chain, Ext});
19428
19429 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19430 }
19431
19432 if (VT == MVT::f128 || !Subtarget.hasX87())
19433 return SDValue();
19434
19435 SDValue ValueToStore = Src;
19436 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19437 // Bitcasting to f64 here allows us to do a single 64-bit store from
19438 // an SSE register, avoiding the store forwarding penalty that would come
19439 // with two 32-bit stores.
19440 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19441
19442 unsigned Size = SrcVT.getStoreSize();
19443 Align Alignment(Size);
19445 auto PtrVT = getPointerTy(MF.getDataLayout());
19446 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19447 MachinePointerInfo MPI =
19449 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19450 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19451 std::pair<SDValue, SDValue> Tmp =
19452 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19453
19454 if (IsStrict)
19455 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19456
19457 return Tmp.first;
19458}
19459
19460std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19461 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19462 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19463 // Build the FILD
19464 SDVTList Tys;
19465 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19466 if (useSSE)
19467 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19468 else
19469 Tys = DAG.getVTList(DstVT, MVT::Other);
19470
19471 SDValue FILDOps[] = {Chain, Pointer};
19472 SDValue Result =
19473 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19474 Alignment, MachineMemOperand::MOLoad);
19475 Chain = Result.getValue(1);
19476
19477 if (useSSE) {
19479 unsigned SSFISize = DstVT.getStoreSize();
19480 int SSFI =
19481 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19482 auto PtrVT = getPointerTy(MF.getDataLayout());
19483 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19484 Tys = DAG.getVTList(MVT::Other);
19485 SDValue FSTOps[] = {Chain, Result, StackSlot};
19488 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19489
19490 Chain =
19491 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19492 Result = DAG.getLoad(
19493 DstVT, DL, Chain, StackSlot,
19495 Chain = Result.getValue(1);
19496 }
19497
19498 return { Result, Chain };
19499}
19500
19501/// Horizontal vector math instructions may be slower than normal math with
19502/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19503/// implementation, and likely shuffle complexity of the alternate sequence.
19504static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19505 const X86Subtarget &Subtarget) {
19506 bool IsOptimizingSize = DAG.shouldOptForSize();
19507 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19508 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19509}
19510
19511/// 64-bit unsigned integer to double expansion.
19513 SelectionDAG &DAG,
19514 const X86Subtarget &Subtarget) {
19515 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19516 // when converting 0 when rounding toward negative infinity. Caller will
19517 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19518 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19519 // This algorithm is not obvious. Here it is what we're trying to output:
19520 /*
19521 movq %rax, %xmm0
19522 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19523 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19524 #ifdef __SSE3__
19525 haddpd %xmm0, %xmm0
19526 #else
19527 pshufd $0x4e, %xmm0, %xmm1
19528 addpd %xmm1, %xmm0
19529 #endif
19530 */
19531
19532 LLVMContext *Context = DAG.getContext();
19533
19534 // Build some magic constants.
19535 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19536 Constant *C0 = ConstantDataVector::get(*Context, CV0);
19537 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19538 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19539
19541 CV1.push_back(
19542 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19543 APInt(64, 0x4330000000000000ULL))));
19544 CV1.push_back(
19545 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19546 APInt(64, 0x4530000000000000ULL))));
19547 Constant *C1 = ConstantVector::get(CV1);
19548 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19549
19550 // Load the 64-bit value into an XMM register.
19551 SDValue XR1 =
19552 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19553 SDValue CLod0 = DAG.getLoad(
19554 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19556 SDValue Unpck1 =
19557 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19558
19559 SDValue CLod1 = DAG.getLoad(
19560 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19562 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19563 // TODO: Are there any fast-math-flags to propagate here?
19564 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19565 SDValue Result;
19566
19567 if (Subtarget.hasSSE3() &&
19568 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19569 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19570 } else {
19571 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19572 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19573 }
19574 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19575 DAG.getIntPtrConstant(0, dl));
19576 return Result;
19577}
19578
19579/// 32-bit unsigned integer to float expansion.
19581 SelectionDAG &DAG,
19582 const X86Subtarget &Subtarget) {
19583 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19584 // FP constant to bias correct the final result.
19585 SDValue Bias = DAG.getConstantFP(
19586 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19587
19588 // Load the 32-bit value into an XMM register.
19589 SDValue Load =
19590 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19591
19592 // Zero out the upper parts of the register.
19593 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19594
19595 // Or the load with the bias.
19596 SDValue Or = DAG.getNode(
19597 ISD::OR, dl, MVT::v2i64,
19598 DAG.getBitcast(MVT::v2i64, Load),
19599 DAG.getBitcast(MVT::v2i64,
19600 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19601 Or =
19602 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19603 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
19604
19605 if (Op.getNode()->isStrictFPOpcode()) {
19606 // Subtract the bias.
19607 // TODO: Are there any fast-math-flags to propagate here?
19608 SDValue Chain = Op.getOperand(0);
19609 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19610 {Chain, Or, Bias});
19611
19612 if (Op.getValueType() == Sub.getValueType())
19613 return Sub;
19614
19615 // Handle final rounding.
19616 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19617 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19618
19619 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19620 }
19621
19622 // Subtract the bias.
19623 // TODO: Are there any fast-math-flags to propagate here?
19624 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19625
19626 // Handle final rounding.
19627 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19628}
19629
19631 SelectionDAG &DAG,
19632 const X86Subtarget &Subtarget) {
19633 if (Op.getSimpleValueType() != MVT::v2f64)
19634 return SDValue();
19635
19636 bool IsStrict = Op->isStrictFPOpcode();
19637
19638 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19639 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19640
19641 if (Subtarget.hasAVX512()) {
19642 if (!Subtarget.hasVLX()) {
19643 // Let generic type legalization widen this.
19644 if (!IsStrict)
19645 return SDValue();
19646 // Otherwise pad the integer input with 0s and widen the operation.
19647 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19648 DAG.getConstant(0, DL, MVT::v2i32));
19649 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19650 {Op.getOperand(0), N0});
19651 SDValue Chain = Res.getValue(1);
19652 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
19653 DAG.getIntPtrConstant(0, DL));
19654 return DAG.getMergeValues({Res, Chain}, DL);
19655 }
19656
19657 // Legalize to v4i32 type.
19658 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19659 DAG.getUNDEF(MVT::v2i32));
19660 if (IsStrict)
19661 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
19662 {Op.getOperand(0), N0});
19663 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
19664 }
19665
19666 // Zero extend to 2i64, OR with the floating point representation of 2^52.
19667 // This gives us the floating point equivalent of 2^52 + the i32 integer
19668 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
19669 // point leaving just our i32 integers in double format.
19670 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
19671 SDValue VBias = DAG.getConstantFP(
19672 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
19673 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
19674 DAG.getBitcast(MVT::v2i64, VBias));
19675 Or = DAG.getBitcast(MVT::v2f64, Or);
19676
19677 if (IsStrict)
19678 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
19679 {Op.getOperand(0), Or, VBias});
19680 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
19681}
19682
19684 SelectionDAG &DAG,
19685 const X86Subtarget &Subtarget) {
19686 bool IsStrict = Op->isStrictFPOpcode();
19687 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
19688 MVT VecIntVT = V.getSimpleValueType();
19689 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
19690 "Unsupported custom type");
19691
19692 if (Subtarget.hasAVX512()) {
19693 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19694 assert(!Subtarget.hasVLX() && "Unexpected features");
19695 MVT VT = Op->getSimpleValueType(0);
19696
19697 // v8i32->v8f64 is legal with AVX512 so just return it.
19698 if (VT == MVT::v8f64)
19699 return Op;
19700
19701 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
19702 "Unexpected VT!");
19703 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
19704 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
19705 // Need to concat with zero vector for strict fp to avoid spurious
19706 // exceptions.
19707 SDValue Tmp =
19708 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
19709 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
19710 DAG.getIntPtrConstant(0, DL));
19711 SDValue Res, Chain;
19712 if (IsStrict) {
19713 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
19714 {Op->getOperand(0), V});
19715 Chain = Res.getValue(1);
19716 } else {
19717 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
19718 }
19719
19720 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19721 DAG.getIntPtrConstant(0, DL));
19722
19723 if (IsStrict)
19724 return DAG.getMergeValues({Res, Chain}, DL);
19725 return Res;
19726 }
19727
19728 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
19729 Op->getSimpleValueType(0) == MVT::v4f64) {
19730 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
19731 Constant *Bias = ConstantFP::get(
19732 *DAG.getContext(),
19733 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
19734 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19735 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
19736 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
19737 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
19738 SDValue VBias = DAG.getMemIntrinsicNode(
19739 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
19742
19743 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
19744 DAG.getBitcast(MVT::v4i64, VBias));
19745 Or = DAG.getBitcast(MVT::v4f64, Or);
19746
19747 if (IsStrict)
19748 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
19749 {Op.getOperand(0), Or, VBias});
19750 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
19751 }
19752
19753 // The algorithm is the following:
19754 // #ifdef __SSE4_1__
19755 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19756 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19757 // (uint4) 0x53000000, 0xaa);
19758 // #else
19759 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19760 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19761 // #endif
19762 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19763 // return (float4) lo + fhi;
19764
19765 bool Is128 = VecIntVT == MVT::v4i32;
19766 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
19767 // If we convert to something else than the supported type, e.g., to v4f64,
19768 // abort early.
19769 if (VecFloatVT != Op->getSimpleValueType(0))
19770 return SDValue();
19771
19772 // In the #idef/#else code, we have in common:
19773 // - The vector of constants:
19774 // -- 0x4b000000
19775 // -- 0x53000000
19776 // - A shift:
19777 // -- v >> 16
19778
19779 // Create the splat vector for 0x4b000000.
19780 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
19781 // Create the splat vector for 0x53000000.
19782 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
19783
19784 // Create the right shift.
19785 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
19786 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
19787
19788 SDValue Low, High;
19789 if (Subtarget.hasSSE41()) {
19790 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
19791 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19792 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
19793 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
19794 // Low will be bitcasted right away, so do not bother bitcasting back to its
19795 // original type.
19796 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
19797 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19798 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19799 // (uint4) 0x53000000, 0xaa);
19800 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
19801 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
19802 // High will be bitcasted right away, so do not bother bitcasting back to
19803 // its original type.
19804 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
19805 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19806 } else {
19807 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
19808 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19809 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
19810 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
19811
19812 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19813 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
19814 }
19815
19816 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
19817 SDValue VecCstFSub = DAG.getConstantFP(
19818 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
19819
19820 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19821 // NOTE: By using fsub of a positive constant instead of fadd of a negative
19822 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
19823 // enabled. See PR24512.
19824 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
19825 // TODO: Are there any fast-math-flags to propagate here?
19826 // (float4) lo;
19827 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
19828 // return (float4) lo + fhi;
19829 if (IsStrict) {
19830 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
19831 {Op.getOperand(0), HighBitcast, VecCstFSub});
19832 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
19833 {FHigh.getValue(1), LowBitcast, FHigh});
19834 }
19835
19836 SDValue FHigh =
19837 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
19838 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
19839}
19840
19842 const X86Subtarget &Subtarget) {
19843 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19844 SDValue N0 = Op.getOperand(OpNo);
19845 MVT SrcVT = N0.getSimpleValueType();
19846
19847 switch (SrcVT.SimpleTy) {
19848 default:
19849 llvm_unreachable("Custom UINT_TO_FP is not supported!");
19850 case MVT::v2i32:
19851 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
19852 case MVT::v4i32:
19853 case MVT::v8i32:
19854 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
19855 case MVT::v2i64:
19856 case MVT::v4i64:
19857 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19858 }
19859}
19860
19861SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
19862 SelectionDAG &DAG) const {
19863 bool IsStrict = Op->isStrictFPOpcode();
19864 unsigned OpNo = IsStrict ? 1 : 0;
19865 SDValue Src = Op.getOperand(OpNo);
19866 SDLoc dl(Op);
19867 auto PtrVT = getPointerTy(DAG.getDataLayout());
19868 MVT SrcVT = Src.getSimpleValueType();
19869 MVT DstVT = Op->getSimpleValueType(0);
19870 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19871
19872 // Bail out when we don't have native conversion instructions.
19873 if (DstVT == MVT::f128)
19874 return SDValue();
19875
19876 if (isSoftF16(DstVT, Subtarget))
19877 return promoteXINT_TO_FP(Op, dl, DAG);
19878 else if (isLegalConversion(SrcVT, false, Subtarget))
19879 return Op;
19880
19881 if (DstVT.isVector())
19882 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
19883
19884 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19885 return LowerWin64_INT128_TO_FP(Op, DAG);
19886
19887 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19888 return Extract;
19889
19890 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
19891 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
19892 // Conversions from unsigned i32 to f32/f64 are legal,
19893 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
19894 return Op;
19895 }
19896
19897 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
19898 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
19899 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
19900 if (IsStrict)
19901 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
19902 {Chain, Src});
19903 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
19904 }
19905
19906 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19907 return V;
19908 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19909 return V;
19910
19911 // The transform for i64->f64 isn't correct for 0 when rounding to negative
19912 // infinity. It produces -0.0, so disable under strictfp.
19913 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
19914 !IsStrict)
19915 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
19916 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
19917 // negative infinity. So disable under strictfp. Using FILD instead.
19918 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
19919 !IsStrict)
19920 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
19921 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
19922 (DstVT == MVT::f32 || DstVT == MVT::f64))
19923 return SDValue();
19924
19925 // Make a 64-bit buffer, and use it to build an FILD.
19926 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
19927 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
19928 Align SlotAlign(8);
19929 MachinePointerInfo MPI =
19931 if (SrcVT == MVT::i32) {
19932 SDValue OffsetSlot =
19933 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
19934 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
19935 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
19936 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
19937 std::pair<SDValue, SDValue> Tmp =
19938 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
19939 if (IsStrict)
19940 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19941
19942 return Tmp.first;
19943 }
19944
19945 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
19946 SDValue ValueToStore = Src;
19947 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
19948 // Bitcasting to f64 here allows us to do a single 64-bit store from
19949 // an SSE register, avoiding the store forwarding penalty that would come
19950 // with two 32-bit stores.
19951 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19952 }
19953 SDValue Store =
19954 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
19955 // For i64 source, we need to add the appropriate power of 2 if the input
19956 // was negative. We must be careful to do the computation in x87 extended
19957 // precision, not in SSE.
19958 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19959 SDValue Ops[] = {Store, StackSlot};
19960 SDValue Fild =
19961 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
19962 SlotAlign, MachineMemOperand::MOLoad);
19963 Chain = Fild.getValue(1);
19964
19965 // Check whether the sign bit is set.
19966 SDValue SignSet = DAG.getSetCC(
19967 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
19968 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
19969
19970 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
19971 APInt FF(64, 0x5F80000000000000ULL);
19972 SDValue FudgePtr =
19973 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
19974 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
19975
19976 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
19977 SDValue Zero = DAG.getIntPtrConstant(0, dl);
19978 SDValue Four = DAG.getIntPtrConstant(4, dl);
19979 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
19980 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
19981
19982 // Load the value out, extending it from f32 to f80.
19983 SDValue Fudge = DAG.getExtLoad(
19984 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
19986 CPAlignment);
19987 Chain = Fudge.getValue(1);
19988 // Extend everything to 80 bits to force it to be done on x87.
19989 // TODO: Are there any fast-math-flags to propagate here?
19990 if (IsStrict) {
19991 unsigned Opc = ISD::STRICT_FADD;
19992 // Windows needs the precision control changed to 80bits around this add.
19993 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19995
19996 SDValue Add =
19997 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
19998 // STRICT_FP_ROUND can't handle equal types.
19999 if (DstVT == MVT::f80)
20000 return Add;
20001 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20002 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
20003 }
20004 unsigned Opc = ISD::FADD;
20005 // Windows needs the precision control changed to 80bits around this add.
20006 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20007 Opc = X86ISD::FP80_ADD;
20008
20009 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20010 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20011 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20012}
20013
20014// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20015// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20016// just return an SDValue().
20017// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20018// to i16, i32 or i64, and we lower it to a legal sequence and return the
20019// result.
20020SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20021 bool IsSigned,
20022 SDValue &Chain) const {
20023 bool IsStrict = Op->isStrictFPOpcode();
20024 SDLoc DL(Op);
20025
20026 EVT DstTy = Op.getValueType();
20027 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20028 EVT TheVT = Value.getValueType();
20029 auto PtrVT = getPointerTy(DAG.getDataLayout());
20030
20031 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20032 // f16 must be promoted before using the lowering in this routine.
20033 // fp128 does not use this lowering.
20034 return SDValue();
20035 }
20036
20037 // If using FIST to compute an unsigned i64, we'll need some fixup
20038 // to handle values above the maximum signed i64. A FIST is always
20039 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20040 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20041
20042 // FIXME: This does not generate an invalid exception if the input does not
20043 // fit in i32. PR44019
20044 if (!IsSigned && DstTy != MVT::i64) {
20045 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20046 // The low 32 bits of the fist result will have the correct uint32 result.
20047 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20048 DstTy = MVT::i64;
20049 }
20050
20051 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20052 DstTy.getSimpleVT() >= MVT::i16 &&
20053 "Unknown FP_TO_INT to lower!");
20054
20055 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20056 // stack slot.
20058 unsigned MemSize = DstTy.getStoreSize();
20059 int SSFI =
20060 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20061 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20062
20063 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20064
20065 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20066
20067 if (UnsignedFixup) {
20068 //
20069 // Conversion to unsigned i64 is implemented with a select,
20070 // depending on whether the source value fits in the range
20071 // of a signed i64. Let Thresh be the FP equivalent of
20072 // 0x8000000000000000ULL.
20073 //
20074 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20075 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20076 // FistSrc = (Value - FltOfs);
20077 // Fist-to-mem64 FistSrc
20078 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20079 // to XOR'ing the high 32 bits with Adjust.
20080 //
20081 // Being a power of 2, Thresh is exactly representable in all FP formats.
20082 // For X87 we'd like to use the smallest FP type for this constant, but
20083 // for DAG type consistency we have to match the FP operand type.
20084
20085 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20087 bool LosesInfo = false;
20088 if (TheVT == MVT::f64)
20089 // The rounding mode is irrelevant as the conversion should be exact.
20091 &LosesInfo);
20092 else if (TheVT == MVT::f80)
20093 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20094 APFloat::rmNearestTiesToEven, &LosesInfo);
20095
20096 assert(Status == APFloat::opOK && !LosesInfo &&
20097 "FP conversion should have been exact");
20098
20099 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20100
20101 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20102 *DAG.getContext(), TheVT);
20103 SDValue Cmp;
20104 if (IsStrict) {
20105 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20106 /*IsSignaling*/ true);
20107 Chain = Cmp.getValue(1);
20108 } else {
20109 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20110 }
20111
20112 // Our preferred lowering of
20113 //
20114 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20115 //
20116 // is
20117 //
20118 // (Value >= Thresh) << 63
20119 //
20120 // but since we can get here after LegalOperations, DAGCombine might do the
20121 // wrong thing if we create a select. So, directly create the preferred
20122 // version.
20123 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20124 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20125 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20126
20127 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20128 DAG.getConstantFP(0.0, DL, TheVT));
20129
20130 if (IsStrict) {
20131 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20132 { Chain, Value, FltOfs });
20133 Chain = Value.getValue(1);
20134 } else
20135 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20136 }
20137
20139
20140 // FIXME This causes a redundant load/store if the SSE-class value is already
20141 // in memory, such as if it is on the callstack.
20142 if (isScalarFPTypeInSSEReg(TheVT)) {
20143 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20144 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20145 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20146 SDValue Ops[] = { Chain, StackSlot };
20147
20148 unsigned FLDSize = TheVT.getStoreSize();
20149 assert(FLDSize <= MemSize && "Stack slot not big enough");
20151 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20152 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20153 Chain = Value.getValue(1);
20154 }
20155
20156 // Build the FP_TO_INT*_IN_MEM
20158 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20159 SDValue Ops[] = { Chain, Value, StackSlot };
20161 DAG.getVTList(MVT::Other),
20162 Ops, DstTy, MMO);
20163
20164 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20165 Chain = Res.getValue(1);
20166
20167 // If we need an unsigned fixup, XOR the result with adjust.
20168 if (UnsignedFixup)
20169 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20170
20171 return Res;
20172}
20173
20175 const X86Subtarget &Subtarget) {
20176 MVT VT = Op.getSimpleValueType();
20177 SDValue In = Op.getOperand(0);
20178 MVT InVT = In.getSimpleValueType();
20179 unsigned Opc = Op.getOpcode();
20180
20181 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20182 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20183 "Unexpected extension opcode");
20185 "Expected same number of elements");
20186 assert((VT.getVectorElementType() == MVT::i16 ||
20187 VT.getVectorElementType() == MVT::i32 ||
20188 VT.getVectorElementType() == MVT::i64) &&
20189 "Unexpected element type");
20190 assert((InVT.getVectorElementType() == MVT::i8 ||
20191 InVT.getVectorElementType() == MVT::i16 ||
20192 InVT.getVectorElementType() == MVT::i32) &&
20193 "Unexpected element type");
20194
20195 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20196
20197 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20198 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20199 return splitVectorIntUnary(Op, DAG, dl);
20200 }
20201
20202 if (Subtarget.hasInt256())
20203 return Op;
20204
20205 // Optimize vectors in AVX mode:
20206 //
20207 // v8i16 -> v8i32
20208 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20209 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20210 // Concat upper and lower parts.
20211 //
20212 // v4i32 -> v4i64
20213 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20214 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20215 // Concat upper and lower parts.
20216 //
20217 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20218 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20219
20220 // Short-circuit if we can determine that each 128-bit half is the same value.
20221 // Otherwise, this is difficult to match and optimize.
20222 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20223 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20224 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20225
20226 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20227 SDValue Undef = DAG.getUNDEF(InVT);
20228 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20229 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20230 OpHi = DAG.getBitcast(HalfVT, OpHi);
20231
20232 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20233}
20234
20235// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20236static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20237 const SDLoc &dl, SelectionDAG &DAG) {
20238 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20239 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20240 DAG.getIntPtrConstant(0, dl));
20241 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20242 DAG.getIntPtrConstant(8, dl));
20243 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20244 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20245 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20246 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20247}
20248
20250 const X86Subtarget &Subtarget,
20251 SelectionDAG &DAG) {
20252 MVT VT = Op->getSimpleValueType(0);
20253 SDValue In = Op->getOperand(0);
20254 MVT InVT = In.getSimpleValueType();
20255 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20256 unsigned NumElts = VT.getVectorNumElements();
20257
20258 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20259 // avoids a constant pool load.
20260 if (VT.getVectorElementType() != MVT::i8) {
20261 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20262 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20263 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20264 }
20265
20266 // Extend VT if BWI is not supported.
20267 MVT ExtVT = VT;
20268 if (!Subtarget.hasBWI()) {
20269 // If v16i32 is to be avoided, we'll need to split and concatenate.
20270 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20271 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20272
20273 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20274 }
20275
20276 // Widen to 512-bits if VLX is not supported.
20277 MVT WideVT = ExtVT;
20278 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20279 NumElts *= 512 / ExtVT.getSizeInBits();
20280 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20281 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20282 In, DAG.getIntPtrConstant(0, DL));
20283 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20284 NumElts);
20285 }
20286
20287 SDValue One = DAG.getConstant(1, DL, WideVT);
20288 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20289
20290 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20291
20292 // Truncate if we had to extend above.
20293 if (VT != ExtVT) {
20294 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20295 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20296 }
20297
20298 // Extract back to 128/256-bit if we widened.
20299 if (WideVT != VT)
20300 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20301 DAG.getIntPtrConstant(0, DL));
20302
20303 return SelectedVal;
20304}
20305
20307 SelectionDAG &DAG) {
20308 SDValue In = Op.getOperand(0);
20309 MVT SVT = In.getSimpleValueType();
20310 SDLoc DL(Op);
20311
20312 if (SVT.getVectorElementType() == MVT::i1)
20313 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
20314
20315 assert(Subtarget.hasAVX() && "Expected AVX support");
20316 return LowerAVXExtend(Op, DL, DAG, Subtarget);
20317}
20318
20319/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20320/// It makes use of the fact that vectors with enough leading sign/zero bits
20321/// prevent the PACKSS/PACKUS from saturating the results.
20322/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20323/// within each 128-bit lane.
20324static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20325 const SDLoc &DL, SelectionDAG &DAG,
20326 const X86Subtarget &Subtarget) {
20327 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20328 "Unexpected PACK opcode");
20329 assert(DstVT.isVector() && "VT not a vector?");
20330
20331 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20332 if (!Subtarget.hasSSE2())
20333 return SDValue();
20334
20335 EVT SrcVT = In.getValueType();
20336
20337 // No truncation required, we might get here due to recursive calls.
20338 if (SrcVT == DstVT)
20339 return In;
20340
20341 unsigned NumElems = SrcVT.getVectorNumElements();
20342 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20343 return SDValue();
20344
20345 unsigned DstSizeInBits = DstVT.getSizeInBits();
20346 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20347 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20348 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20349
20350 LLVMContext &Ctx = *DAG.getContext();
20351 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20352 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20353
20354 // Pack to the largest type possible:
20355 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20356 EVT InVT = MVT::i16, OutVT = MVT::i8;
20357 if (SrcVT.getScalarSizeInBits() > 16 &&
20358 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20359 InVT = MVT::i32;
20360 OutVT = MVT::i16;
20361 }
20362
20363 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20364 // On pre-AVX512, pack the src in both halves to help value tracking.
20365 if (SrcSizeInBits <= 128) {
20366 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20367 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20368 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20369 SDValue LHS = DAG.getBitcast(InVT, In);
20370 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20371 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20372 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20373 Res = DAG.getBitcast(PackedVT, Res);
20374 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20375 }
20376
20377 // Split lower/upper subvectors.
20378 SDValue Lo, Hi;
20379 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20380
20381 // If Hi is undef, then don't bother packing it and widen the result instead.
20382 if (Hi.isUndef()) {
20383 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20384 if (SDValue Res =
20385 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20386 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20387 }
20388
20389 unsigned SubSizeInBits = SrcSizeInBits / 2;
20390 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20391 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20392
20393 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20394 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20395 Lo = DAG.getBitcast(InVT, Lo);
20396 Hi = DAG.getBitcast(InVT, Hi);
20397 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20398 return DAG.getBitcast(DstVT, Res);
20399 }
20400
20401 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20402 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20403 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20404 Lo = DAG.getBitcast(InVT, Lo);
20405 Hi = DAG.getBitcast(InVT, Hi);
20406 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20407
20408 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20409 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20410 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20412 int Scale = 64 / OutVT.getScalarSizeInBits();
20413 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20414 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20415
20416 if (DstVT.is256BitVector())
20417 return DAG.getBitcast(DstVT, Res);
20418
20419 // If 512bit -> 128bit truncate another stage.
20420 Res = DAG.getBitcast(PackedVT, Res);
20421 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20422 }
20423
20424 // Recursively pack lower/upper subvectors, concat result and pack again.
20425 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20426
20427 if (PackedVT.is128BitVector()) {
20428 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20429 // type legalization.
20430 SDValue Res =
20431 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20432 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20433 }
20434
20435 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20436 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20437 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20438 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20439 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20440}
20441
20442/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20443/// e.g. trunc <8 x i32> X to <8 x i16> -->
20444/// MaskX = X & 0xffff (clear high bits to prevent saturation)
20445/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20447 const X86Subtarget &Subtarget,
20448 SelectionDAG &DAG) {
20449 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20450 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20451}
20452
20453/// Truncate using inreg sign extension and X86ISD::PACKSS.
20455 const X86Subtarget &Subtarget,
20456 SelectionDAG &DAG) {
20457 EVT SrcVT = In.getValueType();
20458 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20459 DAG.getValueType(DstVT));
20460 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20461}
20462
20463/// Helper to determine if \p In truncated to \p DstVT has the necessary
20464/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20465/// possibly by converting a SRL node to SRA for sign extension.
20466static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20467 SDValue In, const SDLoc &DL,
20468 SelectionDAG &DAG,
20469 const X86Subtarget &Subtarget) {
20470 // Requires SSE2.
20471 if (!Subtarget.hasSSE2())
20472 return SDValue();
20473
20474 EVT SrcVT = In.getValueType();
20475 EVT DstSVT = DstVT.getVectorElementType();
20476 EVT SrcSVT = SrcVT.getVectorElementType();
20477 unsigned NumDstEltBits = DstSVT.getSizeInBits();
20478 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
20479
20480 // Check we have a truncation suited for PACKSS/PACKUS.
20481 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20482 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20483 return SDValue();
20484
20485 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
20486 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
20487
20488 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20489 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20490 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20491 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20492 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20493 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20494 return SDValue();
20495
20496 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20497 // split this for packing.
20498 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20499 !isFreeToSplitVector(In.getNode(), DAG) &&
20500 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20501 return SDValue();
20502
20503 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20504 if (Subtarget.hasAVX512() && NumStages > 1)
20505 return SDValue();
20506
20507 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20508 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20509
20510 // Truncate with PACKUS if we are truncating a vector with leading zero
20511 // bits that extend all the way to the packed/truncated value.
20512 // e.g. Masks, zext_in_reg, etc.
20513 // Pre-SSE41 we can only use PACKUSWB.
20514 KnownBits Known = DAG.computeKnownBits(In);
20515 if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20516 PackOpcode = X86ISD::PACKUS;
20517 return In;
20518 }
20519
20520 // Truncate with PACKSS if we are truncating a vector with sign-bits
20521 // that extend all the way to the packed/truncated value.
20522 // e.g. Comparison result, sext_in_reg, etc.
20523 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20524
20525 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20526 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20527 // see through BITCASTs later on and combines/simplifications can't then use
20528 // it.
20529 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20530 !Subtarget.hasAVX512())
20531 return SDValue();
20532
20533 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20534 if (MinSignBits < NumSignBits) {
20535 PackOpcode = X86ISD::PACKSS;
20536 return In;
20537 }
20538
20539 // If we have a srl that only generates signbits that we will discard in
20540 // the truncation then we can use PACKSS by converting the srl to a sra.
20541 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20542 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20543 if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
20544 if (*ShAmt == MinSignBits) {
20545 PackOpcode = X86ISD::PACKSS;
20546 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20547 }
20548 }
20549
20550 return SDValue();
20551}
20552
20553/// This function lowers a vector truncation of 'extended sign-bits' or
20554/// 'extended zero-bits' values.
20555/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20557 const SDLoc &DL,
20558 const X86Subtarget &Subtarget,
20559 SelectionDAG &DAG) {
20560 MVT SrcVT = In.getSimpleValueType();
20561 MVT DstSVT = DstVT.getVectorElementType();
20562 MVT SrcSVT = SrcVT.getVectorElementType();
20563 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20564 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20565 return SDValue();
20566
20567 // If the upper half of the source is undef, then attempt to split and
20568 // only truncate the lower half.
20569 if (DstVT.getSizeInBits() >= 128) {
20570 SmallVector<SDValue> LowerOps;
20571 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20572 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20573 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20574 Subtarget, DAG))
20575 return widenSubVector(Res, false, Subtarget, DAG, DL,
20576 DstVT.getSizeInBits());
20577 }
20578 }
20579
20580 unsigned PackOpcode;
20581 if (SDValue Src =
20582 matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20583 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20584
20585 return SDValue();
20586}
20587
20588/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20589/// X86ISD::PACKUS/X86ISD::PACKSS operations.
20591 const X86Subtarget &Subtarget,
20592 SelectionDAG &DAG) {
20593 MVT SrcVT = In.getSimpleValueType();
20594 MVT DstSVT = DstVT.getVectorElementType();
20595 MVT SrcSVT = SrcVT.getVectorElementType();
20596 unsigned NumElems = DstVT.getVectorNumElements();
20597 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20598 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20599 NumElems >= 8))
20600 return SDValue();
20601
20602 // SSSE3's pshufb results in less instructions in the cases below.
20603 if (Subtarget.hasSSSE3() && NumElems == 8) {
20604 if (SrcSVT == MVT::i16)
20605 return SDValue();
20606 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20607 return SDValue();
20608 }
20609
20610 // If the upper half of the source is undef, then attempt to split and
20611 // only truncate the lower half.
20612 if (DstVT.getSizeInBits() >= 128) {
20613 SmallVector<SDValue> LowerOps;
20614 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20615 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20616 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20617 return widenSubVector(Res, false, Subtarget, DAG, DL,
20618 DstVT.getSizeInBits());
20619 }
20620 }
20621
20622 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20623 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20624 // truncate 2 x v4i32 to v8i16.
20625 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20626 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20627
20628 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20629 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20630
20631 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20632 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20633 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20634 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20635 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20636 }
20637
20638 return SDValue();
20639}
20640
20642 SelectionDAG &DAG,
20643 const X86Subtarget &Subtarget) {
20644 MVT VT = Op.getSimpleValueType();
20645 SDValue In = Op.getOperand(0);
20646 MVT InVT = In.getSimpleValueType();
20647 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
20648
20649 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20650 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20651 if (InVT.getScalarSizeInBits() <= 16) {
20652 if (Subtarget.hasBWI()) {
20653 // legal, will go to VPMOVB2M, VPMOVW2M
20654 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20655 // We need to shift to get the lsb into sign position.
20656 // Shift packed bytes not supported natively, bitcast to word
20657 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20658 In = DAG.getNode(ISD::SHL, DL, ExtVT,
20659 DAG.getBitcast(ExtVT, In),
20660 DAG.getConstant(ShiftInx, DL, ExtVT));
20661 In = DAG.getBitcast(InVT, In);
20662 }
20663 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
20664 In, ISD::SETGT);
20665 }
20666 // Use TESTD/Q, extended vector to packed dword/qword.
20667 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
20668 "Unexpected vector type.");
20669 unsigned NumElts = InVT.getVectorNumElements();
20670 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
20671 // We need to change to a wider element type that we have support for.
20672 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
20673 // For 16 element vectors we extend to v16i32 unless we are explicitly
20674 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20675 // we need to split into two 8 element vectors which we can extend to v8i32,
20676 // truncate and concat the results. There's an additional complication if
20677 // the original type is v16i8. In that case we can't split the v16i8
20678 // directly, so we need to shuffle high elements to low and use
20679 // sign_extend_vector_inreg.
20680 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
20681 SDValue Lo, Hi;
20682 if (InVT == MVT::v16i8) {
20683 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
20684 Hi = DAG.getVectorShuffle(
20685 InVT, DL, In, In,
20686 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
20687 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
20688 } else {
20689 assert(InVT == MVT::v16i16 && "Unexpected VT!");
20690 Lo = extract128BitVector(In, 0, DAG, DL);
20691 Hi = extract128BitVector(In, 8, DAG, DL);
20692 }
20693 // We're split now, just emit two truncates and a concat. The two
20694 // truncates will trigger legalization to come back to this function.
20695 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
20696 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
20697 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20698 }
20699 // We either have 8 elements or we're allowed to use 512-bit vectors.
20700 // If we have VLX, we want to use the narrowest vector that can get the
20701 // job done so we use vXi32.
20702 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
20703 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
20704 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
20705 InVT = ExtVT;
20706 ShiftInx = InVT.getScalarSizeInBits() - 1;
20707 }
20708
20709 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20710 // We need to shift to get the lsb into sign position.
20711 In = DAG.getNode(ISD::SHL, DL, InVT, In,
20712 DAG.getConstant(ShiftInx, DL, InVT));
20713 }
20714 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
20715 if (Subtarget.hasDQI())
20716 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
20717 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
20718}
20719
20720SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
20721 SDLoc DL(Op);
20722 MVT VT = Op.getSimpleValueType();
20723 SDValue In = Op.getOperand(0);
20724 MVT InVT = In.getSimpleValueType();
20726 "Invalid TRUNCATE operation");
20727
20728 // If we're called by the type legalizer, handle a few cases.
20729 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20730 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
20731 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
20732 VT.is128BitVector() && Subtarget.hasAVX512()) {
20733 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
20734 "Unexpected subtarget!");
20735 // The default behavior is to truncate one step, concatenate, and then
20736 // truncate the remainder. We'd rather produce two 64-bit results and
20737 // concatenate those.
20738 SDValue Lo, Hi;
20739 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
20740
20741 EVT LoVT, HiVT;
20742 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
20743
20744 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
20745 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
20746 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20747 }
20748
20749 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
20750 if (!Subtarget.hasAVX512() ||
20751 (InVT.is512BitVector() && VT.is256BitVector()))
20752 if (SDValue SignPack =
20753 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20754 return SignPack;
20755
20756 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
20757 if (!Subtarget.hasAVX512())
20758 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
20759
20760 // Otherwise let default legalization handle it.
20761 return SDValue();
20762 }
20763
20764 if (VT.getVectorElementType() == MVT::i1)
20765 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
20766
20767 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
20768 // concat from subvectors to use VPTRUNC etc.
20769 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
20770 if (SDValue SignPack =
20771 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20772 return SignPack;
20773
20774 // vpmovqb/w/d, vpmovdb/w, vpmovwb
20775 if (Subtarget.hasAVX512()) {
20776 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
20777 assert(VT == MVT::v32i8 && "Unexpected VT!");
20778 return splitVectorIntUnary(Op, DAG, DL);
20779 }
20780
20781 // word to byte only under BWI. Otherwise we have to promoted to v16i32
20782 // and then truncate that. But we should only do that if we haven't been
20783 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20784 // handled by isel patterns.
20785 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
20786 Subtarget.canExtendTo512DQ())
20787 return Op;
20788 }
20789
20790 // Handle truncation of V256 to V128 using shuffles.
20791 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
20792
20793 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
20794 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20795 if (Subtarget.hasInt256()) {
20796 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20797 In = DAG.getBitcast(MVT::v8i32, In);
20798 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
20799 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
20800 DAG.getIntPtrConstant(0, DL));
20801 }
20802
20803 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20804 DAG.getIntPtrConstant(0, DL));
20805 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20806 DAG.getIntPtrConstant(2, DL));
20807 static const int ShufMask[] = {0, 2, 4, 6};
20808 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
20809 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
20810 }
20811
20812 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
20813 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20814 if (Subtarget.hasInt256()) {
20815 // The PSHUFB mask:
20816 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
20817 -1, -1, -1, -1, -1, -1, -1, -1,
20818 16, 17, 20, 21, 24, 25, 28, 29,
20819 -1, -1, -1, -1, -1, -1, -1, -1 };
20820 In = DAG.getBitcast(MVT::v32i8, In);
20821 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
20822 In = DAG.getBitcast(MVT::v4i64, In);
20823
20824 static const int ShufMask2[] = {0, 2, -1, -1};
20825 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
20826 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20827 DAG.getIntPtrConstant(0, DL));
20828 return DAG.getBitcast(MVT::v8i16, In);
20829 }
20830
20831 return Subtarget.hasSSE41()
20832 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
20833 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
20834 }
20835
20836 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
20837 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
20838
20839 llvm_unreachable("All 256->128 cases should have been handled above!");
20840}
20841
20842// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
20843// behaves on out of range inputs to generate optimized conversions.
20845 SelectionDAG &DAG,
20846 const X86Subtarget &Subtarget) {
20847 MVT SrcVT = Src.getSimpleValueType();
20848 unsigned DstBits = VT.getScalarSizeInBits();
20849 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
20850
20851 // Calculate the converted result for values in the range 0 to
20852 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
20853 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
20854 SDValue Big =
20855 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
20856 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
20857 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
20858
20859 // The "CVTTP2SI" instruction conveniently sets the sign bit if
20860 // and only if the value was out of range. So we can use that
20861 // as our indicator that we rather use "Big" instead of "Small".
20862 //
20863 // Use "Small" if "IsOverflown" has all bits cleared
20864 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
20865
20866 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
20867 // use the slightly slower blendv select instead.
20868 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
20869 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
20870 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
20871 }
20872
20873 SDValue IsOverflown =
20874 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
20875 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
20876 return DAG.getNode(ISD::OR, dl, VT, Small,
20877 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
20878}
20879
20880SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
20881 bool IsStrict = Op->isStrictFPOpcode();
20882 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
20883 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
20884 MVT VT = Op->getSimpleValueType(0);
20885 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20886 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
20887 MVT SrcVT = Src.getSimpleValueType();
20888 SDLoc dl(Op);
20889
20890 SDValue Res;
20891 if (isSoftF16(SrcVT, Subtarget)) {
20892 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20893 if (IsStrict)
20894 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
20895 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
20896 {NVT, MVT::Other}, {Chain, Src})});
20897 return DAG.getNode(Op.getOpcode(), dl, VT,
20898 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
20899 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
20900 return Op;
20901 }
20902
20903 if (VT.isVector()) {
20904 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
20905 MVT ResVT = MVT::v4i32;
20906 MVT TruncVT = MVT::v4i1;
20907 unsigned Opc;
20908 if (IsStrict)
20910 else
20911 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20912
20913 if (!IsSigned && !Subtarget.hasVLX()) {
20914 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
20915 // Widen to 512-bits.
20916 ResVT = MVT::v8i32;
20917 TruncVT = MVT::v8i1;
20918 Opc = Op.getOpcode();
20919 // Need to concat with zero vector for strict fp to avoid spurious
20920 // exceptions.
20921 // TODO: Should we just do this for non-strict as well?
20922 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
20923 : DAG.getUNDEF(MVT::v8f64);
20924 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
20925 DAG.getIntPtrConstant(0, dl));
20926 }
20927 if (IsStrict) {
20928 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
20929 Chain = Res.getValue(1);
20930 } else {
20931 Res = DAG.getNode(Opc, dl, ResVT, Src);
20932 }
20933
20934 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
20935 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
20936 DAG.getIntPtrConstant(0, dl));
20937 if (IsStrict)
20938 return DAG.getMergeValues({Res, Chain}, dl);
20939 return Res;
20940 }
20941
20942 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
20943 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
20944 return Op;
20945
20946 MVT ResVT = VT;
20947 MVT EleVT = VT.getVectorElementType();
20948 if (EleVT != MVT::i64)
20949 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
20950
20951 if (SrcVT != MVT::v8f16) {
20952 SDValue Tmp =
20953 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
20954 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
20955 Ops[0] = Src;
20956 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
20957 }
20958
20959 if (IsStrict) {
20960 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
20962 dl, {ResVT, MVT::Other}, {Chain, Src});
20963 Chain = Res.getValue(1);
20964 } else {
20965 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
20966 ResVT, Src);
20967 }
20968
20969 // TODO: Need to add exception check code for strict FP.
20970 if (EleVT.getSizeInBits() < 16) {
20971 ResVT = MVT::getVectorVT(EleVT, 8);
20972 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
20973 }
20974
20975 if (ResVT != VT)
20976 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20977 DAG.getIntPtrConstant(0, dl));
20978
20979 if (IsStrict)
20980 return DAG.getMergeValues({Res, Chain}, dl);
20981 return Res;
20982 }
20983
20984 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
20985 if (VT.getVectorElementType() == MVT::i16) {
20986 assert((SrcVT.getVectorElementType() == MVT::f32 ||
20987 SrcVT.getVectorElementType() == MVT::f64) &&
20988 "Expected f32/f64 vector!");
20989 MVT NVT = VT.changeVectorElementType(MVT::i32);
20990 if (IsStrict) {
20991 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
20993 dl, {NVT, MVT::Other}, {Chain, Src});
20994 Chain = Res.getValue(1);
20995 } else {
20996 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
20997 NVT, Src);
20998 }
20999
21000 // TODO: Need to add exception check code for strict FP.
21001 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21002
21003 if (IsStrict)
21004 return DAG.getMergeValues({Res, Chain}, dl);
21005 return Res;
21006 }
21007
21008 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21009 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21010 assert(!IsSigned && "Expected unsigned conversion!");
21011 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21012 return Op;
21013 }
21014
21015 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21016 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21017 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21018 Subtarget.useAVX512Regs()) {
21019 assert(!IsSigned && "Expected unsigned conversion!");
21020 assert(!Subtarget.hasVLX() && "Unexpected features!");
21021 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21022 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21023 // Need to concat with zero vector for strict fp to avoid spurious
21024 // exceptions.
21025 // TODO: Should we just do this for non-strict as well?
21026 SDValue Tmp =
21027 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21028 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21029 DAG.getIntPtrConstant(0, dl));
21030
21031 if (IsStrict) {
21032 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21033 {Chain, Src});
21034 Chain = Res.getValue(1);
21035 } else {
21036 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21037 }
21038
21039 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21040 DAG.getIntPtrConstant(0, dl));
21041
21042 if (IsStrict)
21043 return DAG.getMergeValues({Res, Chain}, dl);
21044 return Res;
21045 }
21046
21047 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21048 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21049 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21050 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21051 assert(!Subtarget.hasVLX() && "Unexpected features!");
21052 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21053 // Need to concat with zero vector for strict fp to avoid spurious
21054 // exceptions.
21055 // TODO: Should we just do this for non-strict as well?
21056 SDValue Tmp =
21057 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21058 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21059 DAG.getIntPtrConstant(0, dl));
21060
21061 if (IsStrict) {
21062 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21063 {Chain, Src});
21064 Chain = Res.getValue(1);
21065 } else {
21066 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21067 }
21068
21069 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21070 DAG.getIntPtrConstant(0, dl));
21071
21072 if (IsStrict)
21073 return DAG.getMergeValues({Res, Chain}, dl);
21074 return Res;
21075 }
21076
21077 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21078 if (!Subtarget.hasVLX()) {
21079 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21080 // legalizer and then widened again by vector op legalization.
21081 if (!IsStrict)
21082 return SDValue();
21083
21084 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21085 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21086 {Src, Zero, Zero, Zero});
21087 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21088 {Chain, Tmp});
21089 SDValue Chain = Tmp.getValue(1);
21090 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21091 DAG.getIntPtrConstant(0, dl));
21092 return DAG.getMergeValues({Tmp, Chain}, dl);
21093 }
21094
21095 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21096 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21097 DAG.getUNDEF(MVT::v2f32));
21098 if (IsStrict) {
21099 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21101 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21102 }
21103 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21104 return DAG.getNode(Opc, dl, VT, Tmp);
21105 }
21106
21107 // Generate optimized instructions for pre AVX512 unsigned conversions from
21108 // vXf32 to vXi32.
21109 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21110 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21111 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21112 assert(!IsSigned && "Expected unsigned conversion!");
21113 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21114 }
21115
21116 return SDValue();
21117 }
21118
21119 assert(!VT.isVector());
21120
21121 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21122
21123 if (!IsSigned && UseSSEReg) {
21124 // Conversions from f32/f64 with AVX512 should be legal.
21125 if (Subtarget.hasAVX512())
21126 return Op;
21127
21128 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21129 // behaves on out of range inputs to generate optimized conversions.
21130 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21131 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21132 unsigned DstBits = VT.getScalarSizeInBits();
21133 APInt UIntLimit = APInt::getSignMask(DstBits);
21134 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21135 DAG.getConstant(UIntLimit, dl, VT));
21136 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21137
21138 // Calculate the converted result for values in the range:
21139 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21140 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21141 SDValue Small =
21142 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21143 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21144 SDValue Big = DAG.getNode(
21145 X86ISD::CVTTS2SI, dl, VT,
21146 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21147 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21148
21149 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21150 // and only if the value was out of range. So we can use that
21151 // as our indicator that we rather use "Big" instead of "Small".
21152 //
21153 // Use "Small" if "IsOverflown" has all bits cleared
21154 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21155 SDValue IsOverflown = DAG.getNode(
21156 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21157 return DAG.getNode(ISD::OR, dl, VT, Small,
21158 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21159 }
21160
21161 // Use default expansion for i64.
21162 if (VT == MVT::i64)
21163 return SDValue();
21164
21165 assert(VT == MVT::i32 && "Unexpected VT!");
21166
21167 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21168 // FIXME: This does not generate an invalid exception if the input does not
21169 // fit in i32. PR44019
21170 if (Subtarget.is64Bit()) {
21171 if (IsStrict) {
21172 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21173 {Chain, Src});
21174 Chain = Res.getValue(1);
21175 } else
21176 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21177
21178 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21179 if (IsStrict)
21180 return DAG.getMergeValues({Res, Chain}, dl);
21181 return Res;
21182 }
21183
21184 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21185 // use fisttp which will be handled later.
21186 if (!Subtarget.hasSSE3())
21187 return SDValue();
21188 }
21189
21190 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21191 // FIXME: This does not generate an invalid exception if the input does not
21192 // fit in i16. PR44019
21193 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21194 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21195 if (IsStrict) {
21196 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21197 {Chain, Src});
21198 Chain = Res.getValue(1);
21199 } else
21200 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21201
21202 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21203 if (IsStrict)
21204 return DAG.getMergeValues({Res, Chain}, dl);
21205 return Res;
21206 }
21207
21208 // If this is a FP_TO_SINT using SSEReg we're done.
21209 if (UseSSEReg && IsSigned)
21210 return Op;
21211
21212 // fp128 needs to use a libcall.
21213 if (SrcVT == MVT::f128) {
21214 RTLIB::Libcall LC;
21215 if (IsSigned)
21216 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21217 else
21218 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21219
21220 MakeLibCallOptions CallOptions;
21221 std::pair<SDValue, SDValue> Tmp =
21222 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21223
21224 if (IsStrict)
21225 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21226
21227 return Tmp.first;
21228 }
21229
21230 // Fall back to X87.
21231 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21232 if (IsStrict)
21233 return DAG.getMergeValues({V, Chain}, dl);
21234 return V;
21235 }
21236
21237 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21238}
21239
21240SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21241 SelectionDAG &DAG) const {
21242 SDValue Src = Op.getOperand(0);
21243 EVT DstVT = Op.getSimpleValueType();
21244 MVT SrcVT = Src.getSimpleValueType();
21245
21246 if (SrcVT.isVector())
21247 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21248
21249 if (SrcVT == MVT::f16)
21250 return SDValue();
21251
21252 // If the source is in an SSE register, the node is Legal.
21253 if (isScalarFPTypeInSSEReg(SrcVT))
21254 return Op;
21255
21256 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21257}
21258
21259SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21260 SelectionDAG &DAG) const {
21261 EVT DstVT = N->getValueType(0);
21262 SDValue Src = N->getOperand(0);
21263 EVT SrcVT = Src.getValueType();
21264
21265 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21266 // f16 must be promoted before using the lowering in this routine.
21267 // fp128 does not use this lowering.
21268 return SDValue();
21269 }
21270
21271 SDLoc DL(N);
21272 SDValue Chain = DAG.getEntryNode();
21273
21274 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21275
21276 // If we're converting from SSE, the stack slot needs to hold both types.
21277 // Otherwise it only needs to hold the DstVT.
21278 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21279 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21280 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21281 MachinePointerInfo MPI =
21283
21284 if (UseSSE) {
21285 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21286 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21287 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21288 SDValue Ops[] = { Chain, StackPtr };
21289
21290 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21291 /*Align*/ std::nullopt,
21293 Chain = Src.getValue(1);
21294 }
21295
21296 SDValue StoreOps[] = { Chain, Src, StackPtr };
21297 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21298 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
21300
21301 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21302}
21303
21304SDValue
21305X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21306 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21307 // but making use of X86 specifics to produce better instruction sequences.
21308 SDNode *Node = Op.getNode();
21309 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21310 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21311 SDLoc dl(SDValue(Node, 0));
21312 SDValue Src = Node->getOperand(0);
21313
21314 // There are three types involved here: SrcVT is the source floating point
21315 // type, DstVT is the type of the result, and TmpVT is the result of the
21316 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21317 // DstVT).
21318 EVT SrcVT = Src.getValueType();
21319 EVT DstVT = Node->getValueType(0);
21320 EVT TmpVT = DstVT;
21321
21322 // This code is only for floats and doubles. Fall back to generic code for
21323 // anything else.
21324 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
21325 return SDValue();
21326
21327 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21328 unsigned SatWidth = SatVT.getScalarSizeInBits();
21329 unsigned DstWidth = DstVT.getScalarSizeInBits();
21330 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21331 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21332 "Expected saturation width smaller than result width");
21333
21334 // Promote result of FP_TO_*INT to at least 32 bits.
21335 if (TmpWidth < 32) {
21336 TmpVT = MVT::i32;
21337 TmpWidth = 32;
21338 }
21339
21340 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21341 // us to use a native signed conversion instead.
21342 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21343 TmpVT = MVT::i64;
21344 TmpWidth = 64;
21345 }
21346
21347 // If the saturation width is smaller than the size of the temporary result,
21348 // we can always use signed conversion, which is native.
21349 if (SatWidth < TmpWidth)
21350 FpToIntOpcode = ISD::FP_TO_SINT;
21351
21352 // Determine minimum and maximum integer values and their corresponding
21353 // floating-point values.
21354 APInt MinInt, MaxInt;
21355 if (IsSigned) {
21356 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
21357 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
21358 } else {
21359 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
21360 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
21361 }
21362
21363 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21364 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21365
21366 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21367 MinInt, IsSigned, APFloat::rmTowardZero);
21368 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21369 MaxInt, IsSigned, APFloat::rmTowardZero);
21370 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21371 && !(MaxStatus & APFloat::opStatus::opInexact);
21372
21373 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21374 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21375
21376 // If the integer bounds are exactly representable as floats, emit a
21377 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21378 if (AreExactFloatBounds) {
21379 if (DstVT != TmpVT) {
21380 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21381 SDValue MinClamped = DAG.getNode(
21382 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21383 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21384 SDValue BothClamped = DAG.getNode(
21385 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21386 // Convert clamped value to integer.
21387 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21388
21389 // NaN will become INDVAL, with the top bit set and the rest zero.
21390 // Truncation will discard the top bit, resulting in zero.
21391 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21392 }
21393
21394 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21395 SDValue MinClamped = DAG.getNode(
21396 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21397 // Clamp by MaxFloat from above. NaN cannot occur.
21398 SDValue BothClamped = DAG.getNode(
21399 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21400 // Convert clamped value to integer.
21401 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21402
21403 if (!IsSigned) {
21404 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21405 // which is zero.
21406 return FpToInt;
21407 }
21408
21409 // Otherwise, select zero if Src is NaN.
21410 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21411 return DAG.getSelectCC(
21412 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21413 }
21414
21415 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21416 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21417
21418 // Result of direct conversion, which may be selected away.
21419 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21420
21421 if (DstVT != TmpVT) {
21422 // NaN will become INDVAL, with the top bit set and the rest zero.
21423 // Truncation will discard the top bit, resulting in zero.
21424 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21425 }
21426
21427 SDValue Select = FpToInt;
21428 // For signed conversions where we saturate to the same size as the
21429 // result type of the fptoi instructions, INDVAL coincides with integer
21430 // minimum, so we don't need to explicitly check it.
21431 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21432 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21433 // MinInt if Src is NaN.
21434 Select = DAG.getSelectCC(
21435 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21436 }
21437
21438 // If Src OGT MaxFloat, select MaxInt.
21439 Select = DAG.getSelectCC(
21440 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21441
21442 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21443 // is already zero. The promoted case was already handled above.
21444 if (!IsSigned || DstVT != TmpVT) {
21445 return Select;
21446 }
21447
21448 // Otherwise, select 0 if Src is NaN.
21449 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21450 return DAG.getSelectCC(
21451 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21452}
21453
21454SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21455 bool IsStrict = Op->isStrictFPOpcode();
21456
21457 SDLoc DL(Op);
21458 MVT VT = Op.getSimpleValueType();
21459 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21460 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21461 MVT SVT = In.getSimpleValueType();
21462
21463 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21464 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
21465 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
21466 !Subtarget.getTargetTriple().isOSDarwin()))
21467 return SDValue();
21468
21469 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
21470 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
21471 return Op;
21472
21473 if (SVT == MVT::f16) {
21474 if (Subtarget.hasFP16())
21475 return Op;
21476
21477 if (VT != MVT::f32) {
21478 if (IsStrict)
21479 return DAG.getNode(
21480 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
21481 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
21482 {MVT::f32, MVT::Other}, {Chain, In})});
21483
21484 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21485 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
21486 }
21487
21488 if (!Subtarget.hasF16C()) {
21489 if (!Subtarget.getTargetTriple().isOSDarwin())
21490 return SDValue();
21491
21492 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
21493
21494 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21496 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21497
21498 In = DAG.getBitcast(MVT::i16, In);
21501 Entry.Node = In;
21502 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
21503 Entry.IsSExt = false;
21504 Entry.IsZExt = true;
21505 Args.push_back(Entry);
21506
21508 getLibcallName(RTLIB::FPEXT_F16_F32),
21510 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21511 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
21512 std::move(Args));
21513
21514 SDValue Res;
21515 std::tie(Res,Chain) = LowerCallTo(CLI);
21516 if (IsStrict)
21517 Res = DAG.getMergeValues({Res, Chain}, DL);
21518
21519 return Res;
21520 }
21521
21522 In = DAG.getBitcast(MVT::i16, In);
21523 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
21524 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
21525 DAG.getIntPtrConstant(0, DL));
21526 SDValue Res;
21527 if (IsStrict) {
21528 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
21529 {Chain, In});
21530 Chain = Res.getValue(1);
21531 } else {
21532 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
21533 DAG.getTargetConstant(4, DL, MVT::i32));
21534 }
21535 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
21536 DAG.getIntPtrConstant(0, DL));
21537 if (IsStrict)
21538 return DAG.getMergeValues({Res, Chain}, DL);
21539 return Res;
21540 }
21541
21542 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
21543 return Op;
21544
21545 if (SVT.getVectorElementType() == MVT::f16) {
21546 if (Subtarget.hasFP16() && isTypeLegal(SVT))
21547 return Op;
21548 assert(Subtarget.hasF16C() && "Unexpected features!");
21549 if (SVT == MVT::v2f16)
21550 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
21551 DAG.getUNDEF(MVT::v2f16));
21552 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
21553 DAG.getUNDEF(MVT::v4f16));
21554 if (IsStrict)
21555 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21556 {Op->getOperand(0), Res});
21557 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21558 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
21559 return Op;
21560 }
21561
21562 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21563
21564 SDValue Res =
21565 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21566 if (IsStrict)
21567 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21568 {Op->getOperand(0), Res});
21569 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21570}
21571
21572SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21573 bool IsStrict = Op->isStrictFPOpcode();
21574
21575 SDLoc DL(Op);
21576 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21577 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21578 MVT VT = Op.getSimpleValueType();
21579 MVT SVT = In.getSimpleValueType();
21580
21581 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
21582 return SDValue();
21583
21584 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
21585 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
21586 if (!Subtarget.getTargetTriple().isOSDarwin())
21587 return SDValue();
21588
21589 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
21591 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21592
21595 Entry.Node = In;
21596 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
21597 Entry.IsSExt = false;
21598 Entry.IsZExt = true;
21599 Args.push_back(Entry);
21600
21602 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
21603 : RTLIB::FPROUND_F32_F16),
21605 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21606 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
21607 std::move(Args));
21608
21609 SDValue Res;
21610 std::tie(Res, Chain) = LowerCallTo(CLI);
21611
21612 Res = DAG.getBitcast(MVT::f16, Res);
21613
21614 if (IsStrict)
21615 Res = DAG.getMergeValues({Res, Chain}, DL);
21616
21617 return Res;
21618 }
21619
21620 if (VT.getScalarType() == MVT::bf16) {
21621 if (SVT.getScalarType() == MVT::f32 &&
21622 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21623 Subtarget.hasAVXNECONVERT()))
21624 return Op;
21625 return SDValue();
21626 }
21627
21628 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
21629 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
21630 return SDValue();
21631
21632 if (VT.isVector())
21633 return Op;
21634
21635 SDValue Res;
21637 MVT::i32);
21638 if (IsStrict) {
21639 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
21640 DAG.getConstantFP(0, DL, MVT::v4f32), In,
21641 DAG.getIntPtrConstant(0, DL));
21642 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
21643 {Chain, Res, Rnd});
21644 Chain = Res.getValue(1);
21645 } else {
21646 // FIXME: Should we use zeros for upper elements for non-strict?
21647 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
21648 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
21649 }
21650
21651 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21652 DAG.getIntPtrConstant(0, DL));
21653 Res = DAG.getBitcast(MVT::f16, Res);
21654
21655 if (IsStrict)
21656 return DAG.getMergeValues({Res, Chain}, DL);
21657
21658 return Res;
21659 }
21660
21661 return Op;
21662}
21663
21665 bool IsStrict = Op->isStrictFPOpcode();
21666 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21667 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21668 "Unexpected VT!");
21669
21670 SDLoc dl(Op);
21671 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21672 DAG.getConstant(0, dl, MVT::v8i16), Src,
21673 DAG.getIntPtrConstant(0, dl));
21674
21675 SDValue Chain;
21676 if (IsStrict) {
21677 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21678 {Op.getOperand(0), Res});
21679 Chain = Res.getValue(1);
21680 } else {
21681 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21682 }
21683
21684 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21685 DAG.getIntPtrConstant(0, dl));
21686
21687 if (IsStrict)
21688 return DAG.getMergeValues({Res, Chain}, dl);
21689
21690 return Res;
21691}
21692
21694 bool IsStrict = Op->isStrictFPOpcode();
21695 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21696 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21697 "Unexpected VT!");
21698
21699 SDLoc dl(Op);
21700 SDValue Res, Chain;
21701 if (IsStrict) {
21702 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21703 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21704 DAG.getIntPtrConstant(0, dl));
21705 Res = DAG.getNode(
21706 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21707 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21708 Chain = Res.getValue(1);
21709 } else {
21710 // FIXME: Should we use zeros for upper elements for non-strict?
21711 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21712 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21713 DAG.getTargetConstant(4, dl, MVT::i32));
21714 }
21715
21716 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21717 DAG.getIntPtrConstant(0, dl));
21718
21719 if (IsStrict)
21720 return DAG.getMergeValues({Res, Chain}, dl);
21721
21722 return Res;
21723}
21724
21725SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
21726 SelectionDAG &DAG) const {
21727 SDLoc DL(Op);
21728
21729 MVT SVT = Op.getOperand(0).getSimpleValueType();
21730 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21731 Subtarget.hasAVXNECONVERT())) {
21732 SDValue Res;
21733 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
21734 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
21735 Res = DAG.getBitcast(MVT::v8i16, Res);
21736 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21737 DAG.getIntPtrConstant(0, DL));
21738 }
21739
21740 MakeLibCallOptions CallOptions;
21741 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
21742 SDValue Res =
21743 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
21744 return DAG.getBitcast(MVT::i16, Res);
21745}
21746
21747/// Depending on uarch and/or optimizing for size, we might prefer to use a
21748/// vector operation in place of the typical scalar operation.
21750 SelectionDAG &DAG,
21751 const X86Subtarget &Subtarget) {
21752 // If both operands have other uses, this is probably not profitable.
21753 SDValue LHS = Op.getOperand(0);
21754 SDValue RHS = Op.getOperand(1);
21755 if (!LHS.hasOneUse() && !RHS.hasOneUse())
21756 return Op;
21757
21758 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21759 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21760 if (IsFP && !Subtarget.hasSSE3())
21761 return Op;
21762 if (!IsFP && !Subtarget.hasSSSE3())
21763 return Op;
21764
21765 // Extract from a common vector.
21766 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21767 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21768 LHS.getOperand(0) != RHS.getOperand(0) ||
21769 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21770 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21771 !shouldUseHorizontalOp(true, DAG, Subtarget))
21772 return Op;
21773
21774 // Allow commuted 'hadd' ops.
21775 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21776 unsigned HOpcode;
21777 switch (Op.getOpcode()) {
21778 // clang-format off
21779 case ISD::ADD: HOpcode = X86ISD::HADD; break;
21780 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21781 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21782 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21783 default:
21784 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21785 // clang-format on
21786 }
21787 unsigned LExtIndex = LHS.getConstantOperandVal(1);
21788 unsigned RExtIndex = RHS.getConstantOperandVal(1);
21789 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21790 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21791 std::swap(LExtIndex, RExtIndex);
21792
21793 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21794 return Op;
21795
21796 SDValue X = LHS.getOperand(0);
21797 EVT VecVT = X.getValueType();
21798 unsigned BitWidth = VecVT.getSizeInBits();
21799 unsigned NumLanes = BitWidth / 128;
21800 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21801 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21802 "Not expecting illegal vector widths here");
21803
21804 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21805 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21806 if (BitWidth == 256 || BitWidth == 512) {
21807 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21808 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21809 LExtIndex %= NumEltsPerLane;
21810 }
21811
21812 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21813 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21814 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21815 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21816 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21817 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21818 DAG.getIntPtrConstant(LExtIndex / 2, DL));
21819}
21820
21821/// Depending on uarch and/or optimizing for size, we might prefer to use a
21822/// vector operation in place of the typical scalar operation.
21823SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21824 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21825 "Only expecting float/double");
21826 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
21827}
21828
21829/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21830/// This mode isn't supported in hardware on X86. But as long as we aren't
21831/// compiling with trapping math, we can emulate this with
21832/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
21834 SDValue N0 = Op.getOperand(0);
21835 SDLoc dl(Op);
21836 MVT VT = Op.getSimpleValueType();
21837
21838 // N0 += copysign(nextafter(0.5, 0.0), N0)
21840 bool Ignored;
21841 APFloat Point5Pred = APFloat(0.5f);
21842 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
21843 Point5Pred.next(/*nextDown*/true);
21844
21845 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
21846 DAG.getConstantFP(Point5Pred, dl, VT), N0);
21847 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
21848
21849 // Truncate the result to remove fraction.
21850 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
21851}
21852
21853/// The only differences between FABS and FNEG are the mask and the logic op.
21854/// FNEG also has a folding opportunity for FNEG(FABS(x)).
21856 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
21857 "Wrong opcode for lowering FABS or FNEG.");
21858
21859 bool IsFABS = (Op.getOpcode() == ISD::FABS);
21860
21861 // If this is a FABS and it has an FNEG user, bail out to fold the combination
21862 // into an FNABS. We'll lower the FABS after that if it is still in use.
21863 if (IsFABS)
21864 for (SDNode *User : Op->uses())
21865 if (User->getOpcode() == ISD::FNEG)
21866 return Op;
21867
21868 SDLoc dl(Op);
21869 MVT VT = Op.getSimpleValueType();
21870
21871 bool IsF128 = (VT == MVT::f128);
21872 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21874 "Unexpected type in LowerFABSorFNEG");
21875
21876 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
21877 // decide if we should generate a 16-byte constant mask when we only need 4 or
21878 // 8 bytes for the scalar case.
21879
21880 // There are no scalar bitwise logical SSE/AVX instructions, so we
21881 // generate a 16-byte vector constant and logic op even for the scalar case.
21882 // Using a 16-byte mask allows folding the load of the mask with
21883 // the logic op, so it can save (~4 bytes) on code size.
21884 bool IsFakeVector = !VT.isVector() && !IsF128;
21885 MVT LogicVT = VT;
21886 if (IsFakeVector)
21887 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21888 : (VT == MVT::f32) ? MVT::v4f32
21889 : MVT::v8f16;
21890
21891 unsigned EltBits = VT.getScalarSizeInBits();
21892 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
21893 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
21894 APInt::getSignMask(EltBits);
21896 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
21897
21898 SDValue Op0 = Op.getOperand(0);
21899 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
21900 unsigned LogicOp = IsFABS ? X86ISD::FAND :
21901 IsFNABS ? X86ISD::FOR :
21903 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
21904
21905 if (VT.isVector() || IsF128)
21906 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21907
21908 // For the scalar case extend to a 128-bit vector, perform the logic op,
21909 // and extract the scalar result back out.
21910 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
21911 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21912 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
21913 DAG.getIntPtrConstant(0, dl));
21914}
21915
21917 SDValue Mag = Op.getOperand(0);
21918 SDValue Sign = Op.getOperand(1);
21919 SDLoc dl(Op);
21920
21921 // If the sign operand is smaller, extend it first.
21922 MVT VT = Op.getSimpleValueType();
21923 if (Sign.getSimpleValueType().bitsLT(VT))
21924 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
21925
21926 // And if it is bigger, shrink it first.
21927 if (Sign.getSimpleValueType().bitsGT(VT))
21928 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
21929 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21930
21931 // At this point the operands and the result should have the same
21932 // type, and that won't be f80 since that is not custom lowered.
21933 bool IsF128 = (VT == MVT::f128);
21934 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21936 "Unexpected type in LowerFCOPYSIGN");
21937
21939
21940 // Perform all scalar logic operations as 16-byte vectors because there are no
21941 // scalar FP logic instructions in SSE.
21942 // TODO: This isn't necessary. If we used scalar types, we might avoid some
21943 // unnecessary splats, but we might miss load folding opportunities. Should
21944 // this decision be based on OptimizeForSize?
21945 bool IsFakeVector = !VT.isVector() && !IsF128;
21946 MVT LogicVT = VT;
21947 if (IsFakeVector)
21948 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21949 : (VT == MVT::f32) ? MVT::v4f32
21950 : MVT::v8f16;
21951
21952 // The mask constants are automatically splatted for vector types.
21953 unsigned EltSizeInBits = VT.getScalarSizeInBits();
21954 SDValue SignMask = DAG.getConstantFP(
21955 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
21956 SDValue MagMask = DAG.getConstantFP(
21957 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
21958
21959 // First, clear all bits but the sign bit from the second operand (sign).
21960 if (IsFakeVector)
21961 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
21962 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
21963
21964 // Next, clear the sign bit from the first operand (magnitude).
21965 // TODO: If we had general constant folding for FP logic ops, this check
21966 // wouldn't be necessary.
21967 SDValue MagBits;
21968 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
21969 APFloat APF = Op0CN->getValueAPF();
21970 APF.clearSign();
21971 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
21972 } else {
21973 // If the magnitude operand wasn't a constant, we need to AND out the sign.
21974 if (IsFakeVector)
21975 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
21976 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
21977 }
21978
21979 // OR the magnitude value with the sign bit.
21980 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
21981 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
21982 DAG.getIntPtrConstant(0, dl));
21983}
21984
21986 SDValue N0 = Op.getOperand(0);
21987 SDLoc dl(Op);
21988 MVT VT = Op.getSimpleValueType();
21989
21990 MVT OpVT = N0.getSimpleValueType();
21991 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
21992 "Unexpected type for FGETSIGN");
21993
21994 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
21995 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
21996 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
21997 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
21998 Res = DAG.getZExtOrTrunc(Res, dl, VT);
21999 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22000 return Res;
22001}
22002
22003/// Helper for attempting to create a X86ISD::BT node.
22004static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22005 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22006 // instruction. Since the shift amount is in-range-or-undefined, we know
22007 // that doing a bittest on the i32 value is ok. We extend to i32 because
22008 // the encoding for the i16 version is larger than the i32 version.
22009 // Also promote i16 to i32 for performance / code size reason.
22010 if (Src.getValueType().getScalarSizeInBits() < 32)
22011 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22012
22013 // No legal type found, give up.
22014 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22015 return SDValue();
22016
22017 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22018 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22019 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22020 // known to be zero.
22021 if (Src.getValueType() == MVT::i64 &&
22022 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22023 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22024
22025 // If the operand types disagree, extend the shift amount to match. Since
22026 // BT ignores high bits (like shifts) we can use anyextend.
22027 if (Src.getValueType() != BitNo.getValueType()) {
22028 // Peek through a mask/modulo operation.
22029 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22030 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22031 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22032 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22033 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22034 BitNo.getOperand(0)),
22035 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22036 BitNo.getOperand(1)));
22037 else
22038 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22039 }
22040
22041 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22042}
22043
22044/// Helper for creating a X86ISD::SETCC node.
22046 SelectionDAG &DAG) {
22047 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22048 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22049}
22050
22051/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22052/// recognizable memcmp expansion.
22053static bool isOrXorXorTree(SDValue X, bool Root = true) {
22054 if (X.getOpcode() == ISD::OR)
22055 return isOrXorXorTree(X.getOperand(0), false) &&
22056 isOrXorXorTree(X.getOperand(1), false);
22057 if (Root)
22058 return false;
22059 return X.getOpcode() == ISD::XOR;
22060}
22061
22062/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22063/// expansion.
22064template <typename F>
22066 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22067 SDValue Op0 = X.getOperand(0);
22068 SDValue Op1 = X.getOperand(1);
22069 if (X.getOpcode() == ISD::OR) {
22070 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22071 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22072 if (VecVT != CmpVT)
22073 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22074 if (HasPT)
22075 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22076 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22077 }
22078 if (X.getOpcode() == ISD::XOR) {
22079 SDValue A = SToV(Op0);
22080 SDValue B = SToV(Op1);
22081 if (VecVT != CmpVT)
22082 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22083 if (HasPT)
22084 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22085 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22086 }
22087 llvm_unreachable("Impossible");
22088}
22089
22090/// Try to map a 128-bit or larger integer comparison to vector instructions
22091/// before type legalization splits it up into chunks.
22094 const SDLoc &DL,
22095 SelectionDAG &DAG,
22096 const X86Subtarget &Subtarget) {
22097 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22098
22099 // We're looking for an oversized integer equality comparison.
22100 EVT OpVT = X.getValueType();
22101 unsigned OpSize = OpVT.getSizeInBits();
22102 if (!OpVT.isScalarInteger() || OpSize < 128)
22103 return SDValue();
22104
22105 // Ignore a comparison with zero because that gets special treatment in
22106 // EmitTest(). But make an exception for the special case of a pair of
22107 // logically-combined vector-sized operands compared to zero. This pattern may
22108 // be generated by the memcmp expansion pass with oversized integer compares
22109 // (see PR33325).
22110 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22111 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22112 return SDValue();
22113
22114 // Don't perform this combine if constructing the vector will be expensive.
22115 auto IsVectorBitCastCheap = [](SDValue X) {
22117 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22118 X.getOpcode() == ISD::LOAD;
22119 };
22120 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22121 !IsOrXorXorTreeCCZero)
22122 return SDValue();
22123
22124 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22125 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22126 // Otherwise use PCMPEQ (plus AND) and mask testing.
22127 bool NoImplicitFloatOps =
22129 Attribute::NoImplicitFloat);
22130 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22131 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22132 (OpSize == 256 && Subtarget.hasAVX()) ||
22133 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22134 bool HasPT = Subtarget.hasSSE41();
22135
22136 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22137 // vector registers are essentially free. (Technically, widening registers
22138 // prevents load folding, but the tradeoff is worth it.)
22139 bool PreferKOT = Subtarget.preferMaskRegisters();
22140 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22141
22142 EVT VecVT = MVT::v16i8;
22143 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22144 if (OpSize == 256) {
22145 VecVT = MVT::v32i8;
22146 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22147 }
22148 EVT CastVT = VecVT;
22149 bool NeedsAVX512FCast = false;
22150 if (OpSize == 512 || NeedZExt) {
22151 if (Subtarget.hasBWI()) {
22152 VecVT = MVT::v64i8;
22153 CmpVT = MVT::v64i1;
22154 if (OpSize == 512)
22155 CastVT = VecVT;
22156 } else {
22157 VecVT = MVT::v16i32;
22158 CmpVT = MVT::v16i1;
22159 CastVT = OpSize == 512 ? VecVT
22160 : OpSize == 256 ? MVT::v8i32
22161 : MVT::v4i32;
22162 NeedsAVX512FCast = true;
22163 }
22164 }
22165
22166 auto ScalarToVector = [&](SDValue X) -> SDValue {
22167 bool TmpZext = false;
22168 EVT TmpCastVT = CastVT;
22169 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22170 SDValue OrigX = X.getOperand(0);
22171 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22172 if (OrigSize < OpSize) {
22173 if (OrigSize == 128) {
22174 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22175 X = OrigX;
22176 TmpZext = true;
22177 } else if (OrigSize == 256) {
22178 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22179 X = OrigX;
22180 TmpZext = true;
22181 }
22182 }
22183 }
22184 X = DAG.getBitcast(TmpCastVT, X);
22185 if (!NeedZExt && !TmpZext)
22186 return X;
22187 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22188 DAG.getConstant(0, DL, VecVT), X,
22189 DAG.getVectorIdxConstant(0, DL));
22190 };
22191
22192 SDValue Cmp;
22193 if (IsOrXorXorTreeCCZero) {
22194 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22195 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22196 // Use 2 vector equality compares and 'and' the results before doing a
22197 // MOVMSK.
22198 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22199 } else {
22200 SDValue VecX = ScalarToVector(X);
22201 SDValue VecY = ScalarToVector(Y);
22202 if (VecVT != CmpVT) {
22203 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22204 } else if (HasPT) {
22205 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22206 } else {
22207 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22208 }
22209 }
22210 // AVX512 should emit a setcc that will lower to kortest.
22211 if (VecVT != CmpVT) {
22212 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22213 : CmpVT == MVT::v32i1 ? MVT::i32
22214 : MVT::i16;
22215 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22216 DAG.getConstant(0, DL, KRegVT), CC);
22217 }
22218 if (HasPT) {
22219 SDValue BCCmp =
22220 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22221 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22223 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22224 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22225 }
22226 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22227 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22228 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22229 assert(Cmp.getValueType() == MVT::v16i8 &&
22230 "Non 128-bit vector on pre-SSE41 target");
22231 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22232 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22233 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22234 }
22235
22236 return SDValue();
22237}
22238
22239/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22240/// style scalarized (associative) reduction patterns. Partial reductions
22241/// are supported when the pointer SrcMask is non-null.
22242/// TODO - move this to SelectionDAG?
22245 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22247 DenseMap<SDValue, APInt> SrcOpMap;
22248 EVT VT = MVT::Other;
22249
22250 // Recognize a special case where a vector is casted into wide integer to
22251 // test all 0s.
22252 assert(Op.getOpcode() == unsigned(BinOp) &&
22253 "Unexpected bit reduction opcode");
22254 Opnds.push_back(Op.getOperand(0));
22255 Opnds.push_back(Op.getOperand(1));
22256
22257 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22259 // BFS traverse all BinOp operands.
22260 if (I->getOpcode() == unsigned(BinOp)) {
22261 Opnds.push_back(I->getOperand(0));
22262 Opnds.push_back(I->getOperand(1));
22263 // Re-evaluate the number of nodes to be traversed.
22264 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22265 continue;
22266 }
22267
22268 // Quit if a non-EXTRACT_VECTOR_ELT
22269 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22270 return false;
22271
22272 // Quit if without a constant index.
22273 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22274 if (!Idx)
22275 return false;
22276
22277 SDValue Src = I->getOperand(0);
22278 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22279 if (M == SrcOpMap.end()) {
22280 VT = Src.getValueType();
22281 // Quit if not the same type.
22282 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22283 return false;
22284 unsigned NumElts = VT.getVectorNumElements();
22285 APInt EltCount = APInt::getZero(NumElts);
22286 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22287 SrcOps.push_back(Src);
22288 }
22289
22290 // Quit if element already used.
22291 unsigned CIdx = Idx->getZExtValue();
22292 if (M->second[CIdx])
22293 return false;
22294 M->second.setBit(CIdx);
22295 }
22296
22297 if (SrcMask) {
22298 // Collect the source partial masks.
22299 for (SDValue &SrcOp : SrcOps)
22300 SrcMask->push_back(SrcOpMap[SrcOp]);
22301 } else {
22302 // Quit if not all elements are used.
22303 for (const auto &I : SrcOpMap)
22304 if (!I.second.isAllOnes())
22305 return false;
22306 }
22307
22308 return true;
22309}
22310
22311// Helper function for comparing all bits of two vectors.
22313 ISD::CondCode CC, const APInt &OriginalMask,
22314 const X86Subtarget &Subtarget,
22315 SelectionDAG &DAG, X86::CondCode &X86CC) {
22316 EVT VT = LHS.getValueType();
22317 unsigned ScalarSize = VT.getScalarSizeInBits();
22318 if (OriginalMask.getBitWidth() != ScalarSize) {
22319 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22320 return SDValue();
22321 }
22322
22323 // Quit if not convertable to legal scalar or 128/256-bit vector.
22324 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22325 return SDValue();
22326
22327 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22328 if (VT.isFloatingPoint())
22329 return SDValue();
22330
22331 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22332 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22333
22334 APInt Mask = OriginalMask;
22335
22336 auto MaskBits = [&](SDValue Src) {
22337 if (Mask.isAllOnes())
22338 return Src;
22339 EVT SrcVT = Src.getValueType();
22340 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22341 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22342 };
22343
22344 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22345 if (VT.getSizeInBits() < 128) {
22346 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22347 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
22348 if (IntVT != MVT::i64)
22349 return SDValue();
22350 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
22351 MVT::i32, MVT::i32);
22352 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
22353 MVT::i32, MVT::i32);
22354 SDValue Lo =
22355 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
22356 SDValue Hi =
22357 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
22358 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22359 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
22360 DAG.getConstant(0, DL, MVT::i32));
22361 }
22362 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22363 DAG.getBitcast(IntVT, MaskBits(LHS)),
22364 DAG.getBitcast(IntVT, MaskBits(RHS)));
22365 }
22366
22367 // Without PTEST, a masked v2i64 or-reduction is not faster than
22368 // scalarization.
22369 bool UseKORTEST = Subtarget.useAVX512Regs();
22370 bool UsePTEST = Subtarget.hasSSE41();
22371 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
22372 return SDValue();
22373
22374 // Split down to 128/256/512-bit vector.
22375 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
22376
22377 // If the input vector has vector elements wider than the target test size,
22378 // then cast to <X x i64> so it will safely split.
22379 if (ScalarSize > TestSize) {
22380 if (!Mask.isAllOnes())
22381 return SDValue();
22382 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
22383 LHS = DAG.getBitcast(VT, LHS);
22384 RHS = DAG.getBitcast(VT, RHS);
22385 Mask = APInt::getAllOnes(64);
22386 }
22387
22388 if (VT.getSizeInBits() > TestSize) {
22389 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
22390 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
22391 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22392 while (VT.getSizeInBits() > TestSize) {
22393 auto Split = DAG.SplitVector(LHS, DL);
22394 VT = Split.first.getValueType();
22395 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22396 }
22397 RHS = DAG.getAllOnesConstant(DL, VT);
22398 } else if (!UsePTEST && !KnownRHS.isZero()) {
22399 // MOVMSK Special Case:
22400 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22401 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
22402 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
22403 LHS = DAG.getBitcast(VT, MaskBits(LHS));
22404 RHS = DAG.getBitcast(VT, MaskBits(RHS));
22405 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
22406 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
22407 V = DAG.getSExtOrTrunc(V, DL, VT);
22408 while (VT.getSizeInBits() > TestSize) {
22409 auto Split = DAG.SplitVector(V, DL);
22410 VT = Split.first.getValueType();
22411 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22412 }
22413 V = DAG.getNOT(DL, V, VT);
22414 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22415 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22416 DAG.getConstant(0, DL, MVT::i32));
22417 } else {
22418 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
22419 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
22420 while (VT.getSizeInBits() > TestSize) {
22421 auto Split = DAG.SplitVector(V, DL);
22422 VT = Split.first.getValueType();
22423 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22424 }
22425 LHS = V;
22426 RHS = DAG.getConstant(0, DL, VT);
22427 }
22428 }
22429
22430 if (UseKORTEST && VT.is512BitVector()) {
22431 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22432 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
22433 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22434 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22435 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
22436 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
22437 }
22438
22439 if (UsePTEST) {
22440 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
22441 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22442 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22443 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
22444 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22445 }
22446
22447 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22448 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
22449 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
22450 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
22451 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
22452 V = DAG.getNOT(DL, V, MaskVT);
22453 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22454 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22455 DAG.getConstant(0, DL, MVT::i32));
22456}
22457
22458// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22459// to CMP(MOVMSK(PCMPEQB(X,Y))).
22461 ISD::CondCode CC, const SDLoc &DL,
22462 const X86Subtarget &Subtarget,
22463 SelectionDAG &DAG,
22464 X86::CondCode &X86CC) {
22465 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22466
22467 bool CmpNull = isNullConstant(RHS);
22468 bool CmpAllOnes = isAllOnesConstant(RHS);
22469 if (!CmpNull && !CmpAllOnes)
22470 return SDValue();
22471
22472 SDValue Op = LHS;
22473 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22474 return SDValue();
22475
22476 // Check whether we're masking/truncating an OR-reduction result, in which
22477 // case track the masked bits.
22478 // TODO: Add CmpAllOnes support.
22479 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
22480 if (CmpNull) {
22481 switch (Op.getOpcode()) {
22482 case ISD::TRUNCATE: {
22483 SDValue Src = Op.getOperand(0);
22484 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22485 Op.getScalarValueSizeInBits());
22486 Op = Src;
22487 break;
22488 }
22489 case ISD::AND: {
22490 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22491 Mask = Cst->getAPIntValue();
22492 Op = Op.getOperand(0);
22493 }
22494 break;
22495 }
22496 }
22497 }
22498
22499 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
22500
22501 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
22502 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22504 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
22505 EVT VT = VecIns[0].getValueType();
22506 assert(llvm::all_of(VecIns,
22507 [VT](SDValue V) { return VT == V.getValueType(); }) &&
22508 "Reduction source vector mismatch");
22509
22510 // Quit if not splittable to scalar/128/256/512-bit vector.
22511 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22512 return SDValue();
22513
22514 // If more than one full vector is evaluated, AND/OR them first before
22515 // PTEST.
22516 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22517 Slot += 2, e += 1) {
22518 // Each iteration will AND/OR 2 nodes and append the result until there is
22519 // only 1 node left, i.e. the final value of all vectors.
22520 SDValue LHS = VecIns[Slot];
22521 SDValue RHS = VecIns[Slot + 1];
22522 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
22523 }
22524
22525 return LowerVectorAllEqual(DL, VecIns.back(),
22526 CmpNull ? DAG.getConstant(0, DL, VT)
22527 : DAG.getAllOnesConstant(DL, VT),
22528 CC, Mask, Subtarget, DAG, X86CC);
22529 }
22530
22531 // Match icmp(reduce_or(X),0) anyof reduction patterns.
22532 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22533 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22534 ISD::NodeType BinOp;
22535 if (SDValue Match =
22536 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
22537 EVT MatchVT = Match.getValueType();
22539 CmpNull ? DAG.getConstant(0, DL, MatchVT)
22540 : DAG.getAllOnesConstant(DL, MatchVT),
22541 CC, Mask, Subtarget, DAG, X86CC);
22542 }
22543 }
22544
22545 if (Mask.isAllOnes()) {
22546 assert(!Op.getValueType().isVector() &&
22547 "Illegal vector type for reduction pattern");
22549 if (Src.getValueType().isFixedLengthVector() &&
22550 Src.getValueType().getScalarType() == MVT::i1) {
22551 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
22552 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22553 if (Src.getOpcode() == ISD::SETCC) {
22554 SDValue LHS = Src.getOperand(0);
22555 SDValue RHS = Src.getOperand(1);
22556 EVT LHSVT = LHS.getValueType();
22557 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22558 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
22559 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
22560 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
22561 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
22562 X86CC);
22563 }
22564 }
22565 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
22566 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22567 // Peek through truncation, mask the LSB and compare against zero/LSB.
22568 if (Src.getOpcode() == ISD::TRUNCATE) {
22569 SDValue Inner = Src.getOperand(0);
22570 EVT InnerVT = Inner.getValueType();
22571 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
22572 unsigned BW = InnerVT.getScalarSizeInBits();
22573 APInt SrcMask = APInt(BW, 1);
22574 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
22575 return LowerVectorAllEqual(DL, Inner,
22576 DAG.getConstant(Cmp, DL, InnerVT), CC,
22577 SrcMask, Subtarget, DAG, X86CC);
22578 }
22579 }
22580 }
22581 }
22582
22583 return SDValue();
22584}
22585
22586/// return true if \c Op has a use that doesn't just read flags.
22588 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22589 ++UI) {
22590 SDNode *User = *UI;
22591 unsigned UOpNo = UI.getOperandNo();
22592 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22593 // Look pass truncate.
22594 UOpNo = User->use_begin().getOperandNo();
22595 User = *User->use_begin();
22596 }
22597
22598 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22599 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22600 return true;
22601 }
22602 return false;
22603}
22604
22605// Transform to an x86-specific ALU node with flags if there is a chance of
22606// using an RMW op or only the flags are used. Otherwise, leave
22607// the node alone and emit a 'cmp' or 'test' instruction.
22609 for (SDNode *U : Op->uses())
22610 if (U->getOpcode() != ISD::CopyToReg &&
22611 U->getOpcode() != ISD::SETCC &&
22612 U->getOpcode() != ISD::STORE)
22613 return false;
22614
22615 return true;
22616}
22617
22618/// Emit nodes that will be selected as "test Op0,Op0", or something
22619/// equivalent.
22620static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22621 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22622 // CF and OF aren't always set the way we want. Determine which
22623 // of these we need.
22624 bool NeedCF = false;
22625 bool NeedOF = false;
22626 switch (X86CC) {
22627 default: break;
22628 case X86::COND_A: case X86::COND_AE:
22629 case X86::COND_B: case X86::COND_BE:
22630 NeedCF = true;
22631 break;
22632 case X86::COND_G: case X86::COND_GE:
22633 case X86::COND_L: case X86::COND_LE:
22634 case X86::COND_O: case X86::COND_NO: {
22635 // Check if we really need to set the
22636 // Overflow flag. If NoSignedWrap is present
22637 // that is not actually needed.
22638 switch (Op->getOpcode()) {
22639 case ISD::ADD:
22640 case ISD::SUB:
22641 case ISD::MUL:
22642 case ISD::SHL:
22643 if (Op.getNode()->getFlags().hasNoSignedWrap())
22644 break;
22645 [[fallthrough]];
22646 default:
22647 NeedOF = true;
22648 break;
22649 }
22650 break;
22651 }
22652 }
22653 // See if we can use the EFLAGS value from the operand instead of
22654 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22655 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22656 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22657 // Emit a CMP with 0, which is the TEST pattern.
22658 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22659 DAG.getConstant(0, dl, Op.getValueType()));
22660 }
22661 unsigned Opcode = 0;
22662 unsigned NumOperands = 0;
22663
22664 SDValue ArithOp = Op;
22665
22666 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22667 // which may be the result of a CAST. We use the variable 'Op', which is the
22668 // non-casted variable when we check for possible users.
22669 switch (ArithOp.getOpcode()) {
22670 case ISD::AND:
22671 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22672 // because a TEST instruction will be better.
22673 if (!hasNonFlagsUse(Op))
22674 break;
22675
22676 [[fallthrough]];
22677 case ISD::ADD:
22678 case ISD::SUB:
22679 case ISD::OR:
22680 case ISD::XOR:
22682 break;
22683
22684 // Otherwise use a regular EFLAGS-setting instruction.
22685 switch (ArithOp.getOpcode()) {
22686 // clang-format off
22687 default: llvm_unreachable("unexpected operator!");
22688 case ISD::ADD: Opcode = X86ISD::ADD; break;
22689 case ISD::SUB: Opcode = X86ISD::SUB; break;
22690 case ISD::XOR: Opcode = X86ISD::XOR; break;
22691 case ISD::AND: Opcode = X86ISD::AND; break;
22692 case ISD::OR: Opcode = X86ISD::OR; break;
22693 // clang-format on
22694 }
22695
22696 NumOperands = 2;
22697 break;
22698 case X86ISD::ADD:
22699 case X86ISD::SUB:
22700 case X86ISD::OR:
22701 case X86ISD::XOR:
22702 case X86ISD::AND:
22703 return SDValue(Op.getNode(), 1);
22704 case ISD::SSUBO:
22705 case ISD::USUBO: {
22706 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22707 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22708 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22709 Op->getOperand(1)).getValue(1);
22710 }
22711 default:
22712 break;
22713 }
22714
22715 if (Opcode == 0) {
22716 // Emit a CMP with 0, which is the TEST pattern.
22717 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22718 DAG.getConstant(0, dl, Op.getValueType()));
22719 }
22720 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22721 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22722
22723 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22724 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22725 return SDValue(New.getNode(), 1);
22726}
22727
22728/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22729/// equivalent.
22730static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22731 const SDLoc &dl, SelectionDAG &DAG,
22732 const X86Subtarget &Subtarget) {
22733 if (isNullConstant(Op1))
22734 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22735
22736 EVT CmpVT = Op0.getValueType();
22737
22738 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22739 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22740
22741 // Only promote the compare up to I32 if it is a 16 bit operation
22742 // with an immediate. 16 bit immediates are to be avoided unless the target
22743 // isn't slowed down by length changing prefixes, we're optimizing for
22744 // codesize or the comparison is with a folded load.
22745 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
22746 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
22748 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
22749 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
22750 // Don't do this if the immediate can fit in 8-bits.
22751 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22752 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22753 unsigned ExtendOp =
22755 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22756 // For equality comparisons try to use SIGN_EXTEND if the input was
22757 // truncate from something with enough sign bits.
22758 if (Op0.getOpcode() == ISD::TRUNCATE) {
22759 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
22760 ExtendOp = ISD::SIGN_EXTEND;
22761 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22762 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
22763 ExtendOp = ISD::SIGN_EXTEND;
22764 }
22765 }
22766
22767 CmpVT = MVT::i32;
22768 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22769 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22770 }
22771 }
22772
22773 // Try to shrink i64 compares if the input has enough zero bits.
22774 // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
22775 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
22776 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22777 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
22778 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22779 CmpVT = MVT::i32;
22780 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22781 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22782 }
22783
22784 // 0-x == y --> x+y == 0
22785 // 0-x != y --> x+y != 0
22786 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22787 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22788 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22789 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22790 return Add.getValue(1);
22791 }
22792
22793 // x == 0-y --> x+y == 0
22794 // x != 0-y --> x+y != 0
22795 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22796 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22797 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22798 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22799 return Add.getValue(1);
22800 }
22801
22802 // Use SUB instead of CMP to enable CSE between SUB and CMP.
22803 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22804 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22805 return Sub.getValue(1);
22806}
22807
22809 EVT VT) const {
22810 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
22811}
22812
22813bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
22814 SDNode *N, SDValue, SDValue IntPow2) const {
22815 if (N->getOpcode() == ISD::FDIV)
22816 return true;
22817
22818 EVT FPVT = N->getValueType(0);
22819 EVT IntVT = IntPow2.getValueType();
22820
22821 // This indicates a non-free bitcast.
22822 // TODO: This is probably overly conservative as we will need to scale the
22823 // integer vector anyways for the int->fp cast.
22824 if (FPVT.isVector() &&
22825 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
22826 return false;
22827
22828 return true;
22829}
22830
22831/// Check if replacement of SQRT with RSQRT should be disabled.
22832bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22833 EVT VT = Op.getValueType();
22834
22835 // We don't need to replace SQRT with RSQRT for half type.
22836 if (VT.getScalarType() == MVT::f16)
22837 return true;
22838
22839 // We never want to use both SQRT and RSQRT instructions for the same input.
22840 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22841 return false;
22842
22843 if (VT.isVector())
22844 return Subtarget.hasFastVectorFSQRT();
22845 return Subtarget.hasFastScalarFSQRT();
22846}
22847
22848/// The minimum architected relative accuracy is 2^-12. We need one
22849/// Newton-Raphson step to have a good float result (24 bits of precision).
22850SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22851 SelectionDAG &DAG, int Enabled,
22852 int &RefinementSteps,
22853 bool &UseOneConstNR,
22854 bool Reciprocal) const {
22855 SDLoc DL(Op);
22856 EVT VT = Op.getValueType();
22857
22858 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22859 // It is likely not profitable to do this for f64 because a double-precision
22860 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22861 // instructions: convert to single, rsqrtss, convert back to double, refine
22862 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22863 // along with FMA, this could be a throughput win.
22864 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22865 // after legalize types.
22866 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22867 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22868 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22869 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22870 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22871 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22872 RefinementSteps = 1;
22873
22874 UseOneConstNR = false;
22875 // There is no FSQRT for 512-bits, but there is RSQRT14.
22876 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22877 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
22878 if (RefinementSteps == 0 && !Reciprocal)
22879 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
22880 return Estimate;
22881 }
22882
22883 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22884 Subtarget.hasFP16()) {
22885 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
22886 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22887 RefinementSteps = 0;
22888
22889 if (VT == MVT::f16) {
22890 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22891 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22892 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22893 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
22894 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22895 }
22896
22897 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
22898 }
22899 return SDValue();
22900}
22901
22902/// The minimum architected relative accuracy is 2^-12. We need one
22903/// Newton-Raphson step to have a good float result (24 bits of precision).
22904SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22905 int Enabled,
22906 int &RefinementSteps) const {
22907 SDLoc DL(Op);
22908 EVT VT = Op.getValueType();
22909
22910 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22911 // It is likely not profitable to do this for f64 because a double-precision
22912 // reciprocal estimate with refinement on x86 prior to FMA requires
22913 // 15 instructions: convert to single, rcpss, convert back to double, refine
22914 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22915 // along with FMA, this could be a throughput win.
22916
22917 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22918 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22919 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22920 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22921 // Enable estimate codegen with 1 refinement step for vector division.
22922 // Scalar division estimates are disabled because they break too much
22923 // real-world code. These defaults are intended to match GCC behavior.
22924 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22925 return SDValue();
22926
22927 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22928 RefinementSteps = 1;
22929
22930 // There is no FSQRT for 512-bits, but there is RCP14.
22931 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22932 return DAG.getNode(Opcode, DL, VT, Op);
22933 }
22934
22935 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22936 Subtarget.hasFP16()) {
22937 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22938 RefinementSteps = 0;
22939
22940 if (VT == MVT::f16) {
22941 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22942 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22943 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22944 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
22945 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22946 }
22947
22948 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
22949 }
22950 return SDValue();
22951}
22952
22953/// If we have at least two divisions that use the same divisor, convert to
22954/// multiplication by a reciprocal. This may need to be adjusted for a given
22955/// CPU if a division's cost is not at least twice the cost of a multiplication.
22956/// This is because we still need one division to calculate the reciprocal and
22957/// then we need two multiplies by that reciprocal as replacements for the
22958/// original divisions.
22959unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22960 return 2;
22961}
22962
22963SDValue
22964X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22965 SelectionDAG &DAG,
22966 SmallVectorImpl<SDNode *> &Created) const {
22968 if (isIntDivCheap(N->getValueType(0), Attr))
22969 return SDValue(N,0); // Lower SDIV as SDIV
22970
22971 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
22972 "Unexpected divisor!");
22973
22974 // Only perform this transform if CMOV is supported otherwise the select
22975 // below will become a branch.
22976 if (!Subtarget.canUseCMOV())
22977 return SDValue();
22978
22979 // fold (sdiv X, pow2)
22980 EVT VT = N->getValueType(0);
22981 // FIXME: Support i8.
22982 if (VT != MVT::i16 && VT != MVT::i32 &&
22983 !(Subtarget.is64Bit() && VT == MVT::i64))
22984 return SDValue();
22985
22986 // If the divisor is 2 or -2, the default expansion is better.
22987 if (Divisor == 2 ||
22988 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
22989 return SDValue();
22990
22991 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
22992}
22993
22994/// Result of 'and' is compared against zero. Change to a BT node if possible.
22995/// Returns the BT node and the condition code needed to use it.
22997 SelectionDAG &DAG, X86::CondCode &X86CC) {
22998 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22999 SDValue Op0 = And.getOperand(0);
23000 SDValue Op1 = And.getOperand(1);
23001 if (Op0.getOpcode() == ISD::TRUNCATE)
23002 Op0 = Op0.getOperand(0);
23003 if (Op1.getOpcode() == ISD::TRUNCATE)
23004 Op1 = Op1.getOperand(0);
23005
23006 SDValue Src, BitNo;
23007 if (Op1.getOpcode() == ISD::SHL)
23008 std::swap(Op0, Op1);
23009 if (Op0.getOpcode() == ISD::SHL) {
23010 if (isOneConstant(Op0.getOperand(0))) {
23011 // If we looked past a truncate, check that it's only truncating away
23012 // known zeros.
23013 unsigned BitWidth = Op0.getValueSizeInBits();
23014 unsigned AndBitWidth = And.getValueSizeInBits();
23015 if (BitWidth > AndBitWidth) {
23016 KnownBits Known = DAG.computeKnownBits(Op0);
23017 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23018 return SDValue();
23019 }
23020 Src = Op1;
23021 BitNo = Op0.getOperand(1);
23022 }
23023 } else if (Op1.getOpcode() == ISD::Constant) {
23024 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23025 uint64_t AndRHSVal = AndRHS->getZExtValue();
23026 SDValue AndLHS = Op0;
23027
23028 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23029 Src = AndLHS.getOperand(0);
23030 BitNo = AndLHS.getOperand(1);
23031 } else {
23032 // Use BT if the immediate can't be encoded in a TEST instruction or we
23033 // are optimizing for size and the immedaite won't fit in a byte.
23034 bool OptForSize = DAG.shouldOptForSize();
23035 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23036 isPowerOf2_64(AndRHSVal)) {
23037 Src = AndLHS;
23038 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23039 Src.getValueType());
23040 }
23041 }
23042 }
23043
23044 // No patterns found, give up.
23045 if (!Src.getNode())
23046 return SDValue();
23047
23048 // Remove any bit flip.
23049 if (isBitwiseNot(Src)) {
23050 Src = Src.getOperand(0);
23052 }
23053
23054 // Attempt to create the X86ISD::BT node.
23055 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23056 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23057 return BT;
23058 }
23059
23060 return SDValue();
23061}
23062
23063// Check if pre-AVX condcode can be performed by a single FCMP op.
23064static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23065 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23066}
23067
23068/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23069/// CMPs.
23070static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23071 SDValue &Op1, bool &IsAlwaysSignaling) {
23072 unsigned SSECC;
23073 bool Swap = false;
23074
23075 // SSE Condition code mapping:
23076 // 0 - EQ
23077 // 1 - LT
23078 // 2 - LE
23079 // 3 - UNORD
23080 // 4 - NEQ
23081 // 5 - NLT
23082 // 6 - NLE
23083 // 7 - ORD
23084 switch (SetCCOpcode) {
23085 // clang-format off
23086 default: llvm_unreachable("Unexpected SETCC condition");
23087 case ISD::SETOEQ:
23088 case ISD::SETEQ: SSECC = 0; break;
23089 case ISD::SETOGT:
23090 case ISD::SETGT: Swap = true; [[fallthrough]];
23091 case ISD::SETLT:
23092 case ISD::SETOLT: SSECC = 1; break;
23093 case ISD::SETOGE:
23094 case ISD::SETGE: Swap = true; [[fallthrough]];
23095 case ISD::SETLE:
23096 case ISD::SETOLE: SSECC = 2; break;
23097 case ISD::SETUO: SSECC = 3; break;
23098 case ISD::SETUNE:
23099 case ISD::SETNE: SSECC = 4; break;
23100 case ISD::SETULE: Swap = true; [[fallthrough]];
23101 case ISD::SETUGE: SSECC = 5; break;
23102 case ISD::SETULT: Swap = true; [[fallthrough]];
23103 case ISD::SETUGT: SSECC = 6; break;
23104 case ISD::SETO: SSECC = 7; break;
23105 case ISD::SETUEQ: SSECC = 8; break;
23106 case ISD::SETONE: SSECC = 12; break;
23107 // clang-format on
23108 }
23109 if (Swap)
23110 std::swap(Op0, Op1);
23111
23112 switch (SetCCOpcode) {
23113 default:
23114 IsAlwaysSignaling = true;
23115 break;
23116 case ISD::SETEQ:
23117 case ISD::SETOEQ:
23118 case ISD::SETUEQ:
23119 case ISD::SETNE:
23120 case ISD::SETONE:
23121 case ISD::SETUNE:
23122 case ISD::SETO:
23123 case ISD::SETUO:
23124 IsAlwaysSignaling = false;
23125 break;
23126 }
23127
23128 return SSECC;
23129}
23130
23131/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23132/// concatenate the result back.
23135 const SDLoc &dl) {
23136 assert(VT.isInteger() && VT == LHS.getValueType() &&
23137 VT == RHS.getValueType() && "Unsupported VTs!");
23138
23139 SDValue CC = DAG.getCondCode(Cond);
23140
23141 // Extract the LHS Lo/Hi vectors
23142 SDValue LHS1, LHS2;
23143 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23144
23145 // Extract the RHS Lo/Hi vectors
23146 SDValue RHS1, RHS2;
23147 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23148
23149 // Issue the operation on the smaller types and concatenate the result back
23150 EVT LoVT, HiVT;
23151 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23152 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23153 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23154 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23155}
23156
23158 SelectionDAG &DAG) {
23159 SDValue Op0 = Op.getOperand(0);
23160 SDValue Op1 = Op.getOperand(1);
23161 SDValue CC = Op.getOperand(2);
23162 MVT VT = Op.getSimpleValueType();
23163 assert(VT.getVectorElementType() == MVT::i1 &&
23164 "Cannot set masked compare for this operation");
23165
23166 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23167
23168 // Prefer SETGT over SETLT.
23169 if (SetCCOpcode == ISD::SETLT) {
23170 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23171 std::swap(Op0, Op1);
23172 }
23173
23174 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23175}
23176
23177/// Given a buildvector constant, return a new vector constant with each element
23178/// incremented or decremented. If incrementing or decrementing would result in
23179/// unsigned overflow or underflow or this is not a simple vector constant,
23180/// return an empty value.
23182 bool NSW) {
23183 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23184 if (!BV || !V.getValueType().isSimple())
23185 return SDValue();
23186
23187 MVT VT = V.getSimpleValueType();
23188 MVT EltVT = VT.getVectorElementType();
23189 unsigned NumElts = VT.getVectorNumElements();
23191 SDLoc DL(V);
23192 for (unsigned i = 0; i < NumElts; ++i) {
23193 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23194 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23195 return SDValue();
23196
23197 // Avoid overflow/underflow.
23198 const APInt &EltC = Elt->getAPIntValue();
23199 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23200 return SDValue();
23201 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23202 (!IsInc && EltC.isMinSignedValue())))
23203 return SDValue();
23204
23205 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23206 }
23207
23208 return DAG.getBuildVector(VT, DL, NewVecC);
23209}
23210
23211/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23212/// Op0 u<= Op1:
23213/// t = psubus Op0, Op1
23214/// pcmpeq t, <0..0>
23216 ISD::CondCode Cond, const SDLoc &dl,
23217 const X86Subtarget &Subtarget,
23218 SelectionDAG &DAG) {
23219 if (!Subtarget.hasSSE2())
23220 return SDValue();
23221
23222 MVT VET = VT.getVectorElementType();
23223 if (VET != MVT::i8 && VET != MVT::i16)
23224 return SDValue();
23225
23226 switch (Cond) {
23227 default:
23228 return SDValue();
23229 case ISD::SETULT: {
23230 // If the comparison is against a constant we can turn this into a
23231 // setule. With psubus, setule does not require a swap. This is
23232 // beneficial because the constant in the register is no longer
23233 // destructed as the destination so it can be hoisted out of a loop.
23234 // Only do this pre-AVX since vpcmp* is no longer destructive.
23235 if (Subtarget.hasAVX())
23236 return SDValue();
23237 SDValue ULEOp1 =
23238 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23239 if (!ULEOp1)
23240 return SDValue();
23241 Op1 = ULEOp1;
23242 break;
23243 }
23244 case ISD::SETUGT: {
23245 // If the comparison is against a constant, we can turn this into a setuge.
23246 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23247 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23248 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23249 SDValue UGEOp1 =
23250 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
23251 if (!UGEOp1)
23252 return SDValue();
23253 Op1 = Op0;
23254 Op0 = UGEOp1;
23255 break;
23256 }
23257 // Psubus is better than flip-sign because it requires no inversion.
23258 case ISD::SETUGE:
23259 std::swap(Op0, Op1);
23260 break;
23261 case ISD::SETULE:
23262 break;
23263 }
23264
23265 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23266 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23267 DAG.getConstant(0, dl, VT));
23268}
23269
23270static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23271 SelectionDAG &DAG) {
23272 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23273 Op.getOpcode() == ISD::STRICT_FSETCCS;
23274 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23275 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23276 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23277 MVT VT = Op->getSimpleValueType(0);
23278 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23279 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23280 SDLoc dl(Op);
23281
23282 if (isFP) {
23284 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
23285 if (isSoftF16(EltVT, Subtarget))
23286 return SDValue();
23287
23288 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23289 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23290
23291 // If we have a strict compare with a vXi1 result and the input is 128/256
23292 // bits we can't use a masked compare unless we have VLX. If we use a wider
23293 // compare like we do for non-strict, we might trigger spurious exceptions
23294 // from the upper elements. Instead emit a AVX compare and convert to mask.
23295 unsigned Opc;
23296 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23297 (!IsStrict || Subtarget.hasVLX() ||
23299#ifndef NDEBUG
23300 unsigned Num = VT.getVectorNumElements();
23301 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
23302#endif
23303 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23304 } else {
23305 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23306 // The SSE/AVX packed FP comparison nodes are defined with a
23307 // floating-point vector result that matches the operand type. This allows
23308 // them to work with an SSE1 target (integer vector types are not legal).
23309 VT = Op0.getSimpleValueType();
23310 }
23311
23312 SDValue Cmp;
23313 bool IsAlwaysSignaling;
23314 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23315 if (!Subtarget.hasAVX()) {
23316 // TODO: We could use following steps to handle a quiet compare with
23317 // signaling encodings.
23318 // 1. Get ordered masks from a quiet ISD::SETO
23319 // 2. Use the masks to mask potential unordered elements in operand A, B
23320 // 3. Get the compare results of masked A, B
23321 // 4. Calculating final result using the mask and result from 3
23322 // But currently, we just fall back to scalar operations.
23323 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23324 return SDValue();
23325
23326 // Insert an extra signaling instruction to raise exception.
23327 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23328 SDValue SignalCmp = DAG.getNode(
23329 Opc, dl, {VT, MVT::Other},
23330 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23331 // FIXME: It seems we need to update the flags of all new strict nodes.
23332 // Otherwise, mayRaiseFPException in MI will return false due to
23333 // NoFPExcept = false by default. However, I didn't find it in other
23334 // patches.
23335 SignalCmp->setFlags(Op->getFlags());
23336 Chain = SignalCmp.getValue(1);
23337 }
23338
23339 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23340 // emit two comparisons and a logic op to tie them together.
23341 if (!cheapX86FSETCC_SSE(Cond)) {
23342 // LLVM predicate is SETUEQ or SETONE.
23343 unsigned CC0, CC1;
23344 unsigned CombineOpc;
23345 if (Cond == ISD::SETUEQ) {
23346 CC0 = 3; // UNORD
23347 CC1 = 0; // EQ
23348 CombineOpc = X86ISD::FOR;
23349 } else {
23351 CC0 = 7; // ORD
23352 CC1 = 4; // NEQ
23353 CombineOpc = X86ISD::FAND;
23354 }
23355
23356 SDValue Cmp0, Cmp1;
23357 if (IsStrict) {
23358 Cmp0 = DAG.getNode(
23359 Opc, dl, {VT, MVT::Other},
23360 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23361 Cmp1 = DAG.getNode(
23362 Opc, dl, {VT, MVT::Other},
23363 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23364 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23365 Cmp1.getValue(1));
23366 } else {
23367 Cmp0 = DAG.getNode(
23368 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23369 Cmp1 = DAG.getNode(
23370 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23371 }
23372 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23373 } else {
23374 if (IsStrict) {
23375 Cmp = DAG.getNode(
23376 Opc, dl, {VT, MVT::Other},
23377 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23378 Chain = Cmp.getValue(1);
23379 } else
23380 Cmp = DAG.getNode(
23381 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23382 }
23383 } else {
23384 // Handle all other FP comparisons here.
23385 if (IsStrict) {
23386 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23387 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23388 Cmp = DAG.getNode(
23389 Opc, dl, {VT, MVT::Other},
23390 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23391 Chain = Cmp.getValue(1);
23392 } else
23393 Cmp = DAG.getNode(
23394 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23395 }
23396
23397 if (VT.getFixedSizeInBits() >
23398 Op.getSimpleValueType().getFixedSizeInBits()) {
23399 // We emitted a compare with an XMM/YMM result. Finish converting to a
23400 // mask register using a vptestm.
23402 Cmp = DAG.getBitcast(CastVT, Cmp);
23403 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23404 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23405 } else {
23406 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23407 // the result type of SETCC. The bitcast is expected to be optimized
23408 // away during combining/isel.
23409 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23410 }
23411
23412 if (IsStrict)
23413 return DAG.getMergeValues({Cmp, Chain}, dl);
23414
23415 return Cmp;
23416 }
23417
23418 assert(!IsStrict && "Strict SETCC only handles FP operands.");
23419
23420 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
23421 assert(VTOp0 == Op1.getSimpleValueType() &&
23422 "Expected operands with same type!");
23424 "Invalid number of packed elements for source and destination!");
23425
23426 // The non-AVX512 code below works under the assumption that source and
23427 // destination types are the same.
23428 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23429 "Value types for source and destination must be the same!");
23430
23431 // The result is boolean, but operands are int/float
23432 if (VT.getVectorElementType() == MVT::i1) {
23433 // In AVX-512 architecture setcc returns mask with i1 elements,
23434 // But there is no compare instruction for i8 and i16 elements in KNL.
23435 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23436 "Unexpected operand type");
23437 return LowerIntVSETCC_AVX512(Op, dl, DAG);
23438 }
23439
23440 // Lower using XOP integer comparisons.
23441 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23442 // Translate compare code to XOP PCOM compare mode.
23443 unsigned CmpMode = 0;
23444 switch (Cond) {
23445 // clang-format off
23446 default: llvm_unreachable("Unexpected SETCC condition");
23447 case ISD::SETULT:
23448 case ISD::SETLT: CmpMode = 0x00; break;
23449 case ISD::SETULE:
23450 case ISD::SETLE: CmpMode = 0x01; break;
23451 case ISD::SETUGT:
23452 case ISD::SETGT: CmpMode = 0x02; break;
23453 case ISD::SETUGE:
23454 case ISD::SETGE: CmpMode = 0x03; break;
23455 case ISD::SETEQ: CmpMode = 0x04; break;
23456 case ISD::SETNE: CmpMode = 0x05; break;
23457 // clang-format on
23458 }
23459
23460 // Are we comparing unsigned or signed integers?
23461 unsigned Opc =
23463
23464 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23465 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23466 }
23467
23468 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23469 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23471 SDValue BC0 = peekThroughBitcasts(Op0);
23472 if (BC0.getOpcode() == ISD::AND) {
23473 APInt UndefElts;
23474 SmallVector<APInt, 64> EltBits;
23476 BC0.getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits,
23477 /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) {
23478 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23479 Cond = ISD::SETEQ;
23480 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23481 }
23482 }
23483 }
23484 }
23485
23486 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23487 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23488 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23490 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23491 unsigned BitWidth = VT.getScalarSizeInBits();
23492 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23493
23494 SDValue Result = Op0.getOperand(0);
23495 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23496 DAG.getConstant(ShiftAmt, dl, VT));
23497 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23498 DAG.getConstant(BitWidth - 1, dl, VT));
23499 return Result;
23500 }
23501 }
23502
23503 // Break 256-bit integer vector compare into smaller ones.
23504 if (VT.is256BitVector() && !Subtarget.hasInt256())
23505 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23506
23507 // Break 512-bit integer vector compare into smaller ones.
23508 // TODO: Try harder to use VPCMPx + VPMOV2x?
23509 if (VT.is512BitVector())
23510 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23511
23512 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23513 // not-of-PCMPEQ:
23514 // X != INT_MIN --> X >s INT_MIN
23515 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23516 // +X != 0 --> +X >s 0
23517 APInt ConstValue;
23518 if (Cond == ISD::SETNE &&
23519 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23520 if (ConstValue.isMinSignedValue())
23521 Cond = ISD::SETGT;
23522 else if (ConstValue.isMaxSignedValue())
23523 Cond = ISD::SETLT;
23524 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
23525 Cond = ISD::SETGT;
23526 }
23527
23528 // If both operands are known non-negative, then an unsigned compare is the
23529 // same as a signed compare and there's no need to flip signbits.
23530 // TODO: We could check for more general simplifications here since we're
23531 // computing known bits.
23532 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23533 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23534
23535 // Special case: Use min/max operations for unsigned compares.
23536 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23538 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23539 TLI.isOperationLegal(ISD::UMIN, VT)) {
23540 // If we have a constant operand, increment/decrement it and change the
23541 // condition to avoid an invert.
23542 if (Cond == ISD::SETUGT) {
23543 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23544 if (SDValue UGTOp1 =
23545 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
23546 Op1 = UGTOp1;
23547 Cond = ISD::SETUGE;
23548 }
23549 }
23550 if (Cond == ISD::SETULT) {
23551 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23552 if (SDValue ULTOp1 =
23553 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
23554 Op1 = ULTOp1;
23555 Cond = ISD::SETULE;
23556 }
23557 }
23558 bool Invert = false;
23559 unsigned Opc;
23560 switch (Cond) {
23561 // clang-format off
23562 default: llvm_unreachable("Unexpected condition code");
23563 case ISD::SETUGT: Invert = true; [[fallthrough]];
23564 case ISD::SETULE: Opc = ISD::UMIN; break;
23565 case ISD::SETULT: Invert = true; [[fallthrough]];
23566 case ISD::SETUGE: Opc = ISD::UMAX; break;
23567 // clang-format on
23568 }
23569
23570 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23571 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23572
23573 // If the logical-not of the result is required, perform that now.
23574 if (Invert)
23575 Result = DAG.getNOT(dl, Result, VT);
23576
23577 return Result;
23578 }
23579
23580 // Try to use SUBUS and PCMPEQ.
23581 if (FlipSigns)
23582 if (SDValue V =
23583 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23584 return V;
23585
23586 // We are handling one of the integer comparisons here. Since SSE only has
23587 // GT and EQ comparisons for integer, swapping operands and multiple
23588 // operations may be required for some comparisons.
23589 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23591 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23593 bool Invert = Cond == ISD::SETNE ||
23595
23596 if (Swap)
23597 std::swap(Op0, Op1);
23598
23599 // Check that the operation in question is available (most are plain SSE2,
23600 // but PCMPGTQ and PCMPEQQ have different requirements).
23601 if (VT == MVT::v2i64) {
23602 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23603 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23604
23605 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23606 // the odd elements over the even elements.
23607 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23608 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23609 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23610
23611 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23612 static const int MaskHi[] = { 1, 1, 3, 3 };
23613 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23614
23615 return DAG.getBitcast(VT, Result);
23616 }
23617
23618 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23619 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23620 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23621
23622 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23623 static const int MaskHi[] = { 1, 1, 3, 3 };
23624 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23625
23626 return DAG.getBitcast(VT, Result);
23627 }
23628
23629 // If the i64 elements are sign-extended enough to be representable as i32
23630 // then we can compare the lower i32 bits and splat.
23631 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
23632 DAG.ComputeNumSignBits(Op1) > 32) {
23633 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23634 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23635
23636 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23637 static const int MaskLo[] = {0, 0, 2, 2};
23638 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23639
23640 return DAG.getBitcast(VT, Result);
23641 }
23642
23643 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23644 // bits of the inputs before performing those operations. The lower
23645 // compare is always unsigned.
23646 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
23647 : 0x0000000080000000ULL,
23648 dl, MVT::v2i64);
23649
23650 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23651 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23652
23653 // Cast everything to the right type.
23654 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23655 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23656
23657 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23658 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23659 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23660
23661 // Create masks for only the low parts/high parts of the 64 bit integers.
23662 static const int MaskHi[] = { 1, 1, 3, 3 };
23663 static const int MaskLo[] = { 0, 0, 2, 2 };
23664 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23665 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23666 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23667
23668 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23669 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23670
23671 if (Invert)
23672 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23673
23674 return DAG.getBitcast(VT, Result);
23675 }
23676
23677 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23678 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23679 // pcmpeqd + pshufd + pand.
23680 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23681
23682 // First cast everything to the right type.
23683 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23684 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23685
23686 // Do the compare.
23687 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23688
23689 // Make sure the lower and upper halves are both all-ones.
23690 static const int Mask[] = { 1, 0, 3, 2 };
23691 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23692 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23693
23694 if (Invert)
23695 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23696
23697 return DAG.getBitcast(VT, Result);
23698 }
23699 }
23700
23701 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23702 // bits of the inputs before performing those operations.
23703 if (FlipSigns) {
23704 MVT EltVT = VT.getVectorElementType();
23706 VT);
23707 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23708 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23709 }
23710
23711 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23712
23713 // If the logical-not of the result is required, perform that now.
23714 if (Invert)
23715 Result = DAG.getNOT(dl, Result, VT);
23716
23717 return Result;
23718}
23719
23720// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23722 const SDLoc &dl, SelectionDAG &DAG,
23723 const X86Subtarget &Subtarget,
23724 SDValue &X86CC) {
23725 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23726
23727 // Must be a bitcast from vXi1.
23728 if (Op0.getOpcode() != ISD::BITCAST)
23729 return SDValue();
23730
23731 Op0 = Op0.getOperand(0);
23732 MVT VT = Op0.getSimpleValueType();
23733 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23734 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23735 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23736 return SDValue();
23737
23738 X86::CondCode X86Cond;
23739 if (isNullConstant(Op1)) {
23740 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23741 } else if (isAllOnesConstant(Op1)) {
23742 // C flag is set for all ones.
23743 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23744 } else
23745 return SDValue();
23746
23747 // If the input is an AND, we can combine it's operands into the KTEST.
23748 bool KTestable = false;
23749 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23750 KTestable = true;
23751 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23752 KTestable = true;
23753 if (!isNullConstant(Op1))
23754 KTestable = false;
23755 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23756 SDValue LHS = Op0.getOperand(0);
23757 SDValue RHS = Op0.getOperand(1);
23758 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23759 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23760 }
23761
23762 // If the input is an OR, we can combine it's operands into the KORTEST.
23763 SDValue LHS = Op0;
23764 SDValue RHS = Op0;
23765 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23766 LHS = Op0.getOperand(0);
23767 RHS = Op0.getOperand(1);
23768 }
23769
23770 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23771 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23772}
23773
23774/// Emit flags for the given setcc condition and operands. Also returns the
23775/// corresponding X86 condition code constant in X86CC.
23776SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23777 ISD::CondCode CC, const SDLoc &dl,
23778 SelectionDAG &DAG,
23779 SDValue &X86CC) const {
23780 // Equality Combines.
23781 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23782 X86::CondCode X86CondCode;
23783
23784 // Optimize to BT if possible.
23785 // Lower (X & (1 << N)) == 0 to BT(X, N).
23786 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23787 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23788 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
23789 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
23790 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23791 return BT;
23792 }
23793 }
23794
23795 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
23796 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
23797 X86CondCode)) {
23798 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23799 return CmpZ;
23800 }
23801
23802 // Try to lower using KORTEST or KTEST.
23803 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23804 return Test;
23805
23806 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
23807 // of these.
23808 if (isOneConstant(Op1) || isNullConstant(Op1)) {
23809 // If the input is a setcc, then reuse the input setcc or use a new one
23810 // with the inverted condition.
23811 if (Op0.getOpcode() == X86ISD::SETCC) {
23812 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23813
23814 X86CC = Op0.getOperand(0);
23815 if (Invert) {
23816 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23817 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
23818 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23819 }
23820
23821 return Op0.getOperand(1);
23822 }
23823 }
23824
23825 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
23826 // overflow.
23827 if (isMinSignedConstant(Op1)) {
23828 EVT VT = Op0.getValueType();
23829 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
23830 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
23832 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23833 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
23834 DAG.getConstant(0, dl, VT), Op0);
23835 return SDValue(Neg.getNode(), 1);
23836 }
23837 }
23838
23839 // Try to use the carry flag from the add in place of an separate CMP for:
23840 // (seteq (add X, -1), -1). Similar for setne.
23841 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23842 Op0.getOperand(1) == Op1) {
23843 if (isProfitableToUseFlagOp(Op0)) {
23844 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23845
23846 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23847 Op0.getOperand(1));
23848 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23849 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23850 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23851 return SDValue(New.getNode(), 1);
23852 }
23853 }
23854 }
23855
23857 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23858 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23859
23860 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23861 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23862 return EFLAGS;
23863}
23864
23865SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23866
23867 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23868 Op.getOpcode() == ISD::STRICT_FSETCCS;
23869 MVT VT = Op->getSimpleValueType(0);
23870
23871 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23872
23873 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23874 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23875 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23876 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23877 SDLoc dl(Op);
23879 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23880
23881 if (isSoftF16(Op0.getValueType(), Subtarget))
23882 return SDValue();
23883
23884 // Handle f128 first, since one possible outcome is a normal integer
23885 // comparison which gets handled by emitFlagsForSetcc.
23886 if (Op0.getValueType() == MVT::f128) {
23887 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23888 Op.getOpcode() == ISD::STRICT_FSETCCS);
23889
23890 // If softenSetCCOperands returned a scalar, use it.
23891 if (!Op1.getNode()) {
23892 assert(Op0.getValueType() == Op.getValueType() &&
23893 "Unexpected setcc expansion!");
23894 if (IsStrict)
23895 return DAG.getMergeValues({Op0, Chain}, dl);
23896 return Op0;
23897 }
23898 }
23899
23900 if (Op0.getSimpleValueType().isInteger()) {
23901 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23902 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23903 // this may translate to less uops depending on uarch implementation. The
23904 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23905 // canonicalize to that CondCode.
23906 // NOTE: Only do this if incrementing the constant doesn't increase the bit
23907 // encoding size - so it must either already be a i8 or i32 immediate, or it
23908 // shrinks down to that. We don't do this for any i64's to avoid additional
23909 // constant materializations.
23910 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23911 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23912 const APInt &Op1Val = Op1C->getAPIntValue();
23913 if (!Op1Val.isZero()) {
23914 // Ensure the constant+1 doesn't overflow.
23915 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23916 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23917 APInt Op1ValPlusOne = Op1Val + 1;
23918 if (Op1ValPlusOne.isSignedIntN(32) &&
23919 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23920 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23923 }
23924 }
23925 }
23926 }
23927
23928 SDValue X86CC;
23929 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23930 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23931 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23932 }
23933
23934 // Handle floating point.
23935 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23936 if (CondCode == X86::COND_INVALID)
23937 return SDValue();
23938
23939 SDValue EFLAGS;
23940 if (IsStrict) {
23941 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23942 EFLAGS =
23944 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23945 Chain = EFLAGS.getValue(1);
23946 } else {
23947 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23948 }
23949
23950 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23951 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23952 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23953}
23954
23955SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23956 SDValue LHS = Op.getOperand(0);
23957 SDValue RHS = Op.getOperand(1);
23958 SDValue Carry = Op.getOperand(2);
23959 SDValue Cond = Op.getOperand(3);
23960 SDLoc DL(Op);
23961
23962 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23963 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23964
23965 // Recreate the carry if needed.
23966 EVT CarryVT = Carry.getValueType();
23967 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23968 Carry, DAG.getAllOnesConstant(DL, CarryVT));
23969
23970 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23971 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23972 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23973}
23974
23975// This function returns three things: the arithmetic computation itself
23976// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
23977// flag and the condition code define the case in which the arithmetic
23978// computation overflows.
23979static std::pair<SDValue, SDValue>
23981 assert(Op.getResNo() == 0 && "Unexpected result number!");
23982 SDValue Value, Overflow;
23983 SDValue LHS = Op.getOperand(0);
23984 SDValue RHS = Op.getOperand(1);
23985 unsigned BaseOp = 0;
23986 SDLoc DL(Op);
23987 switch (Op.getOpcode()) {
23988 default: llvm_unreachable("Unknown ovf instruction!");
23989 case ISD::SADDO:
23990 BaseOp = X86ISD::ADD;
23991 Cond = X86::COND_O;
23992 break;
23993 case ISD::UADDO:
23994 BaseOp = X86ISD::ADD;
23996 break;
23997 case ISD::SSUBO:
23998 BaseOp = X86ISD::SUB;
23999 Cond = X86::COND_O;
24000 break;
24001 case ISD::USUBO:
24002 BaseOp = X86ISD::SUB;
24003 Cond = X86::COND_B;
24004 break;
24005 case ISD::SMULO:
24006 BaseOp = X86ISD::SMUL;
24007 Cond = X86::COND_O;
24008 break;
24009 case ISD::UMULO:
24010 BaseOp = X86ISD::UMUL;
24011 Cond = X86::COND_O;
24012 break;
24013 }
24014
24015 if (BaseOp) {
24016 // Also sets EFLAGS.
24017 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24018 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24019 Overflow = Value.getValue(1);
24020 }
24021
24022 return std::make_pair(Value, Overflow);
24023}
24024
24026 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24027 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24028 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24029 // has only one use.
24030 SDLoc DL(Op);
24032 SDValue Value, Overflow;
24033 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24034
24035 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24036 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24037 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24038}
24039
24040/// Return true if opcode is a X86 logical comparison.
24042 unsigned Opc = Op.getOpcode();
24043 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24044 Opc == X86ISD::FCMP)
24045 return true;
24046 if (Op.getResNo() == 1 &&
24047 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24048 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
24049 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24050 return true;
24051
24052 return false;
24053}
24054
24056 if (V.getOpcode() != ISD::TRUNCATE)
24057 return false;
24058
24059 SDValue VOp0 = V.getOperand(0);
24060 unsigned InBits = VOp0.getValueSizeInBits();
24061 unsigned Bits = V.getValueSizeInBits();
24062 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24063}
24064
24065SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24066 bool AddTest = true;
24067 SDValue Cond = Op.getOperand(0);
24068 SDValue Op1 = Op.getOperand(1);
24069 SDValue Op2 = Op.getOperand(2);
24070 SDLoc DL(Op);
24071 MVT VT = Op1.getSimpleValueType();
24072 SDValue CC;
24073
24074 if (isSoftF16(VT, Subtarget)) {
24075 MVT NVT = VT.changeTypeToInteger();
24076 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24077 DAG.getBitcast(NVT, Op1),
24078 DAG.getBitcast(NVT, Op2)));
24079 }
24080
24081 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24082 // are available or VBLENDV if AVX is available.
24083 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24084 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24085 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24086 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24087 bool IsAlwaysSignaling;
24088 unsigned SSECC =
24089 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24090 CondOp0, CondOp1, IsAlwaysSignaling);
24091
24092 if (Subtarget.hasAVX512()) {
24093 SDValue Cmp =
24094 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24095 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24096 assert(!VT.isVector() && "Not a scalar type?");
24097 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24098 }
24099
24100 if (SSECC < 8 || Subtarget.hasAVX()) {
24101 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24102 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24103
24104 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24105 // of 3 logic instructions for size savings and potentially speed.
24106 // Unfortunately, there is no scalar form of VBLENDV.
24107
24108 // If either operand is a +0.0 constant, don't try this. We can expect to
24109 // optimize away at least one of the logic instructions later in that
24110 // case, so that sequence would be faster than a variable blend.
24111
24112 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24113 // uses XMM0 as the selection register. That may need just as many
24114 // instructions as the AND/ANDN/OR sequence due to register moves, so
24115 // don't bother.
24116 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24117 !isNullFPConstant(Op2)) {
24118 // Convert to vectors, do a VSELECT, and convert back to scalar.
24119 // All of the conversions should be optimized away.
24120 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24121 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24122 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24123 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24124
24125 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24126 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24127
24128 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24129
24130 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
24131 VSel, DAG.getIntPtrConstant(0, DL));
24132 }
24133 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24134 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24135 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24136 }
24137 }
24138
24139 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24140 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24141 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24142 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24143 }
24144
24145 if (Cond.getOpcode() == ISD::SETCC &&
24146 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
24147 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24148 Cond = NewCond;
24149 // If the condition was updated, it's possible that the operands of the
24150 // select were also updated (for example, EmitTest has a RAUW). Refresh
24151 // the local references to the select operands in case they got stale.
24152 Op1 = Op.getOperand(1);
24153 Op2 = Op.getOperand(2);
24154 }
24155 }
24156
24157 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24158 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24159 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24160 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24161 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24162 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24163 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24164 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24165 if (Cond.getOpcode() == X86ISD::SETCC &&
24166 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24167 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24168 SDValue Cmp = Cond.getOperand(1);
24169 SDValue CmpOp0 = Cmp.getOperand(0);
24170 unsigned CondCode = Cond.getConstantOperandVal(0);
24171
24172 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24173 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24174 // handle to keep the CMP with 0. This should be removed by
24175 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24176 // cttz_zero_undef.
24177 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24178 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24179 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24180 };
24181 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
24182 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24183 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24184 // Keep Cmp.
24185 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24186 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
24187 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
24188 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
24189
24190 // 'X - 1' sets the carry flag if X == 0.
24191 // '0 - X' sets the carry flag if X != 0.
24192 // Convert the carry flag to a -1/0 mask with sbb:
24193 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24194 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24195 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24196 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24197 SDValue Sub;
24198 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
24199 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
24200 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
24201 } else {
24202 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
24203 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
24204 }
24206 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24207 Sub.getValue(1));
24208 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24209 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
24210 CmpOp0.getOpcode() == ISD::AND &&
24211 isOneConstant(CmpOp0.getOperand(1))) {
24212 SDValue Src1, Src2;
24213 // true if Op2 is XOR or OR operator and one of its operands
24214 // is equal to Op1
24215 // ( a , a op b) || ( b , a op b)
24216 auto isOrXorPattern = [&]() {
24217 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
24218 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
24219 Src1 =
24220 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
24221 Src2 = Op1;
24222 return true;
24223 }
24224 return false;
24225 };
24226
24227 if (isOrXorPattern()) {
24228 SDValue Neg;
24229 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
24230 // we need mask of all zeros or ones with same size of the other
24231 // operands.
24232 if (CmpSz > VT.getSizeInBits())
24233 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
24234 else if (CmpSz < VT.getSizeInBits())
24235 Neg = DAG.getNode(ISD::AND, DL, VT,
24236 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
24237 DAG.getConstant(1, DL, VT));
24238 else
24239 Neg = CmpOp0;
24240 SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
24241 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
24242 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
24243 }
24244 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
24245 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24246 ((CondCode == X86::COND_S) || // smin(x, 0)
24247 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
24248 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24249 //
24250 // If the comparison is testing for a positive value, we have to invert
24251 // the sign bit mask, so only do that transform if the target has a
24252 // bitwise 'and not' instruction (the invert is free).
24253 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24254 unsigned ShCt = VT.getSizeInBits() - 1;
24255 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
24256 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
24257 if (CondCode == X86::COND_G)
24258 Shift = DAG.getNOT(DL, Shift, VT);
24259 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
24260 }
24261 }
24262
24263 // Look past (and (setcc_carry (cmp ...)), 1).
24264 if (Cond.getOpcode() == ISD::AND &&
24265 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24266 isOneConstant(Cond.getOperand(1)))
24267 Cond = Cond.getOperand(0);
24268
24269 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24270 // setting operand in place of the X86ISD::SETCC.
24271 unsigned CondOpcode = Cond.getOpcode();
24272 if (CondOpcode == X86ISD::SETCC ||
24273 CondOpcode == X86ISD::SETCC_CARRY) {
24274 CC = Cond.getOperand(0);
24275
24276 SDValue Cmp = Cond.getOperand(1);
24277 bool IllegalFPCMov = false;
24278 if (VT.isFloatingPoint() && !VT.isVector() &&
24279 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
24280 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24281
24282 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24283 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24284 Cond = Cmp;
24285 AddTest = false;
24286 }
24287 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24288 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24289 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24290 SDValue Value;
24291 X86::CondCode X86Cond;
24292 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24293
24294 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24295 AddTest = false;
24296 }
24297
24298 if (AddTest) {
24299 // Look past the truncate if the high bits are known zero.
24301 Cond = Cond.getOperand(0);
24302
24303 // We know the result of AND is compared against zero. Try to match
24304 // it to BT.
24305 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24306 X86::CondCode X86CondCode;
24307 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
24308 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
24309 Cond = BT;
24310 AddTest = false;
24311 }
24312 }
24313 }
24314
24315 if (AddTest) {
24316 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24317 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24318 }
24319
24320 // a < b ? -1 : 0 -> RES = ~setcc_carry
24321 // a < b ? 0 : -1 -> RES = setcc_carry
24322 // a >= b ? -1 : 0 -> RES = setcc_carry
24323 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24324 if (Cond.getOpcode() == X86ISD::SUB) {
24325 unsigned CondCode = CC->getAsZExtVal();
24326
24327 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24328 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24329 (isNullConstant(Op1) || isNullConstant(Op2))) {
24330 SDValue Res =
24331 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24332 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24333 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24334 return DAG.getNOT(DL, Res, Res.getValueType());
24335 return Res;
24336 }
24337 }
24338
24339 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24340 // widen the cmov and push the truncate through. This avoids introducing a new
24341 // branch during isel and doesn't add any extensions.
24342 if (Op.getValueType() == MVT::i8 &&
24343 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24344 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24345 if (T1.getValueType() == T2.getValueType() &&
24346 // Exclude CopyFromReg to avoid partial register stalls.
24347 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24348 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24349 CC, Cond);
24350 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24351 }
24352 }
24353
24354 // Or finally, promote i8 cmovs if we have CMOV,
24355 // or i16 cmovs if it won't prevent folding a load.
24356 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24357 // legal, but EmitLoweredSelect() can not deal with these extensions
24358 // being inserted between two CMOV's. (in i16 case too TBN)
24359 // https://bugs.llvm.org/show_bug.cgi?id=40974
24360 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
24361 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24362 !X86::mayFoldLoad(Op2, Subtarget))) {
24363 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24364 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24365 SDValue Ops[] = { Op2, Op1, CC, Cond };
24366 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24367 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24368 }
24369
24370 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24371 // condition is true.
24372 SDValue Ops[] = { Op2, Op1, CC, Cond };
24373 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24374}
24375
24377 const X86Subtarget &Subtarget,
24378 SelectionDAG &DAG) {
24379 MVT VT = Op->getSimpleValueType(0);
24380 SDValue In = Op->getOperand(0);
24381 MVT InVT = In.getSimpleValueType();
24382 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24383 MVT VTElt = VT.getVectorElementType();
24384 unsigned NumElts = VT.getVectorNumElements();
24385
24386 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24387 MVT ExtVT = VT;
24388 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24389 // If v16i32 is to be avoided, we'll need to split and concatenate.
24390 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24391 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24392
24393 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24394 }
24395
24396 // Widen to 512-bits if VLX is not supported.
24397 MVT WideVT = ExtVT;
24398 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24399 NumElts *= 512 / ExtVT.getSizeInBits();
24400 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24401 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24402 In, DAG.getIntPtrConstant(0, dl));
24403 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24404 }
24405
24406 SDValue V;
24407 MVT WideEltVT = WideVT.getVectorElementType();
24408 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24409 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24410 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24411 } else {
24412 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24413 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24414 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24415 }
24416
24417 // Truncate if we had to extend i16/i8 above.
24418 if (VT != ExtVT) {
24419 WideVT = MVT::getVectorVT(VTElt, NumElts);
24420 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24421 }
24422
24423 // Extract back to 128/256-bit if we widened.
24424 if (WideVT != VT)
24425 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24426 DAG.getIntPtrConstant(0, dl));
24427
24428 return V;
24429}
24430
24432 SelectionDAG &DAG) {
24433 SDValue In = Op->getOperand(0);
24434 MVT InVT = In.getSimpleValueType();
24435 SDLoc DL(Op);
24436
24437 if (InVT.getVectorElementType() == MVT::i1)
24438 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
24439
24440 assert(Subtarget.hasAVX() && "Expected AVX support");
24441 return LowerAVXExtend(Op, DL, DAG, Subtarget);
24442}
24443
24444// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24445// For sign extend this needs to handle all vector sizes and SSE4.1 and
24446// non-SSE4.1 targets. For zero extend this should only handle inputs of
24447// MVT::v64i8 when BWI is not supported, but AVX512 is.
24449 const X86Subtarget &Subtarget,
24450 SelectionDAG &DAG) {
24451 SDValue In = Op->getOperand(0);
24452 MVT VT = Op->getSimpleValueType(0);
24453 MVT InVT = In.getSimpleValueType();
24454
24455 MVT SVT = VT.getVectorElementType();
24456 MVT InSVT = InVT.getVectorElementType();
24458
24459 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24460 return SDValue();
24461 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24462 return SDValue();
24463 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24464 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24465 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24466 return SDValue();
24467
24468 SDLoc dl(Op);
24469 unsigned Opc = Op.getOpcode();
24470 unsigned NumElts = VT.getVectorNumElements();
24471
24472 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24473 // For 512-bit vectors, we need 128-bits or 256-bits.
24474 if (InVT.getSizeInBits() > 128) {
24475 // Input needs to be at least the same number of elements as output, and
24476 // at least 128-bits.
24477 int InSize = InSVT.getSizeInBits() * NumElts;
24478 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24479 InVT = In.getSimpleValueType();
24480 }
24481
24482 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24483 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24484 // need to be handled here for 256/512-bit results.
24485 if (Subtarget.hasInt256()) {
24486 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24487
24488 if (InVT.getVectorNumElements() != NumElts)
24489 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24490
24491 // FIXME: Apparently we create inreg operations that could be regular
24492 // extends.
24493 unsigned ExtOpc =
24496 return DAG.getNode(ExtOpc, dl, VT, In);
24497 }
24498
24499 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24500 if (Subtarget.hasAVX()) {
24501 assert(VT.is256BitVector() && "256-bit vector expected");
24502 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24503 int HalfNumElts = HalfVT.getVectorNumElements();
24504
24505 unsigned NumSrcElts = InVT.getVectorNumElements();
24506 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24507 for (int i = 0; i != HalfNumElts; ++i)
24508 HiMask[i] = HalfNumElts + i;
24509
24510 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24511 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24512 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24513 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24514 }
24515
24516 // We should only get here for sign extend.
24517 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24518 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24519 unsigned InNumElts = InVT.getVectorNumElements();
24520
24521 // If the source elements are already all-signbits, we don't need to extend,
24522 // just splat the elements.
24523 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
24524 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
24525 unsigned Scale = InNumElts / NumElts;
24526 SmallVector<int, 16> ShuffleMask;
24527 for (unsigned I = 0; I != NumElts; ++I)
24528 ShuffleMask.append(Scale, I);
24529 return DAG.getBitcast(VT,
24530 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
24531 }
24532
24533 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24534 SDValue Curr = In;
24535 SDValue SignExt = Curr;
24536
24537 // As SRAI is only available on i16/i32 types, we expand only up to i32
24538 // and handle i64 separately.
24539 if (InVT != MVT::v4i32) {
24540 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24541
24542 unsigned DestWidth = DestVT.getScalarSizeInBits();
24543 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24544 unsigned DestElts = DestVT.getVectorNumElements();
24545
24546 // Build a shuffle mask that takes each input element and places it in the
24547 // MSBs of the new element size.
24548 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24549 for (unsigned i = 0; i != DestElts; ++i)
24550 Mask[i * Scale + (Scale - 1)] = i;
24551
24552 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24553 Curr = DAG.getBitcast(DestVT, Curr);
24554
24555 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24556 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24557 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24558 }
24559
24560 if (VT == MVT::v2i64) {
24561 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24562 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24563 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24564 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24565 SignExt = DAG.getBitcast(VT, SignExt);
24566 }
24567
24568 return SignExt;
24569}
24570
24572 SelectionDAG &DAG) {
24573 MVT VT = Op->getSimpleValueType(0);
24574 SDValue In = Op->getOperand(0);
24575 MVT InVT = In.getSimpleValueType();
24576 SDLoc dl(Op);
24577
24578 if (InVT.getVectorElementType() == MVT::i1)
24579 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
24580
24581 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24583 "Expected same number of elements");
24584 assert((VT.getVectorElementType() == MVT::i16 ||
24585 VT.getVectorElementType() == MVT::i32 ||
24586 VT.getVectorElementType() == MVT::i64) &&
24587 "Unexpected element type");
24588 assert((InVT.getVectorElementType() == MVT::i8 ||
24589 InVT.getVectorElementType() == MVT::i16 ||
24590 InVT.getVectorElementType() == MVT::i32) &&
24591 "Unexpected element type");
24592
24593 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24594 assert(InVT == MVT::v32i8 && "Unexpected VT!");
24595 return splitVectorIntUnary(Op, DAG, dl);
24596 }
24597
24598 if (Subtarget.hasInt256())
24599 return Op;
24600
24601 // Optimize vectors in AVX mode
24602 // Sign extend v8i16 to v8i32 and
24603 // v4i32 to v4i64
24604 //
24605 // Divide input vector into two parts
24606 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24607 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24608 // concat the vectors to original VT
24609 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24610 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24611
24612 unsigned NumElems = InVT.getVectorNumElements();
24613 SmallVector<int,8> ShufMask(NumElems, -1);
24614 for (unsigned i = 0; i != NumElems/2; ++i)
24615 ShufMask[i] = i + NumElems/2;
24616
24617 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24618 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24619
24620 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24621}
24622
24623/// Change a vector store into a pair of half-size vector stores.
24625 SDValue StoredVal = Store->getValue();
24626 assert((StoredVal.getValueType().is256BitVector() ||
24627 StoredVal.getValueType().is512BitVector()) &&
24628 "Expecting 256/512-bit op");
24629
24630 // Splitting volatile memory ops is not allowed unless the operation was not
24631 // legal to begin with. Assume the input store is legal (this transform is
24632 // only used for targets with AVX). Note: It is possible that we have an
24633 // illegal type like v2i128, and so we could allow splitting a volatile store
24634 // in that case if that is important.
24635 if (!Store->isSimple())
24636 return SDValue();
24637
24638 SDLoc DL(Store);
24639 SDValue Value0, Value1;
24640 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24641 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24642 SDValue Ptr0 = Store->getBasePtr();
24643 SDValue Ptr1 =
24644 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
24645 SDValue Ch0 =
24646 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24647 Store->getOriginalAlign(),
24648 Store->getMemOperand()->getFlags());
24649 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24650 Store->getPointerInfo().getWithOffset(HalfOffset),
24651 Store->getOriginalAlign(),
24652 Store->getMemOperand()->getFlags());
24653 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24654}
24655
24656/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24657/// type.
24659 SelectionDAG &DAG) {
24660 SDValue StoredVal = Store->getValue();
24661 assert(StoreVT.is128BitVector() &&
24662 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24663 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24664
24665 // Splitting volatile memory ops is not allowed unless the operation was not
24666 // legal to begin with. We are assuming the input op is legal (this transform
24667 // is only used for targets with AVX).
24668 if (!Store->isSimple())
24669 return SDValue();
24670
24671 MVT StoreSVT = StoreVT.getScalarType();
24672 unsigned NumElems = StoreVT.getVectorNumElements();
24673 unsigned ScalarSize = StoreSVT.getStoreSize();
24674
24675 SDLoc DL(Store);
24677 for (unsigned i = 0; i != NumElems; ++i) {
24678 unsigned Offset = i * ScalarSize;
24679 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24681 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24682 DAG.getIntPtrConstant(i, DL));
24683 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24684 Store->getPointerInfo().getWithOffset(Offset),
24685 Store->getOriginalAlign(),
24686 Store->getMemOperand()->getFlags());
24687 Stores.push_back(Ch);
24688 }
24689 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24690}
24691
24692static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24693 SelectionDAG &DAG) {
24694 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24695 SDLoc dl(St);
24696 SDValue StoredVal = St->getValue();
24697
24698 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24699 if (StoredVal.getValueType().isVector() &&
24700 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24701 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24702 assert(NumElts <= 8 && "Unexpected VT");
24703 assert(!St->isTruncatingStore() && "Expected non-truncating store");
24704 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24705 "Expected AVX512F without AVX512DQI");
24706
24707 // We must pad with zeros to ensure we store zeroes to any unused bits.
24708 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24709 DAG.getUNDEF(MVT::v16i1), StoredVal,
24710 DAG.getIntPtrConstant(0, dl));
24711 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24712 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24713 // Make sure we store zeros in the extra bits.
24714 if (NumElts < 8)
24715 StoredVal = DAG.getZeroExtendInReg(
24716 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24717
24718 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24719 St->getPointerInfo(), St->getOriginalAlign(),
24720 St->getMemOperand()->getFlags());
24721 }
24722
24723 if (St->isTruncatingStore())
24724 return SDValue();
24725
24726 // If this is a 256-bit store of concatenated ops, we are better off splitting
24727 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24728 // and each half can execute independently. Some cores would split the op into
24729 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24730 MVT StoreVT = StoredVal.getSimpleValueType();
24731 if (StoreVT.is256BitVector() ||
24732 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24733 !Subtarget.hasBWI())) {
24734 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
24735 return splitVectorStore(St, DAG);
24736 return SDValue();
24737 }
24738
24739 if (StoreVT.is32BitVector())
24740 return SDValue();
24741
24742 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24743 assert(StoreVT.is64BitVector() && "Unexpected VT");
24744 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24746 "Unexpected type action!");
24747
24748 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24749 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24750 DAG.getUNDEF(StoreVT));
24751
24752 if (Subtarget.hasSSE2()) {
24753 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24754 // and store it.
24755 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24756 MVT CastVT = MVT::getVectorVT(StVT, 2);
24757 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24758 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24759 DAG.getIntPtrConstant(0, dl));
24760
24761 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24762 St->getPointerInfo(), St->getOriginalAlign(),
24763 St->getMemOperand()->getFlags());
24764 }
24765 assert(Subtarget.hasSSE1() && "Expected SSE");
24766 SDVTList Tys = DAG.getVTList(MVT::Other);
24767 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24768 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24769 St->getMemOperand());
24770}
24771
24772// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24773// may emit an illegal shuffle but the expansion is still better than scalar
24774// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24775// we'll emit a shuffle and a arithmetic shift.
24776// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24777// TODO: It is possible to support ZExt by zeroing the undef values during
24778// the shuffle phase or after the shuffle.
24779static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24780 SelectionDAG &DAG) {
24781 MVT RegVT = Op.getSimpleValueType();
24782 assert(RegVT.isVector() && "We only custom lower vector loads.");
24783 assert(RegVT.isInteger() &&
24784 "We only custom lower integer vector loads.");
24785
24786 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24787 SDLoc dl(Ld);
24788
24789 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24790 if (RegVT.getVectorElementType() == MVT::i1) {
24791 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24792 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24793 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24794 "Expected AVX512F without AVX512DQI");
24795
24796 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24797 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24798 Ld->getMemOperand()->getFlags());
24799
24800 // Replace chain users with the new chain.
24801 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24802
24803 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24804 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24805 DAG.getBitcast(MVT::v16i1, Val),
24806 DAG.getIntPtrConstant(0, dl));
24807 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24808 }
24809
24810 return SDValue();
24811}
24812
24813/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24814/// each of which has no other use apart from the AND / OR.
24815static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24816 Opc = Op.getOpcode();
24817 if (Opc != ISD::OR && Opc != ISD::AND)
24818 return false;
24819 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24820 Op.getOperand(0).hasOneUse() &&
24821 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24822 Op.getOperand(1).hasOneUse());
24823}
24824
24825SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24826 SDValue Chain = Op.getOperand(0);
24827 SDValue Cond = Op.getOperand(1);
24828 SDValue Dest = Op.getOperand(2);
24829 SDLoc dl(Op);
24830
24831 // Bail out when we don't have native compare instructions.
24832 if (Cond.getOpcode() == ISD::SETCC &&
24833 Cond.getOperand(0).getValueType() != MVT::f128 &&
24834 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
24835 SDValue LHS = Cond.getOperand(0);
24836 SDValue RHS = Cond.getOperand(1);
24837 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24838
24839 // Special case for
24840 // setcc([su]{add,sub,mul}o == 0)
24841 // setcc([su]{add,sub,mul}o != 1)
24842 if (ISD::isOverflowIntrOpRes(LHS) &&
24843 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24844 (isNullConstant(RHS) || isOneConstant(RHS))) {
24845 SDValue Value, Overflow;
24846 X86::CondCode X86Cond;
24847 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24848
24849 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24850 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24851
24852 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24853 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24854 Overflow);
24855 }
24856
24857 if (LHS.getSimpleValueType().isInteger()) {
24858 SDValue CCVal;
24859 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24860 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24861 EFLAGS);
24862 }
24863
24864 if (CC == ISD::SETOEQ) {
24865 // For FCMP_OEQ, we can emit
24866 // two branches instead of an explicit AND instruction with a
24867 // separate test. However, we only do this if this block doesn't
24868 // have a fall-through edge, because this requires an explicit
24869 // jmp when the condition is false.
24870 if (Op.getNode()->hasOneUse()) {
24871 SDNode *User = *Op.getNode()->use_begin();
24872 // Look for an unconditional branch following this conditional branch.
24873 // We need this because we need to reverse the successors in order
24874 // to implement FCMP_OEQ.
24875 if (User->getOpcode() == ISD::BR) {
24876 SDValue FalseBB = User->getOperand(1);
24877 SDNode *NewBR =
24878 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24879 assert(NewBR == User);
24880 (void)NewBR;
24881 Dest = FalseBB;
24882
24883 SDValue Cmp =
24884 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24885 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24886 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24887 CCVal, Cmp);
24888 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24889 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24890 Cmp);
24891 }
24892 }
24893 } else if (CC == ISD::SETUNE) {
24894 // For FCMP_UNE, we can emit
24895 // two branches instead of an explicit OR instruction with a
24896 // separate test.
24897 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24898 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24899 Chain =
24900 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24901 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24902 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24903 Cmp);
24904 } else {
24905 X86::CondCode X86Cond =
24906 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24907 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24908 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24909 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24910 Cmp);
24911 }
24912 }
24913
24915 SDValue Value, Overflow;
24916 X86::CondCode X86Cond;
24917 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24918
24919 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24920 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24921 Overflow);
24922 }
24923
24924 // Look past the truncate if the high bits are known zero.
24926 Cond = Cond.getOperand(0);
24927
24928 EVT CondVT = Cond.getValueType();
24929
24930 // Add an AND with 1 if we don't already have one.
24931 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24932 Cond =
24933 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24934
24935 SDValue LHS = Cond;
24936 SDValue RHS = DAG.getConstant(0, dl, CondVT);
24937
24938 SDValue CCVal;
24939 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24940 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24941 EFLAGS);
24942}
24943
24944// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24945// Calls to _alloca are needed to probe the stack when allocating more than 4k
24946// bytes in one go. Touching the stack at 4K increments is necessary to ensure
24947// that the guard pages used by the OS virtual memory manager are allocated in
24948// correct sequence.
24949SDValue
24950X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24951 SelectionDAG &DAG) const {
24953 bool SplitStack = MF.shouldSplitStack();
24954 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24955 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24956 SplitStack || EmitStackProbeCall;
24957 SDLoc dl(Op);
24958
24959 // Get the inputs.
24960 SDNode *Node = Op.getNode();
24961 SDValue Chain = Op.getOperand(0);
24962 SDValue Size = Op.getOperand(1);
24963 MaybeAlign Alignment(Op.getConstantOperandVal(2));
24964 EVT VT = Node->getValueType(0);
24965
24966 // Chain the dynamic stack allocation so that it doesn't modify the stack
24967 // pointer when other instructions are using the stack.
24968 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24969
24970 bool Is64Bit = Subtarget.is64Bit();
24971 MVT SPTy = getPointerTy(DAG.getDataLayout());
24972
24974 if (!Lower) {
24975 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24977 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24978 " not tell us which reg is the stack pointer!");
24979
24980 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24981 const Align StackAlign = TFI.getStackAlign();
24982 if (hasInlineStackProbe(MF)) {
24984
24985 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24986 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24987 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24988 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24989 DAG.getRegister(Vreg, SPTy));
24990 } else {
24991 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24992 Chain = SP.getValue(1);
24993 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24994 }
24995 if (Alignment && *Alignment > StackAlign)
24996 Result =
24997 DAG.getNode(ISD::AND, dl, VT, Result,
24998 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24999 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25000 } else if (SplitStack) {
25002
25003 if (Is64Bit) {
25004 // The 64 bit implementation of segmented stacks needs to clobber both r10
25005 // r11. This makes it impossible to use it along with nested parameters.
25006 const Function &F = MF.getFunction();
25007 for (const auto &A : F.args()) {
25008 if (A.hasNestAttr())
25009 report_fatal_error("Cannot use segmented stacks with functions that "
25010 "have nested arguments.");
25011 }
25012 }
25013
25014 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25015 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
25016 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
25017 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
25018 DAG.getRegister(Vreg, SPTy));
25019 } else {
25020 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25021 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25022 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25023
25024 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25025 Register SPReg = RegInfo->getStackRegister();
25026 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25027 Chain = SP.getValue(1);
25028
25029 if (Alignment) {
25030 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
25031 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25032 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25033 }
25034
25035 Result = SP;
25036 }
25037
25038 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25039
25040 SDValue Ops[2] = {Result, Chain};
25041 return DAG.getMergeValues(Ops, dl);
25042}
25043
25044SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25046 auto PtrVT = getPointerTy(MF.getDataLayout());
25048
25049 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25050 SDLoc DL(Op);
25051
25052 if (!Subtarget.is64Bit() ||
25053 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25054 // vastart just stores the address of the VarArgsFrameIndex slot into the
25055 // memory location argument.
25056 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25057 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
25058 MachinePointerInfo(SV));
25059 }
25060
25061 // __va_list_tag:
25062 // gp_offset (0 - 6 * 8)
25063 // fp_offset (48 - 48 + 8 * 16)
25064 // overflow_arg_area (point to parameters coming in memory).
25065 // reg_save_area
25067 SDValue FIN = Op.getOperand(1);
25068 // Store gp_offset
25069 SDValue Store = DAG.getStore(
25070 Op.getOperand(0), DL,
25071 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25072 MachinePointerInfo(SV));
25073 MemOps.push_back(Store);
25074
25075 // Store fp_offset
25076 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25077 Store = DAG.getStore(
25078 Op.getOperand(0), DL,
25079 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25080 MachinePointerInfo(SV, 4));
25081 MemOps.push_back(Store);
25082
25083 // Store ptr to overflow_arg_area
25084 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25085 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25086 Store =
25087 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25088 MemOps.push_back(Store);
25089
25090 // Store ptr to reg_save_area.
25091 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25092 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25093 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25094 Store = DAG.getStore(
25095 Op.getOperand(0), DL, RSFIN, FIN,
25096 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25097 MemOps.push_back(Store);
25098 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25099}
25100
25101SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25102 assert(Subtarget.is64Bit() &&
25103 "LowerVAARG only handles 64-bit va_arg!");
25104 assert(Op.getNumOperands() == 4);
25105
25107 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25108 // The Win64 ABI uses char* instead of a structure.
25109 return DAG.expandVAArg(Op.getNode());
25110
25111 SDValue Chain = Op.getOperand(0);
25112 SDValue SrcPtr = Op.getOperand(1);
25113 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25114 unsigned Align = Op.getConstantOperandVal(3);
25115 SDLoc dl(Op);
25116
25117 EVT ArgVT = Op.getNode()->getValueType(0);
25118 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25119 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25120 uint8_t ArgMode;
25121
25122 // Decide which area this value should be read from.
25123 // TODO: Implement the AMD64 ABI in its entirety. This simple
25124 // selection mechanism works only for the basic types.
25125 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25126 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25127 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25128 } else {
25129 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25130 "Unhandled argument type in LowerVAARG");
25131 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25132 }
25133
25134 if (ArgMode == 2) {
25135 // Make sure using fp_offset makes sense.
25136 assert(!Subtarget.useSoftFloat() &&
25137 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25138 Subtarget.hasSSE1());
25139 }
25140
25141 // Insert VAARG node into the DAG
25142 // VAARG returns two values: Variable Argument Address, Chain
25143 SDValue InstOps[] = {Chain, SrcPtr,
25144 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25145 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25146 DAG.getTargetConstant(Align, dl, MVT::i32)};
25147 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25150 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25151 /*Alignment=*/std::nullopt,
25153 Chain = VAARG.getValue(1);
25154
25155 // Load the next argument and return it
25156 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25157}
25158
25159static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25160 SelectionDAG &DAG) {
25161 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25162 // where a va_list is still an i8*.
25163 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25164 if (Subtarget.isCallingConvWin64(
25166 // Probably a Win64 va_copy.
25167 return DAG.expandVACopy(Op.getNode());
25168
25169 SDValue Chain = Op.getOperand(0);
25170 SDValue DstPtr = Op.getOperand(1);
25171 SDValue SrcPtr = Op.getOperand(2);
25172 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25173 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25174 SDLoc DL(Op);
25175
25176 return DAG.getMemcpy(
25177 Chain, DL, DstPtr, SrcPtr,
25178 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25179 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25180 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
25181}
25182
25183// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25184static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25185 switch (Opc) {
25186 case ISD::SHL:
25187 case X86ISD::VSHL:
25188 case X86ISD::VSHLI:
25189 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25190 case ISD::SRL:
25191 case X86ISD::VSRL:
25192 case X86ISD::VSRLI:
25193 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25194 case ISD::SRA:
25195 case X86ISD::VSRA:
25196 case X86ISD::VSRAI:
25197 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25198 }
25199 llvm_unreachable("Unknown target vector shift node");
25200}
25201
25202/// Handle vector element shifts where the shift amount is a constant.
25203/// Takes immediate version of shift as input.
25204static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25205 SDValue SrcOp, uint64_t ShiftAmt,
25206 SelectionDAG &DAG) {
25207 MVT ElementType = VT.getVectorElementType();
25208
25209 // Bitcast the source vector to the output type, this is mainly necessary for
25210 // vXi8/vXi64 shifts.
25211 if (VT != SrcOp.getSimpleValueType())
25212 SrcOp = DAG.getBitcast(VT, SrcOp);
25213
25214 // Fold this packed shift into its first operand if ShiftAmt is 0.
25215 if (ShiftAmt == 0)
25216 return SrcOp;
25217
25218 // Check for ShiftAmt >= element width
25219 if (ShiftAmt >= ElementType.getSizeInBits()) {
25220 if (Opc == X86ISD::VSRAI)
25221 ShiftAmt = ElementType.getSizeInBits() - 1;
25222 else
25223 return DAG.getConstant(0, dl, VT);
25224 }
25225
25226 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
25227 && "Unknown target vector shift-by-constant node");
25228
25229 // Fold this packed vector shift into a build vector if SrcOp is a
25230 // vector of Constants or UNDEFs.
25232 unsigned ShiftOpc;
25233 switch (Opc) {
25234 default: llvm_unreachable("Unknown opcode!");
25235 case X86ISD::VSHLI:
25236 ShiftOpc = ISD::SHL;
25237 break;
25238 case X86ISD::VSRLI:
25239 ShiftOpc = ISD::SRL;
25240 break;
25241 case X86ISD::VSRAI:
25242 ShiftOpc = ISD::SRA;
25243 break;
25244 }
25245
25246 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
25247 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
25248 return C;
25249 }
25250
25251 return DAG.getNode(Opc, dl, VT, SrcOp,
25252 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25253}
25254
25255/// Handle vector element shifts by a splat shift amount
25256static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25257 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
25258 const X86Subtarget &Subtarget,
25259 SelectionDAG &DAG) {
25260 MVT AmtVT = ShAmt.getSimpleValueType();
25261 assert(AmtVT.isVector() && "Vector shift type mismatch");
25262 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
25263 "Illegal vector splat index");
25264
25265 // Move the splat element to the bottom element.
25266 if (ShAmtIdx != 0) {
25267 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25268 Mask[0] = ShAmtIdx;
25269 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
25270 }
25271
25272 // Peek through any zext node if we can get back to a 128-bit source.
25273 if (AmtVT.getScalarSizeInBits() == 64 &&
25274 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
25276 ShAmt.getOperand(0).getValueType().isSimple() &&
25277 ShAmt.getOperand(0).getValueType().is128BitVector()) {
25278 ShAmt = ShAmt.getOperand(0);
25279 AmtVT = ShAmt.getSimpleValueType();
25280 }
25281
25282 // See if we can mask off the upper elements using the existing source node.
25283 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25284 // do this for vXi64 types.
25285 bool IsMasked = false;
25286 if (AmtVT.getScalarSizeInBits() < 64) {
25287 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
25288 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
25289 // If the shift amount has come from a scalar, then zero-extend the scalar
25290 // before moving to the vector.
25291 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
25292 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25293 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
25294 AmtVT = MVT::v4i32;
25295 IsMasked = true;
25296 } else if (ShAmt.getOpcode() == ISD::AND) {
25297 // See if the shift amount is already masked (e.g. for rotation modulo),
25298 // then we can zero-extend it by setting all the other mask elements to
25299 // zero.
25300 SmallVector<SDValue> MaskElts(
25301 AmtVT.getVectorNumElements(),
25302 DAG.getConstant(0, dl, AmtVT.getScalarType()));
25303 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
25304 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
25305 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
25306 {ShAmt.getOperand(1), Mask}))) {
25307 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
25308 IsMasked = true;
25309 }
25310 }
25311 }
25312
25313 // Extract if the shift amount vector is larger than 128-bits.
25314 if (AmtVT.getSizeInBits() > 128) {
25315 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
25316 AmtVT = ShAmt.getSimpleValueType();
25317 }
25318
25319 // Zero-extend bottom element to v2i64 vector type, either by extension or
25320 // shuffle masking.
25321 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
25322 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
25323 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
25324 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25325 } else if (Subtarget.hasSSE41()) {
25326 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25327 MVT::v2i64, ShAmt);
25328 } else {
25329 SDValue ByteShift = DAG.getTargetConstant(
25330 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25331 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25332 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25333 ByteShift);
25334 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25335 ByteShift);
25336 }
25337 }
25338
25339 // Change opcode to non-immediate version.
25340 Opc = getTargetVShiftUniformOpcode(Opc, true);
25341
25342 // The return type has to be a 128-bit type with the same element
25343 // type as the input type.
25344 MVT EltVT = VT.getVectorElementType();
25345 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25346
25347 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25348 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25349}
25350
25351/// Return Mask with the necessary casting or extending
25352/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25353static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25354 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25355 const SDLoc &dl) {
25356
25357 if (isAllOnesConstant(Mask))
25358 return DAG.getConstant(1, dl, MaskVT);
25359 if (X86::isZeroNode(Mask))
25360 return DAG.getConstant(0, dl, MaskVT);
25361
25362 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25363
25364 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25365 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25366 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25367 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25368 SDValue Lo, Hi;
25369 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
25370 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25371 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25372 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25373 } else {
25374 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25375 Mask.getSimpleValueType().getSizeInBits());
25376 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25377 // are extracted by EXTRACT_SUBVECTOR.
25378 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25379 DAG.getBitcast(BitcastVT, Mask),
25380 DAG.getIntPtrConstant(0, dl));
25381 }
25382}
25383
25384/// Return (and \p Op, \p Mask) for compare instructions or
25385/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25386/// necessary casting or extending for \p Mask when lowering masking intrinsics
25388 SDValue PreservedSrc,
25389 const X86Subtarget &Subtarget,
25390 SelectionDAG &DAG) {
25391 MVT VT = Op.getSimpleValueType();
25392 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25393 unsigned OpcodeSelect = ISD::VSELECT;
25394 SDLoc dl(Op);
25395
25396 if (isAllOnesConstant(Mask))
25397 return Op;
25398
25399 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25400
25401 if (PreservedSrc.isUndef())
25402 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25403 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25404}
25405
25406/// Creates an SDNode for a predicated scalar operation.
25407/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25408/// The mask is coming as MVT::i8 and it should be transformed
25409/// to MVT::v1i1 while lowering masking intrinsics.
25410/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25411/// "X86select" instead of "vselect". We just can't create the "vselect" node
25412/// for a scalar instruction.
25414 SDValue PreservedSrc,
25415 const X86Subtarget &Subtarget,
25416 SelectionDAG &DAG) {
25417
25418 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25419 if (MaskConst->getZExtValue() & 0x1)
25420 return Op;
25421
25422 MVT VT = Op.getSimpleValueType();
25423 SDLoc dl(Op);
25424
25425 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25426 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25427 DAG.getBitcast(MVT::v8i1, Mask),
25428 DAG.getIntPtrConstant(0, dl));
25429 if (Op.getOpcode() == X86ISD::FSETCCM ||
25430 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25431 Op.getOpcode() == X86ISD::VFPCLASSS)
25432 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25433
25434 if (PreservedSrc.isUndef())
25435 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25436 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25437}
25438
25440 if (!Fn->hasPersonalityFn())
25442 "querying registration node size for function without personality");
25443 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25444 // WinEHStatePass for the full struct definition.
25445 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25446 case EHPersonality::MSVC_X86SEH: return 24;
25447 case EHPersonality::MSVC_CXX: return 16;
25448 default: break;
25449 }
25451 "can only recover FP for 32-bit MSVC EH personality functions");
25452}
25453
25454/// When the MSVC runtime transfers control to us, either to an outlined
25455/// function or when returning to a parent frame after catching an exception, we
25456/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25457/// Here's the math:
25458/// RegNodeBase = EntryEBP - RegNodeSize
25459/// ParentFP = RegNodeBase - ParentFrameOffset
25460/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25461/// subtracting the offset (negative on x86) takes us back to the parent FP.
25463 SDValue EntryEBP) {
25465 SDLoc dl;
25466
25467 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25468 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25469
25470 // It's possible that the parent function no longer has a personality function
25471 // if the exceptional code was optimized away, in which case we just return
25472 // the incoming EBP.
25473 if (!Fn->hasPersonalityFn())
25474 return EntryEBP;
25475
25476 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25477 // registration, or the .set_setframe offset.
25478 MCSymbol *OffsetSym =
25481 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25482 SDValue ParentFrameOffset =
25483 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25484
25485 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25486 // prologue to RBP in the parent function.
25487 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
25488 if (Subtarget.is64Bit())
25489 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25490
25491 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25492 // RegNodeBase = EntryEBP - RegNodeSize
25493 // ParentFP = RegNodeBase - ParentFrameOffset
25494 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25495 DAG.getConstant(RegNodeSize, dl, PtrVT));
25496 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25497}
25498
25499SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25500 SelectionDAG &DAG) const {
25501 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25502 auto isRoundModeCurDirection = [](SDValue Rnd) {
25503 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25504 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25505
25506 return false;
25507 };
25508 auto isRoundModeSAE = [](SDValue Rnd) {
25509 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25510 unsigned RC = C->getZExtValue();
25512 // Clear the NO_EXC bit and check remaining bits.
25514 // As a convenience we allow no other bits or explicitly
25515 // current direction.
25516 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25517 }
25518 }
25519
25520 return false;
25521 };
25522 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25523 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25524 RC = C->getZExtValue();
25526 // Clear the NO_EXC bit and check remaining bits.
25532 }
25533 }
25534
25535 return false;
25536 };
25537
25538 SDLoc dl(Op);
25539 unsigned IntNo = Op.getConstantOperandVal(0);
25540 MVT VT = Op.getSimpleValueType();
25541 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25542
25543 // Propagate flags from original node to transformed node(s).
25544 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25545
25546 if (IntrData) {
25547 switch(IntrData->Type) {
25548 case INTR_TYPE_1OP: {
25549 // We specify 2 possible opcodes for intrinsics with rounding modes.
25550 // First, we check if the intrinsic may have non-default rounding mode,
25551 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25552 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25553 if (IntrWithRoundingModeOpcode != 0) {
25554 SDValue Rnd = Op.getOperand(2);
25555 unsigned RC = 0;
25556 if (isRoundModeSAEToX(Rnd, RC))
25557 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25558 Op.getOperand(1),
25559 DAG.getTargetConstant(RC, dl, MVT::i32));
25560 if (!isRoundModeCurDirection(Rnd))
25561 return SDValue();
25562 }
25563 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25564 Op.getOperand(1));
25565 }
25566 case INTR_TYPE_1OP_SAE: {
25567 SDValue Sae = Op.getOperand(2);
25568
25569 unsigned Opc;
25570 if (isRoundModeCurDirection(Sae))
25571 Opc = IntrData->Opc0;
25572 else if (isRoundModeSAE(Sae))
25573 Opc = IntrData->Opc1;
25574 else
25575 return SDValue();
25576
25577 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25578 }
25579 case INTR_TYPE_2OP: {
25580 SDValue Src2 = Op.getOperand(2);
25581
25582 // We specify 2 possible opcodes for intrinsics with rounding modes.
25583 // First, we check if the intrinsic may have non-default rounding mode,
25584 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25585 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25586 if (IntrWithRoundingModeOpcode != 0) {
25587 SDValue Rnd = Op.getOperand(3);
25588 unsigned RC = 0;
25589 if (isRoundModeSAEToX(Rnd, RC))
25590 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25591 Op.getOperand(1), Src2,
25592 DAG.getTargetConstant(RC, dl, MVT::i32));
25593 if (!isRoundModeCurDirection(Rnd))
25594 return SDValue();
25595 }
25596
25597 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25598 Op.getOperand(1), Src2);
25599 }
25600 case INTR_TYPE_2OP_SAE: {
25601 SDValue Sae = Op.getOperand(3);
25602
25603 unsigned Opc;
25604 if (isRoundModeCurDirection(Sae))
25605 Opc = IntrData->Opc0;
25606 else if (isRoundModeSAE(Sae))
25607 Opc = IntrData->Opc1;
25608 else
25609 return SDValue();
25610
25611 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25612 Op.getOperand(2));
25613 }
25614 case INTR_TYPE_3OP:
25615 case INTR_TYPE_3OP_IMM8: {
25616 SDValue Src1 = Op.getOperand(1);
25617 SDValue Src2 = Op.getOperand(2);
25618 SDValue Src3 = Op.getOperand(3);
25619
25620 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25621 Src3.getValueType() != MVT::i8) {
25622 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
25623 }
25624
25625 // We specify 2 possible opcodes for intrinsics with rounding modes.
25626 // First, we check if the intrinsic may have non-default rounding mode,
25627 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25628 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25629 if (IntrWithRoundingModeOpcode != 0) {
25630 SDValue Rnd = Op.getOperand(4);
25631 unsigned RC = 0;
25632 if (isRoundModeSAEToX(Rnd, RC))
25633 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25634 Src1, Src2, Src3,
25635 DAG.getTargetConstant(RC, dl, MVT::i32));
25636 if (!isRoundModeCurDirection(Rnd))
25637 return SDValue();
25638 }
25639
25640 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25641 {Src1, Src2, Src3});
25642 }
25643 case INTR_TYPE_4OP_IMM8: {
25644 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25645 SDValue Src4 = Op.getOperand(4);
25646 if (Src4.getValueType() != MVT::i8) {
25647 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
25648 }
25649
25650 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25651 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25652 Src4);
25653 }
25654 case INTR_TYPE_1OP_MASK: {
25655 SDValue Src = Op.getOperand(1);
25656 SDValue PassThru = Op.getOperand(2);
25657 SDValue Mask = Op.getOperand(3);
25658 // We add rounding mode to the Node when
25659 // - RC Opcode is specified and
25660 // - RC is not "current direction".
25661 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25662 if (IntrWithRoundingModeOpcode != 0) {
25663 SDValue Rnd = Op.getOperand(4);
25664 unsigned RC = 0;
25665 if (isRoundModeSAEToX(Rnd, RC))
25666 return getVectorMaskingNode(
25667 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25668 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25669 Mask, PassThru, Subtarget, DAG);
25670 if (!isRoundModeCurDirection(Rnd))
25671 return SDValue();
25672 }
25673 return getVectorMaskingNode(
25674 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25675 Subtarget, DAG);
25676 }
25678 SDValue Src = Op.getOperand(1);
25679 SDValue PassThru = Op.getOperand(2);
25680 SDValue Mask = Op.getOperand(3);
25681 SDValue Rnd = Op.getOperand(4);
25682
25683 unsigned Opc;
25684 if (isRoundModeCurDirection(Rnd))
25685 Opc = IntrData->Opc0;
25686 else if (isRoundModeSAE(Rnd))
25687 Opc = IntrData->Opc1;
25688 else
25689 return SDValue();
25690
25691 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25692 Subtarget, DAG);
25693 }
25694 case INTR_TYPE_SCALAR_MASK: {
25695 SDValue Src1 = Op.getOperand(1);
25696 SDValue Src2 = Op.getOperand(2);
25697 SDValue passThru = Op.getOperand(3);
25698 SDValue Mask = Op.getOperand(4);
25699 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25700 // There are 2 kinds of intrinsics in this group:
25701 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25702 // (2) With rounding mode and sae - 7 operands.
25703 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25704 if (Op.getNumOperands() == (5U + HasRounding)) {
25705 if (HasRounding) {
25706 SDValue Rnd = Op.getOperand(5);
25707 unsigned RC = 0;
25708 if (isRoundModeSAEToX(Rnd, RC))
25709 return getScalarMaskingNode(
25710 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25711 DAG.getTargetConstant(RC, dl, MVT::i32)),
25712 Mask, passThru, Subtarget, DAG);
25713 if (!isRoundModeCurDirection(Rnd))
25714 return SDValue();
25715 }
25716 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25717 Src2),
25718 Mask, passThru, Subtarget, DAG);
25719 }
25720
25721 assert(Op.getNumOperands() == (6U + HasRounding) &&
25722 "Unexpected intrinsic form");
25723 SDValue RoundingMode = Op.getOperand(5);
25724 unsigned Opc = IntrData->Opc0;
25725 if (HasRounding) {
25726 SDValue Sae = Op.getOperand(6);
25727 if (isRoundModeSAE(Sae))
25728 Opc = IntrWithRoundingModeOpcode;
25729 else if (!isRoundModeCurDirection(Sae))
25730 return SDValue();
25731 }
25732 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25733 Src2, RoundingMode),
25734 Mask, passThru, Subtarget, DAG);
25735 }
25737 SDValue Src1 = Op.getOperand(1);
25738 SDValue Src2 = Op.getOperand(2);
25739 SDValue passThru = Op.getOperand(3);
25740 SDValue Mask = Op.getOperand(4);
25741 SDValue Rnd = Op.getOperand(5);
25742
25743 SDValue NewOp;
25744 unsigned RC = 0;
25745 if (isRoundModeCurDirection(Rnd))
25746 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25747 else if (isRoundModeSAEToX(Rnd, RC))
25748 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25749 DAG.getTargetConstant(RC, dl, MVT::i32));
25750 else
25751 return SDValue();
25752
25753 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25754 }
25756 SDValue Src1 = Op.getOperand(1);
25757 SDValue Src2 = Op.getOperand(2);
25758 SDValue passThru = Op.getOperand(3);
25759 SDValue Mask = Op.getOperand(4);
25760 SDValue Sae = Op.getOperand(5);
25761 unsigned Opc;
25762 if (isRoundModeCurDirection(Sae))
25763 Opc = IntrData->Opc0;
25764 else if (isRoundModeSAE(Sae))
25765 Opc = IntrData->Opc1;
25766 else
25767 return SDValue();
25768
25769 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25770 Mask, passThru, Subtarget, DAG);
25771 }
25772 case INTR_TYPE_2OP_MASK: {
25773 SDValue Src1 = Op.getOperand(1);
25774 SDValue Src2 = Op.getOperand(2);
25775 SDValue PassThru = Op.getOperand(3);
25776 SDValue Mask = Op.getOperand(4);
25777 SDValue NewOp;
25778 if (IntrData->Opc1 != 0) {
25779 SDValue Rnd = Op.getOperand(5);
25780 unsigned RC = 0;
25781 if (isRoundModeSAEToX(Rnd, RC))
25782 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25783 DAG.getTargetConstant(RC, dl, MVT::i32));
25784 else if (!isRoundModeCurDirection(Rnd))
25785 return SDValue();
25786 }
25787 if (!NewOp)
25788 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25789 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25790 }
25792 SDValue Src1 = Op.getOperand(1);
25793 SDValue Src2 = Op.getOperand(2);
25794 SDValue PassThru = Op.getOperand(3);
25795 SDValue Mask = Op.getOperand(4);
25796
25797 unsigned Opc = IntrData->Opc0;
25798 if (IntrData->Opc1 != 0) {
25799 SDValue Sae = Op.getOperand(5);
25800 if (isRoundModeSAE(Sae))
25801 Opc = IntrData->Opc1;
25802 else if (!isRoundModeCurDirection(Sae))
25803 return SDValue();
25804 }
25805
25806 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25807 Mask, PassThru, Subtarget, DAG);
25808 }
25810 SDValue Src1 = Op.getOperand(1);
25811 SDValue Src2 = Op.getOperand(2);
25812 SDValue Src3 = Op.getOperand(3);
25813 SDValue PassThru = Op.getOperand(4);
25814 SDValue Mask = Op.getOperand(5);
25815 SDValue Sae = Op.getOperand(6);
25816 unsigned Opc;
25817 if (isRoundModeCurDirection(Sae))
25818 Opc = IntrData->Opc0;
25819 else if (isRoundModeSAE(Sae))
25820 Opc = IntrData->Opc1;
25821 else
25822 return SDValue();
25823
25824 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25825 Mask, PassThru, Subtarget, DAG);
25826 }
25828 SDValue Src1 = Op.getOperand(1);
25829 SDValue Src2 = Op.getOperand(2);
25830 SDValue Src3 = Op.getOperand(3);
25831 SDValue PassThru = Op.getOperand(4);
25832 SDValue Mask = Op.getOperand(5);
25833
25834 unsigned Opc = IntrData->Opc0;
25835 if (IntrData->Opc1 != 0) {
25836 SDValue Sae = Op.getOperand(6);
25837 if (isRoundModeSAE(Sae))
25838 Opc = IntrData->Opc1;
25839 else if (!isRoundModeCurDirection(Sae))
25840 return SDValue();
25841 }
25842 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25843 Mask, PassThru, Subtarget, DAG);
25844 }
25845 case BLENDV: {
25846 SDValue Src1 = Op.getOperand(1);
25847 SDValue Src2 = Op.getOperand(2);
25848 SDValue Src3 = Op.getOperand(3);
25849
25851 Src3 = DAG.getBitcast(MaskVT, Src3);
25852
25853 // Reverse the operands to match VSELECT order.
25854 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25855 }
25856 case VPERM_2OP : {
25857 SDValue Src1 = Op.getOperand(1);
25858 SDValue Src2 = Op.getOperand(2);
25859
25860 // Swap Src1 and Src2 in the node creation
25861 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25862 }
25863 case CFMA_OP_MASKZ:
25864 case CFMA_OP_MASK: {
25865 SDValue Src1 = Op.getOperand(1);
25866 SDValue Src2 = Op.getOperand(2);
25867 SDValue Src3 = Op.getOperand(3);
25868 SDValue Mask = Op.getOperand(4);
25869 MVT VT = Op.getSimpleValueType();
25870
25871 SDValue PassThru = Src3;
25872 if (IntrData->Type == CFMA_OP_MASKZ)
25873 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25874
25875 // We add rounding mode to the Node when
25876 // - RC Opcode is specified and
25877 // - RC is not "current direction".
25878 SDValue NewOp;
25879 if (IntrData->Opc1 != 0) {
25880 SDValue Rnd = Op.getOperand(5);
25881 unsigned RC = 0;
25882 if (isRoundModeSAEToX(Rnd, RC))
25883 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
25884 DAG.getTargetConstant(RC, dl, MVT::i32));
25885 else if (!isRoundModeCurDirection(Rnd))
25886 return SDValue();
25887 }
25888 if (!NewOp)
25889 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
25890 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25891 }
25892 case IFMA_OP:
25893 // NOTE: We need to swizzle the operands to pass the multiply operands
25894 // first.
25895 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25896 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25897 case FPCLASSS: {
25898 SDValue Src1 = Op.getOperand(1);
25899 SDValue Imm = Op.getOperand(2);
25900 SDValue Mask = Op.getOperand(3);
25901 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25902 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25903 Subtarget, DAG);
25904 // Need to fill with zeros to ensure the bitcast will produce zeroes
25905 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25906 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25907 DAG.getConstant(0, dl, MVT::v8i1),
25908 FPclassMask, DAG.getIntPtrConstant(0, dl));
25909 return DAG.getBitcast(MVT::i8, Ins);
25910 }
25911
25912 case CMP_MASK_CC: {
25913 MVT MaskVT = Op.getSimpleValueType();
25914 SDValue CC = Op.getOperand(3);
25915 SDValue Mask = Op.getOperand(4);
25916 // We specify 2 possible opcodes for intrinsics with rounding modes.
25917 // First, we check if the intrinsic may have non-default rounding mode,
25918 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25919 if (IntrData->Opc1 != 0) {
25920 SDValue Sae = Op.getOperand(5);
25921 if (isRoundModeSAE(Sae))
25922 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25923 Op.getOperand(2), CC, Mask, Sae);
25924 if (!isRoundModeCurDirection(Sae))
25925 return SDValue();
25926 }
25927 //default rounding mode
25928 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25929 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25930 }
25931 case CMP_MASK_SCALAR_CC: {
25932 SDValue Src1 = Op.getOperand(1);
25933 SDValue Src2 = Op.getOperand(2);
25934 SDValue CC = Op.getOperand(3);
25935 SDValue Mask = Op.getOperand(4);
25936
25937 SDValue Cmp;
25938 if (IntrData->Opc1 != 0) {
25939 SDValue Sae = Op.getOperand(5);
25940 if (isRoundModeSAE(Sae))
25941 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25942 else if (!isRoundModeCurDirection(Sae))
25943 return SDValue();
25944 }
25945 //default rounding mode
25946 if (!Cmp.getNode())
25947 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25948
25949 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25950 Subtarget, DAG);
25951 // Need to fill with zeros to ensure the bitcast will produce zeroes
25952 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25953 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25954 DAG.getConstant(0, dl, MVT::v8i1),
25955 CmpMask, DAG.getIntPtrConstant(0, dl));
25956 return DAG.getBitcast(MVT::i8, Ins);
25957 }
25958 case COMI: { // Comparison intrinsics
25959 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25960 SDValue LHS = Op.getOperand(1);
25961 SDValue RHS = Op.getOperand(2);
25962 // Some conditions require the operands to be swapped.
25963 if (CC == ISD::SETLT || CC == ISD::SETLE)
25964 std::swap(LHS, RHS);
25965
25966 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25967 SDValue SetCC;
25968 switch (CC) {
25969 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25970 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25971 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25972 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25973 break;
25974 }
25975 case ISD::SETNE: { // (ZF = 1 or PF = 1)
25976 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25977 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25978 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25979 break;
25980 }
25981 case ISD::SETGT: // (CF = 0 and ZF = 0)
25982 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25983 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25984 break;
25985 }
25986 case ISD::SETGE: // CF = 0
25987 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25988 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25989 break;
25990 default:
25991 llvm_unreachable("Unexpected illegal condition!");
25992 }
25993 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25994 }
25995 case COMI_RM: { // Comparison intrinsics with Sae
25996 SDValue LHS = Op.getOperand(1);
25997 SDValue RHS = Op.getOperand(2);
25998 unsigned CondVal = Op.getConstantOperandVal(3);
25999 SDValue Sae = Op.getOperand(4);
26000
26001 SDValue FCmp;
26002 if (isRoundModeCurDirection(Sae))
26003 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26004 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26005 else if (isRoundModeSAE(Sae))
26006 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26007 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26008 else
26009 return SDValue();
26010 // Need to fill with zeros to ensure the bitcast will produce zeroes
26011 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26012 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26013 DAG.getConstant(0, dl, MVT::v16i1),
26014 FCmp, DAG.getIntPtrConstant(0, dl));
26015 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26016 DAG.getBitcast(MVT::i16, Ins));
26017 }
26018 case VSHIFT: {
26019 SDValue SrcOp = Op.getOperand(1);
26020 SDValue ShAmt = Op.getOperand(2);
26021 assert(ShAmt.getValueType() == MVT::i32 &&
26022 "Unexpected VSHIFT amount type");
26023
26024 // Catch shift-by-constant.
26025 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26026 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26027 Op.getSimpleValueType(), SrcOp,
26028 CShAmt->getZExtValue(), DAG);
26029
26030 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26031 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26032 SrcOp, ShAmt, 0, Subtarget, DAG);
26033 }
26035 SDValue Mask = Op.getOperand(3);
26036 SDValue DataToCompress = Op.getOperand(1);
26037 SDValue PassThru = Op.getOperand(2);
26038 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26039 return Op.getOperand(1);
26040
26041 // Avoid false dependency.
26042 if (PassThru.isUndef())
26043 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26044
26045 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26046 Mask);
26047 }
26048 case FIXUPIMM:
26049 case FIXUPIMM_MASKZ: {
26050 SDValue Src1 = Op.getOperand(1);
26051 SDValue Src2 = Op.getOperand(2);
26052 SDValue Src3 = Op.getOperand(3);
26053 SDValue Imm = Op.getOperand(4);
26054 SDValue Mask = Op.getOperand(5);
26055 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26056 ? Src1
26057 : getZeroVector(VT, Subtarget, DAG, dl);
26058
26059 unsigned Opc = IntrData->Opc0;
26060 if (IntrData->Opc1 != 0) {
26061 SDValue Sae = Op.getOperand(6);
26062 if (isRoundModeSAE(Sae))
26063 Opc = IntrData->Opc1;
26064 else if (!isRoundModeCurDirection(Sae))
26065 return SDValue();
26066 }
26067
26068 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26069
26070 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
26071 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26072
26073 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26074 }
26075 case ROUNDP: {
26076 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26077 // Clear the upper bits of the rounding immediate so that the legacy
26078 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26079 uint64_t Round = Op.getConstantOperandVal(2);
26080 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26081 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26082 Op.getOperand(1), RoundingMode);
26083 }
26084 case ROUNDS: {
26085 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26086 // Clear the upper bits of the rounding immediate so that the legacy
26087 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26088 uint64_t Round = Op.getConstantOperandVal(3);
26089 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26090 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26091 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26092 }
26093 case BEXTRI: {
26094 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26095
26096 uint64_t Imm = Op.getConstantOperandVal(2);
26097 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26098 Op.getValueType());
26099 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26100 Op.getOperand(1), Control);
26101 }
26102 // ADC/SBB
26103 case ADX: {
26104 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26105 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26106
26107 SDValue Res;
26108 // If the carry in is zero, then we should just use ADD/SUB instead of
26109 // ADC/SBB.
26110 if (isNullConstant(Op.getOperand(1))) {
26111 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26112 Op.getOperand(3));
26113 } else {
26114 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26115 DAG.getConstant(-1, dl, MVT::i8));
26116 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26117 Op.getOperand(3), GenCF.getValue(1));
26118 }
26119 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26120 SDValue Results[] = { SetCC, Res };
26121 return DAG.getMergeValues(Results, dl);
26122 }
26123 case CVTPD2PS_MASK:
26124 case CVTPD2DQ_MASK:
26125 case CVTQQ2PS_MASK:
26126 case TRUNCATE_TO_REG: {
26127 SDValue Src = Op.getOperand(1);
26128 SDValue PassThru = Op.getOperand(2);
26129 SDValue Mask = Op.getOperand(3);
26130
26131 if (isAllOnesConstant(Mask))
26132 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26133
26134 MVT SrcVT = Src.getSimpleValueType();
26135 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26136 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26137 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26138 {Src, PassThru, Mask});
26139 }
26140 case CVTPS2PH_MASK: {
26141 SDValue Src = Op.getOperand(1);
26142 SDValue Rnd = Op.getOperand(2);
26143 SDValue PassThru = Op.getOperand(3);
26144 SDValue Mask = Op.getOperand(4);
26145
26146 unsigned RC = 0;
26147 unsigned Opc = IntrData->Opc0;
26148 bool SAE = Src.getValueType().is512BitVector() &&
26149 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
26150 if (SAE) {
26152 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
26153 }
26154
26155 if (isAllOnesConstant(Mask))
26156 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
26157
26158 if (SAE)
26160 else
26161 Opc = IntrData->Opc1;
26162 MVT SrcVT = Src.getSimpleValueType();
26163 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26164 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26165 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
26166 }
26167 case CVTNEPS2BF16_MASK: {
26168 SDValue Src = Op.getOperand(1);
26169 SDValue PassThru = Op.getOperand(2);
26170 SDValue Mask = Op.getOperand(3);
26171
26172 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26173 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26174
26175 // Break false dependency.
26176 if (PassThru.isUndef())
26177 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26178
26179 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26180 Mask);
26181 }
26182 default:
26183 break;
26184 }
26185 }
26186
26187 switch (IntNo) {
26188 default: return SDValue(); // Don't custom lower most intrinsics.
26189
26190 // ptest and testp intrinsics. The intrinsic these come from are designed to
26191 // return an integer value, not just an instruction so lower it to the ptest
26192 // or testp pattern and a setcc for the result.
26193 case Intrinsic::x86_avx512_ktestc_b:
26194 case Intrinsic::x86_avx512_ktestc_w:
26195 case Intrinsic::x86_avx512_ktestc_d:
26196 case Intrinsic::x86_avx512_ktestc_q:
26197 case Intrinsic::x86_avx512_ktestz_b:
26198 case Intrinsic::x86_avx512_ktestz_w:
26199 case Intrinsic::x86_avx512_ktestz_d:
26200 case Intrinsic::x86_avx512_ktestz_q:
26201 case Intrinsic::x86_sse41_ptestz:
26202 case Intrinsic::x86_sse41_ptestc:
26203 case Intrinsic::x86_sse41_ptestnzc:
26204 case Intrinsic::x86_avx_ptestz_256:
26205 case Intrinsic::x86_avx_ptestc_256:
26206 case Intrinsic::x86_avx_ptestnzc_256:
26207 case Intrinsic::x86_avx_vtestz_ps:
26208 case Intrinsic::x86_avx_vtestc_ps:
26209 case Intrinsic::x86_avx_vtestnzc_ps:
26210 case Intrinsic::x86_avx_vtestz_pd:
26211 case Intrinsic::x86_avx_vtestc_pd:
26212 case Intrinsic::x86_avx_vtestnzc_pd:
26213 case Intrinsic::x86_avx_vtestz_ps_256:
26214 case Intrinsic::x86_avx_vtestc_ps_256:
26215 case Intrinsic::x86_avx_vtestnzc_ps_256:
26216 case Intrinsic::x86_avx_vtestz_pd_256:
26217 case Intrinsic::x86_avx_vtestc_pd_256:
26218 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26219 unsigned TestOpc = X86ISD::PTEST;
26220 X86::CondCode X86CC;
26221 switch (IntNo) {
26222 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
26223 case Intrinsic::x86_avx512_ktestc_b:
26224 case Intrinsic::x86_avx512_ktestc_w:
26225 case Intrinsic::x86_avx512_ktestc_d:
26226 case Intrinsic::x86_avx512_ktestc_q:
26227 // CF = 1
26228 TestOpc = X86ISD::KTEST;
26229 X86CC = X86::COND_B;
26230 break;
26231 case Intrinsic::x86_avx512_ktestz_b:
26232 case Intrinsic::x86_avx512_ktestz_w:
26233 case Intrinsic::x86_avx512_ktestz_d:
26234 case Intrinsic::x86_avx512_ktestz_q:
26235 TestOpc = X86ISD::KTEST;
26236 X86CC = X86::COND_E;
26237 break;
26238 case Intrinsic::x86_avx_vtestz_ps:
26239 case Intrinsic::x86_avx_vtestz_pd:
26240 case Intrinsic::x86_avx_vtestz_ps_256:
26241 case Intrinsic::x86_avx_vtestz_pd_256:
26242 TestOpc = X86ISD::TESTP;
26243 [[fallthrough]];
26244 case Intrinsic::x86_sse41_ptestz:
26245 case Intrinsic::x86_avx_ptestz_256:
26246 // ZF = 1
26247 X86CC = X86::COND_E;
26248 break;
26249 case Intrinsic::x86_avx_vtestc_ps:
26250 case Intrinsic::x86_avx_vtestc_pd:
26251 case Intrinsic::x86_avx_vtestc_ps_256:
26252 case Intrinsic::x86_avx_vtestc_pd_256:
26253 TestOpc = X86ISD::TESTP;
26254 [[fallthrough]];
26255 case Intrinsic::x86_sse41_ptestc:
26256 case Intrinsic::x86_avx_ptestc_256:
26257 // CF = 1
26258 X86CC = X86::COND_B;
26259 break;
26260 case Intrinsic::x86_avx_vtestnzc_ps:
26261 case Intrinsic::x86_avx_vtestnzc_pd:
26262 case Intrinsic::x86_avx_vtestnzc_ps_256:
26263 case Intrinsic::x86_avx_vtestnzc_pd_256:
26264 TestOpc = X86ISD::TESTP;
26265 [[fallthrough]];
26266 case Intrinsic::x86_sse41_ptestnzc:
26267 case Intrinsic::x86_avx_ptestnzc_256:
26268 // ZF and CF = 0
26269 X86CC = X86::COND_A;
26270 break;
26271 }
26272
26273 SDValue LHS = Op.getOperand(1);
26274 SDValue RHS = Op.getOperand(2);
26275 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26276 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26277 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26278 }
26279
26280 case Intrinsic::x86_sse42_pcmpistria128:
26281 case Intrinsic::x86_sse42_pcmpestria128:
26282 case Intrinsic::x86_sse42_pcmpistric128:
26283 case Intrinsic::x86_sse42_pcmpestric128:
26284 case Intrinsic::x86_sse42_pcmpistrio128:
26285 case Intrinsic::x86_sse42_pcmpestrio128:
26286 case Intrinsic::x86_sse42_pcmpistris128:
26287 case Intrinsic::x86_sse42_pcmpestris128:
26288 case Intrinsic::x86_sse42_pcmpistriz128:
26289 case Intrinsic::x86_sse42_pcmpestriz128: {
26290 unsigned Opcode;
26291 X86::CondCode X86CC;
26292 switch (IntNo) {
26293 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26294 case Intrinsic::x86_sse42_pcmpistria128:
26295 Opcode = X86ISD::PCMPISTR;
26296 X86CC = X86::COND_A;
26297 break;
26298 case Intrinsic::x86_sse42_pcmpestria128:
26299 Opcode = X86ISD::PCMPESTR;
26300 X86CC = X86::COND_A;
26301 break;
26302 case Intrinsic::x86_sse42_pcmpistric128:
26303 Opcode = X86ISD::PCMPISTR;
26304 X86CC = X86::COND_B;
26305 break;
26306 case Intrinsic::x86_sse42_pcmpestric128:
26307 Opcode = X86ISD::PCMPESTR;
26308 X86CC = X86::COND_B;
26309 break;
26310 case Intrinsic::x86_sse42_pcmpistrio128:
26311 Opcode = X86ISD::PCMPISTR;
26312 X86CC = X86::COND_O;
26313 break;
26314 case Intrinsic::x86_sse42_pcmpestrio128:
26315 Opcode = X86ISD::PCMPESTR;
26316 X86CC = X86::COND_O;
26317 break;
26318 case Intrinsic::x86_sse42_pcmpistris128:
26319 Opcode = X86ISD::PCMPISTR;
26320 X86CC = X86::COND_S;
26321 break;
26322 case Intrinsic::x86_sse42_pcmpestris128:
26323 Opcode = X86ISD::PCMPESTR;
26324 X86CC = X86::COND_S;
26325 break;
26326 case Intrinsic::x86_sse42_pcmpistriz128:
26327 Opcode = X86ISD::PCMPISTR;
26328 X86CC = X86::COND_E;
26329 break;
26330 case Intrinsic::x86_sse42_pcmpestriz128:
26331 Opcode = X86ISD::PCMPESTR;
26332 X86CC = X86::COND_E;
26333 break;
26334 }
26336 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26337 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26338 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26339 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26340 }
26341
26342 case Intrinsic::x86_sse42_pcmpistri128:
26343 case Intrinsic::x86_sse42_pcmpestri128: {
26344 unsigned Opcode;
26345 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26346 Opcode = X86ISD::PCMPISTR;
26347 else
26348 Opcode = X86ISD::PCMPESTR;
26349
26351 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26352 return DAG.getNode(Opcode, dl, VTs, NewOps);
26353 }
26354
26355 case Intrinsic::x86_sse42_pcmpistrm128:
26356 case Intrinsic::x86_sse42_pcmpestrm128: {
26357 unsigned Opcode;
26358 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26359 Opcode = X86ISD::PCMPISTR;
26360 else
26361 Opcode = X86ISD::PCMPESTR;
26362
26364 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26365 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26366 }
26367
26368 case Intrinsic::eh_sjlj_lsda: {
26370 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26371 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26372 auto &Context = MF.getMMI().getContext();
26373 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26374 Twine(MF.getFunctionNumber()));
26375 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
26376 DAG.getMCSymbol(S, PtrVT));
26377 }
26378
26379 case Intrinsic::x86_seh_lsda: {
26380 // Compute the symbol for the LSDA. We know it'll get emitted later.
26382 SDValue Op1 = Op.getOperand(1);
26383 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26386
26387 // Generate a simple absolute symbol reference. This intrinsic is only
26388 // supported on 32-bit Windows, which isn't PIC.
26389 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26390 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26391 }
26392
26393 case Intrinsic::eh_recoverfp: {
26394 SDValue FnOp = Op.getOperand(1);
26395 SDValue IncomingFPOp = Op.getOperand(2);
26396 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26397 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26398 if (!Fn)
26400 "llvm.eh.recoverfp must take a function as the first argument");
26401 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26402 }
26403
26404 case Intrinsic::localaddress: {
26405 // Returns one of the stack, base, or frame pointer registers, depending on
26406 // which is used to reference local variables.
26408 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26409 unsigned Reg;
26410 if (RegInfo->hasBasePointer(MF))
26411 Reg = RegInfo->getBaseRegister();
26412 else { // Handles the SP or FP case.
26413 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26414 if (CantUseFP)
26415 Reg = RegInfo->getPtrSizedStackRegister(MF);
26416 else
26417 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26418 }
26419 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26420 }
26421 case Intrinsic::x86_avx512_vp2intersect_q_512:
26422 case Intrinsic::x86_avx512_vp2intersect_q_256:
26423 case Intrinsic::x86_avx512_vp2intersect_q_128:
26424 case Intrinsic::x86_avx512_vp2intersect_d_512:
26425 case Intrinsic::x86_avx512_vp2intersect_d_256:
26426 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26427 MVT MaskVT = Op.getSimpleValueType();
26428
26429 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26430 SDLoc DL(Op);
26431
26434 Op->getOperand(1), Op->getOperand(2));
26435
26436 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26437 MaskVT, Operation);
26438 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26439 MaskVT, Operation);
26440 return DAG.getMergeValues({Result0, Result1}, DL);
26441 }
26442 case Intrinsic::x86_mmx_pslli_w:
26443 case Intrinsic::x86_mmx_pslli_d:
26444 case Intrinsic::x86_mmx_pslli_q:
26445 case Intrinsic::x86_mmx_psrli_w:
26446 case Intrinsic::x86_mmx_psrli_d:
26447 case Intrinsic::x86_mmx_psrli_q:
26448 case Intrinsic::x86_mmx_psrai_w:
26449 case Intrinsic::x86_mmx_psrai_d: {
26450 SDLoc DL(Op);
26451 SDValue ShAmt = Op.getOperand(2);
26452 // If the argument is a constant, convert it to a target constant.
26453 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26454 // Clamp out of bounds shift amounts since they will otherwise be masked
26455 // to 8-bits which may make it no longer out of bounds.
26456 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26457 if (ShiftAmount == 0)
26458 return Op.getOperand(1);
26459
26460 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26461 Op.getOperand(0), Op.getOperand(1),
26462 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26463 }
26464
26465 unsigned NewIntrinsic;
26466 switch (IntNo) {
26467 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26468 case Intrinsic::x86_mmx_pslli_w:
26469 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26470 break;
26471 case Intrinsic::x86_mmx_pslli_d:
26472 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26473 break;
26474 case Intrinsic::x86_mmx_pslli_q:
26475 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26476 break;
26477 case Intrinsic::x86_mmx_psrli_w:
26478 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26479 break;
26480 case Intrinsic::x86_mmx_psrli_d:
26481 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26482 break;
26483 case Intrinsic::x86_mmx_psrli_q:
26484 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26485 break;
26486 case Intrinsic::x86_mmx_psrai_w:
26487 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26488 break;
26489 case Intrinsic::x86_mmx_psrai_d:
26490 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26491 break;
26492 }
26493
26494 // The vector shift intrinsics with scalars uses 32b shift amounts but
26495 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26496 // MMX register.
26497 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26498 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26499 DAG.getTargetConstant(NewIntrinsic, DL,
26501 Op.getOperand(1), ShAmt);
26502 }
26503 case Intrinsic::thread_pointer: {
26504 if (Subtarget.isTargetELF()) {
26505 SDLoc dl(Op);
26506 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26507 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
26509 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
26510 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26512 }
26514 "Target OS doesn't support __builtin_thread_pointer() yet.");
26515 }
26516 }
26517}
26518
26520 SDValue Src, SDValue Mask, SDValue Base,
26521 SDValue Index, SDValue ScaleOp, SDValue Chain,
26522 const X86Subtarget &Subtarget) {
26523 SDLoc dl(Op);
26524 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26525 // Scale must be constant.
26526 if (!C)
26527 return SDValue();
26528 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26529 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26530 TLI.getPointerTy(DAG.getDataLayout()));
26531 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26532 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26533 // If source is undef or we know it won't be used, use a zero vector
26534 // to break register dependency.
26535 // TODO: use undef instead and let BreakFalseDeps deal with it?
26536 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26537 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26538
26539 // Cast mask to an integer type.
26540 Mask = DAG.getBitcast(MaskVT, Mask);
26541
26542 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26543
26544 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26545 SDValue Res =
26546 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26547 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26548 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26549}
26550
26552 SDValue Src, SDValue Mask, SDValue Base,
26553 SDValue Index, SDValue ScaleOp, SDValue Chain,
26554 const X86Subtarget &Subtarget) {
26555 MVT VT = Op.getSimpleValueType();
26556 SDLoc dl(Op);
26557 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26558 // Scale must be constant.
26559 if (!C)
26560 return SDValue();
26561 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26562 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26563 TLI.getPointerTy(DAG.getDataLayout()));
26564 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26566 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26567
26568 // We support two versions of the gather intrinsics. One with scalar mask and
26569 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26570 if (Mask.getValueType() != MaskVT)
26571 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26572
26573 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26574 // If source is undef or we know it won't be used, use a zero vector
26575 // to break register dependency.
26576 // TODO: use undef instead and let BreakFalseDeps deal with it?
26577 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26578 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26579
26580 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26581
26582 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26583 SDValue Res =
26584 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26585 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26586 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26587}
26588
26589static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26590 SDValue Src, SDValue Mask, SDValue Base,
26591 SDValue Index, SDValue ScaleOp, SDValue Chain,
26592 const X86Subtarget &Subtarget) {
26593 SDLoc dl(Op);
26594 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26595 // Scale must be constant.
26596 if (!C)
26597 return SDValue();
26598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26599 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26600 TLI.getPointerTy(DAG.getDataLayout()));
26601 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26602 Src.getSimpleValueType().getVectorNumElements());
26603 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26604
26605 // We support two versions of the scatter intrinsics. One with scalar mask and
26606 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26607 if (Mask.getValueType() != MaskVT)
26608 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26609
26610 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26611
26612 SDVTList VTs = DAG.getVTList(MVT::Other);
26613 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26614 SDValue Res =
26615 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26616 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26617 return Res;
26618}
26619
26620static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26622 SDValue ScaleOp, SDValue Chain,
26623 const X86Subtarget &Subtarget) {
26624 SDLoc dl(Op);
26625 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26626 // Scale must be constant.
26627 if (!C)
26628 return SDValue();
26629 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26630 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26631 TLI.getPointerTy(DAG.getDataLayout()));
26632 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26633 SDValue Segment = DAG.getRegister(0, MVT::i32);
26634 MVT MaskVT =
26635 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26636 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26637 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26638 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26639 return SDValue(Res, 0);
26640}
26641
26642/// Handles the lowering of builtin intrinsics with chain that return their
26643/// value into registers EDX:EAX.
26644/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26645/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26646/// TargetOpcode.
26647/// Returns a Glue value which can be used to add extra copy-from-reg if the
26648/// expanded intrinsics implicitly defines extra registers (i.e. not just
26649/// EDX:EAX).
26651 SelectionDAG &DAG,
26652 unsigned TargetOpcode,
26653 unsigned SrcReg,
26654 const X86Subtarget &Subtarget,
26656 SDValue Chain = N->getOperand(0);
26657 SDValue Glue;
26658
26659 if (SrcReg) {
26660 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26661 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26662 Glue = Chain.getValue(1);
26663 }
26664
26665 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26666 SDValue N1Ops[] = {Chain, Glue};
26667 SDNode *N1 = DAG.getMachineNode(
26668 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26669 Chain = SDValue(N1, 0);
26670
26671 // Reads the content of XCR and returns it in registers EDX:EAX.
26672 SDValue LO, HI;
26673 if (Subtarget.is64Bit()) {
26674 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26675 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26676 LO.getValue(2));
26677 } else {
26678 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26679 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26680 LO.getValue(2));
26681 }
26682 Chain = HI.getValue(1);
26683 Glue = HI.getValue(2);
26684
26685 if (Subtarget.is64Bit()) {
26686 // Merge the two 32-bit values into a 64-bit one.
26687 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26688 DAG.getConstant(32, DL, MVT::i8));
26689 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26690 Results.push_back(Chain);
26691 return Glue;
26692 }
26693
26694 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26695 SDValue Ops[] = { LO, HI };
26696 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26697 Results.push_back(Pair);
26698 Results.push_back(Chain);
26699 return Glue;
26700}
26701
26702/// Handles the lowering of builtin intrinsics that read the time stamp counter
26703/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26704/// READCYCLECOUNTER nodes.
26705static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26706 SelectionDAG &DAG,
26707 const X86Subtarget &Subtarget,
26709 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26710 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26711 // and the EAX register is loaded with the low-order 32 bits.
26712 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26713 /* NoRegister */0, Subtarget,
26714 Results);
26715 if (Opcode != X86::RDTSCP)
26716 return;
26717
26718 SDValue Chain = Results[1];
26719 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26720 // the ECX register. Add 'ecx' explicitly to the chain.
26721 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26722 Results[1] = ecx;
26723 Results.push_back(ecx.getValue(1));
26724}
26725
26727 SelectionDAG &DAG) {
26729 SDLoc DL(Op);
26730 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26731 Results);
26732 return DAG.getMergeValues(Results, DL);
26733}
26734
26737 SDValue Chain = Op.getOperand(0);
26738 SDValue RegNode = Op.getOperand(2);
26739 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26740 if (!EHInfo)
26741 report_fatal_error("EH registrations only live in functions using WinEH");
26742
26743 // Cast the operand to an alloca, and remember the frame index.
26744 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26745 if (!FINode)
26746 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26747 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26748
26749 // Return the chain operand without making any DAG nodes.
26750 return Chain;
26751}
26752
26755 SDValue Chain = Op.getOperand(0);
26756 SDValue EHGuard = Op.getOperand(2);
26757 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26758 if (!EHInfo)
26759 report_fatal_error("EHGuard only live in functions using WinEH");
26760
26761 // Cast the operand to an alloca, and remember the frame index.
26762 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26763 if (!FINode)
26764 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26765 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26766
26767 // Return the chain operand without making any DAG nodes.
26768 return Chain;
26769}
26770
26771/// Emit Truncating Store with signed or unsigned saturation.
26772static SDValue
26773EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
26774 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26775 SelectionDAG &DAG) {
26776 SDVTList VTs = DAG.getVTList(MVT::Other);
26777 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26778 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26779 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26780 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26781}
26782
26783/// Emit Masked Truncating Store with signed or unsigned saturation.
26784static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
26785 const SDLoc &DL,
26786 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26787 MachineMemOperand *MMO, SelectionDAG &DAG) {
26788 SDVTList VTs = DAG.getVTList(MVT::Other);
26789 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26790 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26791 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26792}
26793
26795 const MachineFunction &MF) {
26796 if (!Subtarget.is64Bit())
26797 return false;
26798 // 64-bit targets support extended Swift async frame setup,
26799 // except for targets that use the windows 64 prologue.
26800 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
26801}
26802
26804 SelectionDAG &DAG) {
26805 unsigned IntNo = Op.getConstantOperandVal(1);
26806 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26807 if (!IntrData) {
26808 switch (IntNo) {
26809
26810 case Intrinsic::swift_async_context_addr: {
26811 SDLoc dl(Op);
26812 auto &MF = DAG.getMachineFunction();
26813 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
26814 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
26816 X86FI->setHasSwiftAsyncContext(true);
26817 SDValue Chain = Op->getOperand(0);
26818 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
26819 SDValue Result =
26820 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
26821 DAG.getTargetConstant(8, dl, MVT::i32)),
26822 0);
26823 // Return { result, chain }.
26824 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26825 CopyRBP.getValue(1));
26826 } else {
26827 // No special extended frame, create or reuse an existing stack slot.
26828 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
26829 if (!X86FI->getSwiftAsyncContextFrameIdx())
26830 X86FI->setSwiftAsyncContextFrameIdx(
26831 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
26832 false));
26833 SDValue Result =
26834 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
26835 PtrSize == 8 ? MVT::i64 : MVT::i32);
26836 // Return { result, chain }.
26837 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26838 Op->getOperand(0));
26839 }
26840 }
26841
26842 case llvm::Intrinsic::x86_seh_ehregnode:
26843 return MarkEHRegistrationNode(Op, DAG);
26844 case llvm::Intrinsic::x86_seh_ehguard:
26845 return MarkEHGuard(Op, DAG);
26846 case llvm::Intrinsic::x86_rdpkru: {
26847 SDLoc dl(Op);
26848 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26849 // Create a RDPKRU node and pass 0 to the ECX parameter.
26850 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26851 DAG.getConstant(0, dl, MVT::i32));
26852 }
26853 case llvm::Intrinsic::x86_wrpkru: {
26854 SDLoc dl(Op);
26855 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26856 // to the EDX and ECX parameters.
26857 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26858 Op.getOperand(0), Op.getOperand(2),
26859 DAG.getConstant(0, dl, MVT::i32),
26860 DAG.getConstant(0, dl, MVT::i32));
26861 }
26862 case llvm::Intrinsic::asan_check_memaccess: {
26863 // Mark this as adjustsStack because it will be lowered to a call.
26865 // Don't do anything here, we will expand these intrinsics out later.
26866 return Op;
26867 }
26868 case llvm::Intrinsic::x86_flags_read_u32:
26869 case llvm::Intrinsic::x86_flags_read_u64:
26870 case llvm::Intrinsic::x86_flags_write_u32:
26871 case llvm::Intrinsic::x86_flags_write_u64: {
26872 // We need a frame pointer because this will get lowered to a PUSH/POP
26873 // sequence.
26876 // Don't do anything here, we will expand these intrinsics out later
26877 // during FinalizeISel in EmitInstrWithCustomInserter.
26878 return Op;
26879 }
26880 case Intrinsic::x86_lwpins32:
26881 case Intrinsic::x86_lwpins64:
26882 case Intrinsic::x86_umwait:
26883 case Intrinsic::x86_tpause: {
26884 SDLoc dl(Op);
26885 SDValue Chain = Op->getOperand(0);
26886 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26887 unsigned Opcode;
26888
26889 switch (IntNo) {
26890 default: llvm_unreachable("Impossible intrinsic");
26891 case Intrinsic::x86_umwait:
26892 Opcode = X86ISD::UMWAIT;
26893 break;
26894 case Intrinsic::x86_tpause:
26895 Opcode = X86ISD::TPAUSE;
26896 break;
26897 case Intrinsic::x86_lwpins32:
26898 case Intrinsic::x86_lwpins64:
26899 Opcode = X86ISD::LWPINS;
26900 break;
26901 }
26902
26904 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26905 Op->getOperand(3), Op->getOperand(4));
26906 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26907 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26908 Operation.getValue(1));
26909 }
26910 case Intrinsic::x86_enqcmd:
26911 case Intrinsic::x86_enqcmds: {
26912 SDLoc dl(Op);
26913 SDValue Chain = Op.getOperand(0);
26914 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26915 unsigned Opcode;
26916 switch (IntNo) {
26917 default: llvm_unreachable("Impossible intrinsic!");
26918 case Intrinsic::x86_enqcmd:
26919 Opcode = X86ISD::ENQCMD;
26920 break;
26921 case Intrinsic::x86_enqcmds:
26922 Opcode = X86ISD::ENQCMDS;
26923 break;
26924 }
26925 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26926 Op.getOperand(3));
26927 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26928 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26929 Operation.getValue(1));
26930 }
26931 case Intrinsic::x86_aesenc128kl:
26932 case Intrinsic::x86_aesdec128kl:
26933 case Intrinsic::x86_aesenc256kl:
26934 case Intrinsic::x86_aesdec256kl: {
26935 SDLoc DL(Op);
26936 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26937 SDValue Chain = Op.getOperand(0);
26938 unsigned Opcode;
26939
26940 switch (IntNo) {
26941 default: llvm_unreachable("Impossible intrinsic");
26942 case Intrinsic::x86_aesenc128kl:
26943 Opcode = X86ISD::AESENC128KL;
26944 break;
26945 case Intrinsic::x86_aesdec128kl:
26946 Opcode = X86ISD::AESDEC128KL;
26947 break;
26948 case Intrinsic::x86_aesenc256kl:
26949 Opcode = X86ISD::AESENC256KL;
26950 break;
26951 case Intrinsic::x86_aesdec256kl:
26952 Opcode = X86ISD::AESDEC256KL;
26953 break;
26954 }
26955
26956 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26957 MachineMemOperand *MMO = MemIntr->getMemOperand();
26958 EVT MemVT = MemIntr->getMemoryVT();
26960 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26961 MMO);
26962 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26963
26964 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26965 {ZF, Operation.getValue(0), Operation.getValue(2)});
26966 }
26967 case Intrinsic::x86_aesencwide128kl:
26968 case Intrinsic::x86_aesdecwide128kl:
26969 case Intrinsic::x86_aesencwide256kl:
26970 case Intrinsic::x86_aesdecwide256kl: {
26971 SDLoc DL(Op);
26972 SDVTList VTs = DAG.getVTList(
26973 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26974 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26975 SDValue Chain = Op.getOperand(0);
26976 unsigned Opcode;
26977
26978 switch (IntNo) {
26979 default: llvm_unreachable("Impossible intrinsic");
26980 case Intrinsic::x86_aesencwide128kl:
26981 Opcode = X86ISD::AESENCWIDE128KL;
26982 break;
26983 case Intrinsic::x86_aesdecwide128kl:
26984 Opcode = X86ISD::AESDECWIDE128KL;
26985 break;
26986 case Intrinsic::x86_aesencwide256kl:
26987 Opcode = X86ISD::AESENCWIDE256KL;
26988 break;
26989 case Intrinsic::x86_aesdecwide256kl:
26990 Opcode = X86ISD::AESDECWIDE256KL;
26991 break;
26992 }
26993
26994 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26995 MachineMemOperand *MMO = MemIntr->getMemOperand();
26996 EVT MemVT = MemIntr->getMemoryVT();
26998 Opcode, DL, VTs,
26999 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27000 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27001 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27002 MemVT, MMO);
27003 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27004
27005 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27006 {ZF, Operation.getValue(1), Operation.getValue(2),
27007 Operation.getValue(3), Operation.getValue(4),
27008 Operation.getValue(5), Operation.getValue(6),
27009 Operation.getValue(7), Operation.getValue(8),
27010 Operation.getValue(9)});
27011 }
27012 case Intrinsic::x86_testui: {
27013 SDLoc dl(Op);
27014 SDValue Chain = Op.getOperand(0);
27015 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27016 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27017 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27018 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27019 Operation.getValue(1));
27020 }
27021 case Intrinsic::x86_atomic_bts_rm:
27022 case Intrinsic::x86_atomic_btc_rm:
27023 case Intrinsic::x86_atomic_btr_rm: {
27024 SDLoc DL(Op);
27025 MVT VT = Op.getSimpleValueType();
27026 SDValue Chain = Op.getOperand(0);
27027 SDValue Op1 = Op.getOperand(2);
27028 SDValue Op2 = Op.getOperand(3);
27029 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
27030 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
27032 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27033 SDValue Res =
27034 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27035 {Chain, Op1, Op2}, VT, MMO);
27036 Chain = Res.getValue(1);
27037 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27038 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27039 }
27040 case Intrinsic::x86_atomic_bts:
27041 case Intrinsic::x86_atomic_btc:
27042 case Intrinsic::x86_atomic_btr: {
27043 SDLoc DL(Op);
27044 MVT VT = Op.getSimpleValueType();
27045 SDValue Chain = Op.getOperand(0);
27046 SDValue Op1 = Op.getOperand(2);
27047 SDValue Op2 = Op.getOperand(3);
27048 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
27049 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
27050 : X86ISD::LBTR;
27051 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
27052 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27053 SDValue Res =
27054 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27055 {Chain, Op1, Op2, Size}, VT, MMO);
27056 Chain = Res.getValue(1);
27057 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27058 unsigned Imm = Op2->getAsZExtVal();
27059 if (Imm)
27060 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
27061 DAG.getShiftAmountConstant(Imm, VT, DL));
27062 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27063 }
27064 case Intrinsic::x86_cmpccxadd32:
27065 case Intrinsic::x86_cmpccxadd64: {
27066 SDLoc DL(Op);
27067 SDValue Chain = Op.getOperand(0);
27068 SDValue Addr = Op.getOperand(2);
27069 SDValue Src1 = Op.getOperand(3);
27070 SDValue Src2 = Op.getOperand(4);
27071 SDValue CC = Op.getOperand(5);
27072 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27074 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
27075 MVT::i32, MMO);
27076 return Operation;
27077 }
27078 case Intrinsic::x86_aadd32:
27079 case Intrinsic::x86_aadd64:
27080 case Intrinsic::x86_aand32:
27081 case Intrinsic::x86_aand64:
27082 case Intrinsic::x86_aor32:
27083 case Intrinsic::x86_aor64:
27084 case Intrinsic::x86_axor32:
27085 case Intrinsic::x86_axor64: {
27086 SDLoc DL(Op);
27087 SDValue Chain = Op.getOperand(0);
27088 SDValue Op1 = Op.getOperand(2);
27089 SDValue Op2 = Op.getOperand(3);
27090 MVT VT = Op2.getSimpleValueType();
27091 unsigned Opc = 0;
27092 switch (IntNo) {
27093 default:
27094 llvm_unreachable("Unknown Intrinsic");
27095 case Intrinsic::x86_aadd32:
27096 case Intrinsic::x86_aadd64:
27097 Opc = X86ISD::AADD;
27098 break;
27099 case Intrinsic::x86_aand32:
27100 case Intrinsic::x86_aand64:
27101 Opc = X86ISD::AAND;
27102 break;
27103 case Intrinsic::x86_aor32:
27104 case Intrinsic::x86_aor64:
27105 Opc = X86ISD::AOR;
27106 break;
27107 case Intrinsic::x86_axor32:
27108 case Intrinsic::x86_axor64:
27109 Opc = X86ISD::AXOR;
27110 break;
27111 }
27112 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27113 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27114 {Chain, Op1, Op2}, VT, MMO);
27115 }
27116 case Intrinsic::x86_atomic_add_cc:
27117 case Intrinsic::x86_atomic_sub_cc:
27118 case Intrinsic::x86_atomic_or_cc:
27119 case Intrinsic::x86_atomic_and_cc:
27120 case Intrinsic::x86_atomic_xor_cc: {
27121 SDLoc DL(Op);
27122 SDValue Chain = Op.getOperand(0);
27123 SDValue Op1 = Op.getOperand(2);
27124 SDValue Op2 = Op.getOperand(3);
27125 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
27126 MVT VT = Op2.getSimpleValueType();
27127 unsigned Opc = 0;
27128 switch (IntNo) {
27129 default:
27130 llvm_unreachable("Unknown Intrinsic");
27131 case Intrinsic::x86_atomic_add_cc:
27132 Opc = X86ISD::LADD;
27133 break;
27134 case Intrinsic::x86_atomic_sub_cc:
27135 Opc = X86ISD::LSUB;
27136 break;
27137 case Intrinsic::x86_atomic_or_cc:
27138 Opc = X86ISD::LOR;
27139 break;
27140 case Intrinsic::x86_atomic_and_cc:
27141 Opc = X86ISD::LAND;
27142 break;
27143 case Intrinsic::x86_atomic_xor_cc:
27144 Opc = X86ISD::LXOR;
27145 break;
27146 }
27147 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27148 SDValue LockArith =
27149 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27150 {Chain, Op1, Op2}, VT, MMO);
27151 Chain = LockArith.getValue(1);
27152 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
27153 }
27154 }
27155 return SDValue();
27156 }
27157
27158 SDLoc dl(Op);
27159 switch(IntrData->Type) {
27160 default: llvm_unreachable("Unknown Intrinsic Type");
27161 case RDSEED:
27162 case RDRAND: {
27163 // Emit the node with the right value type.
27164 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27165 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27166
27167 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27168 // Otherwise return the value from Rand, which is always 0, casted to i32.
27169 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27170 DAG.getConstant(1, dl, Op->getValueType(1)),
27171 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27172 SDValue(Result.getNode(), 1)};
27173 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27174
27175 // Return { result, isValid, chain }.
27176 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27177 SDValue(Result.getNode(), 2));
27178 }
27179 case GATHER_AVX2: {
27180 SDValue Chain = Op.getOperand(0);
27181 SDValue Src = Op.getOperand(2);
27182 SDValue Base = Op.getOperand(3);
27183 SDValue Index = Op.getOperand(4);
27184 SDValue Mask = Op.getOperand(5);
27185 SDValue Scale = Op.getOperand(6);
27186 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27187 Scale, Chain, Subtarget);
27188 }
27189 case GATHER: {
27190 //gather(v1, mask, index, base, scale);
27191 SDValue Chain = Op.getOperand(0);
27192 SDValue Src = Op.getOperand(2);
27193 SDValue Base = Op.getOperand(3);
27194 SDValue Index = Op.getOperand(4);
27195 SDValue Mask = Op.getOperand(5);
27196 SDValue Scale = Op.getOperand(6);
27197 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27198 Chain, Subtarget);
27199 }
27200 case SCATTER: {
27201 //scatter(base, mask, index, v1, scale);
27202 SDValue Chain = Op.getOperand(0);
27203 SDValue Base = Op.getOperand(2);
27204 SDValue Mask = Op.getOperand(3);
27205 SDValue Index = Op.getOperand(4);
27206 SDValue Src = Op.getOperand(5);
27207 SDValue Scale = Op.getOperand(6);
27208 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27209 Scale, Chain, Subtarget);
27210 }
27211 case PREFETCH: {
27212 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27213 assert((HintVal == 2 || HintVal == 3) &&
27214 "Wrong prefetch hint in intrinsic: should be 2 or 3");
27215 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27216 SDValue Chain = Op.getOperand(0);
27217 SDValue Mask = Op.getOperand(2);
27218 SDValue Index = Op.getOperand(3);
27219 SDValue Base = Op.getOperand(4);
27220 SDValue Scale = Op.getOperand(5);
27221 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27222 Subtarget);
27223 }
27224 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27225 case RDTSC: {
27227 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27228 Results);
27229 return DAG.getMergeValues(Results, dl);
27230 }
27231 // Read Performance Monitoring Counters.
27232 case RDPMC:
27233 // Read Processor Register.
27234 case RDPRU:
27235 // GetExtended Control Register.
27236 case XGETBV: {
27238
27239 // RDPMC uses ECX to select the index of the performance counter to read.
27240 // RDPRU uses ECX to select the processor register to read.
27241 // XGETBV uses ECX to select the index of the XCR register to return.
27242 // The result is stored into registers EDX:EAX.
27243 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27244 Subtarget, Results);
27245 return DAG.getMergeValues(Results, dl);
27246 }
27247 // XTEST intrinsics.
27248 case XTEST: {
27249 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27250 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27251
27252 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27253 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27254 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27255 Ret, SDValue(InTrans.getNode(), 1));
27256 }
27259 case TRUNCATE_TO_MEM_VI32: {
27260 SDValue Mask = Op.getOperand(4);
27261 SDValue DataToTruncate = Op.getOperand(3);
27262 SDValue Addr = Op.getOperand(2);
27263 SDValue Chain = Op.getOperand(0);
27264
27265 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27266 assert(MemIntr && "Expected MemIntrinsicSDNode!");
27267
27268 EVT MemVT = MemIntr->getMemoryVT();
27269
27270 uint16_t TruncationOp = IntrData->Opc0;
27271 switch (TruncationOp) {
27272 case X86ISD::VTRUNC: {
27273 if (isAllOnesConstant(Mask)) // return just a truncate store
27274 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27275 MemIntr->getMemOperand());
27276
27277 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27278 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27279 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27280
27281 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27282 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27283 true /* truncating */);
27284 }
27285 case X86ISD::VTRUNCUS:
27286 case X86ISD::VTRUNCS: {
27287 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27288 if (isAllOnesConstant(Mask))
27289 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27290 MemIntr->getMemOperand(), DAG);
27291
27292 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27293 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27294
27295 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27296 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27297 }
27298 default:
27299 llvm_unreachable("Unsupported truncstore intrinsic");
27300 }
27301 }
27302 }
27303}
27304
27305SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27306 SelectionDAG &DAG) const {
27308 MFI.setReturnAddressIsTaken(true);
27309
27311 return SDValue();
27312
27313 unsigned Depth = Op.getConstantOperandVal(0);
27314 SDLoc dl(Op);
27315 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27316
27317 if (Depth > 0) {
27318 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27319 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27320 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27321 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27322 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27324 }
27325
27326 // Just load the return address.
27327 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27328 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27330}
27331
27332SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27333 SelectionDAG &DAG) const {
27335 return getReturnAddressFrameIndex(DAG);
27336}
27337
27338SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27340 MachineFrameInfo &MFI = MF.getFrameInfo();
27342 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27343 EVT VT = Op.getValueType();
27344
27345 MFI.setFrameAddressIsTaken(true);
27346
27347 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27348 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27349 // is not possible to crawl up the stack without looking at the unwind codes
27350 // simultaneously.
27351 int FrameAddrIndex = FuncInfo->getFAIndex();
27352 if (!FrameAddrIndex) {
27353 // Set up a frame object for the return address.
27354 unsigned SlotSize = RegInfo->getSlotSize();
27355 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27356 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27357 FuncInfo->setFAIndex(FrameAddrIndex);
27358 }
27359 return DAG.getFrameIndex(FrameAddrIndex, VT);
27360 }
27361
27362 unsigned FrameReg =
27363 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27364 SDLoc dl(Op); // FIXME probably not meaningful
27365 unsigned Depth = Op.getConstantOperandVal(0);
27366 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
27367 (FrameReg == X86::EBP && VT == MVT::i32)) &&
27368 "Invalid Frame Register!");
27369 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27370 while (Depth--)
27371 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27373 return FrameAddr;
27374}
27375
27376// FIXME? Maybe this could be a TableGen attribute on some registers and
27377// this table could be generated automatically from RegInfo.
27379 const MachineFunction &MF) const {
27380 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27381
27383 .Case("esp", X86::ESP)
27384 .Case("rsp", X86::RSP)
27385 .Case("ebp", X86::EBP)
27386 .Case("rbp", X86::RBP)
27387 .Case("r14", X86::R14)
27388 .Case("r15", X86::R15)
27389 .Default(0);
27390
27391 if (Reg == X86::EBP || Reg == X86::RBP) {
27392 if (!TFI.hasFP(MF))
27393 report_fatal_error("register " + StringRef(RegName) +
27394 " is allocatable: function has no frame pointer");
27395#ifndef NDEBUG
27396 else {
27397 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27398 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27399 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
27400 "Invalid Frame Register!");
27401 }
27402#endif
27403 }
27404
27405 if (Reg)
27406 return Reg;
27407
27408 report_fatal_error("Invalid register name global variable");
27409}
27410
27411SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27412 SelectionDAG &DAG) const {
27413 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27414 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27415}
27416
27418 const Constant *PersonalityFn) const {
27419 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27420 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27421
27422 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27423}
27424
27426 const Constant *PersonalityFn) const {
27427 // Funclet personalities don't use selectors (the runtime does the selection).
27429 return X86::NoRegister;
27430 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27431}
27432
27434 return Subtarget.isTargetWin64();
27435}
27436
27437SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27438 SDValue Chain = Op.getOperand(0);
27439 SDValue Offset = Op.getOperand(1);
27440 SDValue Handler = Op.getOperand(2);
27441 SDLoc dl (Op);
27442
27443 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27444 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27445 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27446 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
27447 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
27448 "Invalid Frame Register!");
27449 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27450 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27451
27452 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27453 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27454 dl));
27455 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27456 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27457 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27458
27459 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27460 DAG.getRegister(StoreAddrReg, PtrVT));
27461}
27462
27463SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27464 SelectionDAG &DAG) const {
27465 SDLoc DL(Op);
27466 // If the subtarget is not 64bit, we may need the global base reg
27467 // after isel expand pseudo, i.e., after CGBR pass ran.
27468 // Therefore, ask for the GlobalBaseReg now, so that the pass
27469 // inserts the code for us in case we need it.
27470 // Otherwise, we will end up in a situation where we will
27471 // reference a virtual register that is not defined!
27472 if (!Subtarget.is64Bit()) {
27473 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27474 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27475 }
27476 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27477 DAG.getVTList(MVT::i32, MVT::Other),
27478 Op.getOperand(0), Op.getOperand(1));
27479}
27480
27481SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27482 SelectionDAG &DAG) const {
27483 SDLoc DL(Op);
27484 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27485 Op.getOperand(0), Op.getOperand(1));
27486}
27487
27488SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27489 SelectionDAG &DAG) const {
27490 SDLoc DL(Op);
27491 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27492 Op.getOperand(0));
27493}
27494
27496 return Op.getOperand(0);
27497}
27498
27499SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27500 SelectionDAG &DAG) const {
27501 SDValue Root = Op.getOperand(0);
27502 SDValue Trmp = Op.getOperand(1); // trampoline
27503 SDValue FPtr = Op.getOperand(2); // nested function
27504 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27505 SDLoc dl (Op);
27506
27507 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27508 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27509
27510 if (Subtarget.is64Bit()) {
27511 SDValue OutChains[6];
27512
27513 // Large code-model.
27514 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27515 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27516
27517 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27518 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27519
27520 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27521
27522 // Load the pointer to the nested function into R11.
27523 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27524 SDValue Addr = Trmp;
27525 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27526 Addr, MachinePointerInfo(TrmpAddr));
27527
27528 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27529 DAG.getConstant(2, dl, MVT::i64));
27530 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27531 MachinePointerInfo(TrmpAddr, 2), Align(2));
27532
27533 // Load the 'nest' parameter value into R10.
27534 // R10 is specified in X86CallingConv.td
27535 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27536 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27537 DAG.getConstant(10, dl, MVT::i64));
27538 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27539 Addr, MachinePointerInfo(TrmpAddr, 10));
27540
27541 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27542 DAG.getConstant(12, dl, MVT::i64));
27543 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27544 MachinePointerInfo(TrmpAddr, 12), Align(2));
27545
27546 // Jump to the nested function.
27547 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27548 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27549 DAG.getConstant(20, dl, MVT::i64));
27550 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27551 Addr, MachinePointerInfo(TrmpAddr, 20));
27552
27553 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27554 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27555 DAG.getConstant(22, dl, MVT::i64));
27556 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27557 Addr, MachinePointerInfo(TrmpAddr, 22));
27558
27559 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27560 } else {
27561 const Function *Func =
27562 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27563 CallingConv::ID CC = Func->getCallingConv();
27564 unsigned NestReg;
27565
27566 switch (CC) {
27567 default:
27568 llvm_unreachable("Unsupported calling convention");
27569 case CallingConv::C:
27571 // Pass 'nest' parameter in ECX.
27572 // Must be kept in sync with X86CallingConv.td
27573 NestReg = X86::ECX;
27574
27575 // Check that ECX wasn't needed by an 'inreg' parameter.
27576 FunctionType *FTy = Func->getFunctionType();
27577 const AttributeList &Attrs = Func->getAttributes();
27578
27579 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27580 unsigned InRegCount = 0;
27581 unsigned Idx = 0;
27582
27583 for (FunctionType::param_iterator I = FTy->param_begin(),
27584 E = FTy->param_end(); I != E; ++I, ++Idx)
27585 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
27586 const DataLayout &DL = DAG.getDataLayout();
27587 // FIXME: should only count parameters that are lowered to integers.
27588 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27589 }
27590
27591 if (InRegCount > 2) {
27592 report_fatal_error("Nest register in use - reduce number of inreg"
27593 " parameters!");
27594 }
27595 }
27596 break;
27597 }
27600 case CallingConv::Fast:
27601 case CallingConv::Tail:
27603 // Pass 'nest' parameter in EAX.
27604 // Must be kept in sync with X86CallingConv.td
27605 NestReg = X86::EAX;
27606 break;
27607 }
27608
27609 SDValue OutChains[4];
27610 SDValue Addr, Disp;
27611
27612 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27613 DAG.getConstant(10, dl, MVT::i32));
27614 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27615
27616 // This is storing the opcode for MOV32ri.
27617 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27618 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27619 OutChains[0] =
27620 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27621 Trmp, MachinePointerInfo(TrmpAddr));
27622
27623 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27624 DAG.getConstant(1, dl, MVT::i32));
27625 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27626 MachinePointerInfo(TrmpAddr, 1), Align(1));
27627
27628 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27629 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27630 DAG.getConstant(5, dl, MVT::i32));
27631 OutChains[2] =
27632 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27633 MachinePointerInfo(TrmpAddr, 5), Align(1));
27634
27635 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27636 DAG.getConstant(6, dl, MVT::i32));
27637 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27638 MachinePointerInfo(TrmpAddr, 6), Align(1));
27639
27640 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27641 }
27642}
27643
27644SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
27645 SelectionDAG &DAG) const {
27646 /*
27647 The rounding mode is in bits 11:10 of FPSR, and has the following
27648 settings:
27649 00 Round to nearest
27650 01 Round to -inf
27651 10 Round to +inf
27652 11 Round to 0
27653
27654 GET_ROUNDING, on the other hand, expects the following:
27655 -1 Undefined
27656 0 Round to 0
27657 1 Round to nearest
27658 2 Round to +inf
27659 3 Round to -inf
27660
27661 To perform the conversion, we use a packed lookup table of the four 2-bit
27662 values that we can index by FPSP[11:10]
27663 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27664
27665 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27666 */
27667
27669 MVT VT = Op.getSimpleValueType();
27670 SDLoc DL(Op);
27671
27672 // Save FP Control Word to stack slot
27673 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27674 SDValue StackSlot =
27675 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27676
27678
27679 SDValue Chain = Op.getOperand(0);
27680 SDValue Ops[] = {Chain, StackSlot};
27682 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27684
27685 // Load FP Control Word from stack slot
27686 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27687 Chain = CWD.getValue(1);
27688
27689 // Mask and turn the control bits into a shift for the lookup table.
27690 SDValue Shift =
27691 DAG.getNode(ISD::SRL, DL, MVT::i16,
27692 DAG.getNode(ISD::AND, DL, MVT::i16,
27693 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27694 DAG.getConstant(9, DL, MVT::i8));
27695 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27696
27697 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27698 SDValue RetVal =
27699 DAG.getNode(ISD::AND, DL, MVT::i32,
27700 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27701 DAG.getConstant(3, DL, MVT::i32));
27702
27703 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27704
27705 return DAG.getMergeValues({RetVal, Chain}, DL);
27706}
27707
27708SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27709 SelectionDAG &DAG) const {
27711 SDLoc DL(Op);
27712 SDValue Chain = Op.getNode()->getOperand(0);
27713
27714 // FP control word may be set only from data in memory. So we need to allocate
27715 // stack space to save/load FP control word.
27716 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27717 SDValue StackSlot =
27718 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27720 MachineMemOperand *MMO =
27722
27723 // Store FP control word into memory.
27724 SDValue Ops[] = {Chain, StackSlot};
27725 Chain = DAG.getMemIntrinsicNode(
27726 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27727
27728 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27729 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27730 Chain = CWD.getValue(1);
27731 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27732 DAG.getConstant(0xf3ff, DL, MVT::i16));
27733
27734 // Calculate new rounding mode.
27735 SDValue NewRM = Op.getNode()->getOperand(1);
27736 SDValue RMBits;
27737 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27738 uint64_t RM = CVal->getZExtValue();
27739 int FieldVal;
27740 switch (static_cast<RoundingMode>(RM)) {
27741 // clang-format off
27742 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27743 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27744 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27745 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27746 default:
27747 llvm_unreachable("rounding mode is not supported by X86 hardware");
27748 // clang-format on
27749 }
27750 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27751 } else {
27752 // Need to convert argument into bits of control word:
27753 // 0 Round to 0 -> 11
27754 // 1 Round to nearest -> 00
27755 // 2 Round to +inf -> 10
27756 // 3 Round to -inf -> 01
27757 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27758 // To make the conversion, put all these values into a value 0xc9 and shift
27759 // it left depending on the rounding mode:
27760 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27761 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27762 // ...
27763 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27764 SDValue ShiftValue =
27765 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27766 DAG.getNode(ISD::ADD, DL, MVT::i32,
27767 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27768 DAG.getConstant(1, DL, MVT::i8)),
27769 DAG.getConstant(4, DL, MVT::i32)));
27770 SDValue Shifted =
27771 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27772 ShiftValue);
27773 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27774 DAG.getConstant(0xc00, DL, MVT::i16));
27775 }
27776
27777 // Update rounding mode bits and store the new FP Control Word into stack.
27778 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27779 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
27780
27781 // Load FP control word from the slot.
27782 SDValue OpsLD[] = {Chain, StackSlot};
27783 MachineMemOperand *MMOL =
27785 Chain = DAG.getMemIntrinsicNode(
27786 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27787
27788 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27789 // same way but in bits 14:13.
27790 if (Subtarget.hasSSE1()) {
27791 // Store MXCSR into memory.
27792 Chain = DAG.getNode(
27793 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27794 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27795 StackSlot);
27796
27797 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27798 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27799 Chain = CWD.getValue(1);
27800 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27801 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27802
27803 // Shift X87 RM bits from 11:10 to 14:13.
27804 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27805 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27806 DAG.getConstant(3, DL, MVT::i8));
27807
27808 // Update rounding mode bits and store the new FP Control Word into stack.
27809 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27810 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
27811
27812 // Load MXCSR from the slot.
27813 Chain = DAG.getNode(
27814 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27815 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27816 StackSlot);
27817 }
27818
27819 return Chain;
27820}
27821
27822const unsigned X87StateSize = 28;
27823const unsigned FPStateSize = 32;
27824[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
27825
27826SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
27827 SelectionDAG &DAG) const {
27829 SDLoc DL(Op);
27830 SDValue Chain = Op->getOperand(0);
27831 SDValue Ptr = Op->getOperand(1);
27832 auto *Node = cast<FPStateAccessSDNode>(Op);
27833 EVT MemVT = Node->getMemoryVT();
27835 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27836
27837 // Get x87 state, if it presents.
27838 if (Subtarget.hasX87()) {
27839 Chain =
27840 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
27841 {Chain, Ptr}, MemVT, MMO);
27842
27843 // FNSTENV changes the exception mask, so load back the stored environment.
27844 MachineMemOperand::Flags NewFlags =
27846 (MMO->getFlags() & ~MachineMemOperand::MOStore);
27847 MMO = MF.getMachineMemOperand(MMO, NewFlags);
27848 Chain =
27849 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27850 {Chain, Ptr}, MemVT, MMO);
27851 }
27852
27853 // If target supports SSE, get MXCSR as well.
27854 if (Subtarget.hasSSE1()) {
27855 // Get pointer to the MXCSR location in memory.
27857 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27858 DAG.getConstant(X87StateSize, DL, PtrVT));
27859 // Store MXCSR into memory.
27860 Chain = DAG.getNode(
27861 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27862 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27863 MXCSRAddr);
27864 }
27865
27866 return Chain;
27867}
27868
27870 EVT MemVT, MachineMemOperand *MMO,
27871 SelectionDAG &DAG,
27872 const X86Subtarget &Subtarget) {
27873 // Set x87 state, if it presents.
27874 if (Subtarget.hasX87())
27875 Chain =
27876 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27877 {Chain, Ptr}, MemVT, MMO);
27878 // If target supports SSE, set MXCSR as well.
27879 if (Subtarget.hasSSE1()) {
27880 // Get pointer to the MXCSR location in memory.
27882 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27883 DAG.getConstant(X87StateSize, DL, PtrVT));
27884 // Load MXCSR from memory.
27885 Chain = DAG.getNode(
27886 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27887 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27888 MXCSRAddr);
27889 }
27890 return Chain;
27891}
27892
27893SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
27894 SelectionDAG &DAG) const {
27895 SDLoc DL(Op);
27896 SDValue Chain = Op->getOperand(0);
27897 SDValue Ptr = Op->getOperand(1);
27898 auto *Node = cast<FPStateAccessSDNode>(Op);
27899 EVT MemVT = Node->getMemoryVT();
27901 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27902 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
27903}
27904
27905SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
27906 SelectionDAG &DAG) const {
27908 SDLoc DL(Op);
27909 SDValue Chain = Op.getNode()->getOperand(0);
27910
27911 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
27912 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
27914
27915 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
27916 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
27917 // for compatibility with glibc.
27918 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
27919 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
27920 Constant *Zero = ConstantInt::get(ItemTy, 0);
27921 for (unsigned I = 0; I < 6; ++I)
27922 FPEnvVals.push_back(Zero);
27923
27924 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
27925 // all exceptions, sets DAZ and FTZ to 0.
27926 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
27927 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
27929 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
27930 MachinePointerInfo MPI =
27934
27935 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
27936}
27937
27938/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27939//
27940// i8/i16 vector implemented using dword LZCNT vector instruction
27941// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27942// split the vector, perform operation on it's Lo a Hi part and
27943// concatenate the results.
27945 const X86Subtarget &Subtarget) {
27946 assert(Op.getOpcode() == ISD::CTLZ);
27947 SDLoc dl(Op);
27948 MVT VT = Op.getSimpleValueType();
27949 MVT EltVT = VT.getVectorElementType();
27950 unsigned NumElems = VT.getVectorNumElements();
27951
27952 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27953 "Unsupported element type");
27954
27955 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27956 if (NumElems > 16 ||
27957 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27958 return splitVectorIntUnary(Op, DAG, dl);
27959
27960 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27961 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27962 "Unsupported value type for operation");
27963
27964 // Use native supported vector instruction vplzcntd.
27965 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27966 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27967 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27968 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27969
27970 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27971}
27972
27973// Lower CTLZ using a PSHUFB lookup table implementation.
27975 const X86Subtarget &Subtarget,
27976 SelectionDAG &DAG) {
27977 MVT VT = Op.getSimpleValueType();
27978 int NumElts = VT.getVectorNumElements();
27979 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27980 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27981
27982 // Per-nibble leading zero PSHUFB lookup table.
27983 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27984 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27985 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27986 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27987
27989 for (int i = 0; i < NumBytes; ++i)
27990 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27991 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27992
27993 // Begin by bitcasting the input to byte vector, then split those bytes
27994 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
27995 // If the hi input nibble is zero then we add both results together, otherwise
27996 // we just take the hi result (by masking the lo result to zero before the
27997 // add).
27998 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27999 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28000
28001 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
28002 SDValue Lo = Op0;
28003 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
28004 SDValue HiZ;
28005 if (CurrVT.is512BitVector()) {
28006 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28007 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
28008 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28009 } else {
28010 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
28011 }
28012
28013 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
28014 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
28015 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
28016 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
28017
28018 // Merge result back from vXi8 back to VT, working on the lo/hi halves
28019 // of the current vector width in the same way we did for the nibbles.
28020 // If the upper half of the input element is zero then add the halves'
28021 // leading zero counts together, otherwise just use the upper half's.
28022 // Double the width of the result until we are at target width.
28023 while (CurrVT != VT) {
28024 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
28025 int CurrNumElts = CurrVT.getVectorNumElements();
28026 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
28027 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
28028 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
28029
28030 // Check if the upper half of the input element is zero.
28031 if (CurrVT.is512BitVector()) {
28032 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28033 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
28034 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28035 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28036 } else {
28037 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
28038 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28039 }
28040 HiZ = DAG.getBitcast(NextVT, HiZ);
28041
28042 // Move the upper/lower halves to the lower bits as we'll be extending to
28043 // NextVT. Mask the lower result to zero if HiZ is true and add the results
28044 // together.
28045 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
28046 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
28047 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
28048 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
28049 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
28050 CurrVT = NextVT;
28051 }
28052
28053 return Res;
28054}
28055
28057 const X86Subtarget &Subtarget,
28058 SelectionDAG &DAG) {
28059 MVT VT = Op.getSimpleValueType();
28060
28061 if (Subtarget.hasCDI() &&
28062 // vXi8 vectors need to be promoted to 512-bits for vXi32.
28063 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
28064 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
28065
28066 // Decompose 256-bit ops into smaller 128-bit ops.
28067 if (VT.is256BitVector() && !Subtarget.hasInt256())
28068 return splitVectorIntUnary(Op, DAG, DL);
28069
28070 // Decompose 512-bit ops into smaller 256-bit ops.
28071 if (VT.is512BitVector() && !Subtarget.hasBWI())
28072 return splitVectorIntUnary(Op, DAG, DL);
28073
28074 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
28075 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
28076}
28077
28078static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
28079 SelectionDAG &DAG) {
28080 MVT VT = Op.getSimpleValueType();
28081 MVT OpVT = VT;
28082 unsigned NumBits = VT.getSizeInBits();
28083 SDLoc dl(Op);
28084 unsigned Opc = Op.getOpcode();
28085
28086 if (VT.isVector())
28087 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
28088
28089 Op = Op.getOperand(0);
28090 if (VT == MVT::i8) {
28091 // Zero extend to i32 since there is not an i8 bsr.
28092 OpVT = MVT::i32;
28093 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
28094 }
28095
28096 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28097 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28098 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
28099
28100 if (Opc == ISD::CTLZ) {
28101 // If src is zero (i.e. bsr sets ZF), returns NumBits.
28102 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28103 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28104 Op.getValue(1)};
28105 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28106 }
28107
28108 // Finally xor with NumBits-1.
28109 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28110 DAG.getConstant(NumBits - 1, dl, OpVT));
28111
28112 if (VT == MVT::i8)
28113 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28114 return Op;
28115}
28116
28117static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28118 SelectionDAG &DAG) {
28119 MVT VT = Op.getSimpleValueType();
28120 unsigned NumBits = VT.getScalarSizeInBits();
28121 SDValue N0 = Op.getOperand(0);
28122 SDLoc dl(Op);
28123
28124 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
28125 "Only scalar CTTZ requires custom lowering");
28126
28127 // Issue a bsf (scan bits forward) which also sets EFLAGS.
28128 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28129 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
28130
28131 // If src is known never zero we can skip the CMOV.
28132 if (DAG.isKnownNeverZero(N0))
28133 return Op;
28134
28135 // If src is zero (i.e. bsf sets ZF), returns NumBits.
28136 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28137 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28138 Op.getValue(1)};
28139 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28140}
28141
28143 const X86Subtarget &Subtarget) {
28144 MVT VT = Op.getSimpleValueType();
28145 SDLoc DL(Op);
28146
28147 if (VT == MVT::i16 || VT == MVT::i32)
28148 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
28149
28150 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28151 return splitVectorIntBinary(Op, DAG, DL);
28152
28153 assert(Op.getSimpleValueType().is256BitVector() &&
28154 Op.getSimpleValueType().isInteger() &&
28155 "Only handle AVX 256-bit vector integer operation");
28156 return splitVectorIntBinary(Op, DAG, DL);
28157}
28158
28160 const X86Subtarget &Subtarget) {
28161 MVT VT = Op.getSimpleValueType();
28162 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28163 unsigned Opcode = Op.getOpcode();
28164 SDLoc DL(Op);
28165
28166 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28167 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28168 assert(Op.getSimpleValueType().isInteger() &&
28169 "Only handle AVX vector integer operation");
28170 return splitVectorIntBinary(Op, DAG, DL);
28171 }
28172
28173 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28174 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28175 EVT SetCCResultType =
28176 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28177
28178 unsigned BitWidth = VT.getScalarSizeInBits();
28179 if (Opcode == ISD::USUBSAT) {
28180 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28181 // Handle a special-case with a bit-hack instead of cmp+select:
28182 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28183 // If the target can use VPTERNLOG, DAGToDAG will match this as
28184 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28185 // "broadcast" constant load.
28187 if (C && C->getAPIntValue().isSignMask()) {
28188 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28189 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28190 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28191 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28192 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28193 }
28194 }
28195 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28196 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28197 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28198 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28199 // TODO: Move this to DAGCombiner?
28200 if (SetCCResultType == VT &&
28201 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28202 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28203 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28204 }
28205 }
28206
28207 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28208 (!VT.isVector() || VT == MVT::v2i64)) {
28211 SDValue Zero = DAG.getConstant(0, DL, VT);
28212 SDValue Result =
28213 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28214 DAG.getVTList(VT, SetCCResultType), X, Y);
28215 SDValue SumDiff = Result.getValue(0);
28216 SDValue Overflow = Result.getValue(1);
28217 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28218 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28219 SDValue SumNeg =
28220 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28221 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28222 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28223 }
28224
28225 // Use default expansion.
28226 return SDValue();
28227}
28228
28229static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28230 SelectionDAG &DAG) {
28231 MVT VT = Op.getSimpleValueType();
28232 SDLoc DL(Op);
28233
28234 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28235 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28236 // 8-bit integer abs to NEG and CMOV.
28237 SDValue N0 = Op.getOperand(0);
28238 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28239 DAG.getConstant(0, DL, VT), N0);
28240 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28241 SDValue(Neg.getNode(), 1)};
28242 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28243 }
28244
28245 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28246 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28247 SDValue Src = Op.getOperand(0);
28248 SDValue Neg = DAG.getNegative(Src, DL, VT);
28249 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
28250 }
28251
28252 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28253 assert(VT.isInteger() &&
28254 "Only handle AVX 256-bit vector integer operation");
28255 return splitVectorIntUnary(Op, DAG, DL);
28256 }
28257
28258 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28259 return splitVectorIntUnary(Op, DAG, DL);
28260
28261 // Default to expand.
28262 return SDValue();
28263}
28264
28265static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
28266 SelectionDAG &DAG) {
28267 MVT VT = Op.getSimpleValueType();
28268 SDLoc DL(Op);
28269
28270 // For AVX1 cases, split to use legal ops.
28271 if (VT.is256BitVector() && !Subtarget.hasInt256())
28272 return splitVectorIntBinary(Op, DAG, DL);
28273
28274 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28275 return splitVectorIntBinary(Op, DAG, DL);
28276
28277 // Default to expand.
28278 return SDValue();
28279}
28280
28281static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
28282 SelectionDAG &DAG) {
28283 MVT VT = Op.getSimpleValueType();
28284 SDLoc DL(Op);
28285
28286 // For AVX1 cases, split to use legal ops.
28287 if (VT.is256BitVector() && !Subtarget.hasInt256())
28288 return splitVectorIntBinary(Op, DAG, DL);
28289
28290 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28291 return splitVectorIntBinary(Op, DAG, DL);
28292
28293 // Default to expand.
28294 return SDValue();
28295}
28296
28298 SelectionDAG &DAG) {
28299 assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
28300 "Expected FMAXIMUM or FMINIMUM opcode");
28301 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28302 EVT VT = Op.getValueType();
28303 SDValue X = Op.getOperand(0);
28304 SDValue Y = Op.getOperand(1);
28305 SDLoc DL(Op);
28306 uint64_t SizeInBits = VT.getScalarSizeInBits();
28307 APInt PreferredZero = APInt::getZero(SizeInBits);
28308 APInt OppositeZero = PreferredZero;
28309 EVT IVT = VT.changeTypeToInteger();
28310 X86ISD::NodeType MinMaxOp;
28311 if (Op.getOpcode() == ISD::FMAXIMUM) {
28312 MinMaxOp = X86ISD::FMAX;
28313 OppositeZero.setSignBit();
28314 } else {
28315 PreferredZero.setSignBit();
28316 MinMaxOp = X86ISD::FMIN;
28317 }
28318 EVT SetCCType =
28319 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28320
28321 // The tables below show the expected result of Max in cases of NaN and
28322 // signed zeros.
28323 //
28324 // Y Y
28325 // Num xNaN +0 -0
28326 // --------------- ---------------
28327 // Num | Max | Y | +0 | +0 | +0 |
28328 // X --------------- X ---------------
28329 // xNaN | X | X/Y | -0 | +0 | -0 |
28330 // --------------- ---------------
28331 //
28332 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
28333 // reordering.
28334 //
28335 // We check if any of operands is NaN and return NaN. Then we check if any of
28336 // operands is zero or negative zero (for fmaximum and fminimum respectively)
28337 // to ensure the correct zero is returned.
28338 auto MatchesZero = [](SDValue Op, APInt Zero) {
28340 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
28341 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
28342 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
28343 return CstOp->getAPIntValue() == Zero;
28344 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
28345 Op->getOpcode() == ISD::SPLAT_VECTOR) {
28346 for (const SDValue &OpVal : Op->op_values()) {
28347 if (OpVal.isUndef())
28348 continue;
28349 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
28350 if (!CstOp)
28351 return false;
28352 if (!CstOp->getValueAPF().isZero())
28353 continue;
28354 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
28355 return false;
28356 }
28357 return true;
28358 }
28359 return false;
28360 };
28361
28362 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
28363 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
28364 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
28365 Op->getFlags().hasNoSignedZeros() ||
28366 DAG.isKnownNeverZeroFloat(X) ||
28368 SDValue NewX, NewY;
28369 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
28370 MatchesZero(X, OppositeZero)) {
28371 // Operands are already in right order or order does not matter.
28372 NewX = X;
28373 NewY = Y;
28374 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
28375 NewX = Y;
28376 NewY = X;
28377 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
28378 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
28379 if (IsXNeverNaN)
28380 std::swap(X, Y);
28381 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
28382 // xmm register.
28383 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
28385 // Bits of classes:
28386 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
28387 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
28388 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
28389 DL, MVT::i32);
28390 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
28391 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
28392 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
28393 DAG.getIntPtrConstant(0, DL));
28394 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
28395 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
28396 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
28397 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28398 } else {
28399 SDValue IsXSigned;
28400 if (Subtarget.is64Bit() || VT != MVT::f64) {
28401 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
28402 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
28403 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
28404 } else {
28405 assert(VT == MVT::f64);
28406 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
28407 DAG.getConstantFP(0, DL, MVT::v2f64), X,
28408 DAG.getIntPtrConstant(0, DL));
28409 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
28410 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
28411 DAG.getIntPtrConstant(1, DL));
28412 Hi = DAG.getBitcast(MVT::i32, Hi);
28413 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
28414 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
28415 *DAG.getContext(), MVT::i32);
28416 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
28417 }
28418 if (MinMaxOp == X86ISD::FMAX) {
28419 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28420 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28421 } else {
28422 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28423 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28424 }
28425 }
28426
28427 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
28428 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
28429
28430 // If we did no ordering operands for signed zero handling and we need
28431 // to process NaN and we know that the second operand is not NaN then put
28432 // it in first operand and we will not need to post handle NaN after max/min.
28433 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
28434 std::swap(NewX, NewY);
28435
28436 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28437
28438 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
28439 return MinMax;
28440
28441 SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
28442 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
28443}
28444
28445static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
28446 SelectionDAG &DAG) {
28447 MVT VT = Op.getSimpleValueType();
28448 SDLoc dl(Op);
28449
28450 // For AVX1 cases, split to use legal ops.
28451 if (VT.is256BitVector() && !Subtarget.hasInt256())
28452 return splitVectorIntBinary(Op, DAG, dl);
28453
28454 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
28455 return splitVectorIntBinary(Op, DAG, dl);
28456
28457 bool IsSigned = Op.getOpcode() == ISD::ABDS;
28458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28459
28460 // TODO: Move to TargetLowering expandABD() once we have ABD promotion.
28461 if (VT.isScalarInteger()) {
28462 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
28463 MVT WideVT = MVT::getIntegerVT(WideBits);
28464 if (TLI.isTypeLegal(WideVT)) {
28465 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
28466 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
28467 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28468 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
28469 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
28470 SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);
28471 SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);
28472 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
28473 }
28474 }
28475
28476 // Default to expand.
28477 return SDValue();
28478}
28479
28480static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28481 SelectionDAG &DAG) {
28482 SDLoc dl(Op);
28483 MVT VT = Op.getSimpleValueType();
28484
28485 // Decompose 256-bit ops into 128-bit ops.
28486 if (VT.is256BitVector() && !Subtarget.hasInt256())
28487 return splitVectorIntBinary(Op, DAG, dl);
28488
28489 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28490 return splitVectorIntBinary(Op, DAG, dl);
28491
28492 SDValue A = Op.getOperand(0);
28493 SDValue B = Op.getOperand(1);
28494
28495 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28496 // vector pairs, multiply and truncate.
28497 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28498 unsigned NumElts = VT.getVectorNumElements();
28499 unsigned NumLanes = VT.getSizeInBits() / 128;
28500 unsigned NumEltsPerLane = NumElts / NumLanes;
28501
28502 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28503 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28504 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28505 return DAG.getNode(
28506 ISD::TRUNCATE, dl, VT,
28507 DAG.getNode(ISD::MUL, dl, ExVT,
28508 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28509 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28510 }
28511
28512 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28513
28514 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
28515 // Don't do this if we only need to unpack one half.
28516 if (Subtarget.hasSSSE3()) {
28517 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
28518 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
28519 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
28520 if (BIsBuildVector) {
28521 for (auto [Idx, Val] : enumerate(B->ops())) {
28522 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
28523 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
28524 else
28525 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
28526 }
28527 }
28528 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
28529 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
28530 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
28531 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
28532 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
28533 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
28534 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
28535 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
28536 DAG.getTargetConstant(8, dl, MVT::i8));
28537 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
28538 }
28539 }
28540
28541 // Extract the lo/hi parts to any extend to i16.
28542 // We're going to mask off the low byte of each result element of the
28543 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28544 // element.
28545 SDValue Undef = DAG.getUNDEF(VT);
28546 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28547 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28548
28549 SDValue BLo, BHi;
28550 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28551 // If the RHS is a constant, manually unpackl/unpackh.
28552 SmallVector<SDValue, 16> LoOps, HiOps;
28553 for (unsigned i = 0; i != NumElts; i += 16) {
28554 for (unsigned j = 0; j != 8; ++j) {
28555 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28556 MVT::i16));
28557 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28558 MVT::i16));
28559 }
28560 }
28561
28562 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28563 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28564 } else {
28565 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28566 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28567 }
28568
28569 // Multiply, mask the lower 8bits of the lo/hi results and pack.
28570 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28571 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28572 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28573 }
28574
28575 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28576 if (VT == MVT::v4i32) {
28577 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
28578 "Should not custom lower when pmulld is available!");
28579
28580 // Extract the odd parts.
28581 static const int UnpackMask[] = { 1, -1, 3, -1 };
28582 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28583 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28584
28585 // Multiply the even parts.
28586 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28587 DAG.getBitcast(MVT::v2i64, A),
28588 DAG.getBitcast(MVT::v2i64, B));
28589 // Now multiply odd parts.
28590 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28591 DAG.getBitcast(MVT::v2i64, Aodds),
28592 DAG.getBitcast(MVT::v2i64, Bodds));
28593
28594 Evens = DAG.getBitcast(VT, Evens);
28595 Odds = DAG.getBitcast(VT, Odds);
28596
28597 // Merge the two vectors back together with a shuffle. This expands into 2
28598 // shuffles.
28599 static const int ShufMask[] = { 0, 4, 2, 6 };
28600 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28601 }
28602
28603 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
28604 "Only know how to lower V2I64/V4I64/V8I64 multiply");
28605 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
28606
28607 // Ahi = psrlqi(a, 32);
28608 // Bhi = psrlqi(b, 32);
28609 //
28610 // AloBlo = pmuludq(a, b);
28611 // AloBhi = pmuludq(a, Bhi);
28612 // AhiBlo = pmuludq(Ahi, b);
28613 //
28614 // Hi = psllqi(AloBhi + AhiBlo, 32);
28615 // return AloBlo + Hi;
28616 KnownBits AKnown = DAG.computeKnownBits(A);
28617 KnownBits BKnown = DAG.computeKnownBits(B);
28618
28619 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28620 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28621 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28622
28623 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28624 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28625 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28626
28627 SDValue Zero = DAG.getConstant(0, dl, VT);
28628
28629 // Only multiply lo/hi halves that aren't known to be zero.
28630 SDValue AloBlo = Zero;
28631 if (!ALoIsZero && !BLoIsZero)
28632 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28633
28634 SDValue AloBhi = Zero;
28635 if (!ALoIsZero && !BHiIsZero) {
28636 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28637 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28638 }
28639
28640 SDValue AhiBlo = Zero;
28641 if (!AHiIsZero && !BLoIsZero) {
28642 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28643 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28644 }
28645
28646 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28647 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28648
28649 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28650}
28651
28653 MVT VT, bool IsSigned,
28654 const X86Subtarget &Subtarget,
28655 SelectionDAG &DAG,
28656 SDValue *Low = nullptr) {
28657 unsigned NumElts = VT.getVectorNumElements();
28658
28659 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28660 // to a vXi16 type. Do the multiplies, shift the results and pack the half
28661 // lane results back together.
28662
28663 // We'll take different approaches for signed and unsigned.
28664 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28665 // and use pmullw to calculate the full 16-bit product.
28666 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28667 // shift them left into the upper byte of each word. This allows us to use
28668 // pmulhw to calculate the full 16-bit product. This trick means we don't
28669 // need to sign extend the bytes to use pmullw.
28670
28671 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28672 SDValue Zero = DAG.getConstant(0, dl, VT);
28673
28674 SDValue ALo, AHi;
28675 if (IsSigned) {
28676 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28677 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28678 } else {
28679 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28680 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28681 }
28682
28683 SDValue BLo, BHi;
28684 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28685 // If the RHS is a constant, manually unpackl/unpackh and extend.
28686 SmallVector<SDValue, 16> LoOps, HiOps;
28687 for (unsigned i = 0; i != NumElts; i += 16) {
28688 for (unsigned j = 0; j != 8; ++j) {
28689 SDValue LoOp = B.getOperand(i + j);
28690 SDValue HiOp = B.getOperand(i + j + 8);
28691
28692 if (IsSigned) {
28693 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28694 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28695 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28696 DAG.getConstant(8, dl, MVT::i16));
28697 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28698 DAG.getConstant(8, dl, MVT::i16));
28699 } else {
28700 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28701 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28702 }
28703
28704 LoOps.push_back(LoOp);
28705 HiOps.push_back(HiOp);
28706 }
28707 }
28708
28709 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28710 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28711 } else if (IsSigned) {
28712 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28713 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28714 } else {
28715 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28716 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28717 }
28718
28719 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28720 // pack back to vXi8.
28721 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28722 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28723 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28724
28725 if (Low)
28726 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28727
28728 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
28729}
28730
28731static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28732 SelectionDAG &DAG) {
28733 SDLoc dl(Op);
28734 MVT VT = Op.getSimpleValueType();
28735 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28736 unsigned NumElts = VT.getVectorNumElements();
28737 SDValue A = Op.getOperand(0);
28738 SDValue B = Op.getOperand(1);
28739
28740 // Decompose 256-bit ops into 128-bit ops.
28741 if (VT.is256BitVector() && !Subtarget.hasInt256())
28742 return splitVectorIntBinary(Op, DAG, dl);
28743
28744 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28745 return splitVectorIntBinary(Op, DAG, dl);
28746
28747 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28748 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
28749 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
28750 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
28751
28752 // PMULxD operations multiply each even value (starting at 0) of LHS with
28753 // the related value of RHS and produce a widen result.
28754 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28755 // => <2 x i64> <ae|cg>
28756 //
28757 // In other word, to have all the results, we need to perform two PMULxD:
28758 // 1. one with the even values.
28759 // 2. one with the odd values.
28760 // To achieve #2, with need to place the odd values at an even position.
28761 //
28762 // Place the odd value at an even position (basically, shift all values 1
28763 // step to the left):
28764 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28765 9, -1, 11, -1, 13, -1, 15, -1};
28766 // <a|b|c|d> => <b|undef|d|undef>
28767 SDValue Odd0 =
28768 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
28769 // <e|f|g|h> => <f|undef|h|undef>
28770 SDValue Odd1 =
28771 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
28772
28773 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28774 // ints.
28775 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28776 unsigned Opcode =
28777 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28778 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28779 // => <2 x i64> <ae|cg>
28780 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28781 DAG.getBitcast(MulVT, A),
28782 DAG.getBitcast(MulVT, B)));
28783 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28784 // => <2 x i64> <bf|dh>
28785 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28786 DAG.getBitcast(MulVT, Odd0),
28787 DAG.getBitcast(MulVT, Odd1)));
28788
28789 // Shuffle it back into the right order.
28790 SmallVector<int, 16> ShufMask(NumElts);
28791 for (int i = 0; i != (int)NumElts; ++i)
28792 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28793
28794 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28795
28796 // If we have a signed multiply but no PMULDQ fix up the result of an
28797 // unsigned multiply.
28798 if (IsSigned && !Subtarget.hasSSE41()) {
28799 SDValue Zero = DAG.getConstant(0, dl, VT);
28800 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28801 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28802 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28803 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28804
28805 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28806 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28807 }
28808
28809 return Res;
28810 }
28811
28812 // Only i8 vectors should need custom lowering after this.
28813 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28814 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28815 "Unsupported vector type");
28816
28817 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28818 // logical shift down the upper half and pack back to i8.
28819
28820 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28821 // and then ashr/lshr the upper bits down to the lower bits before multiply.
28822
28823 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28824 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28825 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28826 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28827 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28828 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28829 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28830 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28831 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28832 }
28833
28834 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28835}
28836
28837// Custom lowering for SMULO/UMULO.
28838static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28839 SelectionDAG &DAG) {
28840 MVT VT = Op.getSimpleValueType();
28841
28842 // Scalars defer to LowerXALUO.
28843 if (!VT.isVector())
28844 return LowerXALUO(Op, DAG);
28845
28846 SDLoc dl(Op);
28847 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28848 SDValue A = Op.getOperand(0);
28849 SDValue B = Op.getOperand(1);
28850 EVT OvfVT = Op->getValueType(1);
28851
28852 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28853 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28854 // Extract the LHS Lo/Hi vectors
28855 SDValue LHSLo, LHSHi;
28856 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28857
28858 // Extract the RHS Lo/Hi vectors
28859 SDValue RHSLo, RHSHi;
28860 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28861
28862 EVT LoOvfVT, HiOvfVT;
28863 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28864 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28865 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28866
28867 // Issue the split operations.
28868 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28869 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28870
28871 // Join the separate data results and the overflow results.
28872 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28873 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28874 Hi.getValue(1));
28875
28876 return DAG.getMergeValues({Res, Ovf}, dl);
28877 }
28878
28879 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28880 EVT SetccVT =
28881 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28882
28883 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28884 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28885 unsigned NumElts = VT.getVectorNumElements();
28886 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28887 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28888 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28889 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28890 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28891
28892 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28893
28894 SDValue Ovf;
28895 if (IsSigned) {
28896 SDValue High, LowSign;
28897 if (OvfVT.getVectorElementType() == MVT::i1 &&
28898 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28899 // Rather the truncating try to do the compare on vXi16 or vXi32.
28900 // Shift the high down filling with sign bits.
28901 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28902 // Fill all 16 bits with the sign bit from the low.
28903 LowSign =
28904 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28905 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28906 15, DAG);
28907 SetccVT = OvfVT;
28908 if (!Subtarget.hasBWI()) {
28909 // We can't do a vXi16 compare so sign extend to v16i32.
28910 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28911 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28912 }
28913 } else {
28914 // Otherwise do the compare at vXi8.
28915 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28916 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28917 LowSign =
28918 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28919 }
28920
28921 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28922 } else {
28923 SDValue High =
28924 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28925 if (OvfVT.getVectorElementType() == MVT::i1 &&
28926 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28927 // Rather the truncating try to do the compare on vXi16 or vXi32.
28928 SetccVT = OvfVT;
28929 if (!Subtarget.hasBWI()) {
28930 // We can't do a vXi16 compare so sign extend to v16i32.
28931 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28932 }
28933 } else {
28934 // Otherwise do the compare at vXi8.
28935 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28936 }
28937
28938 Ovf =
28939 DAG.getSetCC(dl, SetccVT, High,
28940 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28941 }
28942
28943 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28944
28945 return DAG.getMergeValues({Low, Ovf}, dl);
28946 }
28947
28948 SDValue Low;
28949 SDValue High =
28950 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28951
28952 SDValue Ovf;
28953 if (IsSigned) {
28954 // SMULO overflows if the high bits don't match the sign of the low.
28955 SDValue LowSign =
28956 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28957 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28958 } else {
28959 // UMULO overflows if the high bits are non-zero.
28960 Ovf =
28961 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28962 }
28963
28964 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28965
28966 return DAG.getMergeValues({Low, Ovf}, dl);
28967}
28968
28969SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28970 assert(Subtarget.isTargetWin64() && "Unexpected target");
28971 EVT VT = Op.getValueType();
28972 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28973 "Unexpected return type for lowering");
28974
28975 if (isa<ConstantSDNode>(Op->getOperand(1))) {
28977 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
28978 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
28979 }
28980
28981 RTLIB::Libcall LC;
28982 bool isSigned;
28983 switch (Op->getOpcode()) {
28984 // clang-format off
28985 default: llvm_unreachable("Unexpected request for libcall!");
28986 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28987 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28988 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28989 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28990 // clang-format on
28991 }
28992
28993 SDLoc dl(Op);
28994 SDValue InChain = DAG.getEntryNode();
28995
28998 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28999 EVT ArgVT = Op->getOperand(i).getValueType();
29000 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
29001 "Unexpected argument type for lowering");
29002 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29003 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29004 MachinePointerInfo MPI =
29006 Entry.Node = StackPtr;
29007 InChain =
29008 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
29009 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
29010 Entry.Ty = PointerType::get(ArgTy,0);
29011 Entry.IsSExt = false;
29012 Entry.IsZExt = false;
29013 Args.push_back(Entry);
29014 }
29015
29018
29020 CLI.setDebugLoc(dl)
29021 .setChain(InChain)
29022 .setLibCallee(
29024 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
29025 std::move(Args))
29026 .setInRegister()
29027 .setSExtResult(isSigned)
29028 .setZExtResult(!isSigned);
29029
29030 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
29031 return DAG.getBitcast(VT, CallInfo.first);
29032}
29033
29034SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
29035 SelectionDAG &DAG,
29036 SDValue &Chain) const {
29037 assert(Subtarget.isTargetWin64() && "Unexpected target");
29038 EVT VT = Op.getValueType();
29039 bool IsStrict = Op->isStrictFPOpcode();
29040
29041 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29042 EVT ArgVT = Arg.getValueType();
29043
29044 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
29045 "Unexpected return type for lowering");
29046
29047 RTLIB::Libcall LC;
29048 if (Op->getOpcode() == ISD::FP_TO_SINT ||
29049 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
29050 LC = RTLIB::getFPTOSINT(ArgVT, VT);
29051 else
29052 LC = RTLIB::getFPTOUINT(ArgVT, VT);
29053 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
29054
29055 SDLoc dl(Op);
29056 MakeLibCallOptions CallOptions;
29057 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29058
29060 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
29061 // expected VT (i128).
29062 std::tie(Result, Chain) =
29063 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
29064 Result = DAG.getBitcast(VT, Result);
29065 return Result;
29066}
29067
29068SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
29069 SelectionDAG &DAG) const {
29070 assert(Subtarget.isTargetWin64() && "Unexpected target");
29071 EVT VT = Op.getValueType();
29072 bool IsStrict = Op->isStrictFPOpcode();
29073
29074 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29075 EVT ArgVT = Arg.getValueType();
29076
29077 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
29078 "Unexpected argument type for lowering");
29079
29080 RTLIB::Libcall LC;
29081 if (Op->getOpcode() == ISD::SINT_TO_FP ||
29082 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
29083 LC = RTLIB::getSINTTOFP(ArgVT, VT);
29084 else
29085 LC = RTLIB::getUINTTOFP(ArgVT, VT);
29086 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
29087
29088 SDLoc dl(Op);
29089 MakeLibCallOptions CallOptions;
29090 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29091
29092 // Pass the i128 argument as an indirect argument on the stack.
29093 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29094 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29095 MachinePointerInfo MPI =
29097 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
29098
29100 std::tie(Result, Chain) =
29101 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
29102 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
29103}
29104
29105// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
29106uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
29107 assert((Amt < 8) && "Shift/Rotation amount out of range");
29108 switch (Opcode) {
29109 case ISD::BITREVERSE:
29110 return 0x8040201008040201ULL;
29111 case ISD::SHL:
29112 return ((0x0102040810204080ULL >> (Amt)) &
29113 (0x0101010101010101ULL * (0xFF >> (Amt))));
29114 case ISD::SRL:
29115 return ((0x0102040810204080ULL << (Amt)) &
29116 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
29117 case ISD::SRA:
29118 return (getGFNICtrlImm(ISD::SRL, Amt) |
29119 (0x8080808080808080ULL >> (64 - (8 * Amt))));
29120 case ISD::ROTL:
29121 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
29122 case ISD::ROTR:
29123 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
29124 }
29125 llvm_unreachable("Unsupported GFNI opcode");
29126}
29127
29128// Return true if the required (according to Opcode) shift-imm form is natively
29129// supported by the Subtarget
29130static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
29131 unsigned Opcode) {
29132 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29133 "Unexpected shift opcode");
29134
29135 if (!VT.isSimple())
29136 return false;
29137
29138 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29139 return false;
29140
29141 if (VT.getScalarSizeInBits() < 16)
29142 return false;
29143
29144 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29145 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29146 return true;
29147
29148 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29149 (VT.is256BitVector() && Subtarget.hasInt256());
29150
29151 bool AShift = LShift && (Subtarget.hasAVX512() ||
29152 (VT != MVT::v2i64 && VT != MVT::v4i64));
29153 return (Opcode == ISD::SRA) ? AShift : LShift;
29154}
29155
29156// The shift amount is a variable, but it is the same for all vector lanes.
29157// These instructions are defined together with shift-immediate.
29158static
29160 unsigned Opcode) {
29161 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29162}
29163
29164// Return true if the required (according to Opcode) variable-shift form is
29165// natively supported by the Subtarget
29166static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
29167 unsigned Opcode) {
29168 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29169 "Unexpected shift opcode");
29170
29171 if (!VT.isSimple())
29172 return false;
29173
29174 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29175 return false;
29176
29177 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29178 return false;
29179
29180 // vXi16 supported only on AVX-512, BWI
29181 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29182 return false;
29183
29184 if (Subtarget.hasAVX512() &&
29185 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29186 return true;
29187
29188 bool LShift = VT.is128BitVector() || VT.is256BitVector();
29189 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
29190 return (Opcode == ISD::SRA) ? AShift : LShift;
29191}
29192
29194 const X86Subtarget &Subtarget) {
29195 MVT VT = Op.getSimpleValueType();
29196 SDLoc dl(Op);
29197 SDValue R = Op.getOperand(0);
29198 SDValue Amt = Op.getOperand(1);
29199 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29200 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29201
29202 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29203 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
29204 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29205 SDValue Ex = DAG.getBitcast(ExVT, R);
29206
29207 // ashr(R, 63) === cmp_slt(R, 0)
29208 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29209 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
29210 "Unsupported PCMPGT op");
29211 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29212 }
29213
29214 if (ShiftAmt >= 32) {
29215 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29216 SDValue Upper =
29217 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29219 ShiftAmt - 32, DAG);
29220 if (VT == MVT::v2i64)
29221 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29222 if (VT == MVT::v4i64)
29223 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29224 {9, 1, 11, 3, 13, 5, 15, 7});
29225 } else {
29226 // SRA upper i32, SRL whole i64 and select lower i32.
29228 ShiftAmt, DAG);
29229 SDValue Lower =
29230 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29231 Lower = DAG.getBitcast(ExVT, Lower);
29232 if (VT == MVT::v2i64)
29233 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29234 if (VT == MVT::v4i64)
29235 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29236 {8, 1, 10, 3, 12, 5, 14, 7});
29237 }
29238 return DAG.getBitcast(VT, Ex);
29239 };
29240
29241 // Optimize shl/srl/sra with constant shift amount.
29242 APInt APIntShiftAmt;
29243 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29244 return SDValue();
29245
29246 // If the shift amount is out of range, return undef.
29247 if (APIntShiftAmt.uge(EltSizeInBits))
29248 return DAG.getUNDEF(VT);
29249
29250 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29251
29252 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
29253 // Hardware support for vector shifts is sparse which makes us scalarize the
29254 // vector operations in many cases. Also, on sandybridge ADD is faster than
29255 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29256 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29257 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29258 // must be 0). (add undef, undef) however can be any value. To make this
29259 // safe, we must freeze R to ensure that register allocation uses the same
29260 // register for an undefined value. This ensures that the result will
29261 // still be even and preserves the original semantics.
29262 R = DAG.getFreeze(R);
29263 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29264 }
29265
29266 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29267 }
29268
29269 // i64 SRA needs to be performed as partial shifts.
29270 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29271 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29272 Op.getOpcode() == ISD::SRA)
29273 return ArithmeticShiftRight64(ShiftAmt);
29274
29275 // If we're logical shifting an all-signbits value then we can just perform as
29276 // a mask.
29277 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
29278 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
29279 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
29280 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
29281 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
29282 }
29283
29284 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29285 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29286 unsigned NumElts = VT.getVectorNumElements();
29287 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29288
29289 // Simple i8 add case
29290 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29291 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29292 // must be 0). (add undef, undef) however can be any value. To make this
29293 // safe, we must freeze R to ensure that register allocation uses the same
29294 // register for an undefined value. This ensures that the result will
29295 // still be even and preserves the original semantics.
29296 R = DAG.getFreeze(R);
29297 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29298 }
29299
29300 // ashr(R, 7) === cmp_slt(R, 0)
29301 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29302 SDValue Zeros = DAG.getConstant(0, dl, VT);
29303 if (VT.is512BitVector()) {
29304 assert(VT == MVT::v64i8 && "Unexpected element type!");
29305 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29306 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29307 }
29308 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29309 }
29310
29311 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29312 if (VT == MVT::v16i8 && Subtarget.hasXOP())
29313 return SDValue();
29314
29315 if (Subtarget.hasGFNI()) {
29316 uint64_t ShiftMask = getGFNICtrlImm(Op.getOpcode(), ShiftAmt);
29317 MVT MaskVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29318 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(ShiftMask, dl, MaskVT));
29319 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
29320 DAG.getTargetConstant(0, dl, MVT::i8));
29321 }
29322
29323 if (Op.getOpcode() == ISD::SHL) {
29324 // Make a large shift.
29325 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29326 ShiftAmt, DAG);
29327 SHL = DAG.getBitcast(VT, SHL);
29328 // Zero out the rightmost bits.
29329 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29330 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
29331 }
29332 if (Op.getOpcode() == ISD::SRL) {
29333 // Make a large shift.
29334 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
29335 ShiftAmt, DAG);
29336 SRL = DAG.getBitcast(VT, SRL);
29337 // Zero out the leftmost bits.
29338 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29339 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
29340 }
29341 if (Op.getOpcode() == ISD::SRA) {
29342 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
29343 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29344
29345 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
29346 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
29347 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
29348 return Res;
29349 }
29350 llvm_unreachable("Unknown shift opcode.");
29351 }
29352
29353 return SDValue();
29354}
29355
29357 const X86Subtarget &Subtarget) {
29358 MVT VT = Op.getSimpleValueType();
29359 SDLoc dl(Op);
29360 SDValue R = Op.getOperand(0);
29361 SDValue Amt = Op.getOperand(1);
29362 unsigned Opcode = Op.getOpcode();
29363 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
29364
29365 int BaseShAmtIdx = -1;
29366 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
29367 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
29368 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
29369 Subtarget, DAG);
29370
29371 // vXi8 shifts - shift as v8i16 + mask result.
29372 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
29373 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
29374 VT == MVT::v64i8) &&
29375 !Subtarget.hasXOP()) {
29376 unsigned NumElts = VT.getVectorNumElements();
29377 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29378 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
29379 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
29380 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
29381
29382 // Create the mask using vXi16 shifts. For shift-rights we need to move
29383 // the upper byte down before splatting the vXi8 mask.
29384 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
29385 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
29386 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
29387 if (Opcode != ISD::SHL)
29388 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
29389 8, DAG);
29390 BitMask = DAG.getBitcast(VT, BitMask);
29391 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
29392 SmallVector<int, 64>(NumElts, 0));
29393
29394 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
29395 DAG.getBitcast(ExtVT, R), BaseShAmt,
29396 BaseShAmtIdx, Subtarget, DAG);
29397 Res = DAG.getBitcast(VT, Res);
29398 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
29399
29400 if (Opcode == ISD::SRA) {
29401 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
29402 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
29403 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
29404 SignMask =
29405 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
29406 BaseShAmtIdx, Subtarget, DAG);
29407 SignMask = DAG.getBitcast(VT, SignMask);
29408 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
29409 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
29410 }
29411 return Res;
29412 }
29413 }
29414 }
29415
29416 return SDValue();
29417}
29418
29419// Convert a shift/rotate left amount to a multiplication scale factor.
29421 const X86Subtarget &Subtarget,
29422 SelectionDAG &DAG) {
29423 MVT VT = Amt.getSimpleValueType();
29424 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
29425 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
29426 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
29427 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
29428 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29429 (Subtarget.hasBWI() && VT == MVT::v64i8)))
29430 return SDValue();
29431
29432 MVT SVT = VT.getVectorElementType();
29433 unsigned SVTBits = SVT.getSizeInBits();
29434 unsigned NumElems = VT.getVectorNumElements();
29435
29436 APInt UndefElts;
29437 SmallVector<APInt> EltBits;
29438 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
29439 APInt One(SVTBits, 1);
29440 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
29441 for (unsigned I = 0; I != NumElems; ++I) {
29442 if (UndefElts[I] || EltBits[I].uge(SVTBits))
29443 continue;
29444 uint64_t ShAmt = EltBits[I].getZExtValue();
29445 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
29446 }
29447 return DAG.getBuildVector(VT, dl, Elts);
29448 }
29449
29450 // If the target doesn't support variable shifts, use either FP conversion
29451 // or integer multiplication to avoid shifting each element individually.
29452 if (VT == MVT::v4i32) {
29453 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
29454 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
29455 DAG.getConstant(0x3f800000U, dl, VT));
29456 Amt = DAG.getBitcast(MVT::v4f32, Amt);
29457 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
29458 }
29459
29460 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
29461 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
29462 SDValue Z = DAG.getConstant(0, dl, VT);
29463 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
29464 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
29465 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
29466 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
29467 if (Subtarget.hasSSE41())
29468 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29469 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
29470 }
29471
29472 return SDValue();
29473}
29474
29475static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
29476 SelectionDAG &DAG) {
29477 MVT VT = Op.getSimpleValueType();
29478 SDLoc dl(Op);
29479 SDValue R = Op.getOperand(0);
29480 SDValue Amt = Op.getOperand(1);
29481 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29482 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29483
29484 unsigned Opc = Op.getOpcode();
29485 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
29486 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
29487
29488 assert(VT.isVector() && "Custom lowering only for vector shifts!");
29489 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
29490
29491 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
29492 return V;
29493
29494 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
29495 return V;
29496
29497 if (supportedVectorVarShift(VT, Subtarget, Opc))
29498 return Op;
29499
29500 // i64 vector arithmetic shift can be emulated with the transform:
29501 // M = lshr(SIGN_MASK, Amt)
29502 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29503 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
29504 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29505 Opc == ISD::SRA) {
29506 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29507 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29508 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29509 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29510 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29511 return R;
29512 }
29513
29514 // XOP has 128-bit variable logical/arithmetic shifts.
29515 // +ve/-ve Amt = shift left/right.
29516 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29517 VT == MVT::v8i16 || VT == MVT::v16i8)) {
29518 if (Opc == ISD::SRL || Opc == ISD::SRA)
29519 Amt = DAG.getNegative(Amt, dl, VT);
29520 if (Opc == ISD::SHL || Opc == ISD::SRL)
29521 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29522 if (Opc == ISD::SRA)
29523 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29524 }
29525
29526 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29527 // shifts per-lane and then shuffle the partial results back together.
29528 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29529 // Splat the shift amounts so the scalar shifts above will catch it.
29530 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29531 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29532 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29533 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29534 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29535 }
29536
29537 // If possible, lower this shift as a sequence of two shifts by
29538 // constant plus a BLENDing shuffle instead of scalarizing it.
29539 // Example:
29540 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29541 //
29542 // Could be rewritten as:
29543 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29544 //
29545 // The advantage is that the two shifts from the example would be
29546 // lowered as X86ISD::VSRLI nodes in parallel before blending.
29547 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29548 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29549 SDValue Amt1, Amt2;
29550 unsigned NumElts = VT.getVectorNumElements();
29551 SmallVector<int, 8> ShuffleMask;
29552 for (unsigned i = 0; i != NumElts; ++i) {
29553 SDValue A = Amt->getOperand(i);
29554 if (A.isUndef()) {
29555 ShuffleMask.push_back(SM_SentinelUndef);
29556 continue;
29557 }
29558 if (!Amt1 || Amt1 == A) {
29559 ShuffleMask.push_back(i);
29560 Amt1 = A;
29561 continue;
29562 }
29563 if (!Amt2 || Amt2 == A) {
29564 ShuffleMask.push_back(i + NumElts);
29565 Amt2 = A;
29566 continue;
29567 }
29568 break;
29569 }
29570
29571 // Only perform this blend if we can perform it without loading a mask.
29572 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29573 (VT != MVT::v16i16 ||
29574 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29575 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29576 canWidenShuffleElements(ShuffleMask))) {
29577 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29578 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29579 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29580 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29581 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29582 Cst1->getZExtValue(), DAG);
29583 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29584 Cst2->getZExtValue(), DAG);
29585 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29586 }
29587 }
29588 }
29589
29590 // If possible, lower this packed shift into a vector multiply instead of
29591 // expanding it into a sequence of scalar shifts.
29592 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
29593 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
29594 Subtarget.canExtendTo512BW())))
29595 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29596 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29597
29598 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29599 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29600 if (Opc == ISD::SRL && ConstantAmt &&
29601 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29602 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29603 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29604 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29605 SDValue Zero = DAG.getConstant(0, dl, VT);
29606 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29607 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29608 return DAG.getSelect(dl, VT, ZAmt, R, Res);
29609 }
29610 }
29611
29612 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29613 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29614 // TODO: Special case handling for shift by 0/1, really we can afford either
29615 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29616 if (Opc == ISD::SRA && ConstantAmt &&
29617 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29618 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29619 !Subtarget.hasAVX512()) ||
29620 DAG.isKnownNeverZero(Amt))) {
29621 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29622 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29623 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29624 SDValue Amt0 =
29625 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29626 SDValue Amt1 =
29627 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29628 SDValue Sra1 =
29629 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29630 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29631 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29632 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29633 }
29634 }
29635
29636 // v4i32 Non Uniform Shifts.
29637 // If the shift amount is constant we can shift each lane using the SSE2
29638 // immediate shifts, else we need to zero-extend each lane to the lower i64
29639 // and shift using the SSE2 variable shifts.
29640 // The separate results can then be blended together.
29641 if (VT == MVT::v4i32) {
29642 SDValue Amt0, Amt1, Amt2, Amt3;
29643 if (ConstantAmt) {
29644 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29645 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29646 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29647 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29648 } else {
29649 // The SSE2 shifts use the lower i64 as the same shift amount for
29650 // all lanes and the upper i64 is ignored. On AVX we're better off
29651 // just zero-extending, but for SSE just duplicating the top 16-bits is
29652 // cheaper and has the same effect for out of range values.
29653 if (Subtarget.hasAVX()) {
29654 SDValue Z = DAG.getConstant(0, dl, VT);
29655 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29656 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29657 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29658 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29659 } else {
29660 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29661 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29662 {4, 5, 6, 7, -1, -1, -1, -1});
29663 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
29664 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
29665 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
29666 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
29667 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
29668 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
29669 }
29670 }
29671
29672 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29673 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29674 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29675 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29676 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29677
29678 // Merge the shifted lane results optimally with/without PBLENDW.
29679 // TODO - ideally shuffle combining would handle this.
29680 if (Subtarget.hasSSE41()) {
29681 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29682 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29683 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29684 }
29685 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29686 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29687 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29688 }
29689
29690 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29691 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29692 // make the existing SSE solution better.
29693 // NOTE: We honor prefered vector width before promoting to 512-bits.
29694 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29695 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29696 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29697 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29698 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29699 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
29700 "Unexpected vector type");
29701 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29702 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29703 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29704 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29705 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29706 return DAG.getNode(ISD::TRUNCATE, dl, VT,
29707 DAG.getNode(Opc, dl, ExtVT, R, Amt));
29708 }
29709
29710 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29711 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29712 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29713 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29714 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29715 !Subtarget.hasXOP()) {
29716 int NumElts = VT.getVectorNumElements();
29717 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29718
29719 // Extend constant shift amount to vXi16 (it doesn't matter if the type
29720 // isn't legal).
29721 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29722 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29723 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29724 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29726 "Constant build vector expected");
29727
29728 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29729 bool IsSigned = Opc == ISD::SRA;
29730 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
29731 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29732 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29733 return DAG.getZExtOrTrunc(R, dl, VT);
29734 }
29735
29736 SmallVector<SDValue, 16> LoAmt, HiAmt;
29737 for (int i = 0; i != NumElts; i += 16) {
29738 for (int j = 0; j != 8; ++j) {
29739 LoAmt.push_back(Amt.getOperand(i + j));
29740 HiAmt.push_back(Amt.getOperand(i + j + 8));
29741 }
29742 }
29743
29744 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29745 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29746 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29747
29748 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29749 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29750 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29751 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29752 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29753 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29754 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29755 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29756 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29757 }
29758
29759 if (VT == MVT::v16i8 ||
29760 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29761 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29762 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29763
29764 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29765 if (VT.is512BitVector()) {
29766 // On AVX512BW targets we make use of the fact that VSELECT lowers
29767 // to a masked blend which selects bytes based just on the sign bit
29768 // extracted to a mask.
29769 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29770 V0 = DAG.getBitcast(VT, V0);
29771 V1 = DAG.getBitcast(VT, V1);
29772 Sel = DAG.getBitcast(VT, Sel);
29773 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29774 ISD::SETGT);
29775 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29776 } else if (Subtarget.hasSSE41()) {
29777 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29778 // on the sign bit.
29779 V0 = DAG.getBitcast(VT, V0);
29780 V1 = DAG.getBitcast(VT, V1);
29781 Sel = DAG.getBitcast(VT, Sel);
29782 return DAG.getBitcast(SelVT,
29783 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29784 }
29785 // On pre-SSE41 targets we test for the sign bit by comparing to
29786 // zero - a negative value will set all bits of the lanes to true
29787 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29788 SDValue Z = DAG.getConstant(0, dl, SelVT);
29789 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29790 return DAG.getSelect(dl, SelVT, C, V0, V1);
29791 };
29792
29793 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29794 // We can safely do this using i16 shifts as we're only interested in
29795 // the 3 lower bits of each byte.
29796 Amt = DAG.getBitcast(ExtVT, Amt);
29797 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29798 Amt = DAG.getBitcast(VT, Amt);
29799
29800 if (Opc == ISD::SHL || Opc == ISD::SRL) {
29801 // r = VSELECT(r, shift(r, 4), a);
29802 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29803 R = SignBitSelect(VT, Amt, M, R);
29804
29805 // a += a
29806 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29807
29808 // r = VSELECT(r, shift(r, 2), a);
29809 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29810 R = SignBitSelect(VT, Amt, M, R);
29811
29812 // a += a
29813 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29814
29815 // return VSELECT(r, shift(r, 1), a);
29816 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29817 R = SignBitSelect(VT, Amt, M, R);
29818 return R;
29819 }
29820
29821 if (Opc == ISD::SRA) {
29822 // For SRA we need to unpack each byte to the higher byte of a i16 vector
29823 // so we can correctly sign extend. We don't care what happens to the
29824 // lower byte.
29825 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29826 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29827 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29828 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29829 ALo = DAG.getBitcast(ExtVT, ALo);
29830 AHi = DAG.getBitcast(ExtVT, AHi);
29831 RLo = DAG.getBitcast(ExtVT, RLo);
29832 RHi = DAG.getBitcast(ExtVT, RHi);
29833
29834 // r = VSELECT(r, shift(r, 4), a);
29835 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29836 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29837 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29838 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29839
29840 // a += a
29841 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29842 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29843
29844 // r = VSELECT(r, shift(r, 2), a);
29845 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29846 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29847 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29848 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29849
29850 // a += a
29851 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29852 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29853
29854 // r = VSELECT(r, shift(r, 1), a);
29855 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29856 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29857 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29858 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29859
29860 // Logical shift the result back to the lower byte, leaving a zero upper
29861 // byte meaning that we can safely pack with PACKUSWB.
29862 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29863 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29864 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29865 }
29866 }
29867
29868 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29869 MVT ExtVT = MVT::v8i32;
29870 SDValue Z = DAG.getConstant(0, dl, VT);
29871 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29872 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29873 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29874 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29875 ALo = DAG.getBitcast(ExtVT, ALo);
29876 AHi = DAG.getBitcast(ExtVT, AHi);
29877 RLo = DAG.getBitcast(ExtVT, RLo);
29878 RHi = DAG.getBitcast(ExtVT, RHi);
29879 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29880 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29881 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29882 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29883 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29884 }
29885
29886 if (VT == MVT::v8i16) {
29887 // If we have a constant shift amount, the non-SSE41 path is best as
29888 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29889 bool UseSSE41 = Subtarget.hasSSE41() &&
29891
29892 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29893 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29894 // the sign bit.
29895 if (UseSSE41) {
29896 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29897 V0 = DAG.getBitcast(ExtVT, V0);
29898 V1 = DAG.getBitcast(ExtVT, V1);
29899 Sel = DAG.getBitcast(ExtVT, Sel);
29900 return DAG.getBitcast(
29901 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29902 }
29903 // On pre-SSE41 targets we splat the sign bit - a negative value will
29904 // set all bits of the lanes to true and VSELECT uses that in
29905 // its OR(AND(V0,C),AND(V1,~C)) lowering.
29906 SDValue C =
29907 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29908 return DAG.getSelect(dl, VT, C, V0, V1);
29909 };
29910
29911 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29912 if (UseSSE41) {
29913 // On SSE41 targets we need to replicate the shift mask in both
29914 // bytes for PBLENDVB.
29915 Amt = DAG.getNode(
29916 ISD::OR, dl, VT,
29917 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29918 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29919 } else {
29920 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29921 }
29922
29923 // r = VSELECT(r, shift(r, 8), a);
29924 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29925 R = SignBitSelect(Amt, M, R);
29926
29927 // a += a
29928 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29929
29930 // r = VSELECT(r, shift(r, 4), a);
29931 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29932 R = SignBitSelect(Amt, M, R);
29933
29934 // a += a
29935 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29936
29937 // r = VSELECT(r, shift(r, 2), a);
29938 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29939 R = SignBitSelect(Amt, M, R);
29940
29941 // a += a
29942 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29943
29944 // return VSELECT(r, shift(r, 1), a);
29945 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29946 R = SignBitSelect(Amt, M, R);
29947 return R;
29948 }
29949
29950 // Decompose 256-bit shifts into 128-bit shifts.
29951 if (VT.is256BitVector())
29952 return splitVectorIntBinary(Op, DAG, dl);
29953
29954 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29955 return splitVectorIntBinary(Op, DAG, dl);
29956
29957 return SDValue();
29958}
29959
29961 SelectionDAG &DAG) {
29962 MVT VT = Op.getSimpleValueType();
29963 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
29964 "Unexpected funnel shift opcode!");
29965
29966 SDLoc DL(Op);
29967 SDValue Op0 = Op.getOperand(0);
29968 SDValue Op1 = Op.getOperand(1);
29969 SDValue Amt = Op.getOperand(2);
29970 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29971 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
29972
29973 if (VT.isVector()) {
29974 APInt APIntShiftAmt;
29975 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
29976 unsigned NumElts = VT.getVectorNumElements();
29977
29978 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
29979 if (IsFSHR)
29980 std::swap(Op0, Op1);
29981
29982 if (IsCstSplat) {
29983 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29984 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
29985 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
29986 {Op0, Op1, Imm}, DAG, Subtarget);
29987 }
29988 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
29989 {Op0, Op1, Amt}, DAG, Subtarget);
29990 }
29991 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
29992 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
29993 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
29994 "Unexpected funnel shift type!");
29995
29996 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
29997 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
29998 if (IsCstSplat) {
29999 // TODO: Can't use generic expansion as UNDEF amt elements can be
30000 // converted to other values when folded to shift amounts, losing the
30001 // splat.
30002 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
30003 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
30004 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
30005 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
30006 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30007
30008 if (EltSizeInBits == 8 &&
30009 (Subtarget.hasXOP() ||
30010 (useVPTERNLOG(Subtarget, VT) &&
30011 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
30012 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
30013 // bit-select - lower using vXi16 shifts and then perform the bitmask at
30014 // the original vector width to handle cases where we split.
30015 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
30016 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
30017 SDValue ShX =
30018 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
30019 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
30020 SDValue ShY =
30021 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
30022 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
30023 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
30024 DAG.getConstant(MaskX, DL, VT));
30025 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
30026 DAG.getConstant(MaskY, DL, VT));
30027 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
30028 }
30029
30030 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
30031 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
30032 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
30033 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
30034 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
30035 }
30036
30037 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30038 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30039 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
30040
30041 // Constant vXi16 funnel shifts can be efficiently handled by default.
30042 if (IsCst && EltSizeInBits == 16)
30043 return SDValue();
30044
30045 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
30046 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30047 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30048
30049 // Split 256-bit integers on XOP/pre-AVX2 targets.
30050 // Split 512-bit integers on non 512-bit BWI targets.
30051 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
30052 !Subtarget.hasAVX2())) ||
30053 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
30054 EltSizeInBits < 32)) {
30055 // Pre-mask the amount modulo using the wider vector.
30056 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
30057 return splitVectorOp(Op, DAG, DL);
30058 }
30059
30060 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
30061 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
30062 int ScalarAmtIdx = -1;
30063 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
30064 // Uniform vXi16 funnel shifts can be efficiently handled by default.
30065 if (EltSizeInBits == 16)
30066 return SDValue();
30067
30068 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30069 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30070 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
30071 ScalarAmtIdx, Subtarget, DAG);
30072 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
30073 ScalarAmtIdx, Subtarget, DAG);
30074 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30075 }
30076 }
30077
30078 MVT WideSVT = MVT::getIntegerVT(
30079 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
30080 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
30081
30082 // If per-element shifts are legal, fallback to generic expansion.
30083 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
30084 return SDValue();
30085
30086 // Attempt to fold as:
30087 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30088 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30089 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30090 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30091 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
30092 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
30093 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30094 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
30095 EltSizeInBits, DAG);
30096 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
30097 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
30098 if (!IsFSHR)
30099 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
30100 EltSizeInBits, DAG);
30101 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
30102 }
30103
30104 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
30105 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
30106 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
30107 SDValue Z = DAG.getConstant(0, DL, VT);
30108 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30109 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30110 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30111 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30112 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30113 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30114 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30115 }
30116
30117 // Fallback to generic expansion.
30118 return SDValue();
30119 }
30120 assert(
30121 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
30122 "Unexpected funnel shift type!");
30123
30124 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
30125 bool OptForSize = DAG.shouldOptForSize();
30126 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
30127
30128 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30129 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30130 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
30131 !isa<ConstantSDNode>(Amt)) {
30132 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
30133 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
30134 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
30135 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
30136 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
30137 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
30138 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
30139 if (IsFSHR) {
30140 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
30141 } else {
30142 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
30143 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
30144 }
30145 return DAG.getZExtOrTrunc(Res, DL, VT);
30146 }
30147
30148 if (VT == MVT::i8 || ExpandFunnel)
30149 return SDValue();
30150
30151 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
30152 if (VT == MVT::i16) {
30153 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
30154 DAG.getConstant(15, DL, Amt.getValueType()));
30155 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
30156 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
30157 }
30158
30159 return Op;
30160}
30161
30162static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
30163 SelectionDAG &DAG) {
30164 MVT VT = Op.getSimpleValueType();
30165 assert(VT.isVector() && "Custom lowering only for vector rotates!");
30166
30167 SDLoc DL(Op);
30168 SDValue R = Op.getOperand(0);
30169 SDValue Amt = Op.getOperand(1);
30170 unsigned Opcode = Op.getOpcode();
30171 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30172 int NumElts = VT.getVectorNumElements();
30173 bool IsROTL = Opcode == ISD::ROTL;
30174
30175 // Check for constant splat rotation amount.
30176 APInt CstSplatValue;
30177 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
30178
30179 // Check for splat rotate by zero.
30180 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
30181 return R;
30182
30183 // AVX512 implicitly uses modulo rotation amounts.
30184 if ((Subtarget.hasVLX() ||
30185 (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
30186 32 <= EltSizeInBits) {
30187 // Attempt to rotate by immediate.
30188 if (IsCstSplat) {
30189 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
30190 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30191 return DAG.getNode(RotOpc, DL, VT, R,
30192 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30193 }
30194
30195 // Else, fall-back on VPROLV/VPRORV.
30196 return Op;
30197 }
30198
30199 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
30200 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
30201 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30202 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30203 }
30204
30205 SDValue Z = DAG.getConstant(0, DL, VT);
30206
30207 if (!IsROTL) {
30208 // If the ISD::ROTR amount is constant, we're always better converting to
30209 // ISD::ROTL.
30210 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
30211 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
30212
30213 // XOP targets always prefers ISD::ROTL.
30214 if (Subtarget.hasXOP())
30215 return DAG.getNode(ISD::ROTL, DL, VT, R,
30216 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
30217 }
30218
30219 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
30220 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
30222 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30223 uint64_t RotMask = getGFNICtrlImm(Opcode, RotAmt);
30224 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
30225 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(RotMask, DL, MaskVT));
30226 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
30227 DAG.getTargetConstant(0, DL, MVT::i8));
30228 }
30229
30230 // Split 256-bit integers on XOP/pre-AVX2 targets.
30231 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
30232 return splitVectorIntBinary(Op, DAG, DL);
30233
30234 // XOP has 128-bit vector variable + immediate rotates.
30235 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
30236 // XOP implicitly uses modulo rotation amounts.
30237 if (Subtarget.hasXOP()) {
30238 assert(IsROTL && "Only ROTL expected");
30239 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
30240
30241 // Attempt to rotate by immediate.
30242 if (IsCstSplat) {
30243 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30244 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
30245 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30246 }
30247
30248 // Use general rotate by variable (per-element).
30249 return Op;
30250 }
30251
30252 // Rotate by an uniform constant - expand back to shifts.
30253 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
30254 // to other values when folded to shift amounts, losing the splat.
30255 if (IsCstSplat) {
30256 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30257 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
30258 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
30259 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
30260 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
30261 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
30262 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
30263 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
30264 }
30265
30266 // Split 512-bit integers on non 512-bit BWI targets.
30267 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
30268 return splitVectorIntBinary(Op, DAG, DL);
30269
30270 assert(
30271 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
30272 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
30273 Subtarget.hasAVX2()) ||
30274 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
30275 "Only vXi32/vXi16/vXi8 vector rotates supported");
30276
30277 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30278 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30279
30280 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30281 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30282
30283 // Attempt to fold as unpack(x,x) << zext(splat(y)):
30284 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30285 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30286 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
30287 int BaseRotAmtIdx = -1;
30288 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
30289 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
30290 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30291 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30292 }
30293 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
30294 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30295 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30296 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
30297 BaseRotAmtIdx, Subtarget, DAG);
30298 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
30299 BaseRotAmtIdx, Subtarget, DAG);
30300 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30301 }
30302 }
30303
30304 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30305 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
30306
30307 // Attempt to fold as unpack(x,x) << zext(y):
30308 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30309 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30310 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
30311 if (!(ConstantAmt && EltSizeInBits != 8) &&
30312 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
30313 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
30314 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30315 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30316 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30317 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30318 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30319 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30320 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30321 }
30322
30323 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
30324 // the amount bit.
30325 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
30326 if (EltSizeInBits == 8) {
30327 MVT WideVT =
30328 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
30329
30330 // Attempt to fold as:
30331 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
30332 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
30333 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30334 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30335 // If we're rotating by constant, just use default promotion.
30336 if (ConstantAmt)
30337 return SDValue();
30338 // See if we can perform this by widening to vXi16 or vXi32.
30339 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
30340 R = DAG.getNode(
30341 ISD::OR, DL, WideVT, R,
30342 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
30343 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30344 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
30345 if (IsROTL)
30346 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
30347 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
30348 }
30349
30350 // We don't need ModuloAmt here as we just peek at individual bits.
30351 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30352 if (Subtarget.hasSSE41()) {
30353 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30354 // on the sign bit.
30355 V0 = DAG.getBitcast(VT, V0);
30356 V1 = DAG.getBitcast(VT, V1);
30357 Sel = DAG.getBitcast(VT, Sel);
30358 return DAG.getBitcast(SelVT,
30359 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
30360 }
30361 // On pre-SSE41 targets we test for the sign bit by comparing to
30362 // zero - a negative value will set all bits of the lanes to true
30363 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30364 SDValue Z = DAG.getConstant(0, DL, SelVT);
30365 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
30366 return DAG.getSelect(DL, SelVT, C, V0, V1);
30367 };
30368
30369 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
30370 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
30371 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30372 IsROTL = true;
30373 }
30374
30375 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
30376 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
30377
30378 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30379 // We can safely do this using i16 shifts as we're only interested in
30380 // the 3 lower bits of each byte.
30381 Amt = DAG.getBitcast(ExtVT, Amt);
30382 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
30383 Amt = DAG.getBitcast(VT, Amt);
30384
30385 // r = VSELECT(r, rot(r, 4), a);
30386 SDValue M;
30387 M = DAG.getNode(
30388 ISD::OR, DL, VT,
30389 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
30390 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
30391 R = SignBitSelect(VT, Amt, M, R);
30392
30393 // a += a
30394 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30395
30396 // r = VSELECT(r, rot(r, 2), a);
30397 M = DAG.getNode(
30398 ISD::OR, DL, VT,
30399 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
30400 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
30401 R = SignBitSelect(VT, Amt, M, R);
30402
30403 // a += a
30404 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30405
30406 // return VSELECT(r, rot(r, 1), a);
30407 M = DAG.getNode(
30408 ISD::OR, DL, VT,
30409 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
30410 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
30411 return SignBitSelect(VT, Amt, M, R);
30412 }
30413
30414 bool IsSplatAmt = DAG.isSplatValue(Amt);
30415 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
30416 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
30417
30418 // Fallback for splats + all supported variable shifts.
30419 // Fallback for non-constants AVX2 vXi16 as well.
30420 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
30421 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30422 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
30423 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
30424 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
30425 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
30426 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
30427 }
30428
30429 // Everything below assumes ISD::ROTL.
30430 if (!IsROTL) {
30431 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30432 IsROTL = true;
30433 }
30434
30435 // ISD::ROT* uses modulo rotate amounts.
30436 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30437
30438 assert(IsROTL && "Only ROTL supported");
30439
30440 // As with shifts, attempt to convert the rotation amount to a multiplication
30441 // factor, fallback to general expansion.
30442 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
30443 if (!Scale)
30444 return SDValue();
30445
30446 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
30447 if (EltSizeInBits == 16) {
30448 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
30449 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
30450 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30451 }
30452
30453 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
30454 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
30455 // that can then be OR'd with the lower 32-bits.
30456 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
30457 static const int OddMask[] = {1, -1, 3, -1};
30458 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
30459 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
30460
30461 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30462 DAG.getBitcast(MVT::v2i64, R),
30463 DAG.getBitcast(MVT::v2i64, Scale));
30464 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30465 DAG.getBitcast(MVT::v2i64, R13),
30466 DAG.getBitcast(MVT::v2i64, Scale13));
30467 Res02 = DAG.getBitcast(VT, Res02);
30468 Res13 = DAG.getBitcast(VT, Res13);
30469
30470 return DAG.getNode(ISD::OR, DL, VT,
30471 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
30472 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
30473}
30474
30475/// Returns true if the operand type is exactly twice the native width, and
30476/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
30477/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
30478/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
30479bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
30480 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30481
30482 if (OpWidth == 64)
30483 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
30484 if (OpWidth == 128)
30485 return Subtarget.canUseCMPXCHG16B();
30486
30487 return false;
30488}
30489
30491X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
30492 Type *MemType = SI->getValueOperand()->getType();
30493
30494 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
30495 !Subtarget.useSoftFloat()) {
30496 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30497 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30499
30500 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
30501 Subtarget.hasAVX())
30503 }
30504
30505 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
30507}
30508
30509// Note: this turns large loads into lock cmpxchg8b/16b.
30511X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
30512 Type *MemType = LI->getType();
30513
30514 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
30515 !Subtarget.useSoftFloat()) {
30516 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30517 // can use movq to do the load. If we have X87 we can load into an 80-bit
30518 // X87 register and store it to a stack temporary.
30519 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30520 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30522
30523 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
30524 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
30525 Subtarget.hasAVX())
30527 }
30528
30529 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30531}
30532
30533enum BitTestKind : unsigned {
30540
30541static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
30542 using namespace llvm::PatternMatch;
30543 BitTestKind BTK = UndefBit;
30544 auto *C = dyn_cast<ConstantInt>(V);
30545 if (C) {
30546 // Check if V is a power of 2 or NOT power of 2.
30547 if (isPowerOf2_64(C->getZExtValue()))
30548 BTK = ConstantBit;
30549 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
30550 BTK = NotConstantBit;
30551 return {V, BTK};
30552 }
30553
30554 // Check if V is some power of 2 pattern known to be non-zero
30555 auto *I = dyn_cast<Instruction>(V);
30556 if (I) {
30557 bool Not = false;
30558 // Check if we have a NOT
30559 Value *PeekI;
30560 if (match(I, m_Not(m_Value(PeekI))) ||
30561 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
30562 Not = true;
30563 I = dyn_cast<Instruction>(PeekI);
30564
30565 // If I is constant, it will fold and we can evaluate later. If its an
30566 // argument or something of that nature, we can't analyze.
30567 if (I == nullptr)
30568 return {nullptr, UndefBit};
30569 }
30570 // We can only use 1 << X without more sophisticated analysis. C << X where
30571 // C is a power of 2 but not 1 can result in zero which cannot be translated
30572 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
30573 if (I->getOpcode() == Instruction::Shl) {
30574 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
30575 // -X` and some other provable power of 2 patterns that we can use CTZ on
30576 // may be profitable.
30577 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
30578 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
30579 // be provably a non-zero power of 2.
30580 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
30581 // transformable to bittest.
30582 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
30583 if (!ShiftVal)
30584 return {nullptr, UndefBit};
30585 if (ShiftVal->equalsInt(1))
30586 BTK = Not ? NotShiftBit : ShiftBit;
30587
30588 if (BTK == UndefBit)
30589 return {nullptr, UndefBit};
30590
30591 Value *BitV = I->getOperand(1);
30592
30593 Value *AndOp;
30594 const APInt *AndC;
30595 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
30596 // Read past a shiftmask instruction to find count
30597 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
30598 BitV = AndOp;
30599 }
30600 return {BitV, BTK};
30601 }
30602 }
30603 return {nullptr, UndefBit};
30604}
30605
30607X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
30608 using namespace llvm::PatternMatch;
30609 // If the atomicrmw's result isn't actually used, we can just add a "lock"
30610 // prefix to a normal instruction for these operations.
30611 if (AI->use_empty())
30613
30614 if (AI->getOperation() == AtomicRMWInst::Xor) {
30615 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
30616 // preferable to both `cmpxchg` and `btc`.
30617 if (match(AI->getOperand(1), m_SignMask()))
30619 }
30620
30621 // If the atomicrmw's result is used by a single bit AND, we may use
30622 // bts/btr/btc instruction for these operations.
30623 // Note: InstCombinePass can cause a de-optimization here. It replaces the
30624 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
30625 // (depending on CC). This pattern can only use bts/btr/btc but we don't
30626 // detect it.
30627 Instruction *I = AI->user_back();
30628 auto BitChange = FindSingleBitChange(AI->getValOperand());
30629 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
30630 I->getOpcode() != Instruction::And ||
30631 AI->getType()->getPrimitiveSizeInBits() == 8 ||
30632 AI->getParent() != I->getParent())
30634
30635 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
30636
30637 // This is a redundant AND, it should get cleaned up elsewhere.
30638 if (AI == I->getOperand(OtherIdx))
30640
30641 // The following instruction must be a AND single bit.
30642 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
30643 auto *C1 = cast<ConstantInt>(AI->getValOperand());
30644 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
30645 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
30647 }
30648 if (AI->getOperation() == AtomicRMWInst::And) {
30649 return ~C1->getValue() == C2->getValue()
30652 }
30655 }
30656
30657 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
30658
30659 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
30660 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
30662
30663 assert(BitChange.first != nullptr && BitTested.first != nullptr);
30664
30665 // If shift amounts are not the same we can't use BitTestIntrinsic.
30666 if (BitChange.first != BitTested.first)
30668
30669 // If atomic AND need to be masking all be one bit and testing the one bit
30670 // unset in the mask.
30671 if (AI->getOperation() == AtomicRMWInst::And)
30672 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
30675
30676 // If atomic XOR/OR need to be setting and testing the same bit.
30677 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
30680}
30681
30682void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
30683 IRBuilder<> Builder(AI);
30684 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30687 switch (AI->getOperation()) {
30688 default:
30689 llvm_unreachable("Unknown atomic operation");
30690 case AtomicRMWInst::Or:
30691 IID_C = Intrinsic::x86_atomic_bts;
30692 IID_I = Intrinsic::x86_atomic_bts_rm;
30693 break;
30694 case AtomicRMWInst::Xor:
30695 IID_C = Intrinsic::x86_atomic_btc;
30696 IID_I = Intrinsic::x86_atomic_btc_rm;
30697 break;
30698 case AtomicRMWInst::And:
30699 IID_C = Intrinsic::x86_atomic_btr;
30700 IID_I = Intrinsic::x86_atomic_btr_rm;
30701 break;
30702 }
30703 Instruction *I = AI->user_back();
30704 LLVMContext &Ctx = AI->getContext();
30705 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30707 Function *BitTest = nullptr;
30708 Value *Result = nullptr;
30709 auto BitTested = FindSingleBitChange(AI->getValOperand());
30710 assert(BitTested.first != nullptr);
30711
30712 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
30713 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
30714
30715 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
30716
30717 unsigned Imm = llvm::countr_zero(C->getZExtValue());
30718 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
30719 } else {
30720 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
30721
30722 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
30723
30724 Value *SI = BitTested.first;
30725 assert(SI != nullptr);
30726
30727 // BT{S|R|C} on memory operand don't modulo bit position so we need to
30728 // mask it.
30729 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
30730 Value *BitPos =
30731 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
30732 // Todo(1): In many cases it may be provable that SI is less than
30733 // ShiftBits in which case this mask is unnecessary
30734 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
30735 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
30736 // favor of just a raw BT{S|R|C}.
30737
30738 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
30739 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
30740
30741 // If the result is only used for zero/non-zero status then we don't need to
30742 // shift value back. Otherwise do so.
30743 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
30744 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
30745 if (ICmp->isEquality()) {
30746 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
30747 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
30748 if (C0 || C1) {
30749 assert(C0 == nullptr || C1 == nullptr);
30750 if ((C0 ? C0 : C1)->isZero())
30751 continue;
30752 }
30753 }
30754 }
30755 Result = Builder.CreateShl(Result, BitPos);
30756 break;
30757 }
30758 }
30759
30760 I->replaceAllUsesWith(Result);
30761 I->eraseFromParent();
30762 AI->eraseFromParent();
30763}
30764
30766 using namespace llvm::PatternMatch;
30767 if (!AI->hasOneUse())
30768 return false;
30769
30770 Value *Op = AI->getOperand(1);
30772 Instruction *I = AI->user_back();
30774 if (Opc == AtomicRMWInst::Add) {
30775 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
30776 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30777 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
30778 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30779 return Pred == CmpInst::ICMP_SLT;
30780 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30781 return Pred == CmpInst::ICMP_SGT;
30782 }
30783 return false;
30784 }
30785 if (Opc == AtomicRMWInst::Sub) {
30786 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30787 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30788 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
30789 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30790 return Pred == CmpInst::ICMP_SLT;
30791 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30792 return Pred == CmpInst::ICMP_SGT;
30793 }
30794 return false;
30795 }
30796 if ((Opc == AtomicRMWInst::Or &&
30798 (Opc == AtomicRMWInst::And &&
30800 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30801 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
30802 Pred == CmpInst::ICMP_SLT;
30803 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30804 return Pred == CmpInst::ICMP_SGT;
30805 return false;
30806 }
30807 if (Opc == AtomicRMWInst::Xor) {
30808 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30809 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30810 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
30811 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30812 return Pred == CmpInst::ICMP_SLT;
30813 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30814 return Pred == CmpInst::ICMP_SGT;
30815 }
30816 return false;
30817 }
30818
30819 return false;
30820}
30821
30822void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
30823 AtomicRMWInst *AI) const {
30824 IRBuilder<> Builder(AI);
30825 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30826 Instruction *TempI = nullptr;
30827 LLVMContext &Ctx = AI->getContext();
30828 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
30829 if (!ICI) {
30830 TempI = AI->user_back();
30831 assert(TempI->hasOneUse() && "Must have one use");
30832 ICI = cast<ICmpInst>(TempI->user_back());
30833 }
30835 ICmpInst::Predicate Pred = ICI->getPredicate();
30836 switch (Pred) {
30837 default:
30838 llvm_unreachable("Not supported Pred");
30839 case CmpInst::ICMP_EQ:
30840 CC = X86::COND_E;
30841 break;
30842 case CmpInst::ICMP_NE:
30843 CC = X86::COND_NE;
30844 break;
30845 case CmpInst::ICMP_SLT:
30846 CC = X86::COND_S;
30847 break;
30848 case CmpInst::ICMP_SGT:
30849 CC = X86::COND_NS;
30850 break;
30851 }
30853 switch (AI->getOperation()) {
30854 default:
30855 llvm_unreachable("Unknown atomic operation");
30856 case AtomicRMWInst::Add:
30857 IID = Intrinsic::x86_atomic_add_cc;
30858 break;
30859 case AtomicRMWInst::Sub:
30860 IID = Intrinsic::x86_atomic_sub_cc;
30861 break;
30862 case AtomicRMWInst::Or:
30863 IID = Intrinsic::x86_atomic_or_cc;
30864 break;
30865 case AtomicRMWInst::And:
30866 IID = Intrinsic::x86_atomic_and_cc;
30867 break;
30868 case AtomicRMWInst::Xor:
30869 IID = Intrinsic::x86_atomic_xor_cc;
30870 break;
30871 }
30872 Function *CmpArith =
30873 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
30874 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30876 Value *Call = Builder.CreateCall(
30877 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
30878 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
30879 ICI->replaceAllUsesWith(Result);
30880 ICI->eraseFromParent();
30881 if (TempI)
30882 TempI->eraseFromParent();
30883 AI->eraseFromParent();
30884}
30885
30887X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
30888 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30889 Type *MemType = AI->getType();
30890
30891 // If the operand is too big, we must see if cmpxchg8/16b is available
30892 // and default to library calls otherwise.
30893 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30894 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30896 }
30897
30899 switch (Op) {
30902 case AtomicRMWInst::Add:
30903 case AtomicRMWInst::Sub:
30906 // It's better to use xadd, xsub or xchg for these in other cases.
30908 case AtomicRMWInst::Or:
30909 case AtomicRMWInst::And:
30910 case AtomicRMWInst::Xor:
30913 return shouldExpandLogicAtomicRMWInIR(AI);
30915 case AtomicRMWInst::Max:
30916 case AtomicRMWInst::Min:
30925 default:
30926 // These always require a non-trivial set of data operations on x86. We must
30927 // use a cmpxchg loop.
30929 }
30930}
30931
30932LoadInst *
30933X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
30934 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30935 Type *MemType = AI->getType();
30936 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
30937 // there is no benefit in turning such RMWs into loads, and it is actually
30938 // harmful as it introduces a mfence.
30939 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30940 return nullptr;
30941
30942 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
30943 // lowering available in lowerAtomicArith.
30944 // TODO: push more cases through this path.
30945 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30946 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30947 AI->use_empty())
30948 return nullptr;
30949
30950 IRBuilder<> Builder(AI);
30951 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30952 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30953 auto SSID = AI->getSyncScopeID();
30954 // We must restrict the ordering to avoid generating loads with Release or
30955 // ReleaseAcquire orderings.
30957
30958 // Before the load we need a fence. Here is an example lifted from
30959 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30960 // is required:
30961 // Thread 0:
30962 // x.store(1, relaxed);
30963 // r1 = y.fetch_add(0, release);
30964 // Thread 1:
30965 // y.fetch_add(42, acquire);
30966 // r2 = x.load(relaxed);
30967 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
30968 // lowered to just a load without a fence. A mfence flushes the store buffer,
30969 // making the optimization clearly correct.
30970 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
30971 // otherwise, we might be able to be more aggressive on relaxed idempotent
30972 // rmw. In practice, they do not look useful, so we don't try to be
30973 // especially clever.
30974 if (SSID == SyncScope::SingleThread)
30975 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
30976 // the IR level, so we must wrap it in an intrinsic.
30977 return nullptr;
30978
30979 if (!Subtarget.hasMFence())
30980 // FIXME: it might make sense to use a locked operation here but on a
30981 // different cache-line to prevent cache-line bouncing. In practice it
30982 // is probably a small win, and x86 processors without mfence are rare
30983 // enough that we do not bother.
30984 return nullptr;
30985
30986 Function *MFence =
30987 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30988 Builder.CreateCall(MFence, {});
30989
30990 // Finally we can emit the atomic load.
30991 LoadInst *Loaded = Builder.CreateAlignedLoad(
30992 AI->getType(), AI->getPointerOperand(), AI->getAlign());
30993 Loaded->setAtomic(Order, SSID);
30994 AI->replaceAllUsesWith(Loaded);
30995 AI->eraseFromParent();
30996 return Loaded;
30997}
30998
30999/// Emit a locked operation on a stack location which does not change any
31000/// memory location, but does involve a lock prefix. Location is chosen to be
31001/// a) very likely accessed only by a single thread to minimize cache traffic,
31002/// and b) definitely dereferenceable. Returns the new Chain result.
31004 const X86Subtarget &Subtarget, SDValue Chain,
31005 const SDLoc &DL) {
31006 // Implementation notes:
31007 // 1) LOCK prefix creates a full read/write reordering barrier for memory
31008 // operations issued by the current processor. As such, the location
31009 // referenced is not relevant for the ordering properties of the instruction.
31010 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
31011 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
31012 // 2) Using an immediate operand appears to be the best encoding choice
31013 // here since it doesn't require an extra register.
31014 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
31015 // is small enough it might just be measurement noise.)
31016 // 4) When choosing offsets, there are several contributing factors:
31017 // a) If there's no redzone, we default to TOS. (We could allocate a cache
31018 // line aligned stack object to improve this case.)
31019 // b) To minimize our chances of introducing a false dependence, we prefer
31020 // to offset the stack usage from TOS slightly.
31021 // c) To minimize concerns about cross thread stack usage - in particular,
31022 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
31023 // captures state in the TOS frame and accesses it from many threads -
31024 // we want to use an offset such that the offset is in a distinct cache
31025 // line from the TOS frame.
31026 //
31027 // For a general discussion of the tradeoffs and benchmark results, see:
31028 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
31029
31030 auto &MF = DAG.getMachineFunction();
31031 auto &TFL = *Subtarget.getFrameLowering();
31032 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
31033
31034 if (Subtarget.is64Bit()) {
31035 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
31036 SDValue Ops[] = {
31037 DAG.getRegister(X86::RSP, MVT::i64), // Base
31038 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
31039 DAG.getRegister(0, MVT::i64), // Index
31040 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
31041 DAG.getRegister(0, MVT::i16), // Segment.
31042 Zero,
31043 Chain};
31044 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
31045 MVT::Other, Ops);
31046 return SDValue(Res, 1);
31047 }
31048
31049 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
31050 SDValue Ops[] = {
31051 DAG.getRegister(X86::ESP, MVT::i32), // Base
31052 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
31053 DAG.getRegister(0, MVT::i32), // Index
31054 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
31055 DAG.getRegister(0, MVT::i16), // Segment.
31056 Zero,
31057 Chain
31058 };
31059 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
31060 MVT::Other, Ops);
31061 return SDValue(Res, 1);
31062}
31063
31065 SelectionDAG &DAG) {
31066 SDLoc dl(Op);
31067 AtomicOrdering FenceOrdering =
31068 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
31069 SyncScope::ID FenceSSID =
31070 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
31071
31072 // The only fence that needs an instruction is a sequentially-consistent
31073 // cross-thread fence.
31074 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
31075 FenceSSID == SyncScope::System) {
31076 if (Subtarget.hasMFence())
31077 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
31078
31079 SDValue Chain = Op.getOperand(0);
31080 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
31081 }
31082
31083 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31084 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
31085}
31086
31088 SelectionDAG &DAG) {
31089 MVT T = Op.getSimpleValueType();
31090 SDLoc DL(Op);
31091 unsigned Reg = 0;
31092 unsigned size = 0;
31093 switch(T.SimpleTy) {
31094 default: llvm_unreachable("Invalid value type!");
31095 case MVT::i8: Reg = X86::AL; size = 1; break;
31096 case MVT::i16: Reg = X86::AX; size = 2; break;
31097 case MVT::i32: Reg = X86::EAX; size = 4; break;
31098 case MVT::i64:
31099 assert(Subtarget.is64Bit() && "Node not type legal!");
31100 Reg = X86::RAX; size = 8;
31101 break;
31102 }
31103 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
31104 Op.getOperand(2), SDValue());
31105 SDValue Ops[] = { cpIn.getValue(0),
31106 Op.getOperand(1),
31107 Op.getOperand(3),
31108 DAG.getTargetConstant(size, DL, MVT::i8),
31109 cpIn.getValue(1) };
31110 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31111 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
31113 Ops, T, MMO);
31114
31115 SDValue cpOut =
31116 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
31117 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
31118 MVT::i32, cpOut.getValue(2));
31119 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
31120
31121 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
31122 cpOut, Success, EFLAGS.getValue(1));
31123}
31124
31125// Create MOVMSKB, taking into account whether we need to split for AVX1.
31127 const X86Subtarget &Subtarget) {
31128 MVT InVT = V.getSimpleValueType();
31129
31130 if (InVT == MVT::v64i8) {
31131 SDValue Lo, Hi;
31132 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31133 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
31134 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
31135 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
31136 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
31137 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
31138 DAG.getConstant(32, DL, MVT::i8));
31139 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
31140 }
31141 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
31142 SDValue Lo, Hi;
31143 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31144 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
31145 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
31146 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
31147 DAG.getConstant(16, DL, MVT::i8));
31148 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
31149 }
31150
31151 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31152}
31153
31154static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
31155 SelectionDAG &DAG) {
31156 SDValue Src = Op.getOperand(0);
31157 MVT SrcVT = Src.getSimpleValueType();
31158 MVT DstVT = Op.getSimpleValueType();
31159
31160 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
31161 // half to v32i1 and concatenating the result.
31162 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
31163 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31164 assert(Subtarget.hasBWI() && "Expected BWI target");
31165 SDLoc dl(Op);
31166 SDValue Lo, Hi;
31167 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
31168 Lo = DAG.getBitcast(MVT::v32i1, Lo);
31169 Hi = DAG.getBitcast(MVT::v32i1, Hi);
31170 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
31171 }
31172
31173 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
31174 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
31175 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
31176 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
31177 SDLoc DL(Op);
31178 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
31179 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31180 return DAG.getZExtOrTrunc(V, DL, DstVT);
31181 }
31182
31183 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
31184 SrcVT == MVT::i64) && "Unexpected VT!");
31185
31186 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31187 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
31188 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
31189 // This conversion needs to be expanded.
31190 return SDValue();
31191
31192 SDLoc dl(Op);
31193 if (SrcVT.isVector()) {
31194 // Widen the vector in input in the case of MVT::v2i32.
31195 // Example: from MVT::v2i32 to MVT::v4i32.
31197 SrcVT.getVectorNumElements() * 2);
31198 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
31199 DAG.getUNDEF(SrcVT));
31200 } else {
31201 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
31202 "Unexpected source type in LowerBITCAST");
31203 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
31204 }
31205
31206 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
31207 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
31208
31209 if (DstVT == MVT::x86mmx)
31210 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
31211
31212 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
31213 DAG.getIntPtrConstant(0, dl));
31214}
31215
31216/// Compute the horizontal sum of bytes in V for the elements of VT.
31217///
31218/// Requires V to be a byte vector and VT to be an integer vector type with
31219/// wider elements than V's type. The width of the elements of VT determines
31220/// how many bytes of V are summed horizontally to produce each element of the
31221/// result.
31223 const X86Subtarget &Subtarget,
31224 SelectionDAG &DAG) {
31225 SDLoc DL(V);
31226 MVT ByteVecVT = V.getSimpleValueType();
31227 MVT EltVT = VT.getVectorElementType();
31228 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
31229 "Expected value to have byte element type.");
31230 assert(EltVT != MVT::i8 &&
31231 "Horizontal byte sum only makes sense for wider elements!");
31232 unsigned VecSize = VT.getSizeInBits();
31233 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
31234
31235 // PSADBW instruction horizontally add all bytes and leave the result in i64
31236 // chunks, thus directly computes the pop count for v2i64 and v4i64.
31237 if (EltVT == MVT::i64) {
31238 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
31239 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31240 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
31241 return DAG.getBitcast(VT, V);
31242 }
31243
31244 if (EltVT == MVT::i32) {
31245 // We unpack the low half and high half into i32s interleaved with zeros so
31246 // that we can use PSADBW to horizontally sum them. The most useful part of
31247 // this is that it lines up the results of two PSADBW instructions to be
31248 // two v2i64 vectors which concatenated are the 4 population counts. We can
31249 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
31250 SDValue Zeros = DAG.getConstant(0, DL, VT);
31251 SDValue V32 = DAG.getBitcast(VT, V);
31252 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
31253 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
31254
31255 // Do the horizontal sums into two v2i64s.
31256 Zeros = DAG.getConstant(0, DL, ByteVecVT);
31257 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31258 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31259 DAG.getBitcast(ByteVecVT, Low), Zeros);
31260 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31261 DAG.getBitcast(ByteVecVT, High), Zeros);
31262
31263 // Merge them together.
31264 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
31265 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
31266 DAG.getBitcast(ShortVecVT, Low),
31267 DAG.getBitcast(ShortVecVT, High));
31268
31269 return DAG.getBitcast(VT, V);
31270 }
31271
31272 // The only element type left is i16.
31273 assert(EltVT == MVT::i16 && "Unknown how to handle type");
31274
31275 // To obtain pop count for each i16 element starting from the pop count for
31276 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
31277 // right by 8. It is important to shift as i16s as i8 vector shift isn't
31278 // directly supported.
31279 SDValue ShifterV = DAG.getConstant(8, DL, VT);
31280 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31281 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
31282 DAG.getBitcast(ByteVecVT, V));
31283 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31284}
31285
31287 const X86Subtarget &Subtarget,
31288 SelectionDAG &DAG) {
31289 MVT VT = Op.getSimpleValueType();
31290 MVT EltVT = VT.getVectorElementType();
31291 int NumElts = VT.getVectorNumElements();
31292 (void)EltVT;
31293 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
31294
31295 // Implement a lookup table in register by using an algorithm based on:
31296 // http://wm.ite.pl/articles/sse-popcount.html
31297 //
31298 // The general idea is that every lower byte nibble in the input vector is an
31299 // index into a in-register pre-computed pop count table. We then split up the
31300 // input vector in two new ones: (1) a vector with only the shifted-right
31301 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
31302 // masked out higher ones) for each byte. PSHUFB is used separately with both
31303 // to index the in-register table. Next, both are added and the result is a
31304 // i8 vector where each element contains the pop count for input byte.
31305 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
31306 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
31307 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
31308 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
31309
31311 for (int i = 0; i < NumElts; ++i)
31312 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
31313 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
31314 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
31315
31316 // High nibbles
31317 SDValue FourV = DAG.getConstant(4, DL, VT);
31318 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
31319
31320 // Low nibbles
31321 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
31322
31323 // The input vector is used as the shuffle mask that index elements into the
31324 // LUT. After counting low and high nibbles, add the vector to obtain the
31325 // final pop count per i8 element.
31326 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
31327 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
31328 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
31329}
31330
31331// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
31332// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
31334 const X86Subtarget &Subtarget,
31335 SelectionDAG &DAG) {
31336 MVT VT = Op.getSimpleValueType();
31337 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
31338 "Unknown CTPOP type to handle");
31339 SDValue Op0 = Op.getOperand(0);
31340
31341 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
31342 if (Subtarget.hasVPOPCNTDQ()) {
31343 unsigned NumElems = VT.getVectorNumElements();
31344 assert((VT.getVectorElementType() == MVT::i8 ||
31345 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
31346 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
31347 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
31348 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
31349 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
31350 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
31351 }
31352 }
31353
31354 // Decompose 256-bit ops into smaller 128-bit ops.
31355 if (VT.is256BitVector() && !Subtarget.hasInt256())
31356 return splitVectorIntUnary(Op, DAG, DL);
31357
31358 // Decompose 512-bit ops into smaller 256-bit ops.
31359 if (VT.is512BitVector() && !Subtarget.hasBWI())
31360 return splitVectorIntUnary(Op, DAG, DL);
31361
31362 // For element types greater than i8, do vXi8 pop counts and a bytesum.
31363 if (VT.getScalarType() != MVT::i8) {
31364 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31365 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
31366 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
31367 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
31368 }
31369
31370 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
31371 if (!Subtarget.hasSSSE3())
31372 return SDValue();
31373
31374 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
31375}
31376
31377static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
31378 SelectionDAG &DAG) {
31379 MVT VT = N.getSimpleValueType();
31380 SDValue Op = N.getOperand(0);
31381 SDLoc DL(N);
31382
31383 if (VT.isScalarInteger()) {
31384 // Compute the lower/upper bounds of the active bits of the value,
31385 // allowing us to shift the active bits down if necessary to fit into the
31386 // special cases below.
31387 KnownBits Known = DAG.computeKnownBits(Op);
31388 unsigned LZ = Known.countMinLeadingZeros();
31389 unsigned TZ = Known.countMinTrailingZeros();
31390 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
31391 unsigned ActiveBits = Known.getBitWidth() - LZ;
31392 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
31393
31394 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
31395 if (ShiftedActiveBits <= 2) {
31396 if (ActiveBits > 2)
31397 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31398 DAG.getShiftAmountConstant(TZ, VT, DL));
31399 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31400 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
31401 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31402 DAG.getShiftAmountConstant(1, VT, DL)));
31403 return DAG.getZExtOrTrunc(Op, DL, VT);
31404 }
31405
31406 // i3 CTPOP - perform LUT into i32 integer.
31407 if (ShiftedActiveBits <= 3) {
31408 if (ActiveBits > 3)
31409 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31410 DAG.getShiftAmountConstant(TZ, VT, DL));
31411 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31412 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
31413 DAG.getShiftAmountConstant(1, VT, DL));
31414 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
31415 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
31416 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
31417 DAG.getConstant(0x3, DL, MVT::i32));
31418 return DAG.getZExtOrTrunc(Op, DL, VT);
31419 }
31420
31421 // i4 CTPOP - perform LUT into i64 integer.
31422 if (ShiftedActiveBits <= 4 &&
31423 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
31424 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
31425 if (ActiveBits > 4)
31426 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31427 DAG.getShiftAmountConstant(TZ, VT, DL));
31428 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31429 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31430 DAG.getConstant(4, DL, MVT::i32));
31431 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
31432 DAG.getShiftAmountOperand(MVT::i64, Op));
31433 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
31434 DAG.getConstant(0x7, DL, MVT::i64));
31435 return DAG.getZExtOrTrunc(Op, DL, VT);
31436 }
31437
31438 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
31439 if (ShiftedActiveBits <= 8) {
31440 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
31441 if (ActiveBits > 8)
31442 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31443 DAG.getShiftAmountConstant(TZ, VT, DL));
31444 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31445 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31446 DAG.getConstant(0x08040201U, DL, MVT::i32));
31447 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31448 DAG.getShiftAmountConstant(3, MVT::i32, DL));
31449 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
31450 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
31451 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31452 DAG.getShiftAmountConstant(28, MVT::i32, DL));
31453 return DAG.getZExtOrTrunc(Op, DL, VT);
31454 }
31455
31456 return SDValue(); // fallback to generic expansion.
31457 }
31458
31459 assert(VT.isVector() &&
31460 "We only do custom lowering for vector population count.");
31461 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
31462}
31463
31465 MVT VT = Op.getSimpleValueType();
31466 SDValue In = Op.getOperand(0);
31467 SDLoc DL(Op);
31468
31469 // For scalars, its still beneficial to transfer to/from the SIMD unit to
31470 // perform the BITREVERSE.
31471 if (!VT.isVector()) {
31472 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31473 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31474 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
31475 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
31476 DAG.getIntPtrConstant(0, DL));
31477 }
31478
31479 int NumElts = VT.getVectorNumElements();
31480 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
31481
31482 // Decompose 256-bit ops into smaller 128-bit ops.
31483 if (VT.is256BitVector())
31484 return splitVectorIntUnary(Op, DAG, DL);
31485
31486 assert(VT.is128BitVector() &&
31487 "Only 128-bit vector bitreverse lowering supported.");
31488
31489 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
31490 // perform the BSWAP in the shuffle.
31491 // Its best to shuffle using the second operand as this will implicitly allow
31492 // memory folding for multiple vectors.
31493 SmallVector<SDValue, 16> MaskElts;
31494 for (int i = 0; i != NumElts; ++i) {
31495 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
31496 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
31497 int PermuteByte = SourceByte | (2 << 5);
31498 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
31499 }
31500 }
31501
31502 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
31503 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
31504 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
31505 Res, Mask);
31506 return DAG.getBitcast(VT, Res);
31507}
31508
31510 SelectionDAG &DAG) {
31511 MVT VT = Op.getSimpleValueType();
31512
31513 if (Subtarget.hasXOP() && !VT.is512BitVector())
31514 return LowerBITREVERSE_XOP(Op, DAG);
31515
31516 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
31517
31518 SDValue In = Op.getOperand(0);
31519 SDLoc DL(Op);
31520
31521 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
31522 if (VT.is512BitVector() && !Subtarget.hasBWI())
31523 return splitVectorIntUnary(Op, DAG, DL);
31524
31525 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
31526 if (VT.is256BitVector() && !Subtarget.hasInt256())
31527 return splitVectorIntUnary(Op, DAG, DL);
31528
31529 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
31530 if (!VT.isVector()) {
31531 assert(
31532 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
31533 "Only tested for i8/i16/i32/i64");
31534 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31535 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31536 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
31537 DAG.getBitcast(MVT::v16i8, Res));
31538 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
31539 DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL));
31540 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
31541 }
31542
31543 assert(VT.isVector() && VT.getSizeInBits() >= 128);
31544
31545 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
31546 if (VT.getScalarType() != MVT::i8) {
31547 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31548 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
31549 Res = DAG.getBitcast(ByteVT, Res);
31550 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
31551 return DAG.getBitcast(VT, Res);
31552 }
31553 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
31554 "Only byte vector BITREVERSE supported");
31555
31556 unsigned NumElts = VT.getVectorNumElements();
31557
31558 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
31559 if (Subtarget.hasGFNI()) {
31560 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
31561 SDValue Matrix =
31563 Matrix = DAG.getBitcast(VT, Matrix);
31564 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
31565 DAG.getTargetConstant(0, DL, MVT::i8));
31566 }
31567
31568 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
31569 // two nibbles and a PSHUFB lookup to find the bitreverse of each
31570 // 0-15 value (moved to the other nibble).
31571 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
31572 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
31573 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
31574
31575 const int LoLUT[16] = {
31576 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
31577 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
31578 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
31579 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
31580 const int HiLUT[16] = {
31581 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
31582 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
31583 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
31584 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
31585
31586 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
31587 for (unsigned i = 0; i < NumElts; ++i) {
31588 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
31589 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
31590 }
31591
31592 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
31593 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
31594 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
31595 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
31596 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31597}
31598
31599static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
31600 SelectionDAG &DAG) {
31601 SDLoc DL(Op);
31602 SDValue X = Op.getOperand(0);
31603 MVT VT = Op.getSimpleValueType();
31604
31605 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
31606 if (VT == MVT::i8 ||
31608 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31609 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
31610 DAG.getConstant(0, DL, MVT::i8));
31611 // Copy the inverse of the parity flag into a register with setcc.
31612 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31613 // Extend to the original type.
31614 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31615 }
31616
31617 // If we have POPCNT, use the default expansion.
31618 if (Subtarget.hasPOPCNT())
31619 return SDValue();
31620
31621 if (VT == MVT::i64) {
31622 // Xor the high and low 16-bits together using a 32-bit operation.
31623 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
31624 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
31625 DAG.getConstant(32, DL, MVT::i8)));
31626 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
31627 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
31628 }
31629
31630 if (VT != MVT::i16) {
31631 // Xor the high and low 16-bits together using a 32-bit operation.
31632 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
31633 DAG.getConstant(16, DL, MVT::i8));
31634 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
31635 } else {
31636 // If the input is 16-bits, we need to extend to use an i32 shift below.
31637 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
31638 }
31639
31640 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
31641 // This should allow an h-reg to be used to save a shift.
31642 SDValue Hi = DAG.getNode(
31643 ISD::TRUNCATE, DL, MVT::i8,
31644 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
31645 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31646 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
31647 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
31648
31649 // Copy the inverse of the parity flag into a register with setcc.
31650 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31651 // Extend to the original type.
31652 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31653}
31654
31656 const X86Subtarget &Subtarget) {
31657 unsigned NewOpc = 0;
31658 switch (N->getOpcode()) {
31660 NewOpc = X86ISD::LADD;
31661 break;
31663 NewOpc = X86ISD::LSUB;
31664 break;
31666 NewOpc = X86ISD::LOR;
31667 break;
31669 NewOpc = X86ISD::LXOR;
31670 break;
31672 NewOpc = X86ISD::LAND;
31673 break;
31674 default:
31675 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
31676 }
31677
31678 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31679
31680 return DAG.getMemIntrinsicNode(
31681 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
31682 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31683 /*MemVT=*/N->getSimpleValueType(0), MMO);
31684}
31685
31686/// Lower atomic_load_ops into LOCK-prefixed operations.
31688 const X86Subtarget &Subtarget) {
31689 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
31690 SDValue Chain = N->getOperand(0);
31691 SDValue LHS = N->getOperand(1);
31692 SDValue RHS = N->getOperand(2);
31693 unsigned Opc = N->getOpcode();
31694 MVT VT = N->getSimpleValueType(0);
31695 SDLoc DL(N);
31696
31697 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
31698 // can only be lowered when the result is unused. They should have already
31699 // been transformed into a cmpxchg loop in AtomicExpand.
31700 if (N->hasAnyUseOfValue(0)) {
31701 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31702 // select LXADD if LOCK_SUB can't be selected.
31703 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
31704 // can use LXADD as opposed to cmpxchg.
31705 if (Opc == ISD::ATOMIC_LOAD_SUB ||
31707 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
31708 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
31709
31711 "Used AtomicRMW ops other than Add should have been expanded!");
31712 return N;
31713 }
31714
31715 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
31716 // The core idea here is that since the memory location isn't actually
31717 // changing, all we need is a lowering for the *ordering* impacts of the
31718 // atomicrmw. As such, we can chose a different operation and memory
31719 // location to minimize impact on other code.
31720 // The above holds unless the node is marked volatile in which
31721 // case it needs to be preserved according to the langref.
31722 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
31723 // On X86, the only ordering which actually requires an instruction is
31724 // seq_cst which isn't SingleThread, everything just needs to be preserved
31725 // during codegen and then dropped. Note that we expect (but don't assume),
31726 // that orderings other than seq_cst and acq_rel have been canonicalized to
31727 // a store or load.
31730 // Prefer a locked operation against a stack location to minimize cache
31731 // traffic. This assumes that stack locations are very likely to be
31732 // accessed only by the owning thread.
31733 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
31734 assert(!N->hasAnyUseOfValue(0));
31735 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31736 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31737 DAG.getUNDEF(VT), NewChain);
31738 }
31739 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31740 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
31741 assert(!N->hasAnyUseOfValue(0));
31742 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31743 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31744 DAG.getUNDEF(VT), NewChain);
31745 }
31746
31747 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
31748 // RAUW the chain, but don't worry about the result, as it's unused.
31749 assert(!N->hasAnyUseOfValue(0));
31750 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31751 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31752 DAG.getUNDEF(VT), LockOp.getValue(1));
31753}
31754
31756 const X86Subtarget &Subtarget) {
31757 auto *Node = cast<AtomicSDNode>(Op.getNode());
31758 SDLoc dl(Node);
31759 EVT VT = Node->getMemoryVT();
31760
31761 bool IsSeqCst =
31762 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31763 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
31764
31765 // If this store is not sequentially consistent and the type is legal
31766 // we can just keep it.
31767 if (!IsSeqCst && IsTypeLegal)
31768 return Op;
31769
31770 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
31772 Attribute::NoImplicitFloat)) {
31773 SDValue Chain;
31774 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
31775 // vector store.
31776 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
31777 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
31778 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
31779 Node->getMemOperand());
31780 }
31781
31782 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
31783 // is enabled.
31784 if (VT == MVT::i64) {
31785 if (Subtarget.hasSSE1()) {
31786 SDValue SclToVec =
31787 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
31788 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31789 SclToVec = DAG.getBitcast(StVT, SclToVec);
31790 SDVTList Tys = DAG.getVTList(MVT::Other);
31791 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31792 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
31793 MVT::i64, Node->getMemOperand());
31794 } else if (Subtarget.hasX87()) {
31795 // First load this into an 80-bit X87 register using a stack temporary.
31796 // This will put the whole integer into the significand.
31797 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31798 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31799 MachinePointerInfo MPI =
31801 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
31803 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31804 SDValue LdOps[] = {Chain, StackPtr};
31806 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
31807 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
31808 Chain = Value.getValue(1);
31809
31810 // Now use an FIST to do the atomic store.
31811 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31812 Chain =
31813 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
31814 StoreOps, MVT::i64, Node->getMemOperand());
31815 }
31816 }
31817
31818 if (Chain) {
31819 // If this is a sequentially consistent store, also emit an appropriate
31820 // barrier.
31821 if (IsSeqCst)
31822 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31823
31824 return Chain;
31825 }
31826 }
31827
31828 // Convert seq_cst store -> xchg
31829 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31830 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31831 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
31832 Node->getOperand(0), Node->getOperand(2),
31833 Node->getOperand(1), Node->getMemOperand());
31834 return Swap.getValue(1);
31835}
31836
31838 SDNode *N = Op.getNode();
31839 MVT VT = N->getSimpleValueType(0);
31840 unsigned Opc = Op.getOpcode();
31841
31842 // Let legalize expand this if it isn't a legal type yet.
31843 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31844 return SDValue();
31845
31846 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
31847 SDLoc DL(N);
31848
31849 // Set the carry flag.
31850 SDValue Carry = Op.getOperand(2);
31851 EVT CarryVT = Carry.getValueType();
31852 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
31853 Carry, DAG.getAllOnesConstant(DL, CarryVT));
31854
31855 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
31856 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
31857 Op.getOperand(0), Op.getOperand(1),
31858 Carry.getValue(1));
31859
31860 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
31861 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
31862 Sum.getValue(1), DL, DAG);
31863 if (N->getValueType(1) == MVT::i1)
31864 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
31865
31866 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31867}
31868
31869static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
31870 SelectionDAG &DAG) {
31871 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
31872
31873 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
31874 // which returns the values as { float, float } (in XMM0) or
31875 // { double, double } (which is returned in XMM0, XMM1).
31876 SDLoc dl(Op);
31877 SDValue Arg = Op.getOperand(0);
31878 EVT ArgVT = Arg.getValueType();
31879 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31880
31883
31884 Entry.Node = Arg;
31885 Entry.Ty = ArgTy;
31886 Entry.IsSExt = false;
31887 Entry.IsZExt = false;
31888 Args.push_back(Entry);
31889
31890 bool isF64 = ArgVT == MVT::f64;
31891 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
31892 // the small struct {f32, f32} is returned in (eax, edx). For f64,
31893 // the results are returned via SRet in memory.
31894 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31895 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
31896 const char *LibcallName = TLI.getLibcallName(LC);
31897 SDValue Callee =
31898 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
31899
31900 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
31901 : (Type *)FixedVectorType::get(ArgTy, 4);
31902
31904 CLI.setDebugLoc(dl)
31905 .setChain(DAG.getEntryNode())
31906 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
31907
31908 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
31909
31910 if (isF64)
31911 // Returned in xmm0 and xmm1.
31912 return CallResult.first;
31913
31914 // Returned in bits 0:31 and 32:64 xmm0.
31915 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31916 CallResult.first, DAG.getIntPtrConstant(0, dl));
31917 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31918 CallResult.first, DAG.getIntPtrConstant(1, dl));
31919 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
31920 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
31921}
31922
31923/// Widen a vector input to a vector of NVT. The
31924/// input vector must have the same element type as NVT.
31926 bool FillWithZeroes = false) {
31927 // Check if InOp already has the right width.
31928 MVT InVT = InOp.getSimpleValueType();
31929 if (InVT == NVT)
31930 return InOp;
31931
31932 if (InOp.isUndef())
31933 return DAG.getUNDEF(NVT);
31934
31936 "input and widen element type must match");
31937
31938 unsigned InNumElts = InVT.getVectorNumElements();
31939 unsigned WidenNumElts = NVT.getVectorNumElements();
31940 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
31941 "Unexpected request for vector widening");
31942
31943 SDLoc dl(InOp);
31944 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
31945 InOp.getNumOperands() == 2) {
31946 SDValue N1 = InOp.getOperand(1);
31947 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
31948 N1.isUndef()) {
31949 InOp = InOp.getOperand(0);
31950 InVT = InOp.getSimpleValueType();
31951 InNumElts = InVT.getVectorNumElements();
31952 }
31953 }
31957 for (unsigned i = 0; i < InNumElts; ++i)
31958 Ops.push_back(InOp.getOperand(i));
31959
31960 EVT EltVT = InOp.getOperand(0).getValueType();
31961
31962 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
31963 DAG.getUNDEF(EltVT);
31964 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31965 Ops.push_back(FillVal);
31966 return DAG.getBuildVector(NVT, dl, Ops);
31967 }
31968 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
31969 DAG.getUNDEF(NVT);
31970 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
31971 InOp, DAG.getIntPtrConstant(0, dl));
31972}
31973
31975 SelectionDAG &DAG) {
31976 assert(Subtarget.hasAVX512() &&
31977 "MGATHER/MSCATTER are supported on AVX-512 arch only");
31978
31979 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
31980 SDValue Src = N->getValue();
31981 MVT VT = Src.getSimpleValueType();
31982 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
31983 SDLoc dl(Op);
31984
31985 SDValue Scale = N->getScale();
31986 SDValue Index = N->getIndex();
31987 SDValue Mask = N->getMask();
31988 SDValue Chain = N->getChain();
31989 SDValue BasePtr = N->getBasePtr();
31990
31991 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
31992 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31993 // If the index is v2i64 and we have VLX we can use xmm for data and index.
31994 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
31995 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31996 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
31997 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
31998 SDVTList VTs = DAG.getVTList(MVT::Other);
31999 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
32000 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
32001 N->getMemoryVT(), N->getMemOperand());
32002 }
32003 return SDValue();
32004 }
32005
32006 MVT IndexVT = Index.getSimpleValueType();
32007
32008 // If the index is v2i32, we're being called by type legalization and we
32009 // should just let the default handling take care of it.
32010 if (IndexVT == MVT::v2i32)
32011 return SDValue();
32012
32013 // If we don't have VLX and neither the passthru or index is 512-bits, we
32014 // need to widen until one is.
32015 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
32016 !Index.getSimpleValueType().is512BitVector()) {
32017 // Determine how much we need to widen by to get a 512-bit type.
32018 unsigned Factor = std::min(512/VT.getSizeInBits(),
32019 512/IndexVT.getSizeInBits());
32020 unsigned NumElts = VT.getVectorNumElements() * Factor;
32021
32022 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32023 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32024 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32025
32026 Src = ExtendToType(Src, VT, DAG);
32027 Index = ExtendToType(Index, IndexVT, DAG);
32028 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32029 }
32030
32031 SDVTList VTs = DAG.getVTList(MVT::Other);
32032 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
32033 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
32034 N->getMemoryVT(), N->getMemOperand());
32035}
32036
32037static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
32038 SelectionDAG &DAG) {
32039
32040 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
32041 MVT VT = Op.getSimpleValueType();
32042 MVT ScalarVT = VT.getScalarType();
32043 SDValue Mask = N->getMask();
32044 MVT MaskVT = Mask.getSimpleValueType();
32045 SDValue PassThru = N->getPassThru();
32046 SDLoc dl(Op);
32047
32048 // Handle AVX masked loads which don't support passthru other than 0.
32049 if (MaskVT.getVectorElementType() != MVT::i1) {
32050 // We also allow undef in the isel pattern.
32051 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
32052 return Op;
32053
32054 SDValue NewLoad = DAG.getMaskedLoad(
32055 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32056 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
32057 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
32058 N->isExpandingLoad());
32059 // Emit a blend.
32060 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
32061 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
32062 }
32063
32064 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
32065 "Expanding masked load is supported on AVX-512 target only!");
32066
32067 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
32068 "Expanding masked load is supported for 32 and 64-bit types only!");
32069
32070 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32071 "Cannot lower masked load op.");
32072
32073 assert((ScalarVT.getSizeInBits() >= 32 ||
32074 (Subtarget.hasBWI() &&
32075 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
32076 "Unsupported masked load op.");
32077
32078 // This operation is legal for targets with VLX, but without
32079 // VLX the vector should be widened to 512 bit
32080 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
32081 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32082 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
32083
32084 // Mask element has to be i1.
32085 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
32086 "Unexpected mask type");
32087
32088 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32089
32090 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32091 SDValue NewLoad = DAG.getMaskedLoad(
32092 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32093 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
32094 N->getExtensionType(), N->isExpandingLoad());
32095
32096 SDValue Extract =
32097 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
32098 DAG.getIntPtrConstant(0, dl));
32099 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
32100 return DAG.getMergeValues(RetOps, dl);
32101}
32102
32103static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
32104 SelectionDAG &DAG) {
32105 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
32106 SDValue DataToStore = N->getValue();
32107 MVT VT = DataToStore.getSimpleValueType();
32108 MVT ScalarVT = VT.getScalarType();
32109 SDValue Mask = N->getMask();
32110 SDLoc dl(Op);
32111
32112 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
32113 "Expanding masked load is supported on AVX-512 target only!");
32114
32115 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
32116 "Expanding masked load is supported for 32 and 64-bit types only!");
32117
32118 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32119 "Cannot lower masked store op.");
32120
32121 assert((ScalarVT.getSizeInBits() >= 32 ||
32122 (Subtarget.hasBWI() &&
32123 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
32124 "Unsupported masked store op.");
32125
32126 // This operation is legal for targets with VLX, but without
32127 // VLX the vector should be widened to 512 bit
32128 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
32129 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32130
32131 // Mask element has to be i1.
32132 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
32133 "Unexpected mask type");
32134
32135 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32136
32137 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
32138 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32139 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
32140 N->getOffset(), Mask, N->getMemoryVT(),
32141 N->getMemOperand(), N->getAddressingMode(),
32142 N->isTruncatingStore(), N->isCompressingStore());
32143}
32144
32145static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
32146 SelectionDAG &DAG) {
32147 assert(Subtarget.hasAVX2() &&
32148 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
32149
32150 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
32151 SDLoc dl(Op);
32152 MVT VT = Op.getSimpleValueType();
32153 SDValue Index = N->getIndex();
32154 SDValue Mask = N->getMask();
32155 SDValue PassThru = N->getPassThru();
32156 MVT IndexVT = Index.getSimpleValueType();
32157
32158 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
32159
32160 // If the index is v2i32, we're being called by type legalization.
32161 if (IndexVT == MVT::v2i32)
32162 return SDValue();
32163
32164 // If we don't have VLX and neither the passthru or index is 512-bits, we
32165 // need to widen until one is.
32166 MVT OrigVT = VT;
32167 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32168 !IndexVT.is512BitVector()) {
32169 // Determine how much we need to widen by to get a 512-bit type.
32170 unsigned Factor = std::min(512/VT.getSizeInBits(),
32171 512/IndexVT.getSizeInBits());
32172
32173 unsigned NumElts = VT.getVectorNumElements() * Factor;
32174
32175 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32176 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32177 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32178
32179 PassThru = ExtendToType(PassThru, VT, DAG);
32180 Index = ExtendToType(Index, IndexVT, DAG);
32181 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32182 }
32183
32184 // Break dependency on the data register.
32185 if (PassThru.isUndef())
32186 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
32187
32188 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
32189 N->getScale() };
32190 SDValue NewGather = DAG.getMemIntrinsicNode(
32191 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
32192 N->getMemOperand());
32193 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
32194 NewGather, DAG.getIntPtrConstant(0, dl));
32195 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
32196}
32197
32199 SDLoc dl(Op);
32200 SDValue Src = Op.getOperand(0);
32201 MVT DstVT = Op.getSimpleValueType();
32202
32203 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
32204 unsigned SrcAS = N->getSrcAddressSpace();
32205
32206 assert(SrcAS != N->getDestAddressSpace() &&
32207 "addrspacecast must be between different address spaces");
32208
32209 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
32210 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
32211 } else if (DstVT == MVT::i64) {
32212 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
32213 } else if (DstVT == MVT::i32) {
32214 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
32215 } else {
32216 report_fatal_error("Bad address space in addrspacecast");
32217 }
32218 return Op;
32219}
32220
32221SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
32222 SelectionDAG &DAG) const {
32223 // TODO: Eventually, the lowering of these nodes should be informed by or
32224 // deferred to the GC strategy for the function in which they appear. For
32225 // now, however, they must be lowered to something. Since they are logically
32226 // no-ops in the case of a null GC strategy (or a GC strategy which does not
32227 // require special handling for these nodes), lower them as literal NOOPs for
32228 // the time being.
32230 Ops.push_back(Op.getOperand(0));
32231 if (Op->getGluedNode())
32232 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
32233
32234 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
32235 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
32236}
32237
32238// Custom split CVTPS2PH with wide types.
32240 SDLoc dl(Op);
32241 EVT VT = Op.getValueType();
32242 SDValue Lo, Hi;
32243 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
32244 EVT LoVT, HiVT;
32245 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32246 SDValue RC = Op.getOperand(1);
32247 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
32248 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
32249 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32250}
32251
32253 SelectionDAG &DAG) {
32254 unsigned IsData = Op.getConstantOperandVal(4);
32255
32256 // We don't support non-data prefetch without PREFETCHI.
32257 // Just preserve the chain.
32258 if (!IsData && !Subtarget.hasPREFETCHI())
32259 return Op.getOperand(0);
32260
32261 return Op;
32262}
32263
32265 unsigned OpNo) {
32266 const APInt Operand(32, OpNo);
32267 std::string OpNoStr = llvm::toString(Operand, 10, false);
32268 std::string Str(" $");
32269
32270 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
32271 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
32272
32273 auto I = StringRef::npos;
32274 for (auto &AsmStr : AsmStrs) {
32275 // Match the OpNo string. We should match exactly to exclude match
32276 // sub-string, e.g. "$12" contain "$1"
32277 if (AsmStr.ends_with(OpNoStr1))
32278 I = AsmStr.size() - OpNoStr1.size();
32279
32280 // Get the index of operand in AsmStr.
32281 if (I == StringRef::npos)
32282 I = AsmStr.find(OpNoStr1 + ",");
32283 if (I == StringRef::npos)
32284 I = AsmStr.find(OpNoStr2);
32285
32286 if (I == StringRef::npos)
32287 continue;
32288
32289 assert(I > 0 && "Unexpected inline asm string!");
32290 // Remove the operand string and label (if exsit).
32291 // For example:
32292 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
32293 // ==>
32294 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
32295 // ==>
32296 // "call dword ptr "
32297 auto TmpStr = AsmStr.substr(0, I);
32298 I = TmpStr.rfind(':');
32299 if (I != StringRef::npos)
32300 TmpStr = TmpStr.substr(I + 1);
32301 return TmpStr.take_while(llvm::isAlpha);
32302 }
32303
32304 return StringRef();
32305}
32306
32308 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
32309 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
32310 // changed from indirect TargetLowering::C_Memory to direct
32311 // TargetLowering::C_Address.
32312 // We don't need to special case LOOP* and Jcc, which cannot target a memory
32313 // location.
32314 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
32315 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
32316}
32317
32319 SDValue Mask) {
32320 EVT Ty = MVT::i8;
32321 auto V = DAG.getBitcast(MVT::i1, Mask);
32322 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
32323 auto Zero = DAG.getConstant(0, DL, Ty);
32324 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
32325 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
32326 return SDValue(CmpZero.getNode(), 1);
32327}
32328
32330 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
32331 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
32332 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
32333 // ->
32334 // _, flags = SUB 0, mask
32335 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
32336 // bit_cast_to_vector<res>
32337 EVT VTy = PassThru.getValueType();
32338 EVT Ty = VTy.getVectorElementType();
32339 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
32340 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
32341 : DAG.getBitcast(Ty, PassThru);
32342 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
32343 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
32344 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
32345 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
32346 return DAG.getBitcast(VTy, NewLoad);
32347}
32348
32350 SDValue Chain,
32352 SDValue Val, SDValue Mask) const {
32353 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
32354 // ->
32355 // _, flags = SUB 0, mask
32356 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
32358 SDVTList Tys = DAG.getVTList(MVT::Other);
32359 auto ScalarVal = DAG.getBitcast(Ty, Val);
32360 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
32361 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
32362 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
32363 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
32364}
32365
32366/// Provide custom lowering hooks for some operations.
32368 switch (Op.getOpcode()) {
32369 // clang-format off
32370 default: llvm_unreachable("Should not custom lower this!");
32371 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
32373 return LowerCMP_SWAP(Op, Subtarget, DAG);
32374 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
32379 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
32380 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
32381 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
32382 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
32383 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
32384 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
32385 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
32386 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
32387 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
32388 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
32389 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
32390 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
32391 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
32392 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
32393 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
32394 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
32395 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
32396 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
32397 case ISD::SHL_PARTS:
32398 case ISD::SRA_PARTS:
32399 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
32400 case ISD::FSHL:
32401 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
32403 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
32405 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
32406 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
32407 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
32408 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
32409 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
32412 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
32413 case ISD::FP_TO_SINT:
32415 case ISD::FP_TO_UINT:
32416 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
32418 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
32419 case ISD::FP_EXTEND:
32420 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
32421 case ISD::FP_ROUND:
32422 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
32423 case ISD::FP16_TO_FP:
32424 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
32425 case ISD::FP_TO_FP16:
32426 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
32427 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
32428 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
32429 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
32430 case ISD::FADD:
32431 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
32432 case ISD::FROUND: return LowerFROUND(Op, DAG);
32433 case ISD::FABS:
32434 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
32435 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
32436 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
32437 case ISD::LRINT:
32438 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
32439 case ISD::SETCC:
32440 case ISD::STRICT_FSETCC:
32441 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
32442 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
32443 case ISD::SELECT: return LowerSELECT(Op, DAG);
32444 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
32445 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
32446 case ISD::VASTART: return LowerVASTART(Op, DAG);
32447 case ISD::VAARG: return LowerVAARG(Op, DAG);
32448 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
32449 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
32451 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
32452 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
32453 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
32454 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
32456 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
32457 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
32458 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
32459 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
32460 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
32462 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
32463 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
32465 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
32466 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
32467 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
32468 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
32469 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
32470 case ISD::CTLZ:
32471 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
32472 case ISD::CTTZ:
32473 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
32474 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
32475 case ISD::MULHS:
32476 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
32477 case ISD::ROTL:
32478 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
32479 case ISD::SRA:
32480 case ISD::SRL:
32481 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
32482 case ISD::SADDO:
32483 case ISD::UADDO:
32484 case ISD::SSUBO:
32485 case ISD::USUBO: return LowerXALUO(Op, DAG);
32486 case ISD::SMULO:
32487 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
32488 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
32489 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
32490 case ISD::SADDO_CARRY:
32491 case ISD::SSUBO_CARRY:
32492 case ISD::UADDO_CARRY:
32493 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
32494 case ISD::ADD:
32495 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
32496 case ISD::UADDSAT:
32497 case ISD::SADDSAT:
32498 case ISD::USUBSAT:
32499 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
32500 case ISD::SMAX:
32501 case ISD::SMIN:
32502 case ISD::UMAX:
32503 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
32504 case ISD::FMINIMUM:
32505 case ISD::FMAXIMUM:
32506 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
32507 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
32508 case ISD::ABDS:
32509 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
32510 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
32511 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
32512 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
32513 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
32514 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
32515 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
32517 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
32518 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
32519 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
32520 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
32521 // clang-format on
32522 }
32523}
32524
32525/// Replace a node with an illegal result type with a new node built out of
32526/// custom code.
32529 SelectionDAG &DAG) const {
32530 SDLoc dl(N);
32531 switch (N->getOpcode()) {
32532 default:
32533#ifndef NDEBUG
32534 dbgs() << "ReplaceNodeResults: ";
32535 N->dump(&DAG);
32536#endif
32537 llvm_unreachable("Do not know how to custom type legalize this operation!");
32538 case X86ISD::CVTPH2PS: {
32539 EVT VT = N->getValueType(0);
32540 SDValue Lo, Hi;
32541 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
32542 EVT LoVT, HiVT;
32543 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32544 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
32545 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
32546 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32547 Results.push_back(Res);
32548 return;
32549 }
32551 EVT VT = N->getValueType(0);
32552 SDValue Lo, Hi;
32553 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
32554 EVT LoVT, HiVT;
32555 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32556 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
32557 {N->getOperand(0), Lo});
32558 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
32559 {N->getOperand(0), Hi});
32560 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32561 Lo.getValue(1), Hi.getValue(1));
32562 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32563 Results.push_back(Res);
32564 Results.push_back(Chain);
32565 return;
32566 }
32567 case X86ISD::CVTPS2PH:
32568 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
32569 return;
32570 case ISD::CTPOP: {
32571 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32572 // If we have at most 32 active bits, then perform as i32 CTPOP.
32573 // TODO: Perform this in generic legalizer?
32574 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
32575 unsigned LZ = Known.countMinLeadingZeros();
32576 unsigned TZ = Known.countMinTrailingZeros();
32577 if ((LZ + TZ) >= 32) {
32578 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
32579 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
32580 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
32581 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
32582 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
32583 Results.push_back(Op);
32584 return;
32585 }
32586 // Use a v2i64 if possible.
32587 bool NoImplicitFloatOps =
32589 Attribute::NoImplicitFloat);
32590 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
32591 SDValue Wide =
32592 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
32593 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
32594 // Bit count should fit in 32-bits, extract it as that and then zero
32595 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
32596 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
32597 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
32598 DAG.getIntPtrConstant(0, dl));
32599 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
32600 Results.push_back(Wide);
32601 }
32602 return;
32603 }
32604 case ISD::MUL: {
32605 EVT VT = N->getValueType(0);
32607 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
32608 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
32609 // elements are needed.
32610 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
32611 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
32612 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
32613 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
32614 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32615 unsigned NumConcats = 16 / VT.getVectorNumElements();
32616 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32617 ConcatOps[0] = Res;
32618 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
32619 Results.push_back(Res);
32620 return;
32621 }
32622 case ISD::SMULO:
32623 case ISD::UMULO: {
32624 EVT VT = N->getValueType(0);
32626 VT == MVT::v2i32 && "Unexpected VT!");
32627 bool IsSigned = N->getOpcode() == ISD::SMULO;
32628 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
32629 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
32630 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
32631 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
32632 // Extract the high 32 bits from each result using PSHUFD.
32633 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
32634 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
32635 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
32636 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
32637 DAG.getIntPtrConstant(0, dl));
32638
32639 // Truncate the low bits of the result. This will become PSHUFD.
32640 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32641
32642 SDValue HiCmp;
32643 if (IsSigned) {
32644 // SMULO overflows if the high bits don't match the sign of the low.
32645 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
32646 } else {
32647 // UMULO overflows if the high bits are non-zero.
32648 HiCmp = DAG.getConstant(0, dl, VT);
32649 }
32650 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
32651
32652 // Widen the result with by padding with undef.
32653 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32654 DAG.getUNDEF(VT));
32655 Results.push_back(Res);
32656 Results.push_back(Ovf);
32657 return;
32658 }
32659 case X86ISD::VPMADDWD: {
32660 // Legalize types for X86ISD::VPMADDWD by widening.
32661 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32662
32663 EVT VT = N->getValueType(0);
32664 EVT InVT = N->getOperand(0).getValueType();
32665 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
32666 "Expected a VT that divides into 128 bits.");
32668 "Unexpected type action!");
32669 unsigned NumConcat = 128 / InVT.getSizeInBits();
32670
32671 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
32672 InVT.getVectorElementType(),
32673 NumConcat * InVT.getVectorNumElements());
32674 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
32676 NumConcat * VT.getVectorNumElements());
32677
32678 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
32679 Ops[0] = N->getOperand(0);
32680 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32681 Ops[0] = N->getOperand(1);
32682 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32683
32684 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
32685 Results.push_back(Res);
32686 return;
32687 }
32688 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
32689 case X86ISD::FMINC:
32690 case X86ISD::FMIN:
32691 case X86ISD::FMAXC:
32692 case X86ISD::FMAX: {
32693 EVT VT = N->getValueType(0);
32694 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
32695 SDValue UNDEF = DAG.getUNDEF(VT);
32696 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32697 N->getOperand(0), UNDEF);
32698 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32699 N->getOperand(1), UNDEF);
32700 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
32701 return;
32702 }
32703 case ISD::SDIV:
32704 case ISD::UDIV:
32705 case ISD::SREM:
32706 case ISD::UREM: {
32707 EVT VT = N->getValueType(0);
32708 if (VT.isVector()) {
32710 "Unexpected type action!");
32711 // If this RHS is a constant splat vector we can widen this and let
32712 // division/remainder by constant optimize it.
32713 // TODO: Can we do something for non-splat?
32714 APInt SplatVal;
32715 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
32716 unsigned NumConcats = 128 / VT.getSizeInBits();
32717 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
32718 Ops0[0] = N->getOperand(0);
32719 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
32720 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
32721 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
32722 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
32723 Results.push_back(Res);
32724 }
32725 return;
32726 }
32727
32728 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
32729 Results.push_back(V);
32730 return;
32731 }
32732 case ISD::TRUNCATE: {
32733 MVT VT = N->getSimpleValueType(0);
32734 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
32735 return;
32736
32737 // The generic legalizer will try to widen the input type to the same
32738 // number of elements as the widened result type. But this isn't always
32739 // the best thing so do some custom legalization to avoid some cases.
32740 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
32741 SDValue In = N->getOperand(0);
32742 EVT InVT = In.getValueType();
32743 EVT InEltVT = InVT.getVectorElementType();
32744 EVT EltVT = VT.getVectorElementType();
32745 unsigned MinElts = VT.getVectorNumElements();
32746 unsigned WidenNumElts = WidenVT.getVectorNumElements();
32747 unsigned InBits = InVT.getSizeInBits();
32748
32749 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
32750 unsigned PackOpcode;
32751 if (SDValue Src =
32752 matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
32753 if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
32754 dl, DAG, Subtarget)) {
32755 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
32756 Results.push_back(Res);
32757 return;
32758 }
32759 }
32760
32761 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
32762 // 128 bit and smaller inputs should avoid truncate all together and
32763 // use a shuffle.
32764 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
32765 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
32766 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
32767 for (unsigned I = 0; I < MinElts; ++I)
32768 TruncMask[I] = Scale * I;
32769 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
32770 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
32771 "Illegal vector type in truncation");
32772 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
32773 Results.push_back(
32774 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
32775 return;
32776 }
32777 }
32778
32779 // With AVX512 there are some cases that can use a target specific
32780 // truncate node to go from 256/512 to less than 128 with zeros in the
32781 // upper elements of the 128 bit result.
32782 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
32783 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
32784 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
32785 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32786 return;
32787 }
32788 // There's one case we can widen to 512 bits and use VTRUNC.
32789 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
32790 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
32791 DAG.getUNDEF(MVT::v4i64));
32792 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32793 return;
32794 }
32795 }
32796 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
32797 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
32798 isTypeLegal(MVT::v4i64)) {
32799 // Input needs to be split and output needs to widened. Let's use two
32800 // VTRUNCs, and shuffle their results together into the wider type.
32801 SDValue Lo, Hi;
32802 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
32803
32804 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
32805 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
32806 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
32807 { 0, 1, 2, 3, 16, 17, 18, 19,
32808 -1, -1, -1, -1, -1, -1, -1, -1 });
32809 Results.push_back(Res);
32810 return;
32811 }
32812
32813 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
32814 // this via type legalization.
32815 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
32816 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
32817 (!Subtarget.hasSSSE3() ||
32818 (!isTypeLegal(InVT) &&
32819 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
32820 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
32821 InEltVT.getSizeInBits() * WidenNumElts);
32822 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
32823 return;
32824 }
32825
32826 return;
32827 }
32828 case ISD::ANY_EXTEND:
32829 // Right now, only MVT::v8i8 has Custom action for an illegal type.
32830 // It's intended to custom handle the input type.
32831 assert(N->getValueType(0) == MVT::v8i8 &&
32832 "Do not know how to legalize this Node");
32833 return;
32834 case ISD::SIGN_EXTEND:
32835 case ISD::ZERO_EXTEND: {
32836 EVT VT = N->getValueType(0);
32837 SDValue In = N->getOperand(0);
32838 EVT InVT = In.getValueType();
32839 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
32840 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
32842 "Unexpected type action!");
32843 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
32844 // Custom split this so we can extend i8/i16->i32 invec. This is better
32845 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
32846 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
32847 // we allow the sra from the extend to i32 to be shared by the split.
32848 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
32849
32850 // Fill a vector with sign bits for each element.
32851 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
32852 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
32853
32854 // Create an unpackl and unpackh to interleave the sign bits then bitcast
32855 // to v2i64.
32856 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32857 {0, 4, 1, 5});
32858 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
32859 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32860 {2, 6, 3, 7});
32861 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
32862
32863 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32864 Results.push_back(Res);
32865 return;
32866 }
32867
32868 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
32869 if (!InVT.is128BitVector()) {
32870 // Not a 128 bit vector, but maybe type legalization will promote
32871 // it to 128 bits.
32872 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
32873 return;
32874 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
32875 if (!InVT.is128BitVector())
32876 return;
32877
32878 // Promote the input to 128 bits. Type legalization will turn this into
32879 // zext_inreg/sext_inreg.
32880 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32881 }
32882
32883 // Perform custom splitting instead of the two stage extend we would get
32884 // by default.
32885 EVT LoVT, HiVT;
32886 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32887 assert(isTypeLegal(LoVT) && "Split VT not legal?");
32888
32889 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32890
32891 // We need to shift the input over by half the number of elements.
32892 unsigned NumElts = InVT.getVectorNumElements();
32893 unsigned HalfNumElts = NumElts / 2;
32894 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
32895 for (unsigned i = 0; i != HalfNumElts; ++i)
32896 ShufMask[i] = i + HalfNumElts;
32897
32898 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
32899 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32900
32901 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32902 Results.push_back(Res);
32903 }
32904 return;
32905 }
32906 case ISD::FP_TO_SINT:
32908 case ISD::FP_TO_UINT:
32910 bool IsStrict = N->isStrictFPOpcode();
32911 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32912 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32913 EVT VT = N->getValueType(0);
32914 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32915 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
32916 EVT SrcVT = Src.getValueType();
32917
32918 SDValue Res;
32919 if (isSoftF16(SrcVT, Subtarget)) {
32920 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
32921 if (IsStrict) {
32922 Res =
32923 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
32924 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
32925 {NVT, MVT::Other}, {Chain, Src})});
32926 Chain = Res.getValue(1);
32927 } else {
32928 Res = DAG.getNode(N->getOpcode(), dl, VT,
32929 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
32930 }
32931 Results.push_back(Res);
32932 if (IsStrict)
32933 Results.push_back(Chain);
32934
32935 return;
32936 }
32937
32938 if (VT.isVector() && Subtarget.hasFP16() &&
32939 SrcVT.getVectorElementType() == MVT::f16) {
32940 EVT EleVT = VT.getVectorElementType();
32941 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
32942
32943 if (SrcVT != MVT::v8f16) {
32944 SDValue Tmp =
32945 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
32946 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
32947 Ops[0] = Src;
32948 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
32949 }
32950
32951 if (IsStrict) {
32952 unsigned Opc =
32954 Res =
32955 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32956 Chain = Res.getValue(1);
32957 } else {
32958 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32959 Res = DAG.getNode(Opc, dl, ResVT, Src);
32960 }
32961
32962 // TODO: Need to add exception check code for strict FP.
32963 if (EleVT.getSizeInBits() < 16) {
32964 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
32965 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
32966
32967 // Now widen to 128 bits.
32968 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
32969 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
32970 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
32971 ConcatOps[0] = Res;
32972 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32973 }
32974
32975 Results.push_back(Res);
32976 if (IsStrict)
32977 Results.push_back(Chain);
32978
32979 return;
32980 }
32981
32982 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
32984 "Unexpected type action!");
32985
32986 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
32987 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
32988 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
32990 SDValue Res;
32991 SDValue Chain;
32992 if (IsStrict) {
32993 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
32994 {N->getOperand(0), Src});
32995 Chain = Res.getValue(1);
32996 } else
32997 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
32998
32999 // Preserve what we know about the size of the original result. If the
33000 // result is v2i32, we have to manually widen the assert.
33001 if (PromoteVT == MVT::v2i32)
33002 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33003 DAG.getUNDEF(MVT::v2i32));
33004
33005 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
33006 Res.getValueType(), Res,
33008
33009 if (PromoteVT == MVT::v2i32)
33010 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
33011 DAG.getIntPtrConstant(0, dl));
33012
33013 // Truncate back to the original width.
33014 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33015
33016 // Now widen to 128 bits.
33017 unsigned NumConcats = 128 / VT.getSizeInBits();
33019 VT.getVectorNumElements() * NumConcats);
33020 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33021 ConcatOps[0] = Res;
33022 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
33023 Results.push_back(Res);
33024 if (IsStrict)
33025 Results.push_back(Chain);
33026 return;
33027 }
33028
33029
33030 if (VT == MVT::v2i32) {
33031 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
33032 "Strict unsigned conversion requires AVX512");
33033 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33035 "Unexpected type action!");
33036 if (Src.getValueType() == MVT::v2f64) {
33037 if (!IsSigned && !Subtarget.hasAVX512()) {
33038 SDValue Res =
33039 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
33040 Results.push_back(Res);
33041 return;
33042 }
33043
33044 unsigned Opc;
33045 if (IsStrict)
33047 else
33048 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33049
33050 // If we have VLX we can emit a target specific FP_TO_UINT node,.
33051 if (!IsSigned && !Subtarget.hasVLX()) {
33052 // Otherwise we can defer to the generic legalizer which will widen
33053 // the input as well. This will be further widened during op
33054 // legalization to v8i32<-v8f64.
33055 // For strict nodes we'll need to widen ourselves.
33056 // FIXME: Fix the type legalizer to safely widen strict nodes?
33057 if (!IsStrict)
33058 return;
33059 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
33060 DAG.getConstantFP(0.0, dl, MVT::v2f64));
33061 Opc = N->getOpcode();
33062 }
33063 SDValue Res;
33064 SDValue Chain;
33065 if (IsStrict) {
33066 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
33067 {N->getOperand(0), Src});
33068 Chain = Res.getValue(1);
33069 } else {
33070 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
33071 }
33072 Results.push_back(Res);
33073 if (IsStrict)
33074 Results.push_back(Chain);
33075 return;
33076 }
33077
33078 // Custom widen strict v2f32->v2i32 by padding with zeros.
33079 // FIXME: Should generic type legalizer do this?
33080 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
33081 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
33082 DAG.getConstantFP(0.0, dl, MVT::v2f32));
33083 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
33084 {N->getOperand(0), Src});
33085 Results.push_back(Res);
33086 Results.push_back(Res.getValue(1));
33087 return;
33088 }
33089
33090 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
33091 // so early out here.
33092 return;
33093 }
33094
33095 assert(!VT.isVector() && "Vectors should have been handled above!");
33096
33097 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
33098 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
33099 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
33100 assert(!Subtarget.is64Bit() && "i64 should be legal");
33101 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
33102 // If we use a 128-bit result we might need to use a target specific node.
33103 unsigned SrcElts =
33104 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
33105 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
33106 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
33107 unsigned Opc = N->getOpcode();
33108 if (NumElts != SrcElts) {
33109 if (IsStrict)
33111 else
33112 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33113 }
33114
33115 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
33116 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
33117 DAG.getConstantFP(0.0, dl, VecInVT), Src,
33118 ZeroIdx);
33119 SDValue Chain;
33120 if (IsStrict) {
33121 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
33122 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
33123 Chain = Res.getValue(1);
33124 } else
33125 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
33126 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
33127 Results.push_back(Res);
33128 if (IsStrict)
33129 Results.push_back(Chain);
33130 return;
33131 }
33132
33133 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
33134 SDValue Chain;
33135 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
33136 Results.push_back(V);
33137 if (IsStrict)
33138 Results.push_back(Chain);
33139 return;
33140 }
33141
33142 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
33143 Results.push_back(V);
33144 if (IsStrict)
33145 Results.push_back(Chain);
33146 }
33147 return;
33148 }
33149 case ISD::LRINT:
33150 case ISD::LLRINT: {
33151 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
33152 Results.push_back(V);
33153 return;
33154 }
33155
33156 case ISD::SINT_TO_FP:
33158 case ISD::UINT_TO_FP:
33160 bool IsStrict = N->isStrictFPOpcode();
33161 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
33162 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
33163 EVT VT = N->getValueType(0);
33164 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33165 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
33166 Subtarget.hasVLX()) {
33167 if (Src.getValueType().getVectorElementType() == MVT::i16)
33168 return;
33169
33170 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
33171 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
33172 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
33173 : DAG.getUNDEF(MVT::v2i32));
33174 if (IsStrict) {
33175 unsigned Opc =
33177 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
33178 {N->getOperand(0), Src});
33179 Results.push_back(Res);
33180 Results.push_back(Res.getValue(1));
33181 } else {
33182 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
33183 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
33184 }
33185 return;
33186 }
33187 if (VT != MVT::v2f32)
33188 return;
33189 EVT SrcVT = Src.getValueType();
33190 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
33191 if (IsStrict) {
33192 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
33194 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33195 {N->getOperand(0), Src});
33196 Results.push_back(Res);
33197 Results.push_back(Res.getValue(1));
33198 } else {
33199 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
33200 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
33201 }
33202 return;
33203 }
33204 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
33205 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
33206 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
33207 SDValue One = DAG.getConstant(1, dl, SrcVT);
33208 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
33209 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
33210 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
33211 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
33212 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
33213 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
33214 for (int i = 0; i != 2; ++i) {
33215 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
33216 SignSrc, DAG.getIntPtrConstant(i, dl));
33217 if (IsStrict)
33218 SignCvts[i] =
33219 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
33220 {N->getOperand(0), Elt});
33221 else
33222 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
33223 };
33224 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
33225 SDValue Slow, Chain;
33226 if (IsStrict) {
33227 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33228 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
33229 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
33230 {Chain, SignCvt, SignCvt});
33231 Chain = Slow.getValue(1);
33232 } else {
33233 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
33234 }
33235 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
33236 IsNeg =
33237 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
33238 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
33239 Results.push_back(Cvt);
33240 if (IsStrict)
33241 Results.push_back(Chain);
33242 return;
33243 }
33244
33245 if (SrcVT != MVT::v2i32)
33246 return;
33247
33248 if (IsSigned || Subtarget.hasAVX512()) {
33249 if (!IsStrict)
33250 return;
33251
33252 // Custom widen strict v2i32->v2f32 to avoid scalarization.
33253 // FIXME: Should generic type legalizer do this?
33254 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
33255 DAG.getConstant(0, dl, MVT::v2i32));
33256 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
33257 {N->getOperand(0), Src});
33258 Results.push_back(Res);
33259 Results.push_back(Res.getValue(1));
33260 return;
33261 }
33262
33263 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33264 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
33265 SDValue VBias = DAG.getConstantFP(
33266 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
33267 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
33268 DAG.getBitcast(MVT::v2i64, VBias));
33269 Or = DAG.getBitcast(MVT::v2f64, Or);
33270 if (IsStrict) {
33271 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
33272 {N->getOperand(0), Or, VBias});
33274 {MVT::v4f32, MVT::Other},
33275 {Sub.getValue(1), Sub});
33276 Results.push_back(Res);
33277 Results.push_back(Res.getValue(1));
33278 } else {
33279 // TODO: Are there any fast-math-flags to propagate here?
33280 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
33281 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
33282 }
33283 return;
33284 }
33286 case ISD::FP_ROUND: {
33287 bool IsStrict = N->isStrictFPOpcode();
33288 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33289 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33290 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
33291 EVT SrcVT = Src.getValueType();
33292 EVT VT = N->getValueType(0);
33293 SDValue V;
33294 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
33295 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
33296 : DAG.getUNDEF(MVT::v2f32);
33297 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
33298 }
33299 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
33300 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
33301 if (SrcVT.getVectorElementType() != MVT::f32)
33302 return;
33303
33304 if (IsStrict)
33305 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
33306 {Chain, Src, Rnd});
33307 else
33308 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
33309
33310 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
33311 if (IsStrict)
33312 Results.push_back(V.getValue(1));
33313 return;
33314 }
33315 if (!isTypeLegal(Src.getValueType()))
33316 return;
33317 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
33318 if (IsStrict)
33319 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
33320 {Chain, Src});
33321 else
33322 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
33323 Results.push_back(V);
33324 if (IsStrict)
33325 Results.push_back(V.getValue(1));
33326 return;
33327 }
33328 case ISD::FP_EXTEND:
33329 case ISD::STRICT_FP_EXTEND: {
33330 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
33331 // No other ValueType for FP_EXTEND should reach this point.
33332 assert(N->getValueType(0) == MVT::v2f32 &&
33333 "Do not know how to legalize this Node");
33334 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
33335 return;
33336 bool IsStrict = N->isStrictFPOpcode();
33337 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33338 if (Src.getValueType().getVectorElementType() != MVT::f16)
33339 return;
33340 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
33341 : DAG.getUNDEF(MVT::v2f16);
33342 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
33343 if (IsStrict)
33344 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
33345 {N->getOperand(0), V});
33346 else
33347 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
33348 Results.push_back(V);
33349 if (IsStrict)
33350 Results.push_back(V.getValue(1));
33351 return;
33352 }
33354 unsigned IntNo = N->getConstantOperandVal(1);
33355 switch (IntNo) {
33356 default : llvm_unreachable("Do not know how to custom type "
33357 "legalize this intrinsic operation!");
33358 case Intrinsic::x86_rdtsc:
33359 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
33360 Results);
33361 case Intrinsic::x86_rdtscp:
33362 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
33363 Results);
33364 case Intrinsic::x86_rdpmc:
33365 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
33366 Results);
33367 return;
33368 case Intrinsic::x86_rdpru:
33369 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
33370 Results);
33371 return;
33372 case Intrinsic::x86_xgetbv:
33373 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
33374 Results);
33375 return;
33376 }
33377 }
33378 case ISD::READCYCLECOUNTER: {
33379 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
33380 }
33382 EVT T = N->getValueType(0);
33383 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
33384 bool Regs64bit = T == MVT::i128;
33385 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
33386 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
33387 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
33388 SDValue cpInL, cpInH;
33389 std::tie(cpInL, cpInH) =
33390 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
33391 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
33392 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
33393 cpInH =
33394 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
33395 cpInH, cpInL.getValue(1));
33396 SDValue swapInL, swapInH;
33397 std::tie(swapInL, swapInH) =
33398 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
33399 swapInH =
33400 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
33401 swapInH, cpInH.getValue(1));
33402
33403 // In 64-bit mode we might need the base pointer in RBX, but we can't know
33404 // until later. So we keep the RBX input in a vreg and use a custom
33405 // inserter.
33406 // Since RBX will be a reserved register the register allocator will not
33407 // make sure its value will be properly saved and restored around this
33408 // live-range.
33409 SDValue Result;
33410 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
33411 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
33412 if (Regs64bit) {
33413 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
33414 swapInH.getValue(1)};
33415 Result =
33416 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
33417 } else {
33418 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
33419 swapInH.getValue(1));
33420 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
33421 swapInL.getValue(1)};
33422 Result =
33423 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
33424 }
33425
33426 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
33427 Regs64bit ? X86::RAX : X86::EAX,
33428 HalfT, Result.getValue(1));
33429 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
33430 Regs64bit ? X86::RDX : X86::EDX,
33431 HalfT, cpOutL.getValue(2));
33432 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
33433
33434 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
33435 MVT::i32, cpOutH.getValue(2));
33436 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
33437 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
33438
33439 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
33440 Results.push_back(Success);
33441 Results.push_back(EFLAGS.getValue(1));
33442 return;
33443 }
33444 case ISD::ATOMIC_LOAD: {
33445 assert(
33446 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
33447 "Unexpected VT!");
33448 bool NoImplicitFloatOps =
33450 Attribute::NoImplicitFloat);
33451 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
33452 auto *Node = cast<AtomicSDNode>(N);
33453
33454 if (N->getValueType(0) == MVT::i128) {
33455 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
33456 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
33457 Node->getBasePtr(), Node->getMemOperand());
33458 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33459 DAG.getIntPtrConstant(0, dl));
33460 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33461 DAG.getIntPtrConstant(1, dl));
33462 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
33463 {ResL, ResH}));
33464 Results.push_back(Ld.getValue(1));
33465 return;
33466 }
33467 break;
33468 }
33469 if (Subtarget.hasSSE1()) {
33470 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
33471 // Then extract the lower 64-bits.
33472 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33473 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
33474 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33475 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33476 MVT::i64, Node->getMemOperand());
33477 if (Subtarget.hasSSE2()) {
33478 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33479 DAG.getIntPtrConstant(0, dl));
33480 Results.push_back(Res);
33481 Results.push_back(Ld.getValue(1));
33482 return;
33483 }
33484 // We use an alternative sequence for SSE1 that extracts as v2f32 and
33485 // then casts to i64. This avoids a 128-bit stack temporary being
33486 // created by type legalization if we were to cast v4f32->v2i64.
33487 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
33488 DAG.getIntPtrConstant(0, dl));
33489 Res = DAG.getBitcast(MVT::i64, Res);
33490 Results.push_back(Res);
33491 Results.push_back(Ld.getValue(1));
33492 return;
33493 }
33494 if (Subtarget.hasX87()) {
33495 // First load this into an 80-bit X87 register. This will put the whole
33496 // integer into the significand.
33497 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33498 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33500 dl, Tys, Ops, MVT::i64,
33501 Node->getMemOperand());
33502 SDValue Chain = Result.getValue(1);
33503
33504 // Now store the X87 register to a stack temporary and convert to i64.
33505 // This store is not atomic and doesn't need to be.
33506 // FIXME: We don't need a stack temporary if the result of the load
33507 // is already being stored. We could just directly store there.
33508 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33509 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33510 MachinePointerInfo MPI =
33512 SDValue StoreOps[] = { Chain, Result, StackPtr };
33513 Chain = DAG.getMemIntrinsicNode(
33514 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
33515 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
33516
33517 // Finally load the value back from the stack temporary and return it.
33518 // This load is not atomic and doesn't need to be.
33519 // This load will be further type legalized.
33520 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
33521 Results.push_back(Result);
33522 Results.push_back(Result.getValue(1));
33523 return;
33524 }
33525 }
33526 // TODO: Use MOVLPS when SSE1 is available?
33527 // Delegate to generic TypeLegalization. Situations we can really handle
33528 // should have already been dealt with by AtomicExpandPass.cpp.
33529 break;
33530 }
33531 case ISD::ATOMIC_SWAP:
33542 // Delegate to generic TypeLegalization. Situations we can really handle
33543 // should have already been dealt with by AtomicExpandPass.cpp.
33544 break;
33545
33546 case ISD::BITCAST: {
33547 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33548 EVT DstVT = N->getValueType(0);
33549 EVT SrcVT = N->getOperand(0).getValueType();
33550
33551 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
33552 // we can split using the k-register rather than memory.
33553 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
33554 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
33555 SDValue Lo, Hi;
33556 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33557 Lo = DAG.getBitcast(MVT::i32, Lo);
33558 Hi = DAG.getBitcast(MVT::i32, Hi);
33559 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
33560 Results.push_back(Res);
33561 return;
33562 }
33563
33564 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
33565 // FIXME: Use v4f32 for SSE1?
33566 assert(Subtarget.hasSSE2() && "Requires SSE2");
33567 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
33568 "Unexpected type action!");
33569 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
33570 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
33571 N->getOperand(0));
33572 Res = DAG.getBitcast(WideVT, Res);
33573 Results.push_back(Res);
33574 return;
33575 }
33576
33577 return;
33578 }
33579 case ISD::MGATHER: {
33580 EVT VT = N->getValueType(0);
33581 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
33582 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
33583 auto *Gather = cast<MaskedGatherSDNode>(N);
33584 SDValue Index = Gather->getIndex();
33585 if (Index.getValueType() != MVT::v2i64)
33586 return;
33588 "Unexpected type action!");
33589 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33590 SDValue Mask = Gather->getMask();
33591 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33592 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
33593 Gather->getPassThru(),
33594 DAG.getUNDEF(VT));
33595 if (!Subtarget.hasVLX()) {
33596 // We need to widen the mask, but the instruction will only use 2
33597 // of its elements. So we can use undef.
33598 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
33599 DAG.getUNDEF(MVT::v2i1));
33600 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
33601 }
33602 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
33603 Gather->getBasePtr(), Index, Gather->getScale() };
33604 SDValue Res = DAG.getMemIntrinsicNode(
33605 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
33606 Gather->getMemoryVT(), Gather->getMemOperand());
33607 Results.push_back(Res);
33608 Results.push_back(Res.getValue(1));
33609 return;
33610 }
33611 return;
33612 }
33613 case ISD::LOAD: {
33614 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
33615 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
33616 // cast since type legalization will try to use an i64 load.
33617 MVT VT = N->getSimpleValueType(0);
33618 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
33620 "Unexpected type action!");
33621 if (!ISD::isNON_EXTLoad(N))
33622 return;
33623 auto *Ld = cast<LoadSDNode>(N);
33624 if (Subtarget.hasSSE2()) {
33625 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
33626 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
33627 Ld->getPointerInfo(), Ld->getOriginalAlign(),
33628 Ld->getMemOperand()->getFlags());
33629 SDValue Chain = Res.getValue(1);
33630 MVT VecVT = MVT::getVectorVT(LdVT, 2);
33631 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
33632 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33633 Res = DAG.getBitcast(WideVT, Res);
33634 Results.push_back(Res);
33635 Results.push_back(Chain);
33636 return;
33637 }
33638 assert(Subtarget.hasSSE1() && "Expected SSE");
33639 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
33640 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
33641 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33642 MVT::i64, Ld->getMemOperand());
33643 Results.push_back(Res);
33644 Results.push_back(Res.getValue(1));
33645 return;
33646 }
33647 case ISD::ADDRSPACECAST: {
33648 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
33649 Results.push_back(V);
33650 return;
33651 }
33652 case ISD::BITREVERSE: {
33653 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33654 assert(Subtarget.hasXOP() && "Expected XOP");
33655 // We can use VPPERM by copying to a vector register and back. We'll need
33656 // to move the scalar in two i32 pieces.
33657 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
33658 return;
33659 }
33661 // f16 = extract vXf16 %vec, i64 %idx
33662 assert(N->getSimpleValueType(0) == MVT::f16 &&
33663 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
33664 assert(Subtarget.hasFP16() && "Expected FP16");
33665 SDValue VecOp = N->getOperand(0);
33667 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
33668 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
33669 N->getOperand(1));
33670 Split = DAG.getBitcast(MVT::f16, Split);
33671 Results.push_back(Split);
33672 return;
33673 }
33674 }
33675}
33676
33677const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
33678 switch ((X86ISD::NodeType)Opcode) {
33679 case X86ISD::FIRST_NUMBER: break;
33680#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
33681 NODE_NAME_CASE(BSF)
33682 NODE_NAME_CASE(BSR)
33683 NODE_NAME_CASE(FSHL)
33684 NODE_NAME_CASE(FSHR)
33685 NODE_NAME_CASE(FAND)
33686 NODE_NAME_CASE(FANDN)
33687 NODE_NAME_CASE(FOR)
33688 NODE_NAME_CASE(FXOR)
33689 NODE_NAME_CASE(FILD)
33690 NODE_NAME_CASE(FIST)
33691 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
33692 NODE_NAME_CASE(FLD)
33693 NODE_NAME_CASE(FST)
33694 NODE_NAME_CASE(CALL)
33695 NODE_NAME_CASE(CALL_RVMARKER)
33697 NODE_NAME_CASE(CMP)
33698 NODE_NAME_CASE(FCMP)
33699 NODE_NAME_CASE(STRICT_FCMP)
33700 NODE_NAME_CASE(STRICT_FCMPS)
33702 NODE_NAME_CASE(UCOMI)
33703 NODE_NAME_CASE(CMPM)
33704 NODE_NAME_CASE(CMPMM)
33705 NODE_NAME_CASE(STRICT_CMPM)
33706 NODE_NAME_CASE(CMPMM_SAE)
33707 NODE_NAME_CASE(SETCC)
33708 NODE_NAME_CASE(SETCC_CARRY)
33709 NODE_NAME_CASE(FSETCC)
33710 NODE_NAME_CASE(FSETCCM)
33711 NODE_NAME_CASE(FSETCCM_SAE)
33712 NODE_NAME_CASE(CMOV)
33713 NODE_NAME_CASE(BRCOND)
33714 NODE_NAME_CASE(RET_GLUE)
33715 NODE_NAME_CASE(IRET)
33716 NODE_NAME_CASE(REP_STOS)
33717 NODE_NAME_CASE(REP_MOVS)
33718 NODE_NAME_CASE(GlobalBaseReg)
33720 NODE_NAME_CASE(WrapperRIP)
33721 NODE_NAME_CASE(MOVQ2DQ)
33722 NODE_NAME_CASE(MOVDQ2Q)
33723 NODE_NAME_CASE(MMX_MOVD2W)
33724 NODE_NAME_CASE(MMX_MOVW2D)
33725 NODE_NAME_CASE(PEXTRB)
33726 NODE_NAME_CASE(PEXTRW)
33727 NODE_NAME_CASE(INSERTPS)
33728 NODE_NAME_CASE(PINSRB)
33729 NODE_NAME_CASE(PINSRW)
33730 NODE_NAME_CASE(PSHUFB)
33731 NODE_NAME_CASE(ANDNP)
33732 NODE_NAME_CASE(BLENDI)
33734 NODE_NAME_CASE(HADD)
33735 NODE_NAME_CASE(HSUB)
33736 NODE_NAME_CASE(FHADD)
33737 NODE_NAME_CASE(FHSUB)
33738 NODE_NAME_CASE(CONFLICT)
33739 NODE_NAME_CASE(FMAX)
33740 NODE_NAME_CASE(FMAXS)
33741 NODE_NAME_CASE(FMAX_SAE)
33742 NODE_NAME_CASE(FMAXS_SAE)
33743 NODE_NAME_CASE(FMIN)
33744 NODE_NAME_CASE(FMINS)
33745 NODE_NAME_CASE(FMIN_SAE)
33746 NODE_NAME_CASE(FMINS_SAE)
33747 NODE_NAME_CASE(FMAXC)
33748 NODE_NAME_CASE(FMINC)
33749 NODE_NAME_CASE(FRSQRT)
33750 NODE_NAME_CASE(FRCP)
33751 NODE_NAME_CASE(EXTRQI)
33752 NODE_NAME_CASE(INSERTQI)
33753 NODE_NAME_CASE(TLSADDR)
33754 NODE_NAME_CASE(TLSBASEADDR)
33755 NODE_NAME_CASE(TLSCALL)
33756 NODE_NAME_CASE(TLSDESC)
33757 NODE_NAME_CASE(EH_SJLJ_SETJMP)
33758 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
33759 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
33760 NODE_NAME_CASE(EH_RETURN)
33761 NODE_NAME_CASE(TC_RETURN)
33762 NODE_NAME_CASE(FNSTCW16m)
33763 NODE_NAME_CASE(FLDCW16m)
33764 NODE_NAME_CASE(FNSTENVm)
33765 NODE_NAME_CASE(FLDENVm)
33766 NODE_NAME_CASE(LCMPXCHG_DAG)
33767 NODE_NAME_CASE(LCMPXCHG8_DAG)
33768 NODE_NAME_CASE(LCMPXCHG16_DAG)
33769 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
33770 NODE_NAME_CASE(LADD)
33771 NODE_NAME_CASE(LSUB)
33772 NODE_NAME_CASE(LOR)
33773 NODE_NAME_CASE(LXOR)
33774 NODE_NAME_CASE(LAND)
33775 NODE_NAME_CASE(LBTS)
33776 NODE_NAME_CASE(LBTC)
33777 NODE_NAME_CASE(LBTR)
33778 NODE_NAME_CASE(LBTS_RM)
33779 NODE_NAME_CASE(LBTC_RM)
33780 NODE_NAME_CASE(LBTR_RM)
33781 NODE_NAME_CASE(AADD)
33782 NODE_NAME_CASE(AOR)
33783 NODE_NAME_CASE(AXOR)
33784 NODE_NAME_CASE(AAND)
33785 NODE_NAME_CASE(VZEXT_MOVL)
33786 NODE_NAME_CASE(VZEXT_LOAD)
33787 NODE_NAME_CASE(VEXTRACT_STORE)
33788 NODE_NAME_CASE(VTRUNC)
33789 NODE_NAME_CASE(VTRUNCS)
33790 NODE_NAME_CASE(VTRUNCUS)
33791 NODE_NAME_CASE(VMTRUNC)
33792 NODE_NAME_CASE(VMTRUNCS)
33793 NODE_NAME_CASE(VMTRUNCUS)
33794 NODE_NAME_CASE(VTRUNCSTORES)
33795 NODE_NAME_CASE(VTRUNCSTOREUS)
33796 NODE_NAME_CASE(VMTRUNCSTORES)
33797 NODE_NAME_CASE(VMTRUNCSTOREUS)
33798 NODE_NAME_CASE(VFPEXT)
33799 NODE_NAME_CASE(STRICT_VFPEXT)
33800 NODE_NAME_CASE(VFPEXT_SAE)
33801 NODE_NAME_CASE(VFPEXTS)
33802 NODE_NAME_CASE(VFPEXTS_SAE)
33803 NODE_NAME_CASE(VFPROUND)
33804 NODE_NAME_CASE(STRICT_VFPROUND)
33805 NODE_NAME_CASE(VMFPROUND)
33806 NODE_NAME_CASE(VFPROUND_RND)
33807 NODE_NAME_CASE(VFPROUNDS)
33808 NODE_NAME_CASE(VFPROUNDS_RND)
33809 NODE_NAME_CASE(VSHLDQ)
33810 NODE_NAME_CASE(VSRLDQ)
33811 NODE_NAME_CASE(VSHL)
33812 NODE_NAME_CASE(VSRL)
33813 NODE_NAME_CASE(VSRA)
33814 NODE_NAME_CASE(VSHLI)
33815 NODE_NAME_CASE(VSRLI)
33816 NODE_NAME_CASE(VSRAI)
33817 NODE_NAME_CASE(VSHLV)
33818 NODE_NAME_CASE(VSRLV)
33819 NODE_NAME_CASE(VSRAV)
33820 NODE_NAME_CASE(VROTLI)
33821 NODE_NAME_CASE(VROTRI)
33822 NODE_NAME_CASE(VPPERM)
33823 NODE_NAME_CASE(CMPP)
33824 NODE_NAME_CASE(STRICT_CMPP)
33825 NODE_NAME_CASE(PCMPEQ)
33826 NODE_NAME_CASE(PCMPGT)
33827 NODE_NAME_CASE(PHMINPOS)
33828 NODE_NAME_CASE(ADD)
33829 NODE_NAME_CASE(SUB)
33830 NODE_NAME_CASE(ADC)
33831 NODE_NAME_CASE(SBB)
33832 NODE_NAME_CASE(SMUL)
33833 NODE_NAME_CASE(UMUL)
33834 NODE_NAME_CASE(OR)
33835 NODE_NAME_CASE(XOR)
33836 NODE_NAME_CASE(AND)
33837 NODE_NAME_CASE(BEXTR)
33839 NODE_NAME_CASE(BZHI)
33840 NODE_NAME_CASE(PDEP)
33841 NODE_NAME_CASE(PEXT)
33842 NODE_NAME_CASE(MUL_IMM)
33843 NODE_NAME_CASE(MOVMSK)
33844 NODE_NAME_CASE(PTEST)
33845 NODE_NAME_CASE(TESTP)
33846 NODE_NAME_CASE(KORTEST)
33847 NODE_NAME_CASE(KTEST)
33848 NODE_NAME_CASE(KADD)
33849 NODE_NAME_CASE(KSHIFTL)
33850 NODE_NAME_CASE(KSHIFTR)
33851 NODE_NAME_CASE(PACKSS)
33852 NODE_NAME_CASE(PACKUS)
33853 NODE_NAME_CASE(PALIGNR)
33854 NODE_NAME_CASE(VALIGN)
33855 NODE_NAME_CASE(VSHLD)
33856 NODE_NAME_CASE(VSHRD)
33857 NODE_NAME_CASE(VSHLDV)
33858 NODE_NAME_CASE(VSHRDV)
33859 NODE_NAME_CASE(PSHUFD)
33860 NODE_NAME_CASE(PSHUFHW)
33861 NODE_NAME_CASE(PSHUFLW)
33862 NODE_NAME_CASE(SHUFP)
33863 NODE_NAME_CASE(SHUF128)
33864 NODE_NAME_CASE(MOVLHPS)
33865 NODE_NAME_CASE(MOVHLPS)
33866 NODE_NAME_CASE(MOVDDUP)
33867 NODE_NAME_CASE(MOVSHDUP)
33868 NODE_NAME_CASE(MOVSLDUP)
33869 NODE_NAME_CASE(MOVSD)
33870 NODE_NAME_CASE(MOVSS)
33871 NODE_NAME_CASE(MOVSH)
33872 NODE_NAME_CASE(UNPCKL)
33873 NODE_NAME_CASE(UNPCKH)
33874 NODE_NAME_CASE(VBROADCAST)
33875 NODE_NAME_CASE(VBROADCAST_LOAD)
33876 NODE_NAME_CASE(VBROADCASTM)
33877 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
33878 NODE_NAME_CASE(VPERMILPV)
33879 NODE_NAME_CASE(VPERMILPI)
33880 NODE_NAME_CASE(VPERM2X128)
33881 NODE_NAME_CASE(VPERMV)
33882 NODE_NAME_CASE(VPERMV3)
33883 NODE_NAME_CASE(VPERMI)
33884 NODE_NAME_CASE(VPTERNLOG)
33885 NODE_NAME_CASE(VFIXUPIMM)
33886 NODE_NAME_CASE(VFIXUPIMM_SAE)
33887 NODE_NAME_CASE(VFIXUPIMMS)
33888 NODE_NAME_CASE(VFIXUPIMMS_SAE)
33889 NODE_NAME_CASE(VRANGE)
33890 NODE_NAME_CASE(VRANGE_SAE)
33891 NODE_NAME_CASE(VRANGES)
33892 NODE_NAME_CASE(VRANGES_SAE)
33893 NODE_NAME_CASE(PMULUDQ)
33894 NODE_NAME_CASE(PMULDQ)
33895 NODE_NAME_CASE(PSADBW)
33896 NODE_NAME_CASE(DBPSADBW)
33897 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
33898 NODE_NAME_CASE(VAARG_64)
33899 NODE_NAME_CASE(VAARG_X32)
33900 NODE_NAME_CASE(DYN_ALLOCA)
33901 NODE_NAME_CASE(MFENCE)
33902 NODE_NAME_CASE(SEG_ALLOCA)
33903 NODE_NAME_CASE(PROBED_ALLOCA)
33906 NODE_NAME_CASE(RDPKRU)
33907 NODE_NAME_CASE(WRPKRU)
33908 NODE_NAME_CASE(VPMADDUBSW)
33909 NODE_NAME_CASE(VPMADDWD)
33910 NODE_NAME_CASE(VPSHA)
33911 NODE_NAME_CASE(VPSHL)
33912 NODE_NAME_CASE(VPCOM)
33913 NODE_NAME_CASE(VPCOMU)
33914 NODE_NAME_CASE(VPERMIL2)
33916 NODE_NAME_CASE(STRICT_FMSUB)
33918 NODE_NAME_CASE(STRICT_FNMADD)
33920 NODE_NAME_CASE(STRICT_FNMSUB)
33921 NODE_NAME_CASE(FMADDSUB)
33922 NODE_NAME_CASE(FMSUBADD)
33923 NODE_NAME_CASE(FMADD_RND)
33924 NODE_NAME_CASE(FNMADD_RND)
33925 NODE_NAME_CASE(FMSUB_RND)
33926 NODE_NAME_CASE(FNMSUB_RND)
33927 NODE_NAME_CASE(FMADDSUB_RND)
33928 NODE_NAME_CASE(FMSUBADD_RND)
33929 NODE_NAME_CASE(VFMADDC)
33930 NODE_NAME_CASE(VFMADDC_RND)
33931 NODE_NAME_CASE(VFCMADDC)
33932 NODE_NAME_CASE(VFCMADDC_RND)
33933 NODE_NAME_CASE(VFMULC)
33934 NODE_NAME_CASE(VFMULC_RND)
33935 NODE_NAME_CASE(VFCMULC)
33936 NODE_NAME_CASE(VFCMULC_RND)
33937 NODE_NAME_CASE(VFMULCSH)
33938 NODE_NAME_CASE(VFMULCSH_RND)
33939 NODE_NAME_CASE(VFCMULCSH)
33940 NODE_NAME_CASE(VFCMULCSH_RND)
33941 NODE_NAME_CASE(VFMADDCSH)
33942 NODE_NAME_CASE(VFMADDCSH_RND)
33943 NODE_NAME_CASE(VFCMADDCSH)
33944 NODE_NAME_CASE(VFCMADDCSH_RND)
33945 NODE_NAME_CASE(VPMADD52H)
33946 NODE_NAME_CASE(VPMADD52L)
33947 NODE_NAME_CASE(VRNDSCALE)
33948 NODE_NAME_CASE(STRICT_VRNDSCALE)
33949 NODE_NAME_CASE(VRNDSCALE_SAE)
33950 NODE_NAME_CASE(VRNDSCALES)
33951 NODE_NAME_CASE(VRNDSCALES_SAE)
33952 NODE_NAME_CASE(VREDUCE)
33953 NODE_NAME_CASE(VREDUCE_SAE)
33954 NODE_NAME_CASE(VREDUCES)
33955 NODE_NAME_CASE(VREDUCES_SAE)
33956 NODE_NAME_CASE(VGETMANT)
33957 NODE_NAME_CASE(VGETMANT_SAE)
33958 NODE_NAME_CASE(VGETMANTS)
33959 NODE_NAME_CASE(VGETMANTS_SAE)
33960 NODE_NAME_CASE(PCMPESTR)
33961 NODE_NAME_CASE(PCMPISTR)
33963 NODE_NAME_CASE(COMPRESS)
33965 NODE_NAME_CASE(SELECTS)
33966 NODE_NAME_CASE(ADDSUB)
33967 NODE_NAME_CASE(RCP14)
33968 NODE_NAME_CASE(RCP14S)
33969 NODE_NAME_CASE(RSQRT14)
33970 NODE_NAME_CASE(RSQRT14S)
33971 NODE_NAME_CASE(FADD_RND)
33972 NODE_NAME_CASE(FADDS)
33973 NODE_NAME_CASE(FADDS_RND)
33974 NODE_NAME_CASE(FSUB_RND)
33975 NODE_NAME_CASE(FSUBS)
33976 NODE_NAME_CASE(FSUBS_RND)
33977 NODE_NAME_CASE(FMUL_RND)
33978 NODE_NAME_CASE(FMULS)
33979 NODE_NAME_CASE(FMULS_RND)
33980 NODE_NAME_CASE(FDIV_RND)
33981 NODE_NAME_CASE(FDIVS)
33982 NODE_NAME_CASE(FDIVS_RND)
33983 NODE_NAME_CASE(FSQRT_RND)
33984 NODE_NAME_CASE(FSQRTS)
33985 NODE_NAME_CASE(FSQRTS_RND)
33986 NODE_NAME_CASE(FGETEXP)
33987 NODE_NAME_CASE(FGETEXP_SAE)
33988 NODE_NAME_CASE(FGETEXPS)
33989 NODE_NAME_CASE(FGETEXPS_SAE)
33990 NODE_NAME_CASE(SCALEF)
33991 NODE_NAME_CASE(SCALEF_RND)
33992 NODE_NAME_CASE(SCALEFS)
33993 NODE_NAME_CASE(SCALEFS_RND)
33994 NODE_NAME_CASE(MULHRS)
33995 NODE_NAME_CASE(SINT_TO_FP_RND)
33996 NODE_NAME_CASE(UINT_TO_FP_RND)
33997 NODE_NAME_CASE(CVTTP2SI)
33998 NODE_NAME_CASE(CVTTP2UI)
33999 NODE_NAME_CASE(STRICT_CVTTP2SI)
34000 NODE_NAME_CASE(STRICT_CVTTP2UI)
34001 NODE_NAME_CASE(MCVTTP2SI)
34002 NODE_NAME_CASE(MCVTTP2UI)
34003 NODE_NAME_CASE(CVTTP2SI_SAE)
34004 NODE_NAME_CASE(CVTTP2UI_SAE)
34005 NODE_NAME_CASE(CVTTS2SI)
34006 NODE_NAME_CASE(CVTTS2UI)
34007 NODE_NAME_CASE(CVTTS2SI_SAE)
34008 NODE_NAME_CASE(CVTTS2UI_SAE)
34009 NODE_NAME_CASE(CVTSI2P)
34010 NODE_NAME_CASE(CVTUI2P)
34011 NODE_NAME_CASE(STRICT_CVTSI2P)
34012 NODE_NAME_CASE(STRICT_CVTUI2P)
34013 NODE_NAME_CASE(MCVTSI2P)
34014 NODE_NAME_CASE(MCVTUI2P)
34015 NODE_NAME_CASE(VFPCLASS)
34016 NODE_NAME_CASE(VFPCLASSS)
34017 NODE_NAME_CASE(MULTISHIFT)
34018 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
34019 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
34020 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
34021 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
34022 NODE_NAME_CASE(CVTPS2PH)
34023 NODE_NAME_CASE(STRICT_CVTPS2PH)
34024 NODE_NAME_CASE(CVTPS2PH_SAE)
34025 NODE_NAME_CASE(MCVTPS2PH)
34026 NODE_NAME_CASE(MCVTPS2PH_SAE)
34027 NODE_NAME_CASE(CVTPH2PS)
34028 NODE_NAME_CASE(STRICT_CVTPH2PS)
34029 NODE_NAME_CASE(CVTPH2PS_SAE)
34030 NODE_NAME_CASE(CVTP2SI)
34031 NODE_NAME_CASE(CVTP2UI)
34032 NODE_NAME_CASE(MCVTP2SI)
34033 NODE_NAME_CASE(MCVTP2UI)
34034 NODE_NAME_CASE(CVTP2SI_RND)
34035 NODE_NAME_CASE(CVTP2UI_RND)
34036 NODE_NAME_CASE(CVTS2SI)
34037 NODE_NAME_CASE(CVTS2UI)
34038 NODE_NAME_CASE(CVTS2SI_RND)
34039 NODE_NAME_CASE(CVTS2UI_RND)
34040 NODE_NAME_CASE(CVTNE2PS2BF16)
34041 NODE_NAME_CASE(CVTNEPS2BF16)
34042 NODE_NAME_CASE(MCVTNEPS2BF16)
34043 NODE_NAME_CASE(DPBF16PS)
34044 NODE_NAME_CASE(LWPINS)
34045 NODE_NAME_CASE(MGATHER)
34046 NODE_NAME_CASE(MSCATTER)
34047 NODE_NAME_CASE(VPDPBUSD)
34048 NODE_NAME_CASE(VPDPBUSDS)
34049 NODE_NAME_CASE(VPDPWSSD)
34050 NODE_NAME_CASE(VPDPWSSDS)
34051 NODE_NAME_CASE(VPSHUFBITQMB)
34052 NODE_NAME_CASE(GF2P8MULB)
34053 NODE_NAME_CASE(GF2P8AFFINEQB)
34054 NODE_NAME_CASE(GF2P8AFFINEINVQB)
34055 NODE_NAME_CASE(NT_CALL)
34056 NODE_NAME_CASE(NT_BRIND)
34057 NODE_NAME_CASE(UMWAIT)
34058 NODE_NAME_CASE(TPAUSE)
34059 NODE_NAME_CASE(ENQCMD)
34060 NODE_NAME_CASE(ENQCMDS)
34061 NODE_NAME_CASE(VP2INTERSECT)
34062 NODE_NAME_CASE(VPDPBSUD)
34063 NODE_NAME_CASE(VPDPBSUDS)
34064 NODE_NAME_CASE(VPDPBUUD)
34065 NODE_NAME_CASE(VPDPBUUDS)
34066 NODE_NAME_CASE(VPDPBSSD)
34067 NODE_NAME_CASE(VPDPBSSDS)
34068 NODE_NAME_CASE(AESENC128KL)
34069 NODE_NAME_CASE(AESDEC128KL)
34070 NODE_NAME_CASE(AESENC256KL)
34071 NODE_NAME_CASE(AESDEC256KL)
34072 NODE_NAME_CASE(AESENCWIDE128KL)
34073 NODE_NAME_CASE(AESDECWIDE128KL)
34074 NODE_NAME_CASE(AESENCWIDE256KL)
34075 NODE_NAME_CASE(AESDECWIDE256KL)
34076 NODE_NAME_CASE(CMPCCXADD)
34077 NODE_NAME_CASE(TESTUI)
34078 NODE_NAME_CASE(FP80_ADD)
34079 NODE_NAME_CASE(STRICT_FP80_ADD)
34080 NODE_NAME_CASE(CCMP)
34081 NODE_NAME_CASE(CTEST)
34082 NODE_NAME_CASE(CLOAD)
34083 NODE_NAME_CASE(CSTORE)
34084 }
34085 return nullptr;
34086#undef NODE_NAME_CASE
34087}
34088
34089/// Return true if the addressing mode represented by AM is legal for this
34090/// target, for a load/store of the specified type.
34092 const AddrMode &AM, Type *Ty,
34093 unsigned AS,
34094 Instruction *I) const {
34095 // X86 supports extremely general addressing modes.
34097
34098 // X86 allows a sign-extended 32-bit immediate field as a displacement.
34099 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
34100 return false;
34101
34102 if (AM.BaseGV) {
34103 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
34104
34105 // If a reference to this global requires an extra load, we can't fold it.
34106 if (isGlobalStubReference(GVFlags))
34107 return false;
34108
34109 // If BaseGV requires a register for the PIC base, we cannot also have a
34110 // BaseReg specified.
34111 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
34112 return false;
34113
34114 // If lower 4G is not available, then we must use rip-relative addressing.
34115 if ((M != CodeModel::Small || isPositionIndependent()) &&
34116 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
34117 return false;
34118 }
34119
34120 switch (AM.Scale) {
34121 case 0:
34122 case 1:
34123 case 2:
34124 case 4:
34125 case 8:
34126 // These scales always work.
34127 break;
34128 case 3:
34129 case 5:
34130 case 9:
34131 // These scales are formed with basereg+scalereg. Only accept if there is
34132 // no basereg yet.
34133 if (AM.HasBaseReg)
34134 return false;
34135 break;
34136 default: // Other stuff never works.
34137 return false;
34138 }
34139
34140 return true;
34141}
34142
34144 unsigned Bits = Ty->getScalarSizeInBits();
34145
34146 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
34147 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
34148 if (Subtarget.hasXOP() &&
34149 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
34150 return false;
34151
34152 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
34153 // shifts just as cheap as scalar ones.
34154 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
34155 return false;
34156
34157 // AVX512BW has shifts such as vpsllvw.
34158 if (Subtarget.hasBWI() && Bits == 16)
34159 return false;
34160
34161 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
34162 // fully general vector.
34163 return true;
34164}
34165
34166bool X86TargetLowering::isBinOp(unsigned Opcode) const {
34167 switch (Opcode) {
34168 // These are non-commutative binops.
34169 // TODO: Add more X86ISD opcodes once we have test coverage.
34170 case X86ISD::ANDNP:
34171 case X86ISD::PCMPGT:
34172 case X86ISD::FMAX:
34173 case X86ISD::FMIN:
34174 case X86ISD::FANDN:
34175 case X86ISD::VPSHA:
34176 case X86ISD::VPSHL:
34177 case X86ISD::VSHLV:
34178 case X86ISD::VSRLV:
34179 case X86ISD::VSRAV:
34180 return true;
34181 }
34182
34183 return TargetLoweringBase::isBinOp(Opcode);
34184}
34185
34186bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
34187 switch (Opcode) {
34188 // TODO: Add more X86ISD opcodes once we have test coverage.
34189 case X86ISD::PCMPEQ:
34190 case X86ISD::PMULDQ:
34191 case X86ISD::PMULUDQ:
34192 case X86ISD::FMAXC:
34193 case X86ISD::FMINC:
34194 case X86ISD::FAND:
34195 case X86ISD::FOR:
34196 case X86ISD::FXOR:
34197 return true;
34198 }
34199
34201}
34202
34204 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34205 return false;
34206 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
34207 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
34208 return NumBits1 > NumBits2;
34209}
34210
34212 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34213 return false;
34214
34215 if (!isTypeLegal(EVT::getEVT(Ty1)))
34216 return false;
34217
34218 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
34219
34220 // Assuming the caller doesn't have a zeroext or signext return parameter,
34221 // truncation all the way down to i1 is valid.
34222 return true;
34223}
34224
34226 return isInt<32>(Imm);
34227}
34228
34230 // Can also use sub to handle negated immediates.
34231 return isInt<32>(Imm);
34232}
34233
34235 return isInt<32>(Imm);
34236}
34237
34239 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
34240 return false;
34241 unsigned NumBits1 = VT1.getSizeInBits();
34242 unsigned NumBits2 = VT2.getSizeInBits();
34243 return NumBits1 > NumBits2;
34244}
34245
34247 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34248 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
34249}
34250
34252 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34253 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
34254}
34255
34257 EVT VT1 = Val.getValueType();
34258 if (isZExtFree(VT1, VT2))
34259 return true;
34260
34261 if (Val.getOpcode() != ISD::LOAD)
34262 return false;
34263
34264 if (!VT1.isSimple() || !VT1.isInteger() ||
34265 !VT2.isSimple() || !VT2.isInteger())
34266 return false;
34267
34268 switch (VT1.getSimpleVT().SimpleTy) {
34269 default: break;
34270 case MVT::i8:
34271 case MVT::i16:
34272 case MVT::i32:
34273 // X86 has 8, 16, and 32-bit zero-extending loads.
34274 return true;
34275 }
34276
34277 return false;
34278}
34279
34281 SmallVectorImpl<Use *> &Ops) const {
34282 using namespace llvm::PatternMatch;
34283
34284 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
34285 if (!VTy)
34286 return false;
34287
34288 if (I->getOpcode() == Instruction::Mul &&
34289 VTy->getElementType()->isIntegerTy(64)) {
34290 for (auto &Op : I->operands()) {
34291 // Make sure we are not already sinking this operand
34292 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
34293 continue;
34294
34295 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
34296 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
34297 if (Subtarget.hasSSE41() &&
34298 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
34299 m_SpecificInt(32)))) {
34300 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
34301 Ops.push_back(&Op);
34302 } else if (Subtarget.hasSSE2() &&
34303 match(Op.get(),
34304 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
34305 Ops.push_back(&Op);
34306 }
34307 }
34308
34309 return !Ops.empty();
34310 }
34311
34312 // A uniform shift amount in a vector shift or funnel shift may be much
34313 // cheaper than a generic variable vector shift, so make that pattern visible
34314 // to SDAG by sinking the shuffle instruction next to the shift.
34315 int ShiftAmountOpNum = -1;
34316 if (I->isShift())
34317 ShiftAmountOpNum = 1;
34318 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
34319 if (II->getIntrinsicID() == Intrinsic::fshl ||
34320 II->getIntrinsicID() == Intrinsic::fshr)
34321 ShiftAmountOpNum = 2;
34322 }
34323
34324 if (ShiftAmountOpNum == -1)
34325 return false;
34326
34327 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
34328 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
34329 isVectorShiftByScalarCheap(I->getType())) {
34330 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
34331 return true;
34332 }
34333
34334 return false;
34335}
34336
34338 if (!Subtarget.is64Bit())
34339 return false;
34341}
34342
34344 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
34345 return false;
34346
34347 EVT SrcVT = ExtVal.getOperand(0).getValueType();
34348
34349 // There is no extending load for vXi1.
34350 if (SrcVT.getScalarType() == MVT::i1)
34351 return false;
34352
34353 return true;
34354}
34355
34357 EVT VT) const {
34358 if (!Subtarget.hasAnyFMA())
34359 return false;
34360
34361 VT = VT.getScalarType();
34362
34363 if (!VT.isSimple())
34364 return false;
34365
34366 switch (VT.getSimpleVT().SimpleTy) {
34367 case MVT::f16:
34368 return Subtarget.hasFP16();
34369 case MVT::f32:
34370 case MVT::f64:
34371 return true;
34372 default:
34373 break;
34374 }
34375
34376 return false;
34377}
34378
34380 // i16 instructions are longer (0x66 prefix) and potentially slower.
34381 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
34382}
34383
34385 EVT VT) const {
34386 // TODO: This is too general. There are cases where pre-AVX512 codegen would
34387 // benefit. The transform may also be profitable for scalar code.
34388 if (!Subtarget.hasAVX512())
34389 return false;
34390 if (!Subtarget.hasVLX() && !VT.is512BitVector())
34391 return false;
34392 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
34393 return false;
34394
34395 return true;
34396}
34397
34398/// Targets can use this to indicate that they only support *some*
34399/// VECTOR_SHUFFLE operations, those with specific masks.
34400/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
34401/// are assumed to be legal.
34403 if (!VT.isSimple())
34404 return false;
34405
34406 // Not for i1 vectors
34407 if (VT.getSimpleVT().getScalarType() == MVT::i1)
34408 return false;
34409
34410 // Very little shuffling can be done for 64-bit vectors right now.
34411 if (VT.getSimpleVT().getSizeInBits() == 64)
34412 return false;
34413
34414 // We only care that the types being shuffled are legal. The lowering can
34415 // handle any possible shuffle mask that results.
34416 return isTypeLegal(VT.getSimpleVT());
34417}
34418
34420 EVT VT) const {
34421 // Don't convert an 'and' into a shuffle that we don't directly support.
34422 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
34423 if (!Subtarget.hasAVX2())
34424 if (VT == MVT::v32i8 || VT == MVT::v16i16)
34425 return false;
34426
34427 // Just delegate to the generic legality, clear masks aren't special.
34428 return isShuffleMaskLegal(Mask, VT);
34429}
34430
34432 // If the subtarget is using thunks, we need to not generate jump tables.
34433 if (Subtarget.useIndirectThunkBranches())
34434 return false;
34435
34436 // Otherwise, fallback on the generic logic.
34438}
34439
34441 EVT ConditionVT) const {
34442 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
34443 // zero-extensions.
34444 if (ConditionVT.getSizeInBits() < 32)
34445 return MVT::i32;
34447 ConditionVT);
34448}
34449
34450//===----------------------------------------------------------------------===//
34451// X86 Scheduler Hooks
34452//===----------------------------------------------------------------------===//
34453
34454// Returns true if EFLAG is consumed after this iterator in the rest of the
34455// basic block or any successors of the basic block.
34457 MachineBasicBlock *BB) {
34458 // Scan forward through BB for a use/def of EFLAGS.
34459 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
34460 if (mi.readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
34461 return true;
34462 // If we found a def, we can stop searching.
34463 if (mi.definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
34464 return false;
34465 }
34466
34467 // If we hit the end of the block, check whether EFLAGS is live into a
34468 // successor.
34469 for (MachineBasicBlock *Succ : BB->successors())
34470 if (Succ->isLiveIn(X86::EFLAGS))
34471 return true;
34472
34473 return false;
34474}
34475
34476/// Utility function to emit xbegin specifying the start of an RTM region.
34478 const TargetInstrInfo *TII) {
34479 const MIMetadata MIMD(MI);
34480
34481 const BasicBlock *BB = MBB->getBasicBlock();
34483
34484 // For the v = xbegin(), we generate
34485 //
34486 // thisMBB:
34487 // xbegin sinkMBB
34488 //
34489 // mainMBB:
34490 // s0 = -1
34491 //
34492 // fallBB:
34493 // eax = # XABORT_DEF
34494 // s1 = eax
34495 //
34496 // sinkMBB:
34497 // v = phi(s0/mainBB, s1/fallBB)
34498
34499 MachineBasicBlock *thisMBB = MBB;
34500 MachineFunction *MF = MBB->getParent();
34501 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
34502 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
34503 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34504 MF->insert(I, mainMBB);
34505 MF->insert(I, fallMBB);
34506 MF->insert(I, sinkMBB);
34507
34508 if (isEFLAGSLiveAfter(MI, MBB)) {
34509 mainMBB->addLiveIn(X86::EFLAGS);
34510 fallMBB->addLiveIn(X86::EFLAGS);
34511 sinkMBB->addLiveIn(X86::EFLAGS);
34512 }
34513
34514 // Transfer the remainder of BB and its successor edges to sinkMBB.
34515 sinkMBB->splice(sinkMBB->begin(), MBB,
34516 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34518
34520 Register DstReg = MI.getOperand(0).getReg();
34521 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
34522 Register mainDstReg = MRI.createVirtualRegister(RC);
34523 Register fallDstReg = MRI.createVirtualRegister(RC);
34524
34525 // thisMBB:
34526 // xbegin fallMBB
34527 // # fallthrough to mainMBB
34528 // # abortion to fallMBB
34529 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
34530 thisMBB->addSuccessor(mainMBB);
34531 thisMBB->addSuccessor(fallMBB);
34532
34533 // mainMBB:
34534 // mainDstReg := -1
34535 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
34536 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
34537 mainMBB->addSuccessor(sinkMBB);
34538
34539 // fallMBB:
34540 // ; pseudo instruction to model hardware's definition from XABORT
34541 // EAX := XABORT_DEF
34542 // fallDstReg := EAX
34543 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
34544 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
34545 .addReg(X86::EAX);
34546 fallMBB->addSuccessor(sinkMBB);
34547
34548 // sinkMBB:
34549 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
34550 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
34551 .addReg(mainDstReg).addMBB(mainMBB)
34552 .addReg(fallDstReg).addMBB(fallMBB);
34553
34554 MI.eraseFromParent();
34555 return sinkMBB;
34556}
34557
34559X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
34560 MachineBasicBlock *MBB) const {
34561 // Emit va_arg instruction on X86-64.
34562
34563 // Operands to this pseudo-instruction:
34564 // 0 ) Output : destination address (reg)
34565 // 1-5) Input : va_list address (addr, i64mem)
34566 // 6 ) ArgSize : Size (in bytes) of vararg type
34567 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
34568 // 8 ) Align : Alignment of type
34569 // 9 ) EFLAGS (implicit-def)
34570
34571 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
34572 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
34573
34574 Register DestReg = MI.getOperand(0).getReg();
34575 MachineOperand &Base = MI.getOperand(1);
34576 MachineOperand &Scale = MI.getOperand(2);
34577 MachineOperand &Index = MI.getOperand(3);
34578 MachineOperand &Disp = MI.getOperand(4);
34579 MachineOperand &Segment = MI.getOperand(5);
34580 unsigned ArgSize = MI.getOperand(6).getImm();
34581 unsigned ArgMode = MI.getOperand(7).getImm();
34582 Align Alignment = Align(MI.getOperand(8).getImm());
34583
34584 MachineFunction *MF = MBB->getParent();
34585
34586 // Memory Reference
34587 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
34588
34589 MachineMemOperand *OldMMO = MI.memoperands().front();
34590
34591 // Clone the MMO into two separate MMOs for loading and storing
34592 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
34593 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
34594 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
34595 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
34596
34597 // Machine Information
34598 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34600 const TargetRegisterClass *AddrRegClass =
34602 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
34603 const MIMetadata MIMD(MI);
34604
34605 // struct va_list {
34606 // i32 gp_offset
34607 // i32 fp_offset
34608 // i64 overflow_area (address)
34609 // i64 reg_save_area (address)
34610 // }
34611 // sizeof(va_list) = 24
34612 // alignment(va_list) = 8
34613
34614 unsigned TotalNumIntRegs = 6;
34615 unsigned TotalNumXMMRegs = 8;
34616 bool UseGPOffset = (ArgMode == 1);
34617 bool UseFPOffset = (ArgMode == 2);
34618 unsigned MaxOffset = TotalNumIntRegs * 8 +
34619 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
34620
34621 /* Align ArgSize to a multiple of 8 */
34622 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
34623 bool NeedsAlign = (Alignment > 8);
34624
34625 MachineBasicBlock *thisMBB = MBB;
34626 MachineBasicBlock *overflowMBB;
34627 MachineBasicBlock *offsetMBB;
34628 MachineBasicBlock *endMBB;
34629
34630 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
34631 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
34632 unsigned OffsetReg = 0;
34633
34634 if (!UseGPOffset && !UseFPOffset) {
34635 // If we only pull from the overflow region, we don't create a branch.
34636 // We don't need to alter control flow.
34637 OffsetDestReg = 0; // unused
34638 OverflowDestReg = DestReg;
34639
34640 offsetMBB = nullptr;
34641 overflowMBB = thisMBB;
34642 endMBB = thisMBB;
34643 } else {
34644 // First emit code to check if gp_offset (or fp_offset) is below the bound.
34645 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
34646 // If not, pull from overflow_area. (branch to overflowMBB)
34647 //
34648 // thisMBB
34649 // | .
34650 // | .
34651 // offsetMBB overflowMBB
34652 // | .
34653 // | .
34654 // endMBB
34655
34656 // Registers for the PHI in endMBB
34657 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
34658 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
34659
34660 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34661 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34662 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34663 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34664
34666
34667 // Insert the new basic blocks
34668 MF->insert(MBBIter, offsetMBB);
34669 MF->insert(MBBIter, overflowMBB);
34670 MF->insert(MBBIter, endMBB);
34671
34672 // Transfer the remainder of MBB and its successor edges to endMBB.
34673 endMBB->splice(endMBB->begin(), thisMBB,
34674 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
34675 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
34676
34677 // Make offsetMBB and overflowMBB successors of thisMBB
34678 thisMBB->addSuccessor(offsetMBB);
34679 thisMBB->addSuccessor(overflowMBB);
34680
34681 // endMBB is a successor of both offsetMBB and overflowMBB
34682 offsetMBB->addSuccessor(endMBB);
34683 overflowMBB->addSuccessor(endMBB);
34684
34685 // Load the offset value into a register
34686 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34687 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
34688 .add(Base)
34689 .add(Scale)
34690 .add(Index)
34691 .addDisp(Disp, UseFPOffset ? 4 : 0)
34692 .add(Segment)
34693 .setMemRefs(LoadOnlyMMO);
34694
34695 // Check if there is enough room left to pull this argument.
34696 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
34697 .addReg(OffsetReg)
34698 .addImm(MaxOffset + 8 - ArgSizeA8);
34699
34700 // Branch to "overflowMBB" if offset >= max
34701 // Fall through to "offsetMBB" otherwise
34702 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
34703 .addMBB(overflowMBB).addImm(X86::COND_AE);
34704 }
34705
34706 // In offsetMBB, emit code to use the reg_save_area.
34707 if (offsetMBB) {
34708 assert(OffsetReg != 0);
34709
34710 // Read the reg_save_area address.
34711 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
34712 BuildMI(
34713 offsetMBB, MIMD,
34714 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34715 RegSaveReg)
34716 .add(Base)
34717 .add(Scale)
34718 .add(Index)
34719 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
34720 .add(Segment)
34721 .setMemRefs(LoadOnlyMMO);
34722
34723 if (Subtarget.isTarget64BitLP64()) {
34724 // Zero-extend the offset
34725 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
34726 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
34727 .addImm(0)
34728 .addReg(OffsetReg)
34729 .addImm(X86::sub_32bit);
34730
34731 // Add the offset to the reg_save_area to get the final address.
34732 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
34733 .addReg(OffsetReg64)
34734 .addReg(RegSaveReg);
34735 } else {
34736 // Add the offset to the reg_save_area to get the final address.
34737 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
34738 .addReg(OffsetReg)
34739 .addReg(RegSaveReg);
34740 }
34741
34742 // Compute the offset for the next argument
34743 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34744 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
34745 .addReg(OffsetReg)
34746 .addImm(UseFPOffset ? 16 : 8);
34747
34748 // Store it back into the va_list.
34749 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
34750 .add(Base)
34751 .add(Scale)
34752 .add(Index)
34753 .addDisp(Disp, UseFPOffset ? 4 : 0)
34754 .add(Segment)
34755 .addReg(NextOffsetReg)
34756 .setMemRefs(StoreOnlyMMO);
34757
34758 // Jump to endMBB
34759 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
34760 .addMBB(endMBB);
34761 }
34762
34763 //
34764 // Emit code to use overflow area
34765 //
34766
34767 // Load the overflow_area address into a register.
34768 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
34769 BuildMI(overflowMBB, MIMD,
34770 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34771 OverflowAddrReg)
34772 .add(Base)
34773 .add(Scale)
34774 .add(Index)
34775 .addDisp(Disp, 8)
34776 .add(Segment)
34777 .setMemRefs(LoadOnlyMMO);
34778
34779 // If we need to align it, do so. Otherwise, just copy the address
34780 // to OverflowDestReg.
34781 if (NeedsAlign) {
34782 // Align the overflow address
34783 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
34784
34785 // aligned_addr = (addr + (align-1)) & ~(align-1)
34786 BuildMI(
34787 overflowMBB, MIMD,
34788 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34789 TmpReg)
34790 .addReg(OverflowAddrReg)
34791 .addImm(Alignment.value() - 1);
34792
34793 BuildMI(
34794 overflowMBB, MIMD,
34795 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
34796 OverflowDestReg)
34797 .addReg(TmpReg)
34798 .addImm(~(uint64_t)(Alignment.value() - 1));
34799 } else {
34800 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
34801 .addReg(OverflowAddrReg);
34802 }
34803
34804 // Compute the next overflow address after this argument.
34805 // (the overflow address should be kept 8-byte aligned)
34806 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
34807 BuildMI(
34808 overflowMBB, MIMD,
34809 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34810 NextAddrReg)
34811 .addReg(OverflowDestReg)
34812 .addImm(ArgSizeA8);
34813
34814 // Store the new overflow address.
34815 BuildMI(overflowMBB, MIMD,
34816 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
34817 .add(Base)
34818 .add(Scale)
34819 .add(Index)
34820 .addDisp(Disp, 8)
34821 .add(Segment)
34822 .addReg(NextAddrReg)
34823 .setMemRefs(StoreOnlyMMO);
34824
34825 // If we branched, emit the PHI to the front of endMBB.
34826 if (offsetMBB) {
34827 BuildMI(*endMBB, endMBB->begin(), MIMD,
34828 TII->get(X86::PHI), DestReg)
34829 .addReg(OffsetDestReg).addMBB(offsetMBB)
34830 .addReg(OverflowDestReg).addMBB(overflowMBB);
34831 }
34832
34833 // Erase the pseudo instruction
34834 MI.eraseFromParent();
34835
34836 return endMBB;
34837}
34838
34839// The EFLAGS operand of SelectItr might be missing a kill marker
34840// because there were multiple uses of EFLAGS, and ISel didn't know
34841// which to mark. Figure out whether SelectItr should have had a
34842// kill marker, and set it if it should. Returns the correct kill
34843// marker value.
34846 const TargetRegisterInfo* TRI) {
34847 if (isEFLAGSLiveAfter(SelectItr, BB))
34848 return false;
34849
34850 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
34851 // out. SelectMI should have a kill flag on EFLAGS.
34852 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
34853 return true;
34854}
34855
34856// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
34857// together with other CMOV pseudo-opcodes into a single basic-block with
34858// conditional jump around it.
34860 switch (MI.getOpcode()) {
34861 case X86::CMOV_FR16:
34862 case X86::CMOV_FR16X:
34863 case X86::CMOV_FR32:
34864 case X86::CMOV_FR32X:
34865 case X86::CMOV_FR64:
34866 case X86::CMOV_FR64X:
34867 case X86::CMOV_GR8:
34868 case X86::CMOV_GR16:
34869 case X86::CMOV_GR32:
34870 case X86::CMOV_RFP32:
34871 case X86::CMOV_RFP64:
34872 case X86::CMOV_RFP80:
34873 case X86::CMOV_VR64:
34874 case X86::CMOV_VR128:
34875 case X86::CMOV_VR128X:
34876 case X86::CMOV_VR256:
34877 case X86::CMOV_VR256X:
34878 case X86::CMOV_VR512:
34879 case X86::CMOV_VK1:
34880 case X86::CMOV_VK2:
34881 case X86::CMOV_VK4:
34882 case X86::CMOV_VK8:
34883 case X86::CMOV_VK16:
34884 case X86::CMOV_VK32:
34885 case X86::CMOV_VK64:
34886 return true;
34887
34888 default:
34889 return false;
34890 }
34891}
34892
34893// Helper function, which inserts PHI functions into SinkMBB:
34894// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
34895// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
34896// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
34897// the last PHI function inserted.
34900 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
34901 MachineBasicBlock *SinkMBB) {
34902 MachineFunction *MF = TrueMBB->getParent();
34904 const MIMetadata MIMD(*MIItBegin);
34905
34906 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
34908
34909 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
34910
34911 // As we are creating the PHIs, we have to be careful if there is more than
34912 // one. Later CMOVs may reference the results of earlier CMOVs, but later
34913 // PHIs have to reference the individual true/false inputs from earlier PHIs.
34914 // That also means that PHI construction must work forward from earlier to
34915 // later, and that the code must maintain a mapping from earlier PHI's
34916 // destination registers, and the registers that went into the PHI.
34919
34920 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
34921 Register DestReg = MIIt->getOperand(0).getReg();
34922 Register Op1Reg = MIIt->getOperand(1).getReg();
34923 Register Op2Reg = MIIt->getOperand(2).getReg();
34924
34925 // If this CMOV we are generating is the opposite condition from
34926 // the jump we generated, then we have to swap the operands for the
34927 // PHI that is going to be generated.
34928 if (MIIt->getOperand(3).getImm() == OppCC)
34929 std::swap(Op1Reg, Op2Reg);
34930
34931 if (RegRewriteTable.contains(Op1Reg))
34932 Op1Reg = RegRewriteTable[Op1Reg].first;
34933
34934 if (RegRewriteTable.contains(Op2Reg))
34935 Op2Reg = RegRewriteTable[Op2Reg].second;
34936
34937 MIB =
34938 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
34939 .addReg(Op1Reg)
34940 .addMBB(FalseMBB)
34941 .addReg(Op2Reg)
34942 .addMBB(TrueMBB);
34943
34944 // Add this PHI to the rewrite table.
34945 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
34946 }
34947
34948 return MIB;
34949}
34950
34951// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
34953X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
34954 MachineInstr &SecondCascadedCMOV,
34955 MachineBasicBlock *ThisMBB) const {
34956 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34957 const MIMetadata MIMD(FirstCMOV);
34958
34959 // We lower cascaded CMOVs such as
34960 //
34961 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
34962 //
34963 // to two successive branches.
34964 //
34965 // Without this, we would add a PHI between the two jumps, which ends up
34966 // creating a few copies all around. For instance, for
34967 //
34968 // (sitofp (zext (fcmp une)))
34969 //
34970 // we would generate:
34971 //
34972 // ucomiss %xmm1, %xmm0
34973 // movss <1.0f>, %xmm0
34974 // movaps %xmm0, %xmm1
34975 // jne .LBB5_2
34976 // xorps %xmm1, %xmm1
34977 // .LBB5_2:
34978 // jp .LBB5_4
34979 // movaps %xmm1, %xmm0
34980 // .LBB5_4:
34981 // retq
34982 //
34983 // because this custom-inserter would have generated:
34984 //
34985 // A
34986 // | \
34987 // | B
34988 // | /
34989 // C
34990 // | \
34991 // | D
34992 // | /
34993 // E
34994 //
34995 // A: X = ...; Y = ...
34996 // B: empty
34997 // C: Z = PHI [X, A], [Y, B]
34998 // D: empty
34999 // E: PHI [X, C], [Z, D]
35000 //
35001 // If we lower both CMOVs in a single step, we can instead generate:
35002 //
35003 // A
35004 // | \
35005 // | C
35006 // | /|
35007 // |/ |
35008 // | |
35009 // | D
35010 // | /
35011 // E
35012 //
35013 // A: X = ...; Y = ...
35014 // D: empty
35015 // E: PHI [X, A], [X, C], [Y, D]
35016 //
35017 // Which, in our sitofp/fcmp example, gives us something like:
35018 //
35019 // ucomiss %xmm1, %xmm0
35020 // movss <1.0f>, %xmm0
35021 // jne .LBB5_4
35022 // jp .LBB5_4
35023 // xorps %xmm0, %xmm0
35024 // .LBB5_4:
35025 // retq
35026 //
35027
35028 // We lower cascaded CMOV into two successive branches to the same block.
35029 // EFLAGS is used by both, so mark it as live in the second.
35030 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35031 MachineFunction *F = ThisMBB->getParent();
35032 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35033 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35034 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35035
35036 MachineFunction::iterator It = ++ThisMBB->getIterator();
35037 F->insert(It, FirstInsertedMBB);
35038 F->insert(It, SecondInsertedMBB);
35039 F->insert(It, SinkMBB);
35040
35041 // For a cascaded CMOV, we lower it to two successive branches to
35042 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
35043 // the FirstInsertedMBB.
35044 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
35045
35046 // If the EFLAGS register isn't dead in the terminator, then claim that it's
35047 // live into the sink and copy blocks.
35048 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35049 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
35050 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
35051 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
35052 SinkMBB->addLiveIn(X86::EFLAGS);
35053 }
35054
35055 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35056 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
35057 std::next(MachineBasicBlock::iterator(FirstCMOV)),
35058 ThisMBB->end());
35059 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35060
35061 // Fallthrough block for ThisMBB.
35062 ThisMBB->addSuccessor(FirstInsertedMBB);
35063 // The true block target of the first branch is always SinkMBB.
35064 ThisMBB->addSuccessor(SinkMBB);
35065 // Fallthrough block for FirstInsertedMBB.
35066 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
35067 // The true block for the branch of FirstInsertedMBB.
35068 FirstInsertedMBB->addSuccessor(SinkMBB);
35069 // This is fallthrough.
35070 SecondInsertedMBB->addSuccessor(SinkMBB);
35071
35072 // Create the conditional branch instructions.
35073 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
35074 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
35075
35076 X86::CondCode SecondCC =
35077 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
35078 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
35079 .addMBB(SinkMBB)
35080 .addImm(SecondCC);
35081
35082 // SinkMBB:
35083 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
35084 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
35085 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
35086 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
35088 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
35089 .addReg(Op1Reg)
35090 .addMBB(SecondInsertedMBB)
35091 .addReg(Op2Reg)
35092 .addMBB(ThisMBB);
35093
35094 // The second SecondInsertedMBB provides the same incoming value as the
35095 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
35096 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
35097
35098 // Now remove the CMOVs.
35099 FirstCMOV.eraseFromParent();
35100 SecondCascadedCMOV.eraseFromParent();
35101
35102 return SinkMBB;
35103}
35104
35106X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
35107 MachineBasicBlock *ThisMBB) const {
35108 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35109 const MIMetadata MIMD(MI);
35110
35111 // To "insert" a SELECT_CC instruction, we actually have to insert the
35112 // diamond control-flow pattern. The incoming instruction knows the
35113 // destination vreg to set, the condition code register to branch on, the
35114 // true/false values to select between and a branch opcode to use.
35115
35116 // ThisMBB:
35117 // ...
35118 // TrueVal = ...
35119 // cmpTY ccX, r1, r2
35120 // bCC copy1MBB
35121 // fallthrough --> FalseMBB
35122
35123 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
35124 // as described above, by inserting a BB, and then making a PHI at the join
35125 // point to select the true and false operands of the CMOV in the PHI.
35126 //
35127 // The code also handles two different cases of multiple CMOV opcodes
35128 // in a row.
35129 //
35130 // Case 1:
35131 // In this case, there are multiple CMOVs in a row, all which are based on
35132 // the same condition setting (or the exact opposite condition setting).
35133 // In this case we can lower all the CMOVs using a single inserted BB, and
35134 // then make a number of PHIs at the join point to model the CMOVs. The only
35135 // trickiness here, is that in a case like:
35136 //
35137 // t2 = CMOV cond1 t1, f1
35138 // t3 = CMOV cond1 t2, f2
35139 //
35140 // when rewriting this into PHIs, we have to perform some renaming on the
35141 // temps since you cannot have a PHI operand refer to a PHI result earlier
35142 // in the same block. The "simple" but wrong lowering would be:
35143 //
35144 // t2 = PHI t1(BB1), f1(BB2)
35145 // t3 = PHI t2(BB1), f2(BB2)
35146 //
35147 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
35148 // renaming is to note that on the path through BB1, t2 is really just a
35149 // copy of t1, and do that renaming, properly generating:
35150 //
35151 // t2 = PHI t1(BB1), f1(BB2)
35152 // t3 = PHI t1(BB1), f2(BB2)
35153 //
35154 // Case 2:
35155 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
35156 // function - EmitLoweredCascadedSelect.
35157
35158 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
35160 MachineInstr *LastCMOV = &MI;
35162
35163 // Check for case 1, where there are multiple CMOVs with the same condition
35164 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
35165 // number of jumps the most.
35166
35167 if (isCMOVPseudo(MI)) {
35168 // See if we have a string of CMOVS with the same condition. Skip over
35169 // intervening debug insts.
35170 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
35171 (NextMIIt->getOperand(3).getImm() == CC ||
35172 NextMIIt->getOperand(3).getImm() == OppCC)) {
35173 LastCMOV = &*NextMIIt;
35174 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
35175 }
35176 }
35177
35178 // This checks for case 2, but only do this if we didn't already find
35179 // case 1, as indicated by LastCMOV == MI.
35180 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
35181 NextMIIt->getOpcode() == MI.getOpcode() &&
35182 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
35183 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
35184 NextMIIt->getOperand(1).isKill()) {
35185 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
35186 }
35187
35188 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35189 MachineFunction *F = ThisMBB->getParent();
35190 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
35191 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35192
35193 MachineFunction::iterator It = ++ThisMBB->getIterator();
35194 F->insert(It, FalseMBB);
35195 F->insert(It, SinkMBB);
35196
35197 // Set the call frame size on entry to the new basic blocks.
35198 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
35199 FalseMBB->setCallFrameSize(CallFrameSize);
35200 SinkMBB->setCallFrameSize(CallFrameSize);
35201
35202 // If the EFLAGS register isn't dead in the terminator, then claim that it's
35203 // live into the sink and copy blocks.
35204 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35205 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
35206 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
35207 FalseMBB->addLiveIn(X86::EFLAGS);
35208 SinkMBB->addLiveIn(X86::EFLAGS);
35209 }
35210
35211 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
35213 MachineBasicBlock::iterator(LastCMOV));
35214 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
35215 if (MI.isDebugInstr())
35216 SinkMBB->push_back(MI.removeFromParent());
35217
35218 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35219 SinkMBB->splice(SinkMBB->end(), ThisMBB,
35220 std::next(MachineBasicBlock::iterator(LastCMOV)),
35221 ThisMBB->end());
35222 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35223
35224 // Fallthrough block for ThisMBB.
35225 ThisMBB->addSuccessor(FalseMBB);
35226 // The true block target of the first (or only) branch is always a SinkMBB.
35227 ThisMBB->addSuccessor(SinkMBB);
35228 // Fallthrough block for FalseMBB.
35229 FalseMBB->addSuccessor(SinkMBB);
35230
35231 // Create the conditional branch instruction.
35232 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
35233
35234 // SinkMBB:
35235 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
35236 // ...
35239 std::next(MachineBasicBlock::iterator(LastCMOV));
35240 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
35241
35242 // Now remove the CMOV(s).
35243 ThisMBB->erase(MIItBegin, MIItEnd);
35244
35245 return SinkMBB;
35246}
35247
35248static unsigned getSUBriOpcode(bool IsLP64) {
35249 if (IsLP64)
35250 return X86::SUB64ri32;
35251 else
35252 return X86::SUB32ri;
35253}
35254
35256X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
35257 MachineBasicBlock *MBB) const {
35258 MachineFunction *MF = MBB->getParent();
35259 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35260 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
35261 const MIMetadata MIMD(MI);
35262 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35263
35264 const unsigned ProbeSize = getStackProbeSize(*MF);
35265
35267 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35268 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35269 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35270
35272 MF->insert(MBBIter, testMBB);
35273 MF->insert(MBBIter, blockMBB);
35274 MF->insert(MBBIter, tailMBB);
35275
35276 Register sizeVReg = MI.getOperand(1).getReg();
35277
35278 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
35279
35280 Register TmpStackPtr = MRI.createVirtualRegister(
35281 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35282 Register FinalStackPtr = MRI.createVirtualRegister(
35283 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35284
35285 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
35286 .addReg(physSPReg);
35287 {
35288 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
35289 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
35290 .addReg(TmpStackPtr)
35291 .addReg(sizeVReg);
35292 }
35293
35294 // test rsp size
35295
35296 BuildMI(testMBB, MIMD,
35297 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
35298 .addReg(FinalStackPtr)
35299 .addReg(physSPReg);
35300
35301 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
35302 .addMBB(tailMBB)
35304 testMBB->addSuccessor(blockMBB);
35305 testMBB->addSuccessor(tailMBB);
35306
35307 // Touch the block then extend it. This is done on the opposite side of
35308 // static probe where we allocate then touch, to avoid the need of probing the
35309 // tail of the static alloca. Possible scenarios are:
35310 //
35311 // + ---- <- ------------ <- ------------- <- ------------ +
35312 // | |
35313 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
35314 // | |
35315 // + <- ----------- <- ------------ <- ----------- <- ------------ +
35316 //
35317 // The property we want to enforce is to never have more than [page alloc] between two probes.
35318
35319 const unsigned XORMIOpc =
35320 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
35321 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
35322 .addImm(0);
35323
35324 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
35325 physSPReg)
35326 .addReg(physSPReg)
35327 .addImm(ProbeSize);
35328
35329 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
35330 blockMBB->addSuccessor(testMBB);
35331
35332 // Replace original instruction by the expected stack ptr
35333 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
35334 MI.getOperand(0).getReg())
35335 .addReg(FinalStackPtr);
35336
35337 tailMBB->splice(tailMBB->end(), MBB,
35338 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35340 MBB->addSuccessor(testMBB);
35341
35342 // Delete the original pseudo instruction.
35343 MI.eraseFromParent();
35344
35345 // And we're done.
35346 return tailMBB;
35347}
35348
35350X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
35351 MachineBasicBlock *BB) const {
35352 MachineFunction *MF = BB->getParent();
35353 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35354 const MIMetadata MIMD(MI);
35355 const BasicBlock *LLVM_BB = BB->getBasicBlock();
35356
35357 assert(MF->shouldSplitStack());
35358
35359 const bool Is64Bit = Subtarget.is64Bit();
35360 const bool IsLP64 = Subtarget.isTarget64BitLP64();
35361
35362 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
35363 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
35364
35365 // BB:
35366 // ... [Till the alloca]
35367 // If stacklet is not large enough, jump to mallocMBB
35368 //
35369 // bumpMBB:
35370 // Allocate by subtracting from RSP
35371 // Jump to continueMBB
35372 //
35373 // mallocMBB:
35374 // Allocate by call to runtime
35375 //
35376 // continueMBB:
35377 // ...
35378 // [rest of original BB]
35379 //
35380
35381 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35382 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35383 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35384
35386 const TargetRegisterClass *AddrRegClass =
35388
35389 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35390 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35391 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
35392 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
35393 sizeVReg = MI.getOperand(1).getReg(),
35394 physSPReg =
35395 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
35396
35397 MachineFunction::iterator MBBIter = ++BB->getIterator();
35398
35399 MF->insert(MBBIter, bumpMBB);
35400 MF->insert(MBBIter, mallocMBB);
35401 MF->insert(MBBIter, continueMBB);
35402
35403 continueMBB->splice(continueMBB->begin(), BB,
35404 std::next(MachineBasicBlock::iterator(MI)), BB->end());
35405 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
35406
35407 // Add code to the main basic block to check if the stack limit has been hit,
35408 // and if so, jump to mallocMBB otherwise to bumpMBB.
35409 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
35410 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
35411 .addReg(tmpSPVReg).addReg(sizeVReg);
35412 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
35413 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
35414 .addReg(SPLimitVReg);
35415 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
35416
35417 // bumpMBB simply decreases the stack pointer, since we know the current
35418 // stacklet has enough space.
35419 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
35420 .addReg(SPLimitVReg);
35421 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
35422 .addReg(SPLimitVReg);
35423 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35424
35425 // Calls into a routine in libgcc to allocate more space from the heap.
35426 const uint32_t *RegMask =
35428 if (IsLP64) {
35429 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
35430 .addReg(sizeVReg);
35431 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35432 .addExternalSymbol("__morestack_allocate_stack_space")
35433 .addRegMask(RegMask)
35434 .addReg(X86::RDI, RegState::Implicit)
35435 .addReg(X86::RAX, RegState::ImplicitDefine);
35436 } else if (Is64Bit) {
35437 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
35438 .addReg(sizeVReg);
35439 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35440 .addExternalSymbol("__morestack_allocate_stack_space")
35441 .addRegMask(RegMask)
35442 .addReg(X86::EDI, RegState::Implicit)
35443 .addReg(X86::EAX, RegState::ImplicitDefine);
35444 } else {
35445 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
35446 .addImm(12);
35447 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
35448 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
35449 .addExternalSymbol("__morestack_allocate_stack_space")
35450 .addRegMask(RegMask)
35451 .addReg(X86::EAX, RegState::ImplicitDefine);
35452 }
35453
35454 if (!Is64Bit)
35455 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
35456 .addImm(16);
35457
35458 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
35459 .addReg(IsLP64 ? X86::RAX : X86::EAX);
35460 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35461
35462 // Set up the CFG correctly.
35463 BB->addSuccessor(bumpMBB);
35464 BB->addSuccessor(mallocMBB);
35465 mallocMBB->addSuccessor(continueMBB);
35466 bumpMBB->addSuccessor(continueMBB);
35467
35468 // Take care of the PHI nodes.
35469 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
35470 MI.getOperand(0).getReg())
35471 .addReg(mallocPtrVReg)
35472 .addMBB(mallocMBB)
35473 .addReg(bumpSPPtrVReg)
35474 .addMBB(bumpMBB);
35475
35476 // Delete the original pseudo instruction.
35477 MI.eraseFromParent();
35478
35479 // And we're done.
35480 return continueMBB;
35481}
35482
35484X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
35485 MachineBasicBlock *BB) const {
35486 MachineFunction *MF = BB->getParent();
35487 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35488 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
35489 const MIMetadata MIMD(MI);
35490
35493 "SEH does not use catchret!");
35494
35495 // Only 32-bit EH needs to worry about manually restoring stack pointers.
35496 if (!Subtarget.is32Bit())
35497 return BB;
35498
35499 // C++ EH creates a new target block to hold the restore code, and wires up
35500 // the new block to the return destination with a normal JMP_4.
35501 MachineBasicBlock *RestoreMBB =
35503 assert(BB->succ_size() == 1);
35504 MF->insert(std::next(BB->getIterator()), RestoreMBB);
35505 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
35506 BB->addSuccessor(RestoreMBB);
35507 MI.getOperand(0).setMBB(RestoreMBB);
35508
35509 // Marking this as an EH pad but not a funclet entry block causes PEI to
35510 // restore stack pointers in the block.
35511 RestoreMBB->setIsEHPad(true);
35512
35513 auto RestoreMBBI = RestoreMBB->begin();
35514 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
35515 return BB;
35516}
35517
35519X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
35520 MachineBasicBlock *BB) const {
35521 // So, here we replace TLSADDR with the sequence:
35522 // adjust_stackdown -> TLSADDR -> adjust_stackup.
35523 // We need this because TLSADDR is lowered into calls
35524 // inside MC, therefore without the two markers shrink-wrapping
35525 // may push the prologue/epilogue pass them.
35526 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35527 const MIMetadata MIMD(MI);
35528 MachineFunction &MF = *BB->getParent();
35529
35530 // Emit CALLSEQ_START right before the instruction.
35531 MF.getFrameInfo().setAdjustsStack(true);
35532 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
35533 MachineInstrBuilder CallseqStart =
35534 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
35535 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
35536
35537 // Emit CALLSEQ_END right after the instruction.
35538 // We don't call erase from parent because we want to keep the
35539 // original instruction around.
35540 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
35541 MachineInstrBuilder CallseqEnd =
35542 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
35543 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
35544
35545 return BB;
35546}
35547
35549X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
35550 MachineBasicBlock *BB) const {
35551 // This is pretty easy. We're taking the value that we received from
35552 // our load from the relocation, sticking it in either RDI (x86-64)
35553 // or EAX and doing an indirect call. The return value will then
35554 // be in the normal return register.
35555 MachineFunction *F = BB->getParent();
35556 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35557 const MIMetadata MIMD(MI);
35558
35559 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
35560 assert(MI.getOperand(3).isGlobal() && "This should be a global");
35561
35562 // Get a register mask for the lowered call.
35563 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
35564 // proper register mask.
35565 const uint32_t *RegMask =
35566 Subtarget.is64Bit() ?
35569 if (Subtarget.is64Bit()) {
35571 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
35572 .addReg(X86::RIP)
35573 .addImm(0)
35574 .addReg(0)
35575 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35576 MI.getOperand(3).getTargetFlags())
35577 .addReg(0);
35578 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
35579 addDirectMem(MIB, X86::RDI);
35580 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
35581 } else if (!isPositionIndependent()) {
35583 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35584 .addReg(0)
35585 .addImm(0)
35586 .addReg(0)
35587 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35588 MI.getOperand(3).getTargetFlags())
35589 .addReg(0);
35590 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35591 addDirectMem(MIB, X86::EAX);
35592 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35593 } else {
35595 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35596 .addReg(TII->getGlobalBaseReg(F))
35597 .addImm(0)
35598 .addReg(0)
35599 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35600 MI.getOperand(3).getTargetFlags())
35601 .addReg(0);
35602 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35603 addDirectMem(MIB, X86::EAX);
35604 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35605 }
35606
35607 MI.eraseFromParent(); // The pseudo instruction is gone now.
35608 return BB;
35609}
35610
35611static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
35612 switch (RPOpc) {
35613 case X86::INDIRECT_THUNK_CALL32:
35614 return X86::CALLpcrel32;
35615 case X86::INDIRECT_THUNK_CALL64:
35616 return X86::CALL64pcrel32;
35617 case X86::INDIRECT_THUNK_TCRETURN32:
35618 return X86::TCRETURNdi;
35619 case X86::INDIRECT_THUNK_TCRETURN64:
35620 return X86::TCRETURNdi64;
35621 }
35622 llvm_unreachable("not indirect thunk opcode");
35623}
35624
35625static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
35626 unsigned Reg) {
35627 if (Subtarget.useRetpolineExternalThunk()) {
35628 // When using an external thunk for retpolines, we pick names that match the
35629 // names GCC happens to use as well. This helps simplify the implementation
35630 // of the thunks for kernels where they have no easy ability to create
35631 // aliases and are doing non-trivial configuration of the thunk's body. For
35632 // example, the Linux kernel will do boot-time hot patching of the thunk
35633 // bodies and cannot easily export aliases of these to loaded modules.
35634 //
35635 // Note that at any point in the future, we may need to change the semantics
35636 // of how we implement retpolines and at that time will likely change the
35637 // name of the called thunk. Essentially, there is no hard guarantee that
35638 // LLVM will generate calls to specific thunks, we merely make a best-effort
35639 // attempt to help out kernels and other systems where duplicating the
35640 // thunks is costly.
35641 switch (Reg) {
35642 case X86::EAX:
35643 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35644 return "__x86_indirect_thunk_eax";
35645 case X86::ECX:
35646 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35647 return "__x86_indirect_thunk_ecx";
35648 case X86::EDX:
35649 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35650 return "__x86_indirect_thunk_edx";
35651 case X86::EDI:
35652 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35653 return "__x86_indirect_thunk_edi";
35654 case X86::R11:
35655 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35656 return "__x86_indirect_thunk_r11";
35657 }
35658 llvm_unreachable("unexpected reg for external indirect thunk");
35659 }
35660
35661 if (Subtarget.useRetpolineIndirectCalls() ||
35662 Subtarget.useRetpolineIndirectBranches()) {
35663 // When targeting an internal COMDAT thunk use an LLVM-specific name.
35664 switch (Reg) {
35665 case X86::EAX:
35666 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35667 return "__llvm_retpoline_eax";
35668 case X86::ECX:
35669 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35670 return "__llvm_retpoline_ecx";
35671 case X86::EDX:
35672 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35673 return "__llvm_retpoline_edx";
35674 case X86::EDI:
35675 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35676 return "__llvm_retpoline_edi";
35677 case X86::R11:
35678 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35679 return "__llvm_retpoline_r11";
35680 }
35681 llvm_unreachable("unexpected reg for retpoline");
35682 }
35683
35684 if (Subtarget.useLVIControlFlowIntegrity()) {
35685 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35686 return "__llvm_lvi_thunk_r11";
35687 }
35688 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
35689}
35690
35692X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
35693 MachineBasicBlock *BB) const {
35694 // Copy the virtual register into the R11 physical register and
35695 // call the retpoline thunk.
35696 const MIMetadata MIMD(MI);
35697 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35698 Register CalleeVReg = MI.getOperand(0).getReg();
35699 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
35700
35701 // Find an available scratch register to hold the callee. On 64-bit, we can
35702 // just use R11, but we scan for uses anyway to ensure we don't generate
35703 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
35704 // already a register use operand to the call to hold the callee. If none
35705 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
35706 // register and ESI is the base pointer to realigned stack frames with VLAs.
35707 SmallVector<unsigned, 3> AvailableRegs;
35708 if (Subtarget.is64Bit())
35709 AvailableRegs.push_back(X86::R11);
35710 else
35711 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
35712
35713 // Zero out any registers that are already used.
35714 for (const auto &MO : MI.operands()) {
35715 if (MO.isReg() && MO.isUse())
35716 for (unsigned &Reg : AvailableRegs)
35717 if (Reg == MO.getReg())
35718 Reg = 0;
35719 }
35720
35721 // Choose the first remaining non-zero available register.
35722 unsigned AvailableReg = 0;
35723 for (unsigned MaybeReg : AvailableRegs) {
35724 if (MaybeReg) {
35725 AvailableReg = MaybeReg;
35726 break;
35727 }
35728 }
35729 if (!AvailableReg)
35730 report_fatal_error("calling convention incompatible with retpoline, no "
35731 "available registers");
35732
35733 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
35734
35735 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
35736 .addReg(CalleeVReg);
35737 MI.getOperand(0).ChangeToES(Symbol);
35738 MI.setDesc(TII->get(Opc));
35740 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
35741 return BB;
35742}
35743
35744/// SetJmp implies future control flow change upon calling the corresponding
35745/// LongJmp.
35746/// Instead of using the 'return' instruction, the long jump fixes the stack and
35747/// performs an indirect branch. To do so it uses the registers that were stored
35748/// in the jump buffer (when calling SetJmp).
35749/// In case the shadow stack is enabled we need to fix it as well, because some
35750/// return addresses will be skipped.
35751/// The function will save the SSP for future fixing in the function
35752/// emitLongJmpShadowStackFix.
35753/// \sa emitLongJmpShadowStackFix
35754/// \param [in] MI The temporary Machine Instruction for the builtin.
35755/// \param [in] MBB The Machine Basic Block that will be modified.
35756void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
35757 MachineBasicBlock *MBB) const {
35758 const MIMetadata MIMD(MI);
35759 MachineFunction *MF = MBB->getParent();
35760 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35763
35764 // Memory Reference.
35765 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35766 MI.memoperands_end());
35767
35768 // Initialize a register with zero.
35769 MVT PVT = getPointerTy(MF->getDataLayout());
35770 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35771 Register ZReg = MRI.createVirtualRegister(PtrRC);
35772 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
35773 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
35774 .addDef(ZReg)
35775 .addReg(ZReg, RegState::Undef)
35776 .addReg(ZReg, RegState::Undef);
35777
35778 // Read the current SSP Register value to the zeroed register.
35779 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35780 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35781 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35782
35783 // Write the SSP register value to offset 3 in input memory buffer.
35784 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35785 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
35786 const int64_t SSPOffset = 3 * PVT.getStoreSize();
35787 const unsigned MemOpndSlot = 1;
35788 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35789 if (i == X86::AddrDisp)
35790 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
35791 else
35792 MIB.add(MI.getOperand(MemOpndSlot + i));
35793 }
35794 MIB.addReg(SSPCopyReg);
35795 MIB.setMemRefs(MMOs);
35796}
35797
35799X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
35800 MachineBasicBlock *MBB) const {
35801 const MIMetadata MIMD(MI);
35802 MachineFunction *MF = MBB->getParent();
35803 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35804 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35806
35807 const BasicBlock *BB = MBB->getBasicBlock();
35809
35810 // Memory Reference
35811 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35812 MI.memoperands_end());
35813
35814 unsigned DstReg;
35815 unsigned MemOpndSlot = 0;
35816
35817 unsigned CurOp = 0;
35818
35819 DstReg = MI.getOperand(CurOp++).getReg();
35820 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35821 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
35822 (void)TRI;
35823 Register mainDstReg = MRI.createVirtualRegister(RC);
35824 Register restoreDstReg = MRI.createVirtualRegister(RC);
35825
35826 MemOpndSlot = CurOp;
35827
35828 MVT PVT = getPointerTy(MF->getDataLayout());
35829 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35830 "Invalid Pointer Size!");
35831
35832 // For v = setjmp(buf), we generate
35833 //
35834 // thisMBB:
35835 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
35836 // SjLjSetup restoreMBB
35837 //
35838 // mainMBB:
35839 // v_main = 0
35840 //
35841 // sinkMBB:
35842 // v = phi(main, restore)
35843 //
35844 // restoreMBB:
35845 // if base pointer being used, load it from frame
35846 // v_restore = 1
35847
35848 MachineBasicBlock *thisMBB = MBB;
35849 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35850 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35851 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
35852 MF->insert(I, mainMBB);
35853 MF->insert(I, sinkMBB);
35854 MF->push_back(restoreMBB);
35855 restoreMBB->setMachineBlockAddressTaken();
35856
35858
35859 // Transfer the remainder of BB and its successor edges to sinkMBB.
35860 sinkMBB->splice(sinkMBB->begin(), MBB,
35861 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35863
35864 // thisMBB:
35865 unsigned PtrStoreOpc = 0;
35866 unsigned LabelReg = 0;
35867 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35868 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35870
35871 // Prepare IP either in reg or imm.
35872 if (!UseImmLabel) {
35873 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35874 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35875 LabelReg = MRI.createVirtualRegister(PtrRC);
35876 if (Subtarget.is64Bit()) {
35877 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
35878 .addReg(X86::RIP)
35879 .addImm(0)
35880 .addReg(0)
35881 .addMBB(restoreMBB)
35882 .addReg(0);
35883 } else {
35884 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
35885 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
35886 .addReg(XII->getGlobalBaseReg(MF))
35887 .addImm(0)
35888 .addReg(0)
35889 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
35890 .addReg(0);
35891 }
35892 } else
35893 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35894 // Store IP
35895 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
35896 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35897 if (i == X86::AddrDisp)
35898 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
35899 else
35900 MIB.add(MI.getOperand(MemOpndSlot + i));
35901 }
35902 if (!UseImmLabel)
35903 MIB.addReg(LabelReg);
35904 else
35905 MIB.addMBB(restoreMBB);
35906 MIB.setMemRefs(MMOs);
35907
35908 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35909 emitSetJmpShadowStackFix(MI, thisMBB);
35910 }
35911
35912 // Setup
35913 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
35914 .addMBB(restoreMBB);
35915
35916 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35917 MIB.addRegMask(RegInfo->getNoPreservedMask());
35918 thisMBB->addSuccessor(mainMBB);
35919 thisMBB->addSuccessor(restoreMBB);
35920
35921 // mainMBB:
35922 // EAX = 0
35923 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
35924 mainMBB->addSuccessor(sinkMBB);
35925
35926 // sinkMBB:
35927 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35928 .addReg(mainDstReg)
35929 .addMBB(mainMBB)
35930 .addReg(restoreDstReg)
35931 .addMBB(restoreMBB);
35932
35933 // restoreMBB:
35934 if (RegInfo->hasBasePointer(*MF)) {
35935 const bool Uses64BitFramePtr =
35936 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35938 X86FI->setRestoreBasePointer(MF);
35939 Register FramePtr = RegInfo->getFrameRegister(*MF);
35940 Register BasePtr = RegInfo->getBaseRegister();
35941 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
35942 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
35943 FramePtr, true, X86FI->getRestoreBasePointerOffset())
35945 }
35946 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
35947 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35948 restoreMBB->addSuccessor(sinkMBB);
35949
35950 MI.eraseFromParent();
35951 return sinkMBB;
35952}
35953
35954/// Fix the shadow stack using the previously saved SSP pointer.
35955/// \sa emitSetJmpShadowStackFix
35956/// \param [in] MI The temporary Machine Instruction for the builtin.
35957/// \param [in] MBB The Machine Basic Block that will be modified.
35958/// \return The sink MBB that will perform the future indirect branch.
35960X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
35961 MachineBasicBlock *MBB) const {
35962 const MIMetadata MIMD(MI);
35963 MachineFunction *MF = MBB->getParent();
35964 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35966
35967 // Memory Reference
35968 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35969 MI.memoperands_end());
35970
35971 MVT PVT = getPointerTy(MF->getDataLayout());
35972 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35973
35974 // checkSspMBB:
35975 // xor vreg1, vreg1
35976 // rdssp vreg1
35977 // test vreg1, vreg1
35978 // je sinkMBB # Jump if Shadow Stack is not supported
35979 // fallMBB:
35980 // mov buf+24/12(%rip), vreg2
35981 // sub vreg1, vreg2
35982 // jbe sinkMBB # No need to fix the Shadow Stack
35983 // fixShadowMBB:
35984 // shr 3/2, vreg2
35985 // incssp vreg2 # fix the SSP according to the lower 8 bits
35986 // shr 8, vreg2
35987 // je sinkMBB
35988 // fixShadowLoopPrepareMBB:
35989 // shl vreg2
35990 // mov 128, vreg3
35991 // fixShadowLoopMBB:
35992 // incssp vreg3
35993 // dec vreg2
35994 // jne fixShadowLoopMBB # Iterate until you finish fixing
35995 // # the Shadow Stack
35996 // sinkMBB:
35997
35999 const BasicBlock *BB = MBB->getBasicBlock();
36000
36001 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
36002 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
36003 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
36004 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
36005 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
36006 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36007 MF->insert(I, checkSspMBB);
36008 MF->insert(I, fallMBB);
36009 MF->insert(I, fixShadowMBB);
36010 MF->insert(I, fixShadowLoopPrepareMBB);
36011 MF->insert(I, fixShadowLoopMBB);
36012 MF->insert(I, sinkMBB);
36013
36014 // Transfer the remainder of BB and its successor edges to sinkMBB.
36015 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
36016 MBB->end());
36018
36019 MBB->addSuccessor(checkSspMBB);
36020
36021 // Initialize a register with zero.
36022 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
36023 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
36024
36025 if (PVT == MVT::i64) {
36026 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
36027 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
36028 .addImm(0)
36029 .addReg(ZReg)
36030 .addImm(X86::sub_32bit);
36031 ZReg = TmpZReg;
36032 }
36033
36034 // Read the current SSP Register value to the zeroed register.
36035 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36036 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36037 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36038
36039 // Check whether the result of the SSP register is zero and jump directly
36040 // to the sink.
36041 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
36042 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
36043 .addReg(SSPCopyReg)
36044 .addReg(SSPCopyReg);
36045 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
36046 .addMBB(sinkMBB)
36048 checkSspMBB->addSuccessor(sinkMBB);
36049 checkSspMBB->addSuccessor(fallMBB);
36050
36051 // Reload the previously saved SSP register value.
36052 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
36053 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36054 const int64_t SPPOffset = 3 * PVT.getStoreSize();
36056 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
36057 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36058 const MachineOperand &MO = MI.getOperand(i);
36059 if (i == X86::AddrDisp)
36060 MIB.addDisp(MO, SPPOffset);
36061 else if (MO.isReg()) // Don't add the whole operand, we don't want to
36062 // preserve kill flags.
36063 MIB.addReg(MO.getReg());
36064 else
36065 MIB.add(MO);
36066 }
36067 MIB.setMemRefs(MMOs);
36068
36069 // Subtract the current SSP from the previous SSP.
36070 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
36071 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
36072 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
36073 .addReg(PrevSSPReg)
36074 .addReg(SSPCopyReg);
36075
36076 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
36077 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
36078 .addMBB(sinkMBB)
36080 fallMBB->addSuccessor(sinkMBB);
36081 fallMBB->addSuccessor(fixShadowMBB);
36082
36083 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
36084 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
36085 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
36086 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
36087 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
36088 .addReg(SspSubReg)
36089 .addImm(Offset);
36090
36091 // Increase SSP when looking only on the lower 8 bits of the delta.
36092 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
36093 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
36094
36095 // Reset the lower 8 bits.
36096 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
36097 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
36098 .addReg(SspFirstShrReg)
36099 .addImm(8);
36100
36101 // Jump if the result of the shift is zero.
36102 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
36103 .addMBB(sinkMBB)
36105 fixShadowMBB->addSuccessor(sinkMBB);
36106 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
36107
36108 // Do a single shift left.
36109 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
36110 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
36111 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
36112 .addReg(SspSecondShrReg)
36113 .addImm(1);
36114
36115 // Save the value 128 to a register (will be used next with incssp).
36116 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
36117 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
36118 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
36119 .addImm(128);
36120 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
36121
36122 // Since incssp only looks at the lower 8 bits, we might need to do several
36123 // iterations of incssp until we finish fixing the shadow stack.
36124 Register DecReg = MRI.createVirtualRegister(PtrRC);
36125 Register CounterReg = MRI.createVirtualRegister(PtrRC);
36126 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
36127 .addReg(SspAfterShlReg)
36128 .addMBB(fixShadowLoopPrepareMBB)
36129 .addReg(DecReg)
36130 .addMBB(fixShadowLoopMBB);
36131
36132 // Every iteration we increase the SSP by 128.
36133 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
36134
36135 // Every iteration we decrement the counter by 1.
36136 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
36137 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
36138
36139 // Jump if the counter is not zero yet.
36140 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
36141 .addMBB(fixShadowLoopMBB)
36143 fixShadowLoopMBB->addSuccessor(sinkMBB);
36144 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
36145
36146 return sinkMBB;
36147}
36148
36150X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
36151 MachineBasicBlock *MBB) const {
36152 const MIMetadata MIMD(MI);
36153 MachineFunction *MF = MBB->getParent();
36154 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36156
36157 // Memory Reference
36158 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
36159 MI.memoperands_end());
36160
36161 MVT PVT = getPointerTy(MF->getDataLayout());
36162 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
36163 "Invalid Pointer Size!");
36164
36165 const TargetRegisterClass *RC =
36166 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36167 Register Tmp = MRI.createVirtualRegister(RC);
36168 // Since FP is only updated here but NOT referenced, it's treated as GPR.
36169 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
36170 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
36171 Register SP = RegInfo->getStackRegister();
36172
36174
36175 const int64_t LabelOffset = 1 * PVT.getStoreSize();
36176 const int64_t SPOffset = 2 * PVT.getStoreSize();
36177
36178 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36179 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
36180
36181 MachineBasicBlock *thisMBB = MBB;
36182
36183 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
36184 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
36185 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
36186 }
36187
36188 // Reload FP
36189 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
36190 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36191 const MachineOperand &MO = MI.getOperand(i);
36192 if (MO.isReg()) // Don't add the whole operand, we don't want to
36193 // preserve kill flags.
36194 MIB.addReg(MO.getReg());
36195 else
36196 MIB.add(MO);
36197 }
36198 MIB.setMemRefs(MMOs);
36199
36200 // Reload IP
36201 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
36202 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36203 const MachineOperand &MO = MI.getOperand(i);
36204 if (i == X86::AddrDisp)
36205 MIB.addDisp(MO, LabelOffset);
36206 else if (MO.isReg()) // Don't add the whole operand, we don't want to
36207 // preserve kill flags.
36208 MIB.addReg(MO.getReg());
36209 else
36210 MIB.add(MO);
36211 }
36212 MIB.setMemRefs(MMOs);
36213
36214 // Reload SP
36215 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
36216 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36217 if (i == X86::AddrDisp)
36218 MIB.addDisp(MI.getOperand(i), SPOffset);
36219 else
36220 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
36221 // the last instruction of the expansion.
36222 }
36223 MIB.setMemRefs(MMOs);
36224
36225 // Jump
36226 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
36227
36228 MI.eraseFromParent();
36229 return thisMBB;
36230}
36231
36232void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
36234 MachineBasicBlock *DispatchBB,
36235 int FI) const {
36236 const MIMetadata MIMD(MI);
36237 MachineFunction *MF = MBB->getParent();
36239 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36240
36241 MVT PVT = getPointerTy(MF->getDataLayout());
36242 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
36243
36244 unsigned Op = 0;
36245 unsigned VR = 0;
36246
36247 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36249
36250 if (UseImmLabel) {
36251 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
36252 } else {
36253 const TargetRegisterClass *TRC =
36254 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36255 VR = MRI->createVirtualRegister(TRC);
36256 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36257
36258 if (Subtarget.is64Bit())
36259 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
36260 .addReg(X86::RIP)
36261 .addImm(1)
36262 .addReg(0)
36263 .addMBB(DispatchBB)
36264 .addReg(0);
36265 else
36266 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
36267 .addReg(0) /* TII->getGlobalBaseReg(MF) */
36268 .addImm(1)
36269 .addReg(0)
36270 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
36271 .addReg(0);
36272 }
36273
36274 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
36275 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
36276 if (UseImmLabel)
36277 MIB.addMBB(DispatchBB);
36278 else
36279 MIB.addReg(VR);
36280}
36281
36283X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
36284 MachineBasicBlock *BB) const {
36285 const MIMetadata MIMD(MI);
36286 MachineFunction *MF = BB->getParent();
36288 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36289 int FI = MF->getFrameInfo().getFunctionContextIndex();
36290
36291 // Get a mapping of the call site numbers to all of the landing pads they're
36292 // associated with.
36294 unsigned MaxCSNum = 0;
36295 for (auto &MBB : *MF) {
36296 if (!MBB.isEHPad())
36297 continue;
36298
36299 MCSymbol *Sym = nullptr;
36300 for (const auto &MI : MBB) {
36301 if (MI.isDebugInstr())
36302 continue;
36303
36304 assert(MI.isEHLabel() && "expected EH_LABEL");
36305 Sym = MI.getOperand(0).getMCSymbol();
36306 break;
36307 }
36308
36309 if (!MF->hasCallSiteLandingPad(Sym))
36310 continue;
36311
36312 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
36313 CallSiteNumToLPad[CSI].push_back(&MBB);
36314 MaxCSNum = std::max(MaxCSNum, CSI);
36315 }
36316 }
36317
36318 // Get an ordered list of the machine basic blocks for the jump table.
36319 std::vector<MachineBasicBlock *> LPadList;
36321 LPadList.reserve(CallSiteNumToLPad.size());
36322
36323 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
36324 for (auto &LP : CallSiteNumToLPad[CSI]) {
36325 LPadList.push_back(LP);
36326 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
36327 }
36328 }
36329
36330 assert(!LPadList.empty() &&
36331 "No landing pad destinations for the dispatch jump table!");
36332
36333 // Create the MBBs for the dispatch code.
36334
36335 // Shove the dispatch's address into the return slot in the function context.
36336 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
36337 DispatchBB->setIsEHPad(true);
36338
36339 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
36340 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
36341 DispatchBB->addSuccessor(TrapBB);
36342
36343 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
36344 DispatchBB->addSuccessor(DispContBB);
36345
36346 // Insert MBBs.
36347 MF->push_back(DispatchBB);
36348 MF->push_back(DispContBB);
36349 MF->push_back(TrapBB);
36350
36351 // Insert code into the entry block that creates and registers the function
36352 // context.
36353 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
36354
36355 // Create the jump table and associated information
36356 unsigned JTE = getJumpTableEncoding();
36357 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
36358 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
36359
36360 const X86RegisterInfo &RI = TII->getRegisterInfo();
36361 // Add a register mask with no preserved registers. This results in all
36362 // registers being marked as clobbered.
36363 if (RI.hasBasePointer(*MF)) {
36364 const bool FPIs64Bit =
36365 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
36366 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
36367 MFI->setRestoreBasePointer(MF);
36368
36369 Register FP = RI.getFrameRegister(*MF);
36370 Register BP = RI.getBaseRegister();
36371 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
36372 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
36375 } else {
36376 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
36378 }
36379
36380 // IReg is used as an index in a memory operand and therefore can't be SP
36381 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
36382 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
36383 Subtarget.is64Bit() ? 8 : 4);
36384 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
36385 .addReg(IReg)
36386 .addImm(LPadList.size());
36387 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
36388 .addMBB(TrapBB)
36390
36391 if (Subtarget.is64Bit()) {
36392 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36393 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
36394
36395 // leaq .LJTI0_0(%rip), BReg
36396 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
36397 .addReg(X86::RIP)
36398 .addImm(1)
36399 .addReg(0)
36400 .addJumpTableIndex(MJTI)
36401 .addReg(0);
36402 // movzx IReg64, IReg
36403 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
36404 .addImm(0)
36405 .addReg(IReg)
36406 .addImm(X86::sub_32bit);
36407
36408 switch (JTE) {
36410 // jmpq *(BReg,IReg64,8)
36411 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
36412 .addReg(BReg)
36413 .addImm(8)
36414 .addReg(IReg64)
36415 .addImm(0)
36416 .addReg(0);
36417 break;
36419 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
36420 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
36421 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36422
36423 // movl (BReg,IReg64,4), OReg
36424 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
36425 .addReg(BReg)
36426 .addImm(4)
36427 .addReg(IReg64)
36428 .addImm(0)
36429 .addReg(0);
36430 // movsx OReg64, OReg
36431 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
36432 .addReg(OReg);
36433 // addq BReg, OReg64, TReg
36434 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
36435 .addReg(OReg64)
36436 .addReg(BReg);
36437 // jmpq *TReg
36438 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
36439 break;
36440 }
36441 default:
36442 llvm_unreachable("Unexpected jump table encoding");
36443 }
36444 } else {
36445 // jmpl *.LJTI0_0(,IReg,4)
36446 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
36447 .addReg(0)
36448 .addImm(4)
36449 .addReg(IReg)
36450 .addJumpTableIndex(MJTI)
36451 .addReg(0);
36452 }
36453
36454 // Add the jump table entries as successors to the MBB.
36456 for (auto &LP : LPadList)
36457 if (SeenMBBs.insert(LP).second)
36458 DispContBB->addSuccessor(LP);
36459
36460 // N.B. the order the invoke BBs are processed in doesn't matter here.
36462 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
36463 for (MachineBasicBlock *MBB : InvokeBBs) {
36464 // Remove the landing pad successor from the invoke block and replace it
36465 // with the new dispatch block.
36466 // Keep a copy of Successors since it's modified inside the loop.
36468 MBB->succ_rend());
36469 // FIXME: Avoid quadratic complexity.
36470 for (auto *MBBS : Successors) {
36471 if (MBBS->isEHPad()) {
36472 MBB->removeSuccessor(MBBS);
36473 MBBLPads.push_back(MBBS);
36474 }
36475 }
36476
36477 MBB->addSuccessor(DispatchBB);
36478
36479 // Find the invoke call and mark all of the callee-saved registers as
36480 // 'implicit defined' so that they're spilled. This prevents code from
36481 // moving instructions to before the EH block, where they will never be
36482 // executed.
36483 for (auto &II : reverse(*MBB)) {
36484 if (!II.isCall())
36485 continue;
36486
36488 for (auto &MOp : II.operands())
36489 if (MOp.isReg())
36490 DefRegs[MOp.getReg()] = true;
36491
36492 MachineInstrBuilder MIB(*MF, &II);
36493 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
36494 unsigned Reg = SavedRegs[RegIdx];
36495 if (!DefRegs[Reg])
36497 }
36498
36499 break;
36500 }
36501 }
36502
36503 // Mark all former landing pads as non-landing pads. The dispatch is the only
36504 // landing pad now.
36505 for (auto &LP : MBBLPads)
36506 LP->setIsEHPad(false);
36507
36508 // The instruction is gone now.
36509 MI.eraseFromParent();
36510 return BB;
36511}
36512
36514X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
36515 MachineBasicBlock *BB) const {
36516 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
36517 // calls may require proper stack alignment.
36518 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36519 const MIMetadata MIMD(MI);
36520 MachineFunction &MF = *BB->getParent();
36521
36522 // Emit CALLSEQ_START right before the instruction.
36523 MF.getFrameInfo().setAdjustsStack(true);
36524 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
36525 MachineInstrBuilder CallseqStart =
36526 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
36527 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
36528
36529 // Emit CALLSEQ_END right after the instruction.
36530 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
36531 MachineInstrBuilder CallseqEnd =
36532 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
36533 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
36534
36535 return BB;
36536}
36537
36540 MachineBasicBlock *BB) const {
36541 MachineFunction *MF = BB->getParent();
36542 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36543 const MIMetadata MIMD(MI);
36544
36545 auto TMMImmToTMMReg = [](unsigned Imm) {
36546 assert (Imm < 8 && "Illegal tmm index");
36547 return X86::TMM0 + Imm;
36548 };
36549 switch (MI.getOpcode()) {
36550 default: llvm_unreachable("Unexpected instr type to insert");
36551 case X86::TLS_addr32:
36552 case X86::TLS_addr64:
36553 case X86::TLS_addrX32:
36554 case X86::TLS_base_addr32:
36555 case X86::TLS_base_addr64:
36556 case X86::TLS_base_addrX32:
36557 case X86::TLS_desc32:
36558 case X86::TLS_desc64:
36559 return EmitLoweredTLSAddr(MI, BB);
36560 case X86::INDIRECT_THUNK_CALL32:
36561 case X86::INDIRECT_THUNK_CALL64:
36562 case X86::INDIRECT_THUNK_TCRETURN32:
36563 case X86::INDIRECT_THUNK_TCRETURN64:
36564 return EmitLoweredIndirectThunk(MI, BB);
36565 case X86::CATCHRET:
36566 return EmitLoweredCatchRet(MI, BB);
36567 case X86::SEG_ALLOCA_32:
36568 case X86::SEG_ALLOCA_64:
36569 return EmitLoweredSegAlloca(MI, BB);
36570 case X86::PROBED_ALLOCA_32:
36571 case X86::PROBED_ALLOCA_64:
36572 return EmitLoweredProbedAlloca(MI, BB);
36573 case X86::TLSCall_32:
36574 case X86::TLSCall_64:
36575 return EmitLoweredTLSCall(MI, BB);
36576 case X86::CMOV_FR16:
36577 case X86::CMOV_FR16X:
36578 case X86::CMOV_FR32:
36579 case X86::CMOV_FR32X:
36580 case X86::CMOV_FR64:
36581 case X86::CMOV_FR64X:
36582 case X86::CMOV_GR8:
36583 case X86::CMOV_GR16:
36584 case X86::CMOV_GR32:
36585 case X86::CMOV_RFP32:
36586 case X86::CMOV_RFP64:
36587 case X86::CMOV_RFP80:
36588 case X86::CMOV_VR64:
36589 case X86::CMOV_VR128:
36590 case X86::CMOV_VR128X:
36591 case X86::CMOV_VR256:
36592 case X86::CMOV_VR256X:
36593 case X86::CMOV_VR512:
36594 case X86::CMOV_VK1:
36595 case X86::CMOV_VK2:
36596 case X86::CMOV_VK4:
36597 case X86::CMOV_VK8:
36598 case X86::CMOV_VK16:
36599 case X86::CMOV_VK32:
36600 case X86::CMOV_VK64:
36601 return EmitLoweredSelect(MI, BB);
36602
36603 case X86::FP80_ADDr:
36604 case X86::FP80_ADDm32: {
36605 // Change the floating point control register to use double extended
36606 // precision when performing the addition.
36607 int OrigCWFrameIdx =
36608 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36609 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36610 OrigCWFrameIdx);
36611
36612 // Load the old value of the control word...
36613 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36614 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36615 OrigCWFrameIdx);
36616
36617 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
36618 // precision.
36619 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36620 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36621 .addReg(OldCW, RegState::Kill)
36622 .addImm(0x300);
36623
36624 // Extract to 16 bits.
36625 Register NewCW16 =
36626 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36627 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36628 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36629
36630 // Prepare memory for FLDCW.
36631 int NewCWFrameIdx =
36632 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36633 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36634 NewCWFrameIdx)
36635 .addReg(NewCW16, RegState::Kill);
36636
36637 // Reload the modified control word now...
36638 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36639 NewCWFrameIdx);
36640
36641 // Do the addition.
36642 if (MI.getOpcode() == X86::FP80_ADDr) {
36643 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
36644 .add(MI.getOperand(0))
36645 .add(MI.getOperand(1))
36646 .add(MI.getOperand(2));
36647 } else {
36648 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
36649 .add(MI.getOperand(0))
36650 .add(MI.getOperand(1))
36651 .add(MI.getOperand(2))
36652 .add(MI.getOperand(3))
36653 .add(MI.getOperand(4))
36654 .add(MI.getOperand(5))
36655 .add(MI.getOperand(6));
36656 }
36657
36658 // Reload the original control word now.
36659 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36660 OrigCWFrameIdx);
36661
36662 MI.eraseFromParent(); // The pseudo instruction is gone now.
36663 return BB;
36664 }
36665
36666 case X86::FP32_TO_INT16_IN_MEM:
36667 case X86::FP32_TO_INT32_IN_MEM:
36668 case X86::FP32_TO_INT64_IN_MEM:
36669 case X86::FP64_TO_INT16_IN_MEM:
36670 case X86::FP64_TO_INT32_IN_MEM:
36671 case X86::FP64_TO_INT64_IN_MEM:
36672 case X86::FP80_TO_INT16_IN_MEM:
36673 case X86::FP80_TO_INT32_IN_MEM:
36674 case X86::FP80_TO_INT64_IN_MEM: {
36675 // Change the floating point control register to use "round towards zero"
36676 // mode when truncating to an integer value.
36677 int OrigCWFrameIdx =
36678 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36679 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36680 OrigCWFrameIdx);
36681
36682 // Load the old value of the control word...
36683 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36684 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36685 OrigCWFrameIdx);
36686
36687 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
36688 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36689 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36690 .addReg(OldCW, RegState::Kill).addImm(0xC00);
36691
36692 // Extract to 16 bits.
36693 Register NewCW16 =
36694 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36695 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36696 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36697
36698 // Prepare memory for FLDCW.
36699 int NewCWFrameIdx =
36700 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36701 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36702 NewCWFrameIdx)
36703 .addReg(NewCW16, RegState::Kill);
36704
36705 // Reload the modified control word now...
36706 addFrameReference(BuildMI(*BB, MI, MIMD,
36707 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
36708
36709 // Get the X86 opcode to use.
36710 unsigned Opc;
36711 switch (MI.getOpcode()) {
36712 // clang-format off
36713 default: llvm_unreachable("illegal opcode!");
36714 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
36715 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
36716 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
36717 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
36718 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
36719 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
36720 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
36721 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
36722 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
36723 // clang-format on
36724 }
36725
36727 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
36728 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
36729
36730 // Reload the original control word now.
36731 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36732 OrigCWFrameIdx);
36733
36734 MI.eraseFromParent(); // The pseudo instruction is gone now.
36735 return BB;
36736 }
36737
36738 // xbegin
36739 case X86::XBEGIN:
36740 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
36741
36742 case X86::VAARG_64:
36743 case X86::VAARG_X32:
36744 return EmitVAARGWithCustomInserter(MI, BB);
36745
36746 case X86::EH_SjLj_SetJmp32:
36747 case X86::EH_SjLj_SetJmp64:
36748 return emitEHSjLjSetJmp(MI, BB);
36749
36750 case X86::EH_SjLj_LongJmp32:
36751 case X86::EH_SjLj_LongJmp64:
36752 return emitEHSjLjLongJmp(MI, BB);
36753
36754 case X86::Int_eh_sjlj_setup_dispatch:
36755 return EmitSjLjDispatchBlock(MI, BB);
36756
36757 case TargetOpcode::STATEPOINT:
36758 // As an implementation detail, STATEPOINT shares the STACKMAP format at
36759 // this point in the process. We diverge later.
36760 return emitPatchPoint(MI, BB);
36761
36762 case TargetOpcode::STACKMAP:
36763 case TargetOpcode::PATCHPOINT:
36764 return emitPatchPoint(MI, BB);
36765
36766 case TargetOpcode::PATCHABLE_EVENT_CALL:
36767 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
36768 return emitPatchableEventCall(MI, BB);
36769
36770 case X86::LCMPXCHG8B: {
36771 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36772 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
36773 // requires a memory operand. If it happens that current architecture is
36774 // i686 and for current function we need a base pointer
36775 // - which is ESI for i686 - register allocator would not be able to
36776 // allocate registers for an address in form of X(%reg, %reg, Y)
36777 // - there never would be enough unreserved registers during regalloc
36778 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
36779 // We are giving a hand to register allocator by precomputing the address in
36780 // a new vreg using LEA.
36781
36782 // If it is not i686 or there is no base pointer - nothing to do here.
36783 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
36784 return BB;
36785
36786 // Even though this code does not necessarily needs the base pointer to
36787 // be ESI, we check for that. The reason: if this assert fails, there are
36788 // some changes happened in the compiler base pointer handling, which most
36789 // probably have to be addressed somehow here.
36790 assert(TRI->getBaseRegister() == X86::ESI &&
36791 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
36792 "base pointer in mind");
36793
36795 MVT SPTy = getPointerTy(MF->getDataLayout());
36796 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
36797 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
36798
36800 // Regalloc does not need any help when the memory operand of CMPXCHG8B
36801 // does not use index register.
36802 if (AM.IndexReg == X86::NoRegister)
36803 return BB;
36804
36805 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
36806 // four operand definitions that are E[ABCD] registers. We skip them and
36807 // then insert the LEA.
36808 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
36809 while (RMBBI != BB->rend() &&
36810 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
36811 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
36812 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
36813 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
36814 ++RMBBI;
36815 }
36818 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
36819
36820 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
36821
36822 return BB;
36823 }
36824 case X86::LCMPXCHG16B_NO_RBX: {
36825 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36826 Register BasePtr = TRI->getBaseRegister();
36827 if (TRI->hasBasePointer(*MF) &&
36828 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
36829 if (!BB->isLiveIn(BasePtr))
36830 BB->addLiveIn(BasePtr);
36831 // Save RBX into a virtual register.
36832 Register SaveRBX =
36833 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36834 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36835 .addReg(X86::RBX);
36836 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36838 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
36839 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36840 MIB.add(MI.getOperand(Idx));
36841 MIB.add(MI.getOperand(X86::AddrNumOperands));
36842 MIB.addReg(SaveRBX);
36843 } else {
36844 // Simple case, just copy the virtual register to RBX.
36845 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
36846 .add(MI.getOperand(X86::AddrNumOperands));
36848 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
36849 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36850 MIB.add(MI.getOperand(Idx));
36851 }
36852 MI.eraseFromParent();
36853 return BB;
36854 }
36855 case X86::MWAITX: {
36856 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36857 Register BasePtr = TRI->getBaseRegister();
36858 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
36859 // If no need to save the base pointer, we generate MWAITXrrr,
36860 // else we generate pseudo MWAITX_SAVE_RBX.
36861 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
36862 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36863 .addReg(MI.getOperand(0).getReg());
36864 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36865 .addReg(MI.getOperand(1).getReg());
36866 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
36867 .addReg(MI.getOperand(2).getReg());
36868 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
36869 MI.eraseFromParent();
36870 } else {
36871 if (!BB->isLiveIn(BasePtr)) {
36872 BB->addLiveIn(BasePtr);
36873 }
36874 // Parameters can be copied into ECX and EAX but not EBX yet.
36875 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36876 .addReg(MI.getOperand(0).getReg());
36877 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36878 .addReg(MI.getOperand(1).getReg());
36879 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
36880 // Save RBX into a virtual register.
36881 Register SaveRBX =
36882 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36883 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36884 .addReg(X86::RBX);
36885 // Generate mwaitx pseudo.
36886 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36887 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
36888 .addDef(Dst) // Destination tied in with SaveRBX.
36889 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
36890 .addUse(SaveRBX); // Save of base pointer.
36891 MI.eraseFromParent();
36892 }
36893 return BB;
36894 }
36895 case TargetOpcode::PREALLOCATED_SETUP: {
36896 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
36897 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36898 MFI->setHasPreallocatedCall(true);
36899 int64_t PreallocatedId = MI.getOperand(0).getImm();
36900 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
36901 assert(StackAdjustment != 0 && "0 stack adjustment");
36902 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
36903 << StackAdjustment << "\n");
36904 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
36905 .addReg(X86::ESP)
36906 .addImm(StackAdjustment);
36907 MI.eraseFromParent();
36908 return BB;
36909 }
36910 case TargetOpcode::PREALLOCATED_ARG: {
36911 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
36912 int64_t PreallocatedId = MI.getOperand(1).getImm();
36913 int64_t ArgIdx = MI.getOperand(2).getImm();
36914 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36915 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
36916 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
36917 << ", arg offset " << ArgOffset << "\n");
36918 // stack pointer + offset
36919 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
36920 MI.getOperand(0).getReg()),
36921 X86::ESP, false, ArgOffset);
36922 MI.eraseFromParent();
36923 return BB;
36924 }
36925 case X86::PTDPBSSD:
36926 case X86::PTDPBSUD:
36927 case X86::PTDPBUSD:
36928 case X86::PTDPBUUD:
36929 case X86::PTDPBF16PS:
36930 case X86::PTDPFP16PS: {
36931 unsigned Opc;
36932 switch (MI.getOpcode()) {
36933 // clang-format off
36934 default: llvm_unreachable("illegal opcode!");
36935 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
36936 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
36937 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
36938 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
36939 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
36940 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
36941 // clang-format on
36942 }
36943
36944 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36945 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36946 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36947 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36948 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36949
36950 MI.eraseFromParent(); // The pseudo is gone now.
36951 return BB;
36952 }
36953 case X86::PTILEZERO: {
36954 unsigned Imm = MI.getOperand(0).getImm();
36955 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
36956 MI.eraseFromParent(); // The pseudo is gone now.
36957 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36959 return BB;
36960 }
36961 case X86::PTILEZEROV: {
36962 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
36964 return BB;
36965 }
36966 case X86::PTILELOADD:
36967 case X86::PTILELOADDT1:
36968 case X86::PTILESTORED: {
36969 unsigned Opc;
36970 switch (MI.getOpcode()) {
36971 default: llvm_unreachable("illegal opcode!");
36972#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
36973 case X86::PTILELOADD:
36974 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
36975 break;
36976 case X86::PTILELOADDT1:
36977 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
36978 break;
36979 case X86::PTILESTORED:
36980 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
36981 break;
36982#undef GET_EGPR_IF_ENABLED
36983 }
36984
36985 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36986 unsigned CurOp = 0;
36987 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
36988 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36990
36991 MIB.add(MI.getOperand(CurOp++)); // base
36992 MIB.add(MI.getOperand(CurOp++)); // scale
36993 MIB.add(MI.getOperand(CurOp++)); // index -- stride
36994 MIB.add(MI.getOperand(CurOp++)); // displacement
36995 MIB.add(MI.getOperand(CurOp++)); // segment
36996
36997 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
36998 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
37000
37001 MI.eraseFromParent(); // The pseudo is gone now.
37002 return BB;
37003 }
37004 case X86::PTCMMIMFP16PS:
37005 case X86::PTCMMRLFP16PS: {
37006 const MIMetadata MIMD(MI);
37007 unsigned Opc;
37008 switch (MI.getOpcode()) {
37009 // clang-format off
37010 default: llvm_unreachable("Unexpected instruction!");
37011 case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
37012 case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
37013 // clang-format on
37014 }
37015 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37016 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
37017 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
37018 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37019 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
37020 MI.eraseFromParent(); // The pseudo is gone now.
37021 return BB;
37022 }
37023 }
37024}
37025
37026//===----------------------------------------------------------------------===//
37027// X86 Optimization Hooks
37028//===----------------------------------------------------------------------===//
37029
37030bool
37032 const APInt &DemandedBits,
37033 const APInt &DemandedElts,
37034 TargetLoweringOpt &TLO) const {
37035 EVT VT = Op.getValueType();
37036 unsigned Opcode = Op.getOpcode();
37037 unsigned EltSize = VT.getScalarSizeInBits();
37038
37039 if (VT.isVector()) {
37040 // If the constant is only all signbits in the active bits, then we should
37041 // extend it to the entire constant to allow it act as a boolean constant
37042 // vector.
37043 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
37044 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
37045 return false;
37046 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
37047 if (!DemandedElts[i] || V.getOperand(i).isUndef())
37048 continue;
37049 const APInt &Val = V.getConstantOperandAPInt(i);
37050 if (Val.getBitWidth() > Val.getNumSignBits() &&
37051 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
37052 return true;
37053 }
37054 return false;
37055 };
37056 // For vectors - if we have a constant, then try to sign extend.
37057 // TODO: Handle AND cases.
37058 unsigned ActiveBits = DemandedBits.getActiveBits();
37059 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
37060 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
37061 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
37062 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
37063 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
37065 SDValue NewC =
37067 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
37068 SDValue NewOp =
37069 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
37070 return TLO.CombineTo(Op, NewOp);
37071 }
37072 return false;
37073 }
37074
37075 // Only optimize Ands to prevent shrinking a constant that could be
37076 // matched by movzx.
37077 if (Opcode != ISD::AND)
37078 return false;
37079
37080 // Make sure the RHS really is a constant.
37081 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
37082 if (!C)
37083 return false;
37084
37085 const APInt &Mask = C->getAPIntValue();
37086
37087 // Clear all non-demanded bits initially.
37088 APInt ShrunkMask = Mask & DemandedBits;
37089
37090 // Find the width of the shrunk mask.
37091 unsigned Width = ShrunkMask.getActiveBits();
37092
37093 // If the mask is all 0s there's nothing to do here.
37094 if (Width == 0)
37095 return false;
37096
37097 // Find the next power of 2 width, rounding up to a byte.
37098 Width = llvm::bit_ceil(std::max(Width, 8U));
37099 // Truncate the width to size to handle illegal types.
37100 Width = std::min(Width, EltSize);
37101
37102 // Calculate a possible zero extend mask for this constant.
37103 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
37104
37105 // If we aren't changing the mask, just return true to keep it and prevent
37106 // the caller from optimizing.
37107 if (ZeroExtendMask == Mask)
37108 return true;
37109
37110 // Make sure the new mask can be represented by a combination of mask bits
37111 // and non-demanded bits.
37112 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
37113 return false;
37114
37115 // Replace the constant with the zero extend mask.
37116 SDLoc DL(Op);
37117 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
37118 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
37119 return TLO.CombineTo(Op, NewOp);
37120}
37121
37123 KnownBits &Known,
37124 const APInt &DemandedElts,
37125 const SelectionDAG &DAG, unsigned Depth) {
37126 KnownBits Known2;
37127 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
37128 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
37129 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
37130 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
37131 Known = KnownBits::abdu(Known, Known2).zext(16);
37132 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
37133 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
37134 Known, Known);
37135 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
37136 Known, Known);
37137 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
37138 Known, Known);
37139 Known = Known.zext(64);
37140}
37141
37143 KnownBits &Known,
37144 const APInt &DemandedElts,
37145 const SelectionDAG &DAG,
37146 unsigned Depth) {
37147 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
37148
37149 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
37150 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
37151 APInt DemandedLoElts =
37152 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
37153 APInt DemandedHiElts =
37154 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
37155 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
37156 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
37157 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
37158 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
37159 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
37160 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
37161 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/false,
37162 /*NUW=*/false, Lo, Hi);
37163}
37164
37166 KnownBits &Known,
37167 const APInt &DemandedElts,
37168 const SelectionDAG &DAG,
37169 unsigned Depth) {
37170 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
37171
37172 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
37173 // pairs.
37174 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
37175 APInt DemandedLoElts =
37176 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
37177 APInt DemandedHiElts =
37178 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
37179 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
37180 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
37181 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
37182 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
37183 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
37184 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
37185 Known = KnownBits::sadd_sat(Lo, Hi);
37186}
37187
37189 KnownBits &Known,
37190 const APInt &DemandedElts,
37191 const SelectionDAG &DAG,
37192 unsigned Depth) const {
37193 unsigned BitWidth = Known.getBitWidth();
37194 unsigned NumElts = DemandedElts.getBitWidth();
37195 unsigned Opc = Op.getOpcode();
37196 EVT VT = Op.getValueType();
37197 assert((Opc >= ISD::BUILTIN_OP_END ||
37198 Opc == ISD::INTRINSIC_WO_CHAIN ||
37199 Opc == ISD::INTRINSIC_W_CHAIN ||
37200 Opc == ISD::INTRINSIC_VOID) &&
37201 "Should use MaskedValueIsZero if you don't know whether Op"
37202 " is a target node!");
37203
37204 Known.resetAll();
37205 switch (Opc) {
37206 default: break;
37207 case X86ISD::MUL_IMM: {
37208 KnownBits Known2;
37209 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37210 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37211 Known = KnownBits::mul(Known, Known2);
37212 break;
37213 }
37214 case X86ISD::SETCC:
37215 Known.Zero.setBitsFrom(1);
37216 break;
37217 case X86ISD::MOVMSK: {
37218 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
37219 Known.Zero.setBitsFrom(NumLoBits);
37220 break;
37221 }
37222 case X86ISD::PEXTRB:
37223 case X86ISD::PEXTRW: {
37224 SDValue Src = Op.getOperand(0);
37225 EVT SrcVT = Src.getValueType();
37226 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
37227 Op.getConstantOperandVal(1));
37228 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
37229 Known = Known.anyextOrTrunc(BitWidth);
37230 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
37231 break;
37232 }
37233 case X86ISD::VSRAI:
37234 case X86ISD::VSHLI:
37235 case X86ISD::VSRLI: {
37236 unsigned ShAmt = Op.getConstantOperandVal(1);
37237 if (ShAmt >= VT.getScalarSizeInBits()) {
37238 // Out of range logical bit shifts are guaranteed to be zero.
37239 // Out of range arithmetic bit shifts splat the sign bit.
37240 if (Opc != X86ISD::VSRAI) {
37241 Known.setAllZero();
37242 break;
37243 }
37244
37245 ShAmt = VT.getScalarSizeInBits() - 1;
37246 }
37247
37248 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37249 if (Opc == X86ISD::VSHLI) {
37250 Known.Zero <<= ShAmt;
37251 Known.One <<= ShAmt;
37252 // Low bits are known zero.
37253 Known.Zero.setLowBits(ShAmt);
37254 } else if (Opc == X86ISD::VSRLI) {
37255 Known.Zero.lshrInPlace(ShAmt);
37256 Known.One.lshrInPlace(ShAmt);
37257 // High bits are known zero.
37258 Known.Zero.setHighBits(ShAmt);
37259 } else {
37260 Known.Zero.ashrInPlace(ShAmt);
37261 Known.One.ashrInPlace(ShAmt);
37262 }
37263 break;
37264 }
37265 case X86ISD::PACKUS: {
37266 // PACKUS is just a truncation if the upper half is zero.
37267 APInt DemandedLHS, DemandedRHS;
37268 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
37269
37270 Known.One = APInt::getAllOnes(BitWidth * 2);
37271 Known.Zero = APInt::getAllOnes(BitWidth * 2);
37272
37273 KnownBits Known2;
37274 if (!!DemandedLHS) {
37275 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
37276 Known = Known.intersectWith(Known2);
37277 }
37278 if (!!DemandedRHS) {
37279 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
37280 Known = Known.intersectWith(Known2);
37281 }
37282
37283 if (Known.countMinLeadingZeros() < BitWidth)
37284 Known.resetAll();
37285 Known = Known.trunc(BitWidth);
37286 break;
37287 }
37288 case X86ISD::PSHUFB: {
37289 SDValue Src = Op.getOperand(0);
37290 SDValue Idx = Op.getOperand(1);
37291
37292 // If the index vector is never negative (MSB is zero), then all elements
37293 // come from the source vector. This is useful for cases where
37294 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
37295 // below will handle the more common constant shuffle mask case.
37296 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
37297 if (KnownIdx.isNonNegative())
37298 Known = DAG.computeKnownBits(Src, Depth + 1);
37299 break;
37300 }
37301 case X86ISD::VBROADCAST: {
37302 SDValue Src = Op.getOperand(0);
37303 if (!Src.getSimpleValueType().isVector()) {
37304 Known = DAG.computeKnownBits(Src, Depth + 1);
37305 return;
37306 }
37307 break;
37308 }
37309 case X86ISD::AND: {
37310 if (Op.getResNo() == 0) {
37311 KnownBits Known2;
37312 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37313 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37314 Known &= Known2;
37315 }
37316 break;
37317 }
37318 case X86ISD::ANDNP: {
37319 KnownBits Known2;
37320 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37321 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37322
37323 // ANDNP = (~X & Y);
37324 Known.One &= Known2.Zero;
37325 Known.Zero |= Known2.One;
37326 break;
37327 }
37328 case X86ISD::FOR: {
37329 KnownBits Known2;
37330 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37331 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37332
37333 Known |= Known2;
37334 break;
37335 }
37336 case X86ISD::PSADBW: {
37337 SDValue LHS = Op.getOperand(0);
37338 SDValue RHS = Op.getOperand(1);
37339 assert(VT.getScalarType() == MVT::i64 &&
37340 LHS.getValueType() == RHS.getValueType() &&
37341 LHS.getValueType().getScalarType() == MVT::i8 &&
37342 "Unexpected PSADBW types");
37343 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37344 break;
37345 }
37346 case X86ISD::PCMPGT:
37347 case X86ISD::PCMPEQ: {
37348 KnownBits KnownLhs =
37349 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37350 KnownBits KnownRhs =
37351 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37352 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
37353 ? KnownBits::eq(KnownLhs, KnownRhs)
37354 : KnownBits::sgt(KnownLhs, KnownRhs);
37355 if (Res) {
37356 if (*Res)
37357 Known.setAllOnes();
37358 else
37359 Known.setAllZero();
37360 }
37361 break;
37362 }
37363 case X86ISD::VPMADDWD: {
37364 SDValue LHS = Op.getOperand(0);
37365 SDValue RHS = Op.getOperand(1);
37366 assert(VT.getVectorElementType() == MVT::i32 &&
37367 LHS.getValueType() == RHS.getValueType() &&
37368 LHS.getValueType().getVectorElementType() == MVT::i16 &&
37369 "Unexpected PMADDWD types");
37370 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
37371 break;
37372 }
37373 case X86ISD::VPMADDUBSW: {
37374 SDValue LHS = Op.getOperand(0);
37375 SDValue RHS = Op.getOperand(1);
37376 assert(VT.getVectorElementType() == MVT::i16 &&
37377 LHS.getValueType() == RHS.getValueType() &&
37378 LHS.getValueType().getVectorElementType() == MVT::i8 &&
37379 "Unexpected PMADDUBSW types");
37380 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37381 break;
37382 }
37383 case X86ISD::PMULUDQ: {
37384 KnownBits Known2;
37385 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37386 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37387
37388 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
37389 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
37390 Known = KnownBits::mul(Known, Known2);
37391 break;
37392 }
37393 case X86ISD::CMOV: {
37394 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
37395 // If we don't know any bits, early out.
37396 if (Known.isUnknown())
37397 break;
37398 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
37399
37400 // Only known if known in both the LHS and RHS.
37401 Known = Known.intersectWith(Known2);
37402 break;
37403 }
37404 case X86ISD::BEXTR:
37405 case X86ISD::BEXTRI: {
37406 SDValue Op0 = Op.getOperand(0);
37407 SDValue Op1 = Op.getOperand(1);
37408
37409 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
37410 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
37411 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
37412
37413 // If the length is 0, the result is 0.
37414 if (Length == 0) {
37415 Known.setAllZero();
37416 break;
37417 }
37418
37419 if ((Shift + Length) <= BitWidth) {
37420 Known = DAG.computeKnownBits(Op0, Depth + 1);
37421 Known = Known.extractBits(Length, Shift);
37422 Known = Known.zextOrTrunc(BitWidth);
37423 }
37424 }
37425 break;
37426 }
37427 case X86ISD::PDEP: {
37428 KnownBits Known2;
37429 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37430 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37431 // Zeros are retained from the mask operand. But not ones.
37432 Known.One.clearAllBits();
37433 // The result will have at least as many trailing zeros as the non-mask
37434 // operand since bits can only map to the same or higher bit position.
37435 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
37436 break;
37437 }
37438 case X86ISD::PEXT: {
37439 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37440 // The result has as many leading zeros as the number of zeroes in the mask.
37441 unsigned Count = Known.Zero.popcount();
37442 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
37443 Known.One.clearAllBits();
37444 break;
37445 }
37446 case X86ISD::VTRUNC:
37447 case X86ISD::VTRUNCS:
37448 case X86ISD::VTRUNCUS:
37449 case X86ISD::CVTSI2P:
37450 case X86ISD::CVTUI2P:
37451 case X86ISD::CVTP2SI:
37452 case X86ISD::CVTP2UI:
37453 case X86ISD::MCVTP2SI:
37454 case X86ISD::MCVTP2UI:
37455 case X86ISD::CVTTP2SI:
37456 case X86ISD::CVTTP2UI:
37457 case X86ISD::MCVTTP2SI:
37458 case X86ISD::MCVTTP2UI:
37459 case X86ISD::MCVTSI2P:
37460 case X86ISD::MCVTUI2P:
37461 case X86ISD::VFPROUND:
37462 case X86ISD::VMFPROUND:
37463 case X86ISD::CVTPS2PH:
37464 case X86ISD::MCVTPS2PH: {
37465 // Truncations/Conversions - upper elements are known zero.
37466 EVT SrcVT = Op.getOperand(0).getValueType();
37467 if (SrcVT.isVector()) {
37468 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37469 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37470 Known.setAllZero();
37471 }
37472 break;
37473 }
37480 // Strict Conversions - upper elements are known zero.
37481 EVT SrcVT = Op.getOperand(1).getValueType();
37482 if (SrcVT.isVector()) {
37483 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37484 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37485 Known.setAllZero();
37486 }
37487 break;
37488 }
37489 case X86ISD::MOVQ2DQ: {
37490 // Move from MMX to XMM. Upper half of XMM should be 0.
37491 if (DemandedElts.countr_zero() >= (NumElts / 2))
37492 Known.setAllZero();
37493 break;
37494 }
37496 APInt UndefElts;
37497 SmallVector<APInt, 16> EltBits;
37498 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
37499 /*AllowWholeUndefs*/ false,
37500 /*AllowPartialUndefs*/ false)) {
37501 Known.Zero.setAllBits();
37502 Known.One.setAllBits();
37503 for (unsigned I = 0; I != NumElts; ++I) {
37504 if (!DemandedElts[I])
37505 continue;
37506 if (UndefElts[I]) {
37507 Known.resetAll();
37508 break;
37509 }
37510 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
37511 Known = Known.intersectWith(Known2);
37512 }
37513 return;
37514 }
37515 break;
37516 }
37518 switch (Op->getConstantOperandVal(0)) {
37519 case Intrinsic::x86_sse2_pmadd_wd:
37520 case Intrinsic::x86_avx2_pmadd_wd:
37521 case Intrinsic::x86_avx512_pmaddw_d_512: {
37522 SDValue LHS = Op.getOperand(1);
37523 SDValue RHS = Op.getOperand(2);
37524 assert(VT.getScalarType() == MVT::i32 &&
37525 LHS.getValueType() == RHS.getValueType() &&
37526 LHS.getValueType().getScalarType() == MVT::i16 &&
37527 "Unexpected PMADDWD types");
37528 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
37529 break;
37530 }
37531 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
37532 case Intrinsic::x86_avx2_pmadd_ub_sw:
37533 case Intrinsic::x86_avx512_pmaddubs_w_512: {
37534 SDValue LHS = Op.getOperand(1);
37535 SDValue RHS = Op.getOperand(2);
37536 assert(VT.getScalarType() == MVT::i16 &&
37537 LHS.getValueType() == RHS.getValueType() &&
37538 LHS.getValueType().getScalarType() == MVT::i8 &&
37539 "Unexpected PMADDUBSW types");
37540 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37541 break;
37542 }
37543 case Intrinsic::x86_sse2_psad_bw:
37544 case Intrinsic::x86_avx2_psad_bw:
37545 case Intrinsic::x86_avx512_psad_bw_512: {
37546 SDValue LHS = Op.getOperand(1);
37547 SDValue RHS = Op.getOperand(2);
37548 assert(VT.getScalarType() == MVT::i64 &&
37549 LHS.getValueType() == RHS.getValueType() &&
37550 LHS.getValueType().getScalarType() == MVT::i8 &&
37551 "Unexpected PSADBW types");
37552 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37553 break;
37554 }
37555 }
37556 break;
37557 }
37558 }
37559
37560 // Handle target shuffles.
37561 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37562 if (isTargetShuffle(Opc)) {
37565 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37566 unsigned NumOps = Ops.size();
37567 unsigned NumElts = VT.getVectorNumElements();
37568 if (Mask.size() == NumElts) {
37569 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37570 Known.Zero.setAllBits(); Known.One.setAllBits();
37571 for (unsigned i = 0; i != NumElts; ++i) {
37572 if (!DemandedElts[i])
37573 continue;
37574 int M = Mask[i];
37575 if (M == SM_SentinelUndef) {
37576 // For UNDEF elements, we don't know anything about the common state
37577 // of the shuffle result.
37578 Known.resetAll();
37579 break;
37580 }
37581 if (M == SM_SentinelZero) {
37582 Known.One.clearAllBits();
37583 continue;
37584 }
37585 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37586 "Shuffle index out of range");
37587
37588 unsigned OpIdx = (unsigned)M / NumElts;
37589 unsigned EltIdx = (unsigned)M % NumElts;
37590 if (Ops[OpIdx].getValueType() != VT) {
37591 // TODO - handle target shuffle ops with different value types.
37592 Known.resetAll();
37593 break;
37594 }
37595 DemandedOps[OpIdx].setBit(EltIdx);
37596 }
37597 // Known bits are the values that are shared by every demanded element.
37598 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
37599 if (!DemandedOps[i])
37600 continue;
37601 KnownBits Known2 =
37602 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
37603 Known = Known.intersectWith(Known2);
37604 }
37605 }
37606 }
37607 }
37608}
37609
37611 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
37612 unsigned Depth) const {
37613 EVT VT = Op.getValueType();
37614 unsigned VTBits = VT.getScalarSizeInBits();
37615 unsigned Opcode = Op.getOpcode();
37616 switch (Opcode) {
37618 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
37619 return VTBits;
37620
37621 case X86ISD::VTRUNC: {
37622 SDValue Src = Op.getOperand(0);
37623 MVT SrcVT = Src.getSimpleValueType();
37624 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
37625 assert(VTBits < NumSrcBits && "Illegal truncation input type");
37626 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
37627 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
37628 if (Tmp > (NumSrcBits - VTBits))
37629 return Tmp - (NumSrcBits - VTBits);
37630 return 1;
37631 }
37632
37633 case X86ISD::PACKSS: {
37634 // PACKSS is just a truncation if the sign bits extend to the packed size.
37635 APInt DemandedLHS, DemandedRHS;
37636 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
37637 DemandedRHS);
37638
37639 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
37640 // patterns often used to compact vXi64 allsignbit patterns.
37641 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
37643 if (BC.getOpcode() == X86ISD::PACKSS &&
37644 BC.getScalarValueSizeInBits() == 16 &&
37645 V.getScalarValueSizeInBits() == 32) {
37648 if (BC0.getScalarValueSizeInBits() == 64 &&
37649 BC1.getScalarValueSizeInBits() == 64 &&
37650 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
37651 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
37652 return 32;
37653 }
37654 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
37655 };
37656
37657 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
37658 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
37659 if (!!DemandedLHS)
37660 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
37661 if (!!DemandedRHS)
37662 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
37663 unsigned Tmp = std::min(Tmp0, Tmp1);
37664 if (Tmp > (SrcBits - VTBits))
37665 return Tmp - (SrcBits - VTBits);
37666 return 1;
37667 }
37668
37669 case X86ISD::VBROADCAST: {
37670 SDValue Src = Op.getOperand(0);
37671 if (!Src.getSimpleValueType().isVector())
37672 return DAG.ComputeNumSignBits(Src, Depth + 1);
37673 break;
37674 }
37675
37676 case X86ISD::VSHLI: {
37677 SDValue Src = Op.getOperand(0);
37678 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
37679 if (ShiftVal.uge(VTBits))
37680 return VTBits; // Shifted all bits out --> zero.
37681 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37682 if (ShiftVal.uge(Tmp))
37683 return 1; // Shifted all sign bits out --> unknown.
37684 return Tmp - ShiftVal.getZExtValue();
37685 }
37686
37687 case X86ISD::VSRAI: {
37688 SDValue Src = Op.getOperand(0);
37689 APInt ShiftVal = Op.getConstantOperandAPInt(1);
37690 if (ShiftVal.uge(VTBits - 1))
37691 return VTBits; // Sign splat.
37692 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37693 ShiftVal += Tmp;
37694 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
37695 }
37696
37697 case X86ISD::FSETCC:
37698 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
37699 if (VT == MVT::f32 || VT == MVT::f64 ||
37700 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
37701 return VTBits;
37702 break;
37703
37704 case X86ISD::PCMPGT:
37705 case X86ISD::PCMPEQ:
37706 case X86ISD::CMPP:
37707 case X86ISD::VPCOM:
37708 case X86ISD::VPCOMU:
37709 // Vector compares return zero/all-bits result values.
37710 return VTBits;
37711
37712 case X86ISD::ANDNP: {
37713 unsigned Tmp0 =
37714 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
37715 if (Tmp0 == 1) return 1; // Early out.
37716 unsigned Tmp1 =
37717 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
37718 return std::min(Tmp0, Tmp1);
37719 }
37720
37721 case X86ISD::CMOV: {
37722 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
37723 if (Tmp0 == 1) return 1; // Early out.
37724 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
37725 return std::min(Tmp0, Tmp1);
37726 }
37727 }
37728
37729 // Handle target shuffles.
37730 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37731 if (isTargetShuffle(Opcode)) {
37734 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37735 unsigned NumOps = Ops.size();
37736 unsigned NumElts = VT.getVectorNumElements();
37737 if (Mask.size() == NumElts) {
37738 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37739 for (unsigned i = 0; i != NumElts; ++i) {
37740 if (!DemandedElts[i])
37741 continue;
37742 int M = Mask[i];
37743 if (M == SM_SentinelUndef) {
37744 // For UNDEF elements, we don't know anything about the common state
37745 // of the shuffle result.
37746 return 1;
37747 } else if (M == SM_SentinelZero) {
37748 // Zero = all sign bits.
37749 continue;
37750 }
37751 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37752 "Shuffle index out of range");
37753
37754 unsigned OpIdx = (unsigned)M / NumElts;
37755 unsigned EltIdx = (unsigned)M % NumElts;
37756 if (Ops[OpIdx].getValueType() != VT) {
37757 // TODO - handle target shuffle ops with different value types.
37758 return 1;
37759 }
37760 DemandedOps[OpIdx].setBit(EltIdx);
37761 }
37762 unsigned Tmp0 = VTBits;
37763 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
37764 if (!DemandedOps[i])
37765 continue;
37766 unsigned Tmp1 =
37767 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
37768 Tmp0 = std::min(Tmp0, Tmp1);
37769 }
37770 return Tmp0;
37771 }
37772 }
37773 }
37774
37775 // Fallback case.
37776 return 1;
37777}
37778
37780 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
37781 return N->getOperand(0);
37782 return N;
37783}
37784
37785// Helper to look for a normal load that can be narrowed into a vzload with the
37786// specified VT and memory VT. Returns SDValue() on failure.
37788 SelectionDAG &DAG) {
37789 // Can't if the load is volatile or atomic.
37790 if (!LN->isSimple())
37791 return SDValue();
37792
37793 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37794 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37795 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
37796 LN->getPointerInfo(), LN->getOriginalAlign(),
37797 LN->getMemOperand()->getFlags());
37798}
37799
37800// Attempt to match a combined shuffle mask against supported unary shuffle
37801// instructions.
37802// TODO: Investigate sharing more of this with shuffle lowering.
37803static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37804 bool AllowFloatDomain, bool AllowIntDomain,
37805 SDValue V1, const SelectionDAG &DAG,
37806 const X86Subtarget &Subtarget, unsigned &Shuffle,
37807 MVT &SrcVT, MVT &DstVT) {
37808 unsigned NumMaskElts = Mask.size();
37809 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
37810
37811 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
37812 if (Mask[0] == 0 &&
37813 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
37814 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
37816 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
37817 Shuffle = X86ISD::VZEXT_MOVL;
37818 if (MaskEltSize == 16)
37819 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37820 else
37821 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37822 return true;
37823 }
37824 }
37825
37826 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
37827 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
37828 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
37829 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
37830 unsigned MaxScale = 64 / MaskEltSize;
37831 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
37832 DAG.ComputeNumSignBits(V1) == MaskEltSize;
37833 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
37834 bool MatchAny = true;
37835 bool MatchZero = true;
37836 bool MatchSign = UseSign;
37837 unsigned NumDstElts = NumMaskElts / Scale;
37838 for (unsigned i = 0;
37839 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
37840 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
37841 MatchAny = MatchSign = MatchZero = false;
37842 break;
37843 }
37844 unsigned Pos = (i * Scale) + 1;
37845 unsigned Len = Scale - 1;
37846 MatchAny &= isUndefInRange(Mask, Pos, Len);
37847 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
37848 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
37849 }
37850 if (MatchAny || MatchSign || MatchZero) {
37851 assert((MatchSign || MatchZero) &&
37852 "Failed to match sext/zext but matched aext?");
37853 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
37854 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
37855 : MVT::getIntegerVT(MaskEltSize);
37856 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
37857
37858 Shuffle = unsigned(
37859 MatchAny ? ISD::ANY_EXTEND
37860 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
37861 if (SrcVT.getVectorNumElements() != NumDstElts)
37862 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
37863
37864 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
37865 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
37866 return true;
37867 }
37868 }
37869 }
37870
37871 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
37872 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
37873 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
37874 isUndefOrEqual(Mask[0], 0) &&
37875 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
37876 Shuffle = X86ISD::VZEXT_MOVL;
37877 if (MaskEltSize == 16)
37878 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37879 else
37880 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37881 return true;
37882 }
37883
37884 // Check if we have SSE3 which will let us use MOVDDUP etc. The
37885 // instructions are no slower than UNPCKLPD but has the option to
37886 // fold the input operand into even an unaligned memory load.
37887 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
37888 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
37889 Shuffle = X86ISD::MOVDDUP;
37890 SrcVT = DstVT = MVT::v2f64;
37891 return true;
37892 }
37893 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37894 Shuffle = X86ISD::MOVSLDUP;
37895 SrcVT = DstVT = MVT::v4f32;
37896 return true;
37897 }
37898 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
37899 Shuffle = X86ISD::MOVSHDUP;
37900 SrcVT = DstVT = MVT::v4f32;
37901 return true;
37902 }
37903 }
37904
37905 if (MaskVT.is256BitVector() && AllowFloatDomain) {
37906 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
37907 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37908 Shuffle = X86ISD::MOVDDUP;
37909 SrcVT = DstVT = MVT::v4f64;
37910 return true;
37911 }
37912 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37913 V1)) {
37914 Shuffle = X86ISD::MOVSLDUP;
37915 SrcVT = DstVT = MVT::v8f32;
37916 return true;
37917 }
37918 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
37919 V1)) {
37920 Shuffle = X86ISD::MOVSHDUP;
37921 SrcVT = DstVT = MVT::v8f32;
37922 return true;
37923 }
37924 }
37925
37926 if (MaskVT.is512BitVector() && AllowFloatDomain) {
37927 assert(Subtarget.hasAVX512() &&
37928 "AVX512 required for 512-bit vector shuffles");
37929 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37930 V1)) {
37931 Shuffle = X86ISD::MOVDDUP;
37932 SrcVT = DstVT = MVT::v8f64;
37933 return true;
37934 }
37936 MaskVT, Mask,
37937 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
37938 Shuffle = X86ISD::MOVSLDUP;
37939 SrcVT = DstVT = MVT::v16f32;
37940 return true;
37941 }
37943 MaskVT, Mask,
37944 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
37945 Shuffle = X86ISD::MOVSHDUP;
37946 SrcVT = DstVT = MVT::v16f32;
37947 return true;
37948 }
37949 }
37950
37951 return false;
37952}
37953
37954// Attempt to match a combined shuffle mask against supported unary immediate
37955// permute instructions.
37956// TODO: Investigate sharing more of this with shuffle lowering.
37958 const APInt &Zeroable,
37959 bool AllowFloatDomain, bool AllowIntDomain,
37960 const SelectionDAG &DAG,
37961 const X86Subtarget &Subtarget,
37962 unsigned &Shuffle, MVT &ShuffleVT,
37963 unsigned &PermuteImm) {
37964 unsigned NumMaskElts = Mask.size();
37965 unsigned InputSizeInBits = MaskVT.getSizeInBits();
37966 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
37967 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
37968 bool ContainsZeros = isAnyZero(Mask);
37969
37970 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
37971 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
37972 // Check for lane crossing permutes.
37973 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
37974 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
37975 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
37976 Shuffle = X86ISD::VPERMI;
37977 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
37978 PermuteImm = getV4X86ShuffleImm(Mask);
37979 return true;
37980 }
37981 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
37982 SmallVector<int, 4> RepeatedMask;
37983 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
37984 Shuffle = X86ISD::VPERMI;
37985 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
37986 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
37987 return true;
37988 }
37989 }
37990 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
37991 // VPERMILPD can permute with a non-repeating shuffle.
37992 Shuffle = X86ISD::VPERMILPI;
37993 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
37994 PermuteImm = 0;
37995 for (int i = 0, e = Mask.size(); i != e; ++i) {
37996 int M = Mask[i];
37997 if (M == SM_SentinelUndef)
37998 continue;
37999 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
38000 PermuteImm |= (M & 1) << i;
38001 }
38002 return true;
38003 }
38004 }
38005
38006 // We are checking for shuffle match or shift match. Loop twice so we can
38007 // order which we try and match first depending on target preference.
38008 for (unsigned Order = 0; Order < 2; ++Order) {
38009 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
38010 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
38011 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
38012 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
38013 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
38014 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
38015 SmallVector<int, 4> RepeatedMask;
38016 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
38017 // Narrow the repeated mask to create 32-bit element permutes.
38018 SmallVector<int, 4> WordMask = RepeatedMask;
38019 if (MaskScalarSizeInBits == 64)
38020 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
38021
38022 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
38023 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
38024 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
38025 PermuteImm = getV4X86ShuffleImm(WordMask);
38026 return true;
38027 }
38028 }
38029
38030 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
38031 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
38032 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38033 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38034 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
38035 SmallVector<int, 4> RepeatedMask;
38036 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
38037 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
38038 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
38039
38040 // PSHUFLW: permute lower 4 elements only.
38041 if (isUndefOrInRange(LoMask, 0, 4) &&
38042 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
38043 Shuffle = X86ISD::PSHUFLW;
38044 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
38045 PermuteImm = getV4X86ShuffleImm(LoMask);
38046 return true;
38047 }
38048
38049 // PSHUFHW: permute upper 4 elements only.
38050 if (isUndefOrInRange(HiMask, 4, 8) &&
38051 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
38052 // Offset the HiMask so that we can create the shuffle immediate.
38053 int OffsetHiMask[4];
38054 for (int i = 0; i != 4; ++i)
38055 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
38056
38057 Shuffle = X86ISD::PSHUFHW;
38058 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
38059 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
38060 return true;
38061 }
38062 }
38063 }
38064 } else {
38065 // Attempt to match against bit rotates.
38066 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
38067 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
38068 Subtarget.hasAVX512())) {
38069 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
38070 Subtarget, Mask);
38071 if (0 < RotateAmt) {
38072 Shuffle = X86ISD::VROTLI;
38073 PermuteImm = (unsigned)RotateAmt;
38074 return true;
38075 }
38076 }
38077 }
38078 // Attempt to match against byte/bit shifts.
38079 if (AllowIntDomain &&
38080 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38081 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38082 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38083 int ShiftAmt =
38084 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
38085 Zeroable, Subtarget);
38086 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
38087 32 <= ShuffleVT.getScalarSizeInBits())) {
38088 // Byte shifts can be slower so only match them on second attempt.
38089 if (Order == 0 &&
38090 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
38091 continue;
38092
38093 PermuteImm = (unsigned)ShiftAmt;
38094 return true;
38095 }
38096
38097 }
38098 }
38099
38100 return false;
38101}
38102
38103// Attempt to match a combined unary shuffle mask against supported binary
38104// shuffle instructions.
38105// TODO: Investigate sharing more of this with shuffle lowering.
38106static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
38107 bool AllowFloatDomain, bool AllowIntDomain,
38108 SDValue &V1, SDValue &V2, const SDLoc &DL,
38109 SelectionDAG &DAG, const X86Subtarget &Subtarget,
38110 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
38111 bool IsUnary) {
38112 unsigned NumMaskElts = Mask.size();
38113 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
38114 unsigned SizeInBits = MaskVT.getSizeInBits();
38115
38116 if (MaskVT.is128BitVector()) {
38117 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
38118 AllowFloatDomain) {
38119 V2 = V1;
38120 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
38121 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
38122 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
38123 return true;
38124 }
38125 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
38126 AllowFloatDomain) {
38127 V2 = V1;
38128 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
38129 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
38130 return true;
38131 }
38132 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
38133 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
38134 std::swap(V1, V2);
38135 Shuffle = X86ISD::MOVSD;
38136 SrcVT = DstVT = MVT::v2f64;
38137 return true;
38138 }
38139 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
38140 (AllowFloatDomain || !Subtarget.hasSSE41())) {
38141 Shuffle = X86ISD::MOVSS;
38142 SrcVT = DstVT = MVT::v4f32;
38143 return true;
38144 }
38145 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
38146 DAG) &&
38147 Subtarget.hasFP16()) {
38148 Shuffle = X86ISD::MOVSH;
38149 SrcVT = DstVT = MVT::v8f16;
38150 return true;
38151 }
38152 }
38153
38154 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
38155 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
38156 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
38157 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
38158 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
38159 Subtarget)) {
38160 DstVT = MaskVT;
38161 return true;
38162 }
38163 }
38164 // TODO: Can we handle this inside matchShuffleWithPACK?
38165 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
38166 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
38167 V1.getScalarValueSizeInBits() == 64 &&
38168 V2.getScalarValueSizeInBits() == 64) {
38169 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
38170 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
38171 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
38172 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
38173 SrcVT = MVT::v4i32;
38174 DstVT = MVT::v8i16;
38175 Shuffle = X86ISD::PACKUS;
38176 return true;
38177 }
38178 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
38179 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
38180 SrcVT = MVT::v8i16;
38181 DstVT = MVT::v16i8;
38182 Shuffle = X86ISD::PACKUS;
38183 return true;
38184 }
38185 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
38186 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
38187 SrcVT = MVT::v4i32;
38188 DstVT = MVT::v8i16;
38189 Shuffle = X86ISD::PACKSS;
38190 return true;
38191 }
38192 }
38193
38194 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
38195 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
38196 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38197 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
38198 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38199 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
38200 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
38201 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
38202 Subtarget)) {
38203 SrcVT = DstVT = MaskVT;
38204 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
38205 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
38206 return true;
38207 }
38208 }
38209
38210 // Attempt to match against a OR if we're performing a blend shuffle and the
38211 // non-blended source element is zero in each case.
38212 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
38213 if (SizeInBits == V1.getValueSizeInBits() &&
38214 SizeInBits == V2.getValueSizeInBits() &&
38215 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
38216 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
38217 bool IsBlend = true;
38218 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
38219 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
38220 unsigned Scale1 = NumV1Elts / NumMaskElts;
38221 unsigned Scale2 = NumV2Elts / NumMaskElts;
38222 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
38223 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
38224 for (unsigned i = 0; i != NumMaskElts; ++i) {
38225 int M = Mask[i];
38226 if (M == SM_SentinelUndef)
38227 continue;
38228 if (M == SM_SentinelZero) {
38229 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
38230 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
38231 continue;
38232 }
38233 if (M == (int)i) {
38234 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
38235 continue;
38236 }
38237 if (M == (int)(i + NumMaskElts)) {
38238 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
38239 continue;
38240 }
38241 IsBlend = false;
38242 break;
38243 }
38244 if (IsBlend) {
38245 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
38246 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
38247 Shuffle = ISD::OR;
38248 SrcVT = DstVT = MaskVT.changeTypeToInteger();
38249 return true;
38250 }
38251 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
38252 // FIXME: handle mismatched sizes?
38253 // TODO: investigate if `ISD::OR` handling in
38254 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
38255 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
38256 unsigned NumElts = V.getValueType().getVectorNumElements();
38257 KnownBits Known(NumElts);
38258 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
38259 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
38260 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
38261 if (PeepholeKnown.isZero())
38262 Known.Zero.setBit(EltIdx);
38263 if (PeepholeKnown.isAllOnes())
38264 Known.One.setBit(EltIdx);
38265 }
38266 return Known;
38267 };
38268
38269 KnownBits V1Known = computeKnownBitsElementWise(V1);
38270 KnownBits V2Known = computeKnownBitsElementWise(V2);
38271
38272 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
38273 int M = Mask[i];
38274 if (M == SM_SentinelUndef)
38275 continue;
38276 if (M == SM_SentinelZero) {
38277 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
38278 continue;
38279 }
38280 if (M == (int)i) {
38281 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
38282 continue;
38283 }
38284 if (M == (int)(i + NumMaskElts)) {
38285 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
38286 continue;
38287 }
38288 llvm_unreachable("will not get here.");
38289 }
38290 if (IsBlend) {
38291 Shuffle = ISD::OR;
38292 SrcVT = DstVT = MaskVT.changeTypeToInteger();
38293 return true;
38294 }
38295 }
38296 }
38297 }
38298
38299 return false;
38300}
38301
38303 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
38304 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
38305 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
38306 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
38307 unsigned NumMaskElts = Mask.size();
38308 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
38309
38310 // Attempt to match against VALIGND/VALIGNQ rotate.
38311 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
38312 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
38313 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
38314 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38315 if (!isAnyZero(Mask)) {
38316 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
38317 if (0 < Rotation) {
38318 Shuffle = X86ISD::VALIGN;
38319 if (EltSizeInBits == 64)
38320 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
38321 else
38322 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
38323 PermuteImm = Rotation;
38324 return true;
38325 }
38326 }
38327 }
38328
38329 // Attempt to match against PALIGNR byte rotate.
38330 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38331 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38332 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
38333 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
38334 if (0 < ByteRotation) {
38335 Shuffle = X86ISD::PALIGNR;
38336 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
38337 PermuteImm = ByteRotation;
38338 return true;
38339 }
38340 }
38341
38342 // Attempt to combine to X86ISD::BLENDI.
38343 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
38344 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
38345 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
38346 uint64_t BlendMask = 0;
38347 bool ForceV1Zero = false, ForceV2Zero = false;
38348 SmallVector<int, 8> TargetMask(Mask);
38349 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
38350 ForceV2Zero, BlendMask)) {
38351 if (MaskVT == MVT::v16i16) {
38352 // We can only use v16i16 PBLENDW if the lanes are repeated.
38353 SmallVector<int, 8> RepeatedMask;
38354 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
38355 RepeatedMask)) {
38356 assert(RepeatedMask.size() == 8 &&
38357 "Repeated mask size doesn't match!");
38358 PermuteImm = 0;
38359 for (int i = 0; i < 8; ++i)
38360 if (RepeatedMask[i] >= 8)
38361 PermuteImm |= 1 << i;
38362 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38363 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38364 Shuffle = X86ISD::BLENDI;
38365 ShuffleVT = MaskVT;
38366 return true;
38367 }
38368 } else {
38369 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38370 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38371 PermuteImm = (unsigned)BlendMask;
38372 Shuffle = X86ISD::BLENDI;
38373 ShuffleVT = MaskVT;
38374 return true;
38375 }
38376 }
38377 }
38378
38379 // Attempt to combine to INSERTPS, but only if it has elements that need to
38380 // be set to zero.
38381 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38382 MaskVT.is128BitVector() && isAnyZero(Mask) &&
38383 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38384 Shuffle = X86ISD::INSERTPS;
38385 ShuffleVT = MVT::v4f32;
38386 return true;
38387 }
38388
38389 // Attempt to combine to SHUFPD.
38390 if (AllowFloatDomain && EltSizeInBits == 64 &&
38391 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38392 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38393 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38394 bool ForceV1Zero = false, ForceV2Zero = false;
38395 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
38396 PermuteImm, Mask, Zeroable)) {
38397 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38398 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38399 Shuffle = X86ISD::SHUFP;
38400 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
38401 return true;
38402 }
38403 }
38404
38405 // Attempt to combine to SHUFPS.
38406 if (AllowFloatDomain && EltSizeInBits == 32 &&
38407 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
38408 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38409 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38410 SmallVector<int, 4> RepeatedMask;
38411 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
38412 // Match each half of the repeated mask, to determine if its just
38413 // referencing one of the vectors, is zeroable or entirely undef.
38414 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
38415 int M0 = RepeatedMask[Offset];
38416 int M1 = RepeatedMask[Offset + 1];
38417
38418 if (isUndefInRange(RepeatedMask, Offset, 2)) {
38419 return DAG.getUNDEF(MaskVT);
38420 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
38421 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
38422 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
38423 return getZeroVector(MaskVT, Subtarget, DAG, DL);
38424 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
38425 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38426 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38427 return V1;
38428 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
38429 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38430 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38431 return V2;
38432 }
38433
38434 return SDValue();
38435 };
38436
38437 int ShufMask[4] = {-1, -1, -1, -1};
38438 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
38439 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
38440
38441 if (Lo && Hi) {
38442 V1 = Lo;
38443 V2 = Hi;
38444 Shuffle = X86ISD::SHUFP;
38445 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
38446 PermuteImm = getV4X86ShuffleImm(ShufMask);
38447 return true;
38448 }
38449 }
38450 }
38451
38452 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
38453 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38454 MaskVT.is128BitVector() &&
38455 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38456 Shuffle = X86ISD::INSERTPS;
38457 ShuffleVT = MVT::v4f32;
38458 return true;
38459 }
38460
38461 return false;
38462}
38463
38465 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38466 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38467 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38468 const X86Subtarget &Subtarget);
38469
38470/// Combine an arbitrary chain of shuffles into a single instruction if
38471/// possible.
38472///
38473/// This is the leaf of the recursive combine below. When we have found some
38474/// chain of single-use x86 shuffle instructions and accumulated the combined
38475/// shuffle mask represented by them, this will try to pattern match that mask
38476/// into either a single instruction if there is a special purpose instruction
38477/// for this operation, or into a PSHUFB instruction which is a fully general
38478/// instruction but should only be used to replace chains over a certain depth.
38480 ArrayRef<int> BaseMask, int Depth,
38481 bool HasVariableMask,
38482 bool AllowVariableCrossLaneMask,
38483 bool AllowVariablePerLaneMask,
38484 SelectionDAG &DAG,
38485 const X86Subtarget &Subtarget) {
38486 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
38487 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
38488 "Unexpected number of shuffle inputs!");
38489
38490 SDLoc DL(Root);
38491 MVT RootVT = Root.getSimpleValueType();
38492 unsigned RootSizeInBits = RootVT.getSizeInBits();
38493 unsigned NumRootElts = RootVT.getVectorNumElements();
38494
38495 // Canonicalize shuffle input op to the requested type.
38496 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
38497 if (VT.getSizeInBits() > Op.getValueSizeInBits())
38498 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
38499 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
38500 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
38501 return DAG.getBitcast(VT, Op);
38502 };
38503
38504 // Find the inputs that enter the chain. Note that multiple uses are OK
38505 // here, we're not going to remove the operands we find.
38506 bool UnaryShuffle = (Inputs.size() == 1);
38507 SDValue V1 = peekThroughBitcasts(Inputs[0]);
38508 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
38509 : peekThroughBitcasts(Inputs[1]));
38510
38511 MVT VT1 = V1.getSimpleValueType();
38512 MVT VT2 = V2.getSimpleValueType();
38513 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
38514 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
38515
38516 SDValue Res;
38517
38518 unsigned NumBaseMaskElts = BaseMask.size();
38519 if (NumBaseMaskElts == 1) {
38520 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
38521 return CanonicalizeShuffleInput(RootVT, V1);
38522 }
38523
38524 bool OptForSize = DAG.shouldOptForSize();
38525 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
38526 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
38527 (RootVT.isFloatingPoint() && Depth >= 1) ||
38528 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
38529
38530 // Don't combine if we are a AVX512/EVEX target and the mask element size
38531 // is different from the root element size - this would prevent writemasks
38532 // from being reused.
38533 bool IsMaskedShuffle = false;
38534 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
38535 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
38536 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
38537 IsMaskedShuffle = true;
38538 }
38539 }
38540
38541 // If we are shuffling a splat (and not introducing zeros) then we can just
38542 // use it directly. This works for smaller elements as well as they already
38543 // repeat across each mask element.
38544 if (UnaryShuffle && !isAnyZero(BaseMask) &&
38545 V1.getValueSizeInBits() >= RootSizeInBits &&
38546 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
38547 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
38548 return CanonicalizeShuffleInput(RootVT, V1);
38549 }
38550
38551 SmallVector<int, 64> Mask(BaseMask);
38552
38553 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
38554 // etc. can be simplified.
38555 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
38556 SmallVector<int> ScaledMask, IdentityMask;
38557 unsigned NumElts = VT1.getVectorNumElements();
38558 if (Mask.size() <= NumElts &&
38559 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
38560 for (unsigned i = 0; i != NumElts; ++i)
38561 IdentityMask.push_back(i);
38562 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
38563 V2))
38564 return CanonicalizeShuffleInput(RootVT, V1);
38565 }
38566 }
38567
38568 // Handle 128/256-bit lane shuffles of 512-bit vectors.
38569 if (RootVT.is512BitVector() &&
38570 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
38571 // If the upper subvectors are zeroable, then an extract+insert is more
38572 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
38573 // to zero the upper subvectors.
38574 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
38575 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38576 return SDValue(); // Nothing to do!
38577 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
38578 "Unexpected lane shuffle");
38579 Res = CanonicalizeShuffleInput(RootVT, V1);
38580 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
38581 bool UseZero = isAnyZero(Mask);
38582 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
38583 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
38584 }
38585
38586 // Narrow shuffle mask to v4x128.
38587 SmallVector<int, 4> ScaledMask;
38588 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
38589 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
38590
38591 // Try to lower to vshuf64x2/vshuf32x4.
38592 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
38593 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
38594 SelectionDAG &DAG) {
38595 int PermMask[4] = {-1, -1, -1, -1};
38596 // Ensure elements came from the same Op.
38597 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
38598 for (int i = 0; i < 4; ++i) {
38599 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
38600 if (ScaledMask[i] < 0)
38601 continue;
38602
38603 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
38604 unsigned OpIndex = i / 2;
38605 if (Ops[OpIndex].isUndef())
38606 Ops[OpIndex] = Op;
38607 else if (Ops[OpIndex] != Op)
38608 return SDValue();
38609
38610 PermMask[i] = ScaledMask[i] % 4;
38611 }
38612
38613 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
38614 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
38615 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
38616 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
38617 };
38618
38619 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
38620 // doesn't work because our mask is for 128 bits and we don't have an MVT
38621 // to match that.
38622 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
38623 isUndefOrInRange(ScaledMask[1], 0, 2) &&
38624 isUndefOrInRange(ScaledMask[2], 2, 4) &&
38625 isUndefOrInRange(ScaledMask[3], 2, 4) &&
38626 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
38627 ScaledMask[0] == (ScaledMask[2] % 2)) &&
38628 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
38629 ScaledMask[1] == (ScaledMask[3] % 2));
38630
38631 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
38632 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38633 return SDValue(); // Nothing to do!
38634 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
38635 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
38636 return DAG.getBitcast(RootVT, V);
38637 }
38638 }
38639
38640 // Handle 128-bit lane shuffles of 256-bit vectors.
38641 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
38642 // If the upper half is zeroable, then an extract+insert is more optimal
38643 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
38644 // zero the upper half.
38645 if (isUndefOrZero(Mask[1])) {
38646 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38647 return SDValue(); // Nothing to do!
38648 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
38649 Res = CanonicalizeShuffleInput(RootVT, V1);
38650 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
38651 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
38652 256);
38653 }
38654
38655 // If we're inserting the low subvector, an insert-subvector 'concat'
38656 // pattern is quicker than VPERM2X128.
38657 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
38658 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
38659 !Subtarget.hasAVX2()) {
38660 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38661 return SDValue(); // Nothing to do!
38662 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
38663 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
38664 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
38665 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
38666 }
38667
38668 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
38669 return SDValue(); // Nothing to do!
38670
38671 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
38672 // we need to use the zeroing feature.
38673 // Prefer blends for sequential shuffles unless we are optimizing for size.
38674 if (UnaryShuffle &&
38675 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
38676 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
38677 unsigned PermMask = 0;
38678 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
38679 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
38680 return DAG.getNode(
38681 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
38682 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
38683 }
38684
38685 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38686 return SDValue(); // Nothing to do!
38687
38688 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
38689 if (!UnaryShuffle && !IsMaskedShuffle) {
38690 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
38691 "Unexpected shuffle sentinel value");
38692 // Prefer blends to X86ISD::VPERM2X128.
38693 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
38694 unsigned PermMask = 0;
38695 PermMask |= ((Mask[0] & 3) << 0);
38696 PermMask |= ((Mask[1] & 3) << 4);
38697 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
38698 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
38699 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
38700 CanonicalizeShuffleInput(RootVT, LHS),
38701 CanonicalizeShuffleInput(RootVT, RHS),
38702 DAG.getTargetConstant(PermMask, DL, MVT::i8));
38703 }
38704 }
38705 }
38706
38707 // For masks that have been widened to 128-bit elements or more,
38708 // narrow back down to 64-bit elements.
38709 if (BaseMaskEltSizeInBits > 64) {
38710 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
38711 int MaskScale = BaseMaskEltSizeInBits / 64;
38712 SmallVector<int, 64> ScaledMask;
38713 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38714 Mask = std::move(ScaledMask);
38715 }
38716
38717 // For masked shuffles, we're trying to match the root width for better
38718 // writemask folding, attempt to scale the mask.
38719 // TODO - variable shuffles might need this to be widened again.
38720 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
38721 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
38722 int MaskScale = NumRootElts / Mask.size();
38723 SmallVector<int, 64> ScaledMask;
38724 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38725 Mask = std::move(ScaledMask);
38726 }
38727
38728 unsigned NumMaskElts = Mask.size();
38729 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
38730 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38731
38732 // Determine the effective mask value type.
38733 FloatDomain &= (32 <= MaskEltSizeInBits);
38734 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
38735 : MVT::getIntegerVT(MaskEltSizeInBits);
38736 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
38737
38738 // Only allow legal mask types.
38739 if (!TLI.isTypeLegal(MaskVT))
38740 return SDValue();
38741
38742 // Attempt to match the mask against known shuffle patterns.
38743 MVT ShuffleSrcVT, ShuffleVT;
38744 unsigned Shuffle, PermuteImm;
38745
38746 // Which shuffle domains are permitted?
38747 // Permit domain crossing at higher combine depths.
38748 // TODO: Should we indicate which domain is preferred if both are allowed?
38749 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
38750 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
38751 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
38752
38753 // Determine zeroable mask elements.
38754 APInt KnownUndef, KnownZero;
38755 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
38756 APInt Zeroable = KnownUndef | KnownZero;
38757
38758 if (UnaryShuffle) {
38759 // Attempt to match against broadcast-from-vector.
38760 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
38761 if ((Subtarget.hasAVX2() ||
38762 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
38763 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
38764 if (isUndefOrEqual(Mask, 0)) {
38765 if (V1.getValueType() == MaskVT &&
38767 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
38768 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38769 return SDValue(); // Nothing to do!
38770 Res = V1.getOperand(0);
38771 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38772 return DAG.getBitcast(RootVT, Res);
38773 }
38774 if (Subtarget.hasAVX2()) {
38775 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38776 return SDValue(); // Nothing to do!
38777 Res = CanonicalizeShuffleInput(MaskVT, V1);
38778 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38779 return DAG.getBitcast(RootVT, Res);
38780 }
38781 }
38782 }
38783
38784 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
38785 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
38786 (!IsMaskedShuffle ||
38787 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38788 if (Depth == 0 && Root.getOpcode() == Shuffle)
38789 return SDValue(); // Nothing to do!
38790 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38791 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
38792 return DAG.getBitcast(RootVT, Res);
38793 }
38794
38795 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38796 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
38797 PermuteImm) &&
38798 (!IsMaskedShuffle ||
38799 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38800 if (Depth == 0 && Root.getOpcode() == Shuffle)
38801 return SDValue(); // Nothing to do!
38802 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
38803 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
38804 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38805 return DAG.getBitcast(RootVT, Res);
38806 }
38807 }
38808
38809 // Attempt to combine to INSERTPS, but only if the inserted element has come
38810 // from a scalar.
38811 // TODO: Handle other insertions here as well?
38812 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
38813 Subtarget.hasSSE41() &&
38814 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
38815 if (MaskEltSizeInBits == 32) {
38816 SDValue SrcV1 = V1, SrcV2 = V2;
38817 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
38818 DAG) &&
38819 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
38820 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38821 return SDValue(); // Nothing to do!
38822 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38823 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
38824 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
38825 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38826 return DAG.getBitcast(RootVT, Res);
38827 }
38828 }
38829 if (MaskEltSizeInBits == 64 &&
38830 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
38831 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38832 V2.getScalarValueSizeInBits() <= 32) {
38833 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38834 return SDValue(); // Nothing to do!
38835 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
38836 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38837 CanonicalizeShuffleInput(MVT::v4f32, V1),
38838 CanonicalizeShuffleInput(MVT::v4f32, V2),
38839 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38840 return DAG.getBitcast(RootVT, Res);
38841 }
38842 }
38843
38844 SDValue NewV1 = V1; // Save operands in case early exit happens.
38845 SDValue NewV2 = V2;
38846 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
38847 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
38848 ShuffleVT, UnaryShuffle) &&
38849 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38850 if (Depth == 0 && Root.getOpcode() == Shuffle)
38851 return SDValue(); // Nothing to do!
38852 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
38853 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
38854 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
38855 return DAG.getBitcast(RootVT, Res);
38856 }
38857
38858 NewV1 = V1; // Save operands in case early exit happens.
38859 NewV2 = V2;
38860 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38861 AllowIntDomain, NewV1, NewV2, DL, DAG,
38862 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
38863 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38864 if (Depth == 0 && Root.getOpcode() == Shuffle)
38865 return SDValue(); // Nothing to do!
38866 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
38867 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
38868 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
38869 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38870 return DAG.getBitcast(RootVT, Res);
38871 }
38872
38873 // Typically from here on, we need an integer version of MaskVT.
38874 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
38875 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
38876
38877 // Annoyingly, SSE4A instructions don't map into the above match helpers.
38878 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
38879 uint64_t BitLen, BitIdx;
38880 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
38881 Zeroable)) {
38882 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
38883 return SDValue(); // Nothing to do!
38884 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38885 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
38886 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38887 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38888 return DAG.getBitcast(RootVT, Res);
38889 }
38890
38891 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
38892 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
38893 return SDValue(); // Nothing to do!
38894 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38895 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
38896 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
38897 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38898 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38899 return DAG.getBitcast(RootVT, Res);
38900 }
38901 }
38902
38903 // Match shuffle against TRUNCATE patterns.
38904 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
38905 // Match against a VTRUNC instruction, accounting for src/dst sizes.
38906 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
38907 Subtarget)) {
38908 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
38909 ShuffleSrcVT.getVectorNumElements();
38910 unsigned Opc =
38911 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
38912 if (Depth == 0 && Root.getOpcode() == Opc)
38913 return SDValue(); // Nothing to do!
38914 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38915 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
38916 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
38917 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
38918 return DAG.getBitcast(RootVT, Res);
38919 }
38920
38921 // Do we need a more general binary truncation pattern?
38922 if (RootSizeInBits < 512 &&
38923 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
38924 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
38925 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
38926 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
38927 // Bail if this was already a truncation or PACK node.
38928 // We sometimes fail to match PACK if we demand known undef elements.
38929 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
38930 Root.getOpcode() == X86ISD::PACKSS ||
38931 Root.getOpcode() == X86ISD::PACKUS))
38932 return SDValue(); // Nothing to do!
38933 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38934 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
38935 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38936 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
38937 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38938 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
38939 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
38940 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
38941 return DAG.getBitcast(RootVT, Res);
38942 }
38943 }
38944
38945 // Don't try to re-form single instruction chains under any circumstances now
38946 // that we've done encoding canonicalization for them.
38947 if (Depth < 1)
38948 return SDValue();
38949
38950 // Depth threshold above which we can efficiently use variable mask shuffles.
38951 int VariableCrossLaneShuffleDepth =
38952 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
38953 int VariablePerLaneShuffleDepth =
38954 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
38955 AllowVariableCrossLaneMask &=
38956 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
38957 AllowVariablePerLaneMask &=
38958 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
38959 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
38960 // higher depth before combining them.
38961 bool AllowBWIVPERMV3 =
38962 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
38963
38964 bool MaskContainsZeros = isAnyZero(Mask);
38965
38966 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
38967 // If we have a single input lane-crossing shuffle then lower to VPERMV.
38968 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
38969 if (Subtarget.hasAVX2() &&
38970 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
38971 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
38972 Res = CanonicalizeShuffleInput(MaskVT, V1);
38973 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
38974 return DAG.getBitcast(RootVT, Res);
38975 }
38976 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
38977 if ((Subtarget.hasAVX512() &&
38978 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38979 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38980 (Subtarget.hasBWI() &&
38981 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38982 (Subtarget.hasVBMI() &&
38983 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
38984 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38985 V2 = DAG.getUNDEF(MaskVT);
38986 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38987 return DAG.getBitcast(RootVT, Res);
38988 }
38989 }
38990
38991 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
38992 // vector as the second source (non-VLX will pad to 512-bit shuffles).
38993 if (UnaryShuffle && AllowVariableCrossLaneMask &&
38994 ((Subtarget.hasAVX512() &&
38995 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38996 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38997 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
38998 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38999 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
39000 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
39001 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
39002 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
39003 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
39004 for (unsigned i = 0; i != NumMaskElts; ++i)
39005 if (Mask[i] == SM_SentinelZero)
39006 Mask[i] = NumMaskElts + i;
39007 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39008 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
39009 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
39010 return DAG.getBitcast(RootVT, Res);
39011 }
39012
39013 // If that failed and either input is extracted then try to combine as a
39014 // shuffle with the larger type.
39016 Inputs, Root, BaseMask, Depth, HasVariableMask,
39017 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
39018 Subtarget))
39019 return WideShuffle;
39020
39021 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
39022 // (non-VLX will pad to 512-bit shuffles).
39023 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
39024 ((Subtarget.hasAVX512() &&
39025 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
39026 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
39027 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
39028 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
39029 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
39030 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
39031 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
39032 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
39033 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39034 V2 = CanonicalizeShuffleInput(MaskVT, V2);
39035 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
39036 return DAG.getBitcast(RootVT, Res);
39037 }
39038 return SDValue();
39039 }
39040
39041 // See if we can combine a single input shuffle with zeros to a bit-mask,
39042 // which is much simpler than any shuffle.
39043 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
39044 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
39045 TLI.isTypeLegal(MaskVT)) {
39046 APInt Zero = APInt::getZero(MaskEltSizeInBits);
39047 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
39048 APInt UndefElts(NumMaskElts, 0);
39049 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
39050 for (unsigned i = 0; i != NumMaskElts; ++i) {
39051 int M = Mask[i];
39052 if (M == SM_SentinelUndef) {
39053 UndefElts.setBit(i);
39054 continue;
39055 }
39056 if (M == SM_SentinelZero)
39057 continue;
39058 EltBits[i] = AllOnes;
39059 }
39060 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
39061 Res = CanonicalizeShuffleInput(MaskVT, V1);
39062 unsigned AndOpcode =
39064 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
39065 return DAG.getBitcast(RootVT, Res);
39066 }
39067
39068 // If we have a single input shuffle with different shuffle patterns in the
39069 // the 128-bit lanes use the variable mask to VPERMILPS.
39070 // TODO Combine other mask types at higher depths.
39071 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
39072 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
39073 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
39074 SmallVector<SDValue, 16> VPermIdx;
39075 for (int M : Mask) {
39076 SDValue Idx =
39077 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
39078 VPermIdx.push_back(Idx);
39079 }
39080 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
39081 Res = CanonicalizeShuffleInput(MaskVT, V1);
39082 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
39083 return DAG.getBitcast(RootVT, Res);
39084 }
39085
39086 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
39087 // to VPERMIL2PD/VPERMIL2PS.
39088 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
39089 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
39090 MaskVT == MVT::v8f32)) {
39091 // VPERMIL2 Operation.
39092 // Bits[3] - Match Bit.
39093 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
39094 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
39095 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
39096 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
39097 SmallVector<int, 8> VPerm2Idx;
39098 unsigned M2ZImm = 0;
39099 for (int M : Mask) {
39100 if (M == SM_SentinelUndef) {
39101 VPerm2Idx.push_back(-1);
39102 continue;
39103 }
39104 if (M == SM_SentinelZero) {
39105 M2ZImm = 2;
39106 VPerm2Idx.push_back(8);
39107 continue;
39108 }
39109 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
39110 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
39111 VPerm2Idx.push_back(Index);
39112 }
39113 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39114 V2 = CanonicalizeShuffleInput(MaskVT, V2);
39115 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
39116 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
39117 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
39118 return DAG.getBitcast(RootVT, Res);
39119 }
39120
39121 // If we have 3 or more shuffle instructions or a chain involving a variable
39122 // mask, we can replace them with a single PSHUFB instruction profitably.
39123 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
39124 // instructions, but in practice PSHUFB tends to be *very* fast so we're
39125 // more aggressive.
39126 if (UnaryShuffle && AllowVariablePerLaneMask &&
39127 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39128 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
39129 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
39130 SmallVector<SDValue, 16> PSHUFBMask;
39131 int NumBytes = RootVT.getSizeInBits() / 8;
39132 int Ratio = NumBytes / NumMaskElts;
39133 for (int i = 0; i < NumBytes; ++i) {
39134 int M = Mask[i / Ratio];
39135 if (M == SM_SentinelUndef) {
39136 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
39137 continue;
39138 }
39139 if (M == SM_SentinelZero) {
39140 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
39141 continue;
39142 }
39143 M = Ratio * M + i % Ratio;
39144 assert((M / 16) == (i / 16) && "Lane crossing detected");
39145 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
39146 }
39147 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
39148 Res = CanonicalizeShuffleInput(ByteVT, V1);
39149 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
39150 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
39151 return DAG.getBitcast(RootVT, Res);
39152 }
39153
39154 // With XOP, if we have a 128-bit binary input shuffle we can always combine
39155 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
39156 // slower than PSHUFB on targets that support both.
39157 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
39158 Subtarget.hasXOP()) {
39159 // VPPERM Mask Operation
39160 // Bits[4:0] - Byte Index (0 - 31)
39161 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
39162 SmallVector<SDValue, 16> VPPERMMask;
39163 int NumBytes = 16;
39164 int Ratio = NumBytes / NumMaskElts;
39165 for (int i = 0; i < NumBytes; ++i) {
39166 int M = Mask[i / Ratio];
39167 if (M == SM_SentinelUndef) {
39168 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
39169 continue;
39170 }
39171 if (M == SM_SentinelZero) {
39172 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
39173 continue;
39174 }
39175 M = Ratio * M + i % Ratio;
39176 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
39177 }
39178 MVT ByteVT = MVT::v16i8;
39179 V1 = CanonicalizeShuffleInput(ByteVT, V1);
39180 V2 = CanonicalizeShuffleInput(ByteVT, V2);
39181 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
39182 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
39183 return DAG.getBitcast(RootVT, Res);
39184 }
39185
39186 // If that failed and either input is extracted then try to combine as a
39187 // shuffle with the larger type.
39189 Inputs, Root, BaseMask, Depth, HasVariableMask,
39190 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
39191 return WideShuffle;
39192
39193 // If we have a dual input shuffle then lower to VPERMV3,
39194 // (non-VLX will pad to 512-bit shuffles)
39195 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
39196 ((Subtarget.hasAVX512() &&
39197 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
39198 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
39199 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
39200 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
39201 MaskVT == MVT::v16i32)) ||
39202 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
39203 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
39204 MaskVT == MVT::v32i16)) ||
39205 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
39206 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
39207 MaskVT == MVT::v64i8)))) {
39208 V1 = CanonicalizeShuffleInput(MaskVT, V1);
39209 V2 = CanonicalizeShuffleInput(MaskVT, V2);
39210 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
39211 return DAG.getBitcast(RootVT, Res);
39212 }
39213
39214 // Failed to find any combines.
39215 return SDValue();
39216}
39217
39218// Combine an arbitrary chain of shuffles + extract_subvectors into a single
39219// instruction if possible.
39220//
39221// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
39222// type size to attempt to combine:
39223// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
39224// -->
39225// extract_subvector(shuffle(x,y,m2),0)
39227 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39228 bool HasVariableMask, bool AllowVariableCrossLaneMask,
39229 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39230 const X86Subtarget &Subtarget) {
39231 unsigned NumMaskElts = BaseMask.size();
39232 unsigned NumInputs = Inputs.size();
39233 if (NumInputs == 0)
39234 return SDValue();
39235
39236 EVT RootVT = Root.getValueType();
39237 unsigned RootSizeInBits = RootVT.getSizeInBits();
39238 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
39239 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
39240
39241 // Peek through extract_subvector to find widest legal vector.
39242 // TODO: Handle ISD::TRUNCATE
39243 unsigned WideSizeInBits = RootSizeInBits;
39244 for (unsigned I = 0; I != NumInputs; ++I) {
39245 SDValue Input = peekThroughBitcasts(Inputs[I]);
39246 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
39247 Input = peekThroughBitcasts(Input.getOperand(0));
39248 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
39249 WideSizeInBits < Input.getValueSizeInBits())
39250 WideSizeInBits = Input.getValueSizeInBits();
39251 }
39252
39253 // Bail if we fail to find a source larger than the existing root.
39254 unsigned Scale = WideSizeInBits / RootSizeInBits;
39255 if (WideSizeInBits <= RootSizeInBits ||
39256 (WideSizeInBits % RootSizeInBits) != 0)
39257 return SDValue();
39258
39259 // Create new mask for larger type.
39260 SmallVector<int, 64> WideMask(BaseMask);
39261 for (int &M : WideMask) {
39262 if (M < 0)
39263 continue;
39264 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
39265 }
39266 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
39267
39268 // Attempt to peek through inputs and adjust mask when we extract from an
39269 // upper subvector.
39270 int AdjustedMasks = 0;
39271 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
39272 for (unsigned I = 0; I != NumInputs; ++I) {
39273 SDValue &Input = WideInputs[I];
39274 Input = peekThroughBitcasts(Input);
39275 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39276 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
39278 if (Idx != 0) {
39279 ++AdjustedMasks;
39280 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
39281 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
39282
39283 int lo = I * WideMask.size();
39284 int hi = (I + 1) * WideMask.size();
39285 for (int &M : WideMask)
39286 if (lo <= M && M < hi)
39287 M += Idx;
39288 }
39289 Input = peekThroughBitcasts(Input.getOperand(0));
39290 }
39291 }
39292
39293 // Remove unused/repeated shuffle source ops.
39294 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
39295 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
39296
39297 // Bail if we're always extracting from the lowest subvectors,
39298 // combineX86ShuffleChain should match this for the current width, or the
39299 // shuffle still references too many inputs.
39300 if (AdjustedMasks == 0 || WideInputs.size() > 2)
39301 return SDValue();
39302
39303 // Minor canonicalization of the accumulated shuffle mask to make it easier
39304 // to match below. All this does is detect masks with sequential pairs of
39305 // elements, and shrink them to the half-width mask. It does this in a loop
39306 // so it will reduce the size of the mask to the minimal width mask which
39307 // performs an equivalent shuffle.
39308 while (WideMask.size() > 1) {
39309 SmallVector<int, 64> WidenedMask;
39310 if (!canWidenShuffleElements(WideMask, WidenedMask))
39311 break;
39312 WideMask = std::move(WidenedMask);
39313 }
39314
39315 // Canonicalization of binary shuffle masks to improve pattern matching by
39316 // commuting the inputs.
39317 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
39319 std::swap(WideInputs[0], WideInputs[1]);
39320 }
39321
39322 // Increase depth for every upper subvector we've peeked through.
39323 Depth += AdjustedMasks;
39324
39325 // Attempt to combine wider chain.
39326 // TODO: Can we use a better Root?
39327 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
39328 WideInputs.back().getValueSizeInBits()
39329 ? WideInputs.front()
39330 : WideInputs.back();
39331 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
39332 "WideRootSize mismatch");
39333
39334 if (SDValue WideShuffle =
39335 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
39336 HasVariableMask, AllowVariableCrossLaneMask,
39337 AllowVariablePerLaneMask, DAG, Subtarget)) {
39338 WideShuffle =
39339 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
39340 return DAG.getBitcast(RootVT, WideShuffle);
39341 }
39342
39343 return SDValue();
39344}
39345
39346// Canonicalize the combined shuffle mask chain with horizontal ops.
39347// NOTE: This may update the Ops and Mask.
39350 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
39351 const X86Subtarget &Subtarget) {
39352 if (Mask.empty() || Ops.empty())
39353 return SDValue();
39354
39356 for (SDValue Op : Ops)
39358
39359 // All ops must be the same horizop + type.
39360 SDValue BC0 = BC[0];
39361 EVT VT0 = BC0.getValueType();
39362 unsigned Opcode0 = BC0.getOpcode();
39363 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
39364 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
39365 }))
39366 return SDValue();
39367
39368 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
39369 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
39370 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
39371 if (!isHoriz && !isPack)
39372 return SDValue();
39373
39374 // Do all ops have a single use?
39375 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
39376 return Op.hasOneUse() &&
39378 });
39379
39380 int NumElts = VT0.getVectorNumElements();
39381 int NumLanes = VT0.getSizeInBits() / 128;
39382 int NumEltsPerLane = NumElts / NumLanes;
39383 int NumHalfEltsPerLane = NumEltsPerLane / 2;
39384 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
39385 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39386
39387 if (NumEltsPerLane >= 4 &&
39388 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
39389 SmallVector<int> LaneMask, ScaledMask;
39390 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
39391 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
39392 // See if we can remove the shuffle by resorting the HOP chain so that
39393 // the HOP args are pre-shuffled.
39394 // TODO: Generalize to any sized/depth chain.
39395 // TODO: Add support for PACKSS/PACKUS.
39396 if (isHoriz) {
39397 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
39398 auto GetHOpSrc = [&](int M) {
39399 if (M == SM_SentinelUndef)
39400 return DAG.getUNDEF(VT0);
39401 if (M == SM_SentinelZero)
39402 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
39403 SDValue Src0 = BC[M / 4];
39404 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
39405 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
39406 return Src1.getOperand(M % 2);
39407 return SDValue();
39408 };
39409 SDValue M0 = GetHOpSrc(ScaledMask[0]);
39410 SDValue M1 = GetHOpSrc(ScaledMask[1]);
39411 SDValue M2 = GetHOpSrc(ScaledMask[2]);
39412 SDValue M3 = GetHOpSrc(ScaledMask[3]);
39413 if (M0 && M1 && M2 && M3) {
39414 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
39415 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
39416 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39417 }
39418 }
39419 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
39420 if (Ops.size() >= 2) {
39421 SDValue LHS, RHS;
39422 auto GetHOpSrc = [&](int M, int &OutM) {
39423 // TODO: Support SM_SentinelZero
39424 if (M < 0)
39425 return M == SM_SentinelUndef;
39426 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
39427 if (!LHS || LHS == Src) {
39428 LHS = Src;
39429 OutM = (M % 2);
39430 return true;
39431 }
39432 if (!RHS || RHS == Src) {
39433 RHS = Src;
39434 OutM = (M % 2) + 2;
39435 return true;
39436 }
39437 return false;
39438 };
39439 int PostMask[4] = {-1, -1, -1, -1};
39440 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
39441 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
39442 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
39443 GetHOpSrc(ScaledMask[3], PostMask[3])) {
39444 LHS = DAG.getBitcast(SrcVT, LHS);
39445 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
39446 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39447 // Use SHUFPS for the permute so this will work on SSE2 targets,
39448 // shuffle combining and domain handling will simplify this later on.
39449 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
39450 Res = DAG.getBitcast(ShuffleVT, Res);
39451 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
39452 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
39453 }
39454 }
39455 }
39456 }
39457
39458 if (2 < Ops.size())
39459 return SDValue();
39460
39461 SDValue BC1 = BC[BC.size() - 1];
39462 if (Mask.size() == VT0.getVectorNumElements()) {
39463 // Canonicalize binary shuffles of horizontal ops that use the
39464 // same sources to an unary shuffle.
39465 // TODO: Try to perform this fold even if the shuffle remains.
39466 if (Ops.size() == 2) {
39467 auto ContainsOps = [](SDValue HOp, SDValue Op) {
39468 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
39469 };
39470 // Commute if all BC0's ops are contained in BC1.
39471 if (ContainsOps(BC1, BC0.getOperand(0)) &&
39472 ContainsOps(BC1, BC0.getOperand(1))) {
39474 std::swap(Ops[0], Ops[1]);
39475 std::swap(BC0, BC1);
39476 }
39477
39478 // If BC1 can be represented by BC0, then convert to unary shuffle.
39479 if (ContainsOps(BC0, BC1.getOperand(0)) &&
39480 ContainsOps(BC0, BC1.getOperand(1))) {
39481 for (int &M : Mask) {
39482 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
39483 continue;
39484 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
39485 M -= NumElts + (SubLane * NumHalfEltsPerLane);
39486 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
39487 M += NumHalfEltsPerLane;
39488 }
39489 }
39490 }
39491
39492 // Canonicalize unary horizontal ops to only refer to lower halves.
39493 for (int i = 0; i != NumElts; ++i) {
39494 int &M = Mask[i];
39495 if (isUndefOrZero(M))
39496 continue;
39497 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
39498 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39499 M -= NumHalfEltsPerLane;
39500 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
39501 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39502 M -= NumHalfEltsPerLane;
39503 }
39504 }
39505
39506 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
39507 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
39508 // represents the LHS/RHS inputs for the lower/upper halves.
39509 SmallVector<int, 16> TargetMask128, WideMask128;
39510 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
39511 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
39512 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
39513 bool SingleOp = (Ops.size() == 1);
39514 if (isPack || OneUseOps ||
39515 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
39516 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
39517 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
39518 Lo = Lo.getOperand(WideMask128[0] & 1);
39519 Hi = Hi.getOperand(WideMask128[1] & 1);
39520 if (SingleOp) {
39521 SDValue Undef = DAG.getUNDEF(SrcVT);
39522 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
39523 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
39524 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
39525 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
39526 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
39527 }
39528 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
39529 }
39530 }
39531
39532 // If we are post-shuffling a 256-bit hop and not requiring the upper
39533 // elements, then try to narrow to a 128-bit hop directly.
39534 SmallVector<int, 16> WideMask64;
39535 if (Ops.size() == 1 && NumLanes == 2 &&
39536 scaleShuffleElements(Mask, 4, WideMask64) &&
39537 isUndefInRange(WideMask64, 2, 2)) {
39538 int M0 = WideMask64[0];
39539 int M1 = WideMask64[1];
39540 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
39542 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39543 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39544 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
39545 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
39546 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
39547 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
39548 }
39549 }
39550
39551 return SDValue();
39552}
39553
39554// Attempt to constant fold all of the constant source ops.
39555// Returns true if the entire shuffle is folded to a constant.
39556// TODO: Extend this to merge multiple constant Ops and update the mask.
39558 ArrayRef<int> Mask, SDValue Root,
39559 bool HasVariableMask,
39560 SelectionDAG &DAG,
39561 const X86Subtarget &Subtarget) {
39562 MVT VT = Root.getSimpleValueType();
39563
39564 unsigned SizeInBits = VT.getSizeInBits();
39565 unsigned NumMaskElts = Mask.size();
39566 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
39567 unsigned NumOps = Ops.size();
39568
39569 // Extract constant bits from each source op.
39570 SmallVector<APInt, 16> UndefEltsOps(NumOps);
39571 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
39572 for (unsigned I = 0; I != NumOps; ++I)
39573 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
39574 RawBitsOps[I],
39575 /*AllowWholeUndefs*/ true,
39576 /*AllowPartialUndefs*/ true))
39577 return SDValue();
39578
39579 // If we're optimizing for size, only fold if at least one of the constants is
39580 // only used once or the combined shuffle has included a variable mask
39581 // shuffle, this is to avoid constant pool bloat.
39582 bool IsOptimizingSize = DAG.shouldOptForSize();
39583 if (IsOptimizingSize && !HasVariableMask &&
39584 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
39585 return SDValue();
39586
39587 // Shuffle the constant bits according to the mask.
39588 SDLoc DL(Root);
39589 APInt UndefElts(NumMaskElts, 0);
39590 APInt ZeroElts(NumMaskElts, 0);
39591 APInt ConstantElts(NumMaskElts, 0);
39592 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
39593 APInt::getZero(MaskSizeInBits));
39594 for (unsigned i = 0; i != NumMaskElts; ++i) {
39595 int M = Mask[i];
39596 if (M == SM_SentinelUndef) {
39597 UndefElts.setBit(i);
39598 continue;
39599 } else if (M == SM_SentinelZero) {
39600 ZeroElts.setBit(i);
39601 continue;
39602 }
39603 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
39604
39605 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
39606 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
39607
39608 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
39609 if (SrcUndefElts[SrcMaskIdx]) {
39610 UndefElts.setBit(i);
39611 continue;
39612 }
39613
39614 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
39615 APInt &Bits = SrcEltBits[SrcMaskIdx];
39616 if (!Bits) {
39617 ZeroElts.setBit(i);
39618 continue;
39619 }
39620
39621 ConstantElts.setBit(i);
39622 ConstantBitData[i] = Bits;
39623 }
39624 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
39625
39626 // Attempt to create a zero vector.
39627 if ((UndefElts | ZeroElts).isAllOnes())
39628 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
39629
39630 // Create the constant data.
39631 MVT MaskSVT;
39632 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
39633 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
39634 else
39635 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
39636
39637 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
39638 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
39639 return SDValue();
39640
39641 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
39642 return DAG.getBitcast(VT, CstOp);
39643}
39644
39645namespace llvm {
39646 namespace X86 {
39647 enum {
39650 } // namespace X86
39651} // namespace llvm
39652
39653/// Fully generic combining of x86 shuffle instructions.
39654///
39655/// This should be the last combine run over the x86 shuffle instructions. Once
39656/// they have been fully optimized, this will recursively consider all chains
39657/// of single-use shuffle instructions, build a generic model of the cumulative
39658/// shuffle operation, and check for simpler instructions which implement this
39659/// operation. We use this primarily for two purposes:
39660///
39661/// 1) Collapse generic shuffles to specialized single instructions when
39662/// equivalent. In most cases, this is just an encoding size win, but
39663/// sometimes we will collapse multiple generic shuffles into a single
39664/// special-purpose shuffle.
39665/// 2) Look for sequences of shuffle instructions with 3 or more total
39666/// instructions, and replace them with the slightly more expensive SSSE3
39667/// PSHUFB instruction if available. We do this as the last combining step
39668/// to ensure we avoid using PSHUFB if we can implement the shuffle with
39669/// a suitable short sequence of other instructions. The PSHUFB will either
39670/// use a register or have to read from memory and so is slightly (but only
39671/// slightly) more expensive than the other shuffle instructions.
39672///
39673/// Because this is inherently a quadratic operation (for each shuffle in
39674/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
39675/// This should never be an issue in practice as the shuffle lowering doesn't
39676/// produce sequences of more than 8 instructions.
39677///
39678/// FIXME: We will currently miss some cases where the redundant shuffling
39679/// would simplify under the threshold for PSHUFB formation because of
39680/// combine-ordering. To fix this, we should do the redundant instruction
39681/// combining in this recursive walk.
39683 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
39684 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
39685 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
39686 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39687 const X86Subtarget &Subtarget) {
39688 assert(!RootMask.empty() &&
39689 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
39690 "Illegal shuffle root mask");
39691 MVT RootVT = Root.getSimpleValueType();
39692 assert(RootVT.isVector() && "Shuffles operate on vector types!");
39693 unsigned RootSizeInBits = RootVT.getSizeInBits();
39694
39695 // Bound the depth of our recursive combine because this is ultimately
39696 // quadratic in nature.
39697 if (Depth >= MaxDepth)
39698 return SDValue();
39699
39700 // Directly rip through bitcasts to find the underlying operand.
39701 SDValue Op = SrcOps[SrcOpIndex];
39703
39704 EVT VT = Op.getValueType();
39705 if (!VT.isVector() || !VT.isSimple())
39706 return SDValue(); // Bail if we hit a non-simple non-vector.
39707
39708 // FIXME: Just bail on f16 for now.
39709 if (VT.getVectorElementType() == MVT::f16)
39710 return SDValue();
39711
39712 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
39713 "Can only combine shuffles upto size of the root op.");
39714
39715 // Create a demanded elts mask from the referenced elements of Op.
39716 APInt OpDemandedElts = APInt::getZero(RootMask.size());
39717 for (int M : RootMask) {
39718 int BaseIdx = RootMask.size() * SrcOpIndex;
39719 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
39720 OpDemandedElts.setBit(M - BaseIdx);
39721 }
39722 if (RootSizeInBits != VT.getSizeInBits()) {
39723 // Op is smaller than Root - extract the demanded elts for the subvector.
39724 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
39725 unsigned NumOpMaskElts = RootMask.size() / Scale;
39726 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
39727 assert(OpDemandedElts
39728 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
39729 .isZero() &&
39730 "Out of range elements referenced in root mask");
39731 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
39732 }
39733 OpDemandedElts =
39734 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
39735
39736 // Extract target shuffle mask and resolve sentinels and inputs.
39737 SmallVector<int, 64> OpMask;
39738 SmallVector<SDValue, 2> OpInputs;
39739 APInt OpUndef, OpZero;
39740 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
39741 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
39742 OpZero, DAG, Depth, false)) {
39743 // Shuffle inputs must not be larger than the shuffle result.
39744 // TODO: Relax this for single input faux shuffles (e.g. trunc).
39745 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
39746 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
39747 }))
39748 return SDValue();
39749 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39750 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39751 !isNullConstant(Op.getOperand(1))) {
39752 SDValue SrcVec = Op.getOperand(0);
39753 int ExtractIdx = Op.getConstantOperandVal(1);
39754 unsigned NumElts = VT.getVectorNumElements();
39755 OpInputs.assign({SrcVec});
39756 OpMask.assign(NumElts, SM_SentinelUndef);
39757 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
39758 OpZero = OpUndef = APInt::getZero(NumElts);
39759 } else {
39760 return SDValue();
39761 }
39762
39763 // If the shuffle result was smaller than the root, we need to adjust the
39764 // mask indices and pad the mask with undefs.
39765 if (RootSizeInBits > VT.getSizeInBits()) {
39766 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
39767 unsigned OpMaskSize = OpMask.size();
39768 if (OpInputs.size() > 1) {
39769 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
39770 for (int &M : OpMask) {
39771 if (M < 0)
39772 continue;
39773 int EltIdx = M % OpMaskSize;
39774 int OpIdx = M / OpMaskSize;
39775 M = (PaddedMaskSize * OpIdx) + EltIdx;
39776 }
39777 }
39778 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
39779 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
39780 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
39781 }
39782
39785
39786 // We don't need to merge masks if the root is empty.
39787 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
39788 if (EmptyRoot) {
39789 // Only resolve zeros if it will remove an input, otherwise we might end
39790 // up in an infinite loop.
39791 bool ResolveKnownZeros = true;
39792 if (!OpZero.isZero()) {
39793 APInt UsedInputs = APInt::getZero(OpInputs.size());
39794 for (int i = 0, e = OpMask.size(); i != e; ++i) {
39795 int M = OpMask[i];
39796 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
39797 continue;
39798 UsedInputs.setBit(M / OpMask.size());
39799 if (UsedInputs.isAllOnes()) {
39800 ResolveKnownZeros = false;
39801 break;
39802 }
39803 }
39804 }
39805 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
39806 ResolveKnownZeros);
39807
39808 Mask = OpMask;
39809 Ops.append(OpInputs.begin(), OpInputs.end());
39810 } else {
39811 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
39812
39813 // Add the inputs to the Ops list, avoiding duplicates.
39814 Ops.append(SrcOps.begin(), SrcOps.end());
39815
39816 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
39817 // Attempt to find an existing match.
39818 SDValue InputBC = peekThroughBitcasts(Input);
39819 for (int i = 0, e = Ops.size(); i < e; ++i)
39820 if (InputBC == peekThroughBitcasts(Ops[i]))
39821 return i;
39822 // Match failed - should we replace an existing Op?
39823 if (InsertionPoint >= 0) {
39824 Ops[InsertionPoint] = Input;
39825 return InsertionPoint;
39826 }
39827 // Add to the end of the Ops list.
39828 Ops.push_back(Input);
39829 return Ops.size() - 1;
39830 };
39831
39832 SmallVector<int, 2> OpInputIdx;
39833 for (SDValue OpInput : OpInputs)
39834 OpInputIdx.push_back(
39835 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
39836
39837 assert(((RootMask.size() > OpMask.size() &&
39838 RootMask.size() % OpMask.size() == 0) ||
39839 (OpMask.size() > RootMask.size() &&
39840 OpMask.size() % RootMask.size() == 0) ||
39841 OpMask.size() == RootMask.size()) &&
39842 "The smaller number of elements must divide the larger.");
39843
39844 // This function can be performance-critical, so we rely on the power-of-2
39845 // knowledge that we have about the mask sizes to replace div/rem ops with
39846 // bit-masks and shifts.
39847 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
39848 "Non-power-of-2 shuffle mask sizes");
39849 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
39850 "Non-power-of-2 shuffle mask sizes");
39851 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
39852 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
39853
39854 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
39855 unsigned RootRatio =
39856 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
39857 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
39858 assert((RootRatio == 1 || OpRatio == 1) &&
39859 "Must not have a ratio for both incoming and op masks!");
39860
39861 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
39862 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
39863 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
39864 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
39865 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
39866
39867 Mask.resize(MaskWidth, SM_SentinelUndef);
39868
39869 // Merge this shuffle operation's mask into our accumulated mask. Note that
39870 // this shuffle's mask will be the first applied to the input, followed by
39871 // the root mask to get us all the way to the root value arrangement. The
39872 // reason for this order is that we are recursing up the operation chain.
39873 for (unsigned i = 0; i < MaskWidth; ++i) {
39874 unsigned RootIdx = i >> RootRatioLog2;
39875 if (RootMask[RootIdx] < 0) {
39876 // This is a zero or undef lane, we're done.
39877 Mask[i] = RootMask[RootIdx];
39878 continue;
39879 }
39880
39881 unsigned RootMaskedIdx =
39882 RootRatio == 1
39883 ? RootMask[RootIdx]
39884 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
39885
39886 // Just insert the scaled root mask value if it references an input other
39887 // than the SrcOp we're currently inserting.
39888 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
39889 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
39890 Mask[i] = RootMaskedIdx;
39891 continue;
39892 }
39893
39894 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
39895 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
39896 if (OpMask[OpIdx] < 0) {
39897 // The incoming lanes are zero or undef, it doesn't matter which ones we
39898 // are using.
39899 Mask[i] = OpMask[OpIdx];
39900 continue;
39901 }
39902
39903 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
39904 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
39905 : (OpMask[OpIdx] << OpRatioLog2) +
39906 (RootMaskedIdx & (OpRatio - 1));
39907
39908 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
39909 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
39910 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
39911 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
39912
39913 Mask[i] = OpMaskedIdx;
39914 }
39915 }
39916
39917 // Peek through vector widenings and set out of bounds mask indices to undef.
39918 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
39919 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
39920 SDValue &Op = Ops[I];
39921 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
39922 isNullConstant(Op.getOperand(2))) {
39923 Op = Op.getOperand(1);
39924 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
39925 int Lo = I * Mask.size();
39926 int Hi = (I + 1) * Mask.size();
39927 int NewHi = Lo + (Mask.size() / Scale);
39928 for (int &M : Mask) {
39929 if (Lo <= M && NewHi <= M && M < Hi)
39930 M = SM_SentinelUndef;
39931 }
39932 }
39933 }
39934
39935 // Peek through any free extract_subvector nodes back to root size.
39936 for (SDValue &Op : Ops)
39937 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39938 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39939 isNullConstant(Op.getOperand(1)))
39940 Op = Op.getOperand(0);
39941
39942 // Remove unused/repeated shuffle source ops.
39944
39945 // Handle the all undef/zero/ones cases early.
39946 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
39947 return DAG.getUNDEF(RootVT);
39948 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
39949 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
39950 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
39952 return getOnesVector(RootVT, DAG, SDLoc(Root));
39953
39954 assert(!Ops.empty() && "Shuffle with no inputs detected");
39955 HasVariableMask |= IsOpVariableMask;
39956
39957 // Update the list of shuffle nodes that have been combined so far.
39958 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
39959 SrcNodes.end());
39960 CombinedNodes.push_back(Op.getNode());
39961
39962 // See if we can recurse into each shuffle source op (if it's a target
39963 // shuffle). The source op should only be generally combined if it either has
39964 // a single use (i.e. current Op) or all its users have already been combined,
39965 // if not then we can still combine but should prevent generation of variable
39966 // shuffles to avoid constant pool bloat.
39967 // Don't recurse if we already have more source ops than we can combine in
39968 // the remaining recursion depth.
39969 if (Ops.size() < (MaxDepth - Depth)) {
39970 for (int i = 0, e = Ops.size(); i < e; ++i) {
39971 // For empty roots, we need to resolve zeroable elements before combining
39972 // them with other shuffles.
39973 SmallVector<int, 64> ResolvedMask = Mask;
39974 if (EmptyRoot)
39975 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
39976 bool AllowCrossLaneVar = false;
39977 bool AllowPerLaneVar = false;
39978 if (Ops[i].getNode()->hasOneUse() ||
39979 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
39980 AllowCrossLaneVar = AllowVariableCrossLaneMask;
39981 AllowPerLaneVar = AllowVariablePerLaneMask;
39982 }
39984 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
39985 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
39986 Subtarget))
39987 return Res;
39988 }
39989 }
39990
39991 // Attempt to constant fold all of the constant source ops.
39993 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
39994 return Cst;
39995
39996 // If constant fold failed and we only have constants - then we have
39997 // multiple uses by a single non-variable shuffle - just bail.
39998 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
39999 APInt UndefElts;
40000 SmallVector<APInt> RawBits;
40001 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40002 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
40003 RawBits,
40004 /*AllowWholeUndefs*/ true,
40005 /*AllowPartialUndefs*/ true);
40006 })) {
40007 return SDValue();
40008 }
40009
40010 // Canonicalize the combined shuffle mask chain with horizontal ops.
40011 // NOTE: This will update the Ops and Mask.
40013 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
40014 return DAG.getBitcast(RootVT, HOp);
40015
40016 // Try to refine our inputs given our knowledge of target shuffle mask.
40017 for (auto I : enumerate(Ops)) {
40018 int OpIdx = I.index();
40019 SDValue &Op = I.value();
40020
40021 // What range of shuffle mask element values results in picking from Op?
40022 int Lo = OpIdx * Mask.size();
40023 int Hi = Lo + Mask.size();
40024
40025 // Which elements of Op do we demand, given the mask's granularity?
40026 APInt OpDemandedElts(Mask.size(), 0);
40027 for (int MaskElt : Mask) {
40028 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
40029 int OpEltIdx = MaskElt - Lo;
40030 OpDemandedElts.setBit(OpEltIdx);
40031 }
40032 }
40033
40034 // Is the shuffle result smaller than the root?
40035 if (Op.getValueSizeInBits() < RootSizeInBits) {
40036 // We padded the mask with undefs. But we now need to undo that.
40037 unsigned NumExpectedVectorElts = Mask.size();
40038 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
40039 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
40040 assert(!OpDemandedElts.extractBits(
40041 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
40042 "Demanding the virtual undef widening padding?");
40043 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
40044 }
40045
40046 // The Op itself may be of different VT, so we need to scale the mask.
40047 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
40048 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
40049
40050 // Can this operand be simplified any further, given it's demanded elements?
40051 if (SDValue NewOp =
40053 Op, OpScaledDemandedElts, DAG))
40054 Op = NewOp;
40055 }
40056 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
40057
40058 // Widen any subvector shuffle inputs we've collected.
40059 // TODO: Remove this to avoid generating temporary nodes, we should only
40060 // widen once combineX86ShuffleChain has found a match.
40061 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
40062 return Op.getValueSizeInBits() < RootSizeInBits;
40063 })) {
40064 for (SDValue &Op : Ops)
40065 if (Op.getValueSizeInBits() < RootSizeInBits)
40066 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
40067 RootSizeInBits);
40068 // Reresolve - we might have repeated subvector sources.
40070 }
40071
40072 // We can only combine unary and binary shuffle mask cases.
40073 if (Ops.size() <= 2) {
40074 // Minor canonicalization of the accumulated shuffle mask to make it easier
40075 // to match below. All this does is detect masks with sequential pairs of
40076 // elements, and shrink them to the half-width mask. It does this in a loop
40077 // so it will reduce the size of the mask to the minimal width mask which
40078 // performs an equivalent shuffle.
40079 while (Mask.size() > 1) {
40080 SmallVector<int, 64> WidenedMask;
40081 if (!canWidenShuffleElements(Mask, WidenedMask))
40082 break;
40083 Mask = std::move(WidenedMask);
40084 }
40085
40086 // Canonicalization of binary shuffle masks to improve pattern matching by
40087 // commuting the inputs.
40088 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
40090 std::swap(Ops[0], Ops[1]);
40091 }
40092
40093 // Try to combine into a single shuffle instruction.
40094 if (SDValue Shuffle = combineX86ShuffleChain(
40095 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
40096 AllowVariablePerLaneMask, DAG, Subtarget))
40097 return Shuffle;
40098
40099 // If all the operands come from the same larger vector, fallthrough and try
40100 // to use combineX86ShuffleChainWithExtract.
40103 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
40104 (RootSizeInBits / Mask.size()) != 64 ||
40105 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
40106 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
40107 LHS.getOperand(0) != RHS.getOperand(0))
40108 return SDValue();
40109 }
40110
40111 // If that failed and any input is extracted then try to combine as a
40112 // shuffle with the larger type.
40114 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
40115 AllowVariablePerLaneMask, DAG, Subtarget);
40116}
40117
40118/// Helper entry wrapper to combineX86ShufflesRecursively.
40120 const X86Subtarget &Subtarget) {
40122 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
40123 /*HasVarMask*/ false,
40124 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
40125 Subtarget);
40126}
40127
40128/// Get the PSHUF-style mask from PSHUF node.
40129///
40130/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
40131/// PSHUF-style masks that can be reused with such instructions.
40133 MVT VT = N.getSimpleValueType();
40136 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
40137 (void)HaveMask;
40138 assert(HaveMask);
40139
40140 // If we have more than 128-bits, only the low 128-bits of shuffle mask
40141 // matter. Check that the upper masks are repeats and remove them.
40142 if (VT.getSizeInBits() > 128) {
40143 int LaneElts = 128 / VT.getScalarSizeInBits();
40144#ifndef NDEBUG
40145 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
40146 for (int j = 0; j < LaneElts; ++j)
40147 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
40148 "Mask doesn't repeat in high 128-bit lanes!");
40149#endif
40150 Mask.resize(LaneElts);
40151 }
40152
40153 switch (N.getOpcode()) {
40154 case X86ISD::PSHUFD:
40155 return Mask;
40156 case X86ISD::PSHUFLW:
40157 Mask.resize(4);
40158 return Mask;
40159 case X86ISD::PSHUFHW:
40160 Mask.erase(Mask.begin(), Mask.begin() + 4);
40161 for (int &M : Mask)
40162 M -= 4;
40163 return Mask;
40164 default:
40165 llvm_unreachable("No valid shuffle instruction found!");
40166 }
40167}
40168
40169/// Search for a combinable shuffle across a chain ending in pshufd.
40170///
40171/// We walk up the chain and look for a combinable shuffle, skipping over
40172/// shuffles that we could hoist this shuffle's transformation past without
40173/// altering anything.
40176 const SDLoc &DL,
40177 SelectionDAG &DAG) {
40178 assert(N.getOpcode() == X86ISD::PSHUFD &&
40179 "Called with something other than an x86 128-bit half shuffle!");
40180
40181 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
40182 // of the shuffles in the chain so that we can form a fresh chain to replace
40183 // this one.
40185 SDValue V = N.getOperand(0);
40186 for (; V.hasOneUse(); V = V.getOperand(0)) {
40187 switch (V.getOpcode()) {
40188 default:
40189 return SDValue(); // Nothing combined!
40190
40191 case ISD::BITCAST:
40192 // Skip bitcasts as we always know the type for the target specific
40193 // instructions.
40194 continue;
40195
40196 case X86ISD::PSHUFD:
40197 // Found another dword shuffle.
40198 break;
40199
40200 case X86ISD::PSHUFLW:
40201 // Check that the low words (being shuffled) are the identity in the
40202 // dword shuffle, and the high words are self-contained.
40203 if (Mask[0] != 0 || Mask[1] != 1 ||
40204 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
40205 return SDValue();
40206
40207 Chain.push_back(V);
40208 continue;
40209
40210 case X86ISD::PSHUFHW:
40211 // Check that the high words (being shuffled) are the identity in the
40212 // dword shuffle, and the low words are self-contained.
40213 if (Mask[2] != 2 || Mask[3] != 3 ||
40214 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
40215 return SDValue();
40216
40217 Chain.push_back(V);
40218 continue;
40219
40220 case X86ISD::UNPCKL:
40221 case X86ISD::UNPCKH:
40222 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
40223 // shuffle into a preceding word shuffle.
40224 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
40225 V.getSimpleValueType().getVectorElementType() != MVT::i16)
40226 return SDValue();
40227
40228 // Search for a half-shuffle which we can combine with.
40229 unsigned CombineOp =
40230 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
40231 if (V.getOperand(0) != V.getOperand(1) ||
40232 !V->isOnlyUserOf(V.getOperand(0).getNode()))
40233 return SDValue();
40234 Chain.push_back(V);
40235 V = V.getOperand(0);
40236 do {
40237 switch (V.getOpcode()) {
40238 default:
40239 return SDValue(); // Nothing to combine.
40240
40241 case X86ISD::PSHUFLW:
40242 case X86ISD::PSHUFHW:
40243 if (V.getOpcode() == CombineOp)
40244 break;
40245
40246 Chain.push_back(V);
40247
40248 [[fallthrough]];
40249 case ISD::BITCAST:
40250 V = V.getOperand(0);
40251 continue;
40252 }
40253 break;
40254 } while (V.hasOneUse());
40255 break;
40256 }
40257 // Break out of the loop if we break out of the switch.
40258 break;
40259 }
40260
40261 if (!V.hasOneUse())
40262 // We fell out of the loop without finding a viable combining instruction.
40263 return SDValue();
40264
40265 // Merge this node's mask and our incoming mask.
40267 for (int &M : Mask)
40268 M = VMask[M];
40269 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
40270 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
40271
40272 // Rebuild the chain around this new shuffle.
40273 while (!Chain.empty()) {
40274 SDValue W = Chain.pop_back_val();
40275
40276 if (V.getValueType() != W.getOperand(0).getValueType())
40277 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
40278
40279 switch (W.getOpcode()) {
40280 default:
40281 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
40282
40283 case X86ISD::UNPCKL:
40284 case X86ISD::UNPCKH:
40285 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
40286 break;
40287
40288 case X86ISD::PSHUFD:
40289 case X86ISD::PSHUFLW:
40290 case X86ISD::PSHUFHW:
40291 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
40292 break;
40293 }
40294 }
40295 if (V.getValueType() != N.getValueType())
40296 V = DAG.getBitcast(N.getValueType(), V);
40297
40298 // Return the new chain to replace N.
40299 return V;
40300}
40301
40302// Attempt to commute shufps LHS loads:
40303// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
40305 SelectionDAG &DAG) {
40306 // TODO: Add vXf64 support.
40307 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
40308 return SDValue();
40309
40310 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
40311 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
40312 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
40313 return SDValue();
40314 SDValue N0 = V.getOperand(0);
40315 SDValue N1 = V.getOperand(1);
40316 unsigned Imm = V.getConstantOperandVal(2);
40317 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
40318 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
40320 return SDValue();
40321 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
40322 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
40323 DAG.getTargetConstant(Imm, DL, MVT::i8));
40324 };
40325
40326 switch (N.getOpcode()) {
40327 case X86ISD::VPERMILPI:
40328 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
40329 unsigned Imm = N.getConstantOperandVal(1);
40330 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
40331 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
40332 }
40333 break;
40334 case X86ISD::SHUFP: {
40335 SDValue N0 = N.getOperand(0);
40336 SDValue N1 = N.getOperand(1);
40337 unsigned Imm = N.getConstantOperandVal(2);
40338 if (N0 == N1) {
40339 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
40340 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
40341 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
40342 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
40343 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
40344 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
40345 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
40346 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
40347 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
40348 }
40349 break;
40350 }
40351 }
40352
40353 return SDValue();
40354}
40355
40356// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
40357// iff we don't demand the same element index for both X and Y.
40358static SDValue
40360 const APInt &DemandedElts, SelectionDAG &DAG,
40361 const X86Subtarget &Subtarget, const SDLoc &DL) {
40362 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
40363 if (!N0.hasOneUse() || !N1.hasOneUse())
40364 return SDValue();
40365
40366 unsigned NumElts = VT.getVectorNumElements();
40369
40370 // See if both operands are shuffles, and that we can scale the shuffle masks
40371 // to the same width as the blend mask.
40372 // TODO: Support SM_SentinelZero?
40373 SmallVector<SDValue, 2> Ops0, Ops1;
40374 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
40375 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
40376 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
40377 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
40378 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
40379 return SDValue();
40380
40381 // Determine the demanded elts from both permutes.
40382 APInt Demanded0, DemandedLHS0, DemandedRHS0;
40383 APInt Demanded1, DemandedLHS1, DemandedRHS1;
40384 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
40385 Demanded1,
40386 /*AllowUndefElts=*/true) ||
40387 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
40388 DemandedRHS0, /*AllowUndefElts=*/true) ||
40389 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
40390 DemandedRHS1, /*AllowUndefElts=*/true))
40391 return SDValue();
40392
40393 // Confirm that we only use a single operand from both permutes and that we
40394 // don't demand the same index from both.
40395 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
40396 DemandedLHS0.intersects(DemandedLHS1))
40397 return SDValue();
40398
40399 // Use the permute demanded elts masks as the new blend mask.
40400 // Create the new permute mask as a blend of the 2 original permute masks.
40401 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
40402 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
40403 for (unsigned I = 0; I != NumElts; ++I) {
40404 if (Demanded0[I]) {
40405 int M = ScaledMask0[I];
40406 if (0 <= M) {
40407 assert(isUndefOrEqual(NewBlendMask[M], M) &&
40408 "BlendMask demands LHS AND RHS");
40409 NewBlendMask[M] = M;
40410 NewPermuteMask[I] = M;
40411 }
40412 } else if (Demanded1[I]) {
40413 int M = ScaledMask1[I];
40414 if (0 <= M) {
40415 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
40416 "BlendMask demands LHS AND RHS");
40417 NewBlendMask[M] = M + NumElts;
40418 NewPermuteMask[I] = M;
40419 }
40420 }
40421 }
40422 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
40423 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
40424
40425 // v16i16 shuffles can explode in complexity very easily, only accept them if
40426 // the blend mask is the same in the 128-bit subvectors (or can widen to
40427 // v8i32) and the permute can be widened as well.
40428 if (VT == MVT::v16i16) {
40429 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
40430 !canWidenShuffleElements(NewBlendMask))
40431 return SDValue();
40432 if (!canWidenShuffleElements(NewPermuteMask))
40433 return SDValue();
40434 }
40435
40436 // Don't introduce lane-crossing permutes without AVX2, unless it can be
40437 // widened to a lane permute (vperm2f128).
40438 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
40440 NewPermuteMask) &&
40441 !canScaleShuffleElements(NewPermuteMask, 2))
40442 return SDValue();
40443
40444 SDValue NewBlend =
40445 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
40446 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
40447 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
40448 NewPermuteMask);
40449}
40450
40451// TODO - move this to TLI like isBinOp?
40452static bool isUnaryOp(unsigned Opcode) {
40453 switch (Opcode) {
40454 case ISD::CTLZ:
40455 case ISD::CTTZ:
40456 case ISD::CTPOP:
40457 return true;
40458 }
40459 return false;
40460}
40461
40462// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
40463// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40465 const SDLoc &DL) {
40466 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40467 EVT ShuffleVT = N.getValueType();
40468 unsigned Opc = N.getOpcode();
40469
40470 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true,
40471 bool FoldLoad = false) {
40472 // AllZeros/AllOnes constants are freely shuffled and will peek through
40473 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
40474 // merge with target shuffles if it has one use so shuffle combining is
40475 // likely to kick in. Shuffles of splats are expected to be removed.
40476 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
40477 ISD::isBuildVectorAllZeros(Op.getNode()) ||
40480 getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
40481 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
40482 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
40483 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
40484 (FoldLoad && isShuffleFoldableLoad(Op)) ||
40485 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
40486 };
40487 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
40488 // Ensure we only shuffle whole vector src elements, unless its a logical
40489 // binops where we can more aggressively move shuffles from dst to src.
40490 return isLogicOp(BinOp) ||
40491 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
40492 };
40493
40494 switch (Opc) {
40495 // Unary and Unary+Permute Shuffles.
40496 case X86ISD::PSHUFB: {
40497 // Don't merge PSHUFB if it contains zero'd elements.
40498 SmallVector<int> Mask;
40500 if (!getTargetShuffleMask(N, false, Ops, Mask))
40501 break;
40502 [[fallthrough]];
40503 }
40504 case X86ISD::VBROADCAST:
40505 case X86ISD::MOVDDUP:
40506 case X86ISD::PSHUFD:
40507 case X86ISD::PSHUFHW:
40508 case X86ISD::PSHUFLW:
40509 case X86ISD::VPERMI:
40510 case X86ISD::VPERMILPI: {
40511 if (N.getOperand(0).getValueType() == ShuffleVT &&
40512 N->isOnlyUserOf(N.getOperand(0).getNode())) {
40513 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40514 unsigned SrcOpcode = N0.getOpcode();
40515 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
40518 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::VPERMI,
40519 Opc != X86ISD::PSHUFB) ||
40520 IsMergeableWithShuffle(Op01, Opc != X86ISD::VPERMI,
40521 Opc != X86ISD::PSHUFB)) {
40522 SDValue LHS, RHS;
40523 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40524 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40525 if (N.getNumOperands() == 2) {
40526 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
40527 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
40528 } else {
40529 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
40530 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
40531 }
40532 EVT OpVT = N0.getValueType();
40533 return DAG.getBitcast(ShuffleVT,
40534 DAG.getNode(SrcOpcode, DL, OpVT,
40535 DAG.getBitcast(OpVT, LHS),
40536 DAG.getBitcast(OpVT, RHS)));
40537 }
40538 }
40539 }
40540 break;
40541 }
40542 // Binary and Binary+Permute Shuffles.
40543 case X86ISD::INSERTPS: {
40544 // Don't merge INSERTPS if it contains zero'd elements.
40545 unsigned InsertPSMask = N.getConstantOperandVal(2);
40546 unsigned ZeroMask = InsertPSMask & 0xF;
40547 if (ZeroMask != 0)
40548 break;
40549 [[fallthrough]];
40550 }
40551 case X86ISD::MOVSD:
40552 case X86ISD::MOVSS:
40553 case X86ISD::BLENDI:
40554 case X86ISD::SHUFP:
40555 case X86ISD::UNPCKH:
40556 case X86ISD::UNPCKL: {
40557 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
40558 N->isOnlyUserOf(N.getOperand(1).getNode())) {
40559 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40560 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
40561 unsigned SrcOpcode = N0.getOpcode();
40562 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40563 N0.getValueType() == N1.getValueType() &&
40564 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40565 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40570 // Ensure the total number of shuffles doesn't increase by folding this
40571 // shuffle through to the source ops.
40572 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
40573 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
40574 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
40575 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
40576 SDValue LHS, RHS;
40577 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40578 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40579 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40580 Op11 = DAG.getBitcast(ShuffleVT, Op11);
40581 if (N.getNumOperands() == 3) {
40582 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40583 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
40584 } else {
40585 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40586 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
40587 }
40588 EVT OpVT = N0.getValueType();
40589 return DAG.getBitcast(ShuffleVT,
40590 DAG.getNode(SrcOpcode, DL, OpVT,
40591 DAG.getBitcast(OpVT, LHS),
40592 DAG.getBitcast(OpVT, RHS)));
40593 }
40594 }
40595 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40596 N0.getValueType() == N1.getValueType() &&
40597 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40598 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40601 SDValue Res;
40602 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40603 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40604 if (N.getNumOperands() == 3) {
40605 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40606 } else {
40607 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40608 }
40609 EVT OpVT = N0.getValueType();
40610 return DAG.getBitcast(
40611 ShuffleVT,
40612 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
40613 }
40614 }
40615 break;
40616 }
40617 }
40618 return SDValue();
40619}
40620
40621/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
40623 SelectionDAG &DAG,
40624 const SDLoc &DL) {
40625 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
40626
40627 MVT VT = V.getSimpleValueType();
40628 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
40629 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
40630 unsigned SrcOpc0 = Src0.getOpcode();
40631 unsigned SrcOpc1 = Src1.getOpcode();
40632 EVT SrcVT0 = Src0.getValueType();
40633 EVT SrcVT1 = Src1.getValueType();
40634
40635 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
40636 return SDValue();
40637
40638 switch (SrcOpc0) {
40639 case X86ISD::MOVDDUP: {
40640 SDValue LHS = Src0.getOperand(0);
40641 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40642 SDValue Res =
40643 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
40644 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
40645 return DAG.getBitcast(VT, Res);
40646 }
40647 case X86ISD::VPERMILPI:
40648 // TODO: Handle v4f64 permutes with different low/high lane masks.
40649 if (SrcVT0 == MVT::v4f64) {
40650 uint64_t Mask = Src0.getConstantOperandVal(1);
40651 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
40652 break;
40653 }
40654 [[fallthrough]];
40655 case X86ISD::VSHLI:
40656 case X86ISD::VSRLI:
40657 case X86ISD::VSRAI:
40658 case X86ISD::PSHUFD:
40659 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
40660 SDValue LHS = Src0.getOperand(0);
40661 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40662 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
40663 V.getOperand(2));
40664 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
40665 return DAG.getBitcast(VT, Res);
40666 }
40667 break;
40668 }
40669
40670 return SDValue();
40671}
40672
40673/// Try to combine x86 target specific shuffles.
40675 SelectionDAG &DAG,
40677 const X86Subtarget &Subtarget) {
40678 MVT VT = N.getSimpleValueType();
40680 unsigned Opcode = N.getOpcode();
40681 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40682
40683 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
40684 return R;
40685
40686 // Handle specific target shuffles.
40687 switch (Opcode) {
40688 case X86ISD::MOVDDUP: {
40689 SDValue Src = N.getOperand(0);
40690 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
40691 if (VT == MVT::v2f64 && Src.hasOneUse() &&
40692 ISD::isNormalLoad(Src.getNode())) {
40693 LoadSDNode *LN = cast<LoadSDNode>(Src);
40694 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
40695 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
40696 DCI.CombineTo(N.getNode(), Movddup);
40697 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40699 return N; // Return N so it doesn't get rechecked!
40700 }
40701 }
40702
40703 return SDValue();
40704 }
40705 case X86ISD::VBROADCAST: {
40706 SDValue Src = N.getOperand(0);
40707 SDValue BC = peekThroughBitcasts(Src);
40708 EVT SrcVT = Src.getValueType();
40709 EVT BCVT = BC.getValueType();
40710
40711 // If broadcasting from another shuffle, attempt to simplify it.
40712 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
40713 if (isTargetShuffle(BC.getOpcode()) &&
40714 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
40715 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
40716 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
40718 for (unsigned i = 0; i != Scale; ++i)
40719 DemandedMask[i] = i;
40721 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
40723 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
40724 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
40725 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40726 DAG.getBitcast(SrcVT, Res));
40727 }
40728
40729 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
40730 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
40731 if (Src.getOpcode() == ISD::BITCAST &&
40732 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
40733 TLI.isTypeLegal(BCVT) &&
40735 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
40736 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
40738 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40739 }
40740
40741 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
40742 // If we're re-broadcasting a smaller type then broadcast with that type and
40743 // bitcast.
40744 // TODO: Do this for any splat?
40745 if (Src.getOpcode() == ISD::BITCAST &&
40746 (BC.getOpcode() == X86ISD::VBROADCAST ||
40748 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
40749 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
40750 MVT NewVT =
40752 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
40753 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40754 }
40755
40756 // Reduce broadcast source vector to lowest 128-bits.
40757 if (SrcVT.getSizeInBits() > 128)
40758 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40759 extract128BitVector(Src, 0, DAG, DL));
40760
40761 // broadcast(scalar_to_vector(x)) -> broadcast(x).
40762 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40763 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
40764 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40765
40766 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
40767 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
40768 isNullConstant(Src.getOperand(1)) &&
40769 Src.getValueType() ==
40770 Src.getOperand(0).getValueType().getScalarType() &&
40771 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
40772 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40773
40774 // Share broadcast with the longest vector and extract low subvector (free).
40775 // Ensure the same SDValue from the SDNode use is being used.
40776 for (SDNode *User : Src->uses())
40777 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
40778 Src == User->getOperand(0) &&
40779 User->getValueSizeInBits(0).getFixedValue() >
40780 VT.getFixedSizeInBits()) {
40781 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
40782 VT.getSizeInBits());
40783 }
40784
40785 // vbroadcast(scalarload X) -> vbroadcast_load X
40786 // For float loads, extract other uses of the scalar from the broadcast.
40787 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
40788 ISD::isNormalLoad(Src.getNode())) {
40789 LoadSDNode *LN = cast<LoadSDNode>(Src);
40790 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40791 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40792 SDValue BcastLd =
40794 LN->getMemoryVT(), LN->getMemOperand());
40795 // If the load value is used only by N, replace it via CombineTo N.
40796 bool NoReplaceExtract = Src.hasOneUse();
40797 DCI.CombineTo(N.getNode(), BcastLd);
40798 if (NoReplaceExtract) {
40799 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40801 } else {
40802 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
40803 DAG.getIntPtrConstant(0, DL));
40804 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
40805 }
40806 return N; // Return N so it doesn't get rechecked!
40807 }
40808
40809 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
40810 // i16. So shrink it ourselves if we can make a broadcast_load.
40811 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
40812 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
40813 assert(Subtarget.hasAVX2() && "Expected AVX2");
40814 SDValue TruncIn = Src.getOperand(0);
40815
40816 // If this is a truncate of a non extending load we can just narrow it to
40817 // use a broadcast_load.
40818 if (ISD::isNormalLoad(TruncIn.getNode())) {
40819 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
40820 // Unless its volatile or atomic.
40821 if (LN->isSimple()) {
40822 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40823 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40824 SDValue BcastLd = DAG.getMemIntrinsicNode(
40825 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40826 LN->getPointerInfo(), LN->getOriginalAlign(),
40827 LN->getMemOperand()->getFlags());
40828 DCI.CombineTo(N.getNode(), BcastLd);
40829 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40830 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40831 return N; // Return N so it doesn't get rechecked!
40832 }
40833 }
40834
40835 // If this is a truncate of an i16 extload, we can directly replace it.
40836 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
40837 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
40838 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
40839 if (LN->getMemoryVT().getSizeInBits() == 16) {
40840 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40841 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40842 SDValue BcastLd =
40844 LN->getMemoryVT(), LN->getMemOperand());
40845 DCI.CombineTo(N.getNode(), BcastLd);
40846 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40847 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40848 return N; // Return N so it doesn't get rechecked!
40849 }
40850 }
40851
40852 // If this is a truncate of load that has been shifted right, we can
40853 // offset the pointer and use a narrower load.
40854 if (TruncIn.getOpcode() == ISD::SRL &&
40855 TruncIn.getOperand(0).hasOneUse() &&
40856 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
40857 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
40858 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
40859 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
40860 // Make sure the shift amount and the load size are divisible by 16.
40861 // Don't do this if the load is volatile or atomic.
40862 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
40863 LN->isSimple()) {
40864 unsigned Offset = ShiftAmt / 8;
40865 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40868 SDValue Ops[] = { LN->getChain(), Ptr };
40869 SDValue BcastLd = DAG.getMemIntrinsicNode(
40870 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40872 LN->getOriginalAlign(),
40873 LN->getMemOperand()->getFlags());
40874 DCI.CombineTo(N.getNode(), BcastLd);
40875 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40876 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40877 return N; // Return N so it doesn't get rechecked!
40878 }
40879 }
40880 }
40881
40882 // vbroadcast(vzload X) -> vbroadcast_load X
40883 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
40884 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
40885 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
40886 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40887 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40888 SDValue BcastLd =
40890 LN->getMemoryVT(), LN->getMemOperand());
40891 DCI.CombineTo(N.getNode(), BcastLd);
40892 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40894 return N; // Return N so it doesn't get rechecked!
40895 }
40896 }
40897
40898 // vbroadcast(vector load X) -> vbroadcast_load
40899 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
40900 SrcVT == MVT::v4i32) &&
40901 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
40902 LoadSDNode *LN = cast<LoadSDNode>(Src);
40903 // Unless the load is volatile or atomic.
40904 if (LN->isSimple()) {
40905 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40906 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40907 SDValue BcastLd = DAG.getMemIntrinsicNode(
40908 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
40909 LN->getPointerInfo(), LN->getOriginalAlign(),
40910 LN->getMemOperand()->getFlags());
40911 DCI.CombineTo(N.getNode(), BcastLd);
40912 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40914 return N; // Return N so it doesn't get rechecked!
40915 }
40916 }
40917
40918 return SDValue();
40919 }
40920 case X86ISD::VZEXT_MOVL: {
40921 SDValue N0 = N.getOperand(0);
40922
40923 // If this a vzmovl of a full vector load, replace it with a vzload, unless
40924 // the load is volatile.
40925 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
40926 auto *LN = cast<LoadSDNode>(N0);
40927 if (SDValue VZLoad =
40928 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
40929 DCI.CombineTo(N.getNode(), VZLoad);
40930 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40932 return N;
40933 }
40934 }
40935
40936 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
40937 // and can just use a VZEXT_LOAD.
40938 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
40939 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
40940 auto *LN = cast<MemSDNode>(N0);
40941 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
40942 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40943 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40944 SDValue VZLoad =
40946 LN->getMemoryVT(), LN->getMemOperand());
40947 DCI.CombineTo(N.getNode(), VZLoad);
40948 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40950 return N;
40951 }
40952 }
40953
40954 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
40955 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
40956 // if the upper bits of the i64 are zero.
40957 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40958 N0.getOperand(0).hasOneUse() &&
40959 N0.getOperand(0).getValueType() == MVT::i64) {
40960 SDValue In = N0.getOperand(0);
40961 APInt Mask = APInt::getHighBitsSet(64, 32);
40962 if (DAG.MaskedValueIsZero(In, Mask)) {
40963 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
40964 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
40965 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
40966 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
40967 return DAG.getBitcast(VT, Movl);
40968 }
40969 }
40970
40971 // Load a scalar integer constant directly to XMM instead of transferring an
40972 // immediate value from GPR.
40973 // vzext_movl (scalar_to_vector C) --> load [C,0...]
40974 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40975 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
40976 // Create a vector constant - scalar constant followed by zeros.
40977 EVT ScalarVT = N0.getOperand(0).getValueType();
40978 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
40979 unsigned NumElts = VT.getVectorNumElements();
40980 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
40981 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
40982 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
40983
40984 // Load the vector constant from constant pool.
40985 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
40986 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
40987 MachinePointerInfo MPI =
40989 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
40990 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
40992 }
40993 }
40994
40995 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
40996 // insert into a zero vector. This helps get VZEXT_MOVL closer to
40997 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
40998 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
40999 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
41001
41002 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
41003 isNullConstant(V.getOperand(2))) {
41004 SDValue In = V.getOperand(1);
41006 In.getValueSizeInBits() /
41007 VT.getScalarSizeInBits());
41008 In = DAG.getBitcast(SubVT, In);
41009 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
41010 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
41011 getZeroVector(VT, Subtarget, DAG, DL), Movl,
41012 V.getOperand(2));
41013 }
41014 }
41015
41016 return SDValue();
41017 }
41018 case X86ISD::BLENDI: {
41019 SDValue N0 = N.getOperand(0);
41020 SDValue N1 = N.getOperand(1);
41021
41022 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
41023 // TODO: Handle MVT::v16i16 repeated blend mask.
41024 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
41025 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
41026 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
41027 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
41028 SrcVT.getScalarSizeInBits() >= 32) {
41029 unsigned Size = VT.getVectorNumElements();
41030 unsigned NewSize = SrcVT.getVectorNumElements();
41031 APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
41032 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
41033 return DAG.getBitcast(
41034 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
41035 N1.getOperand(0),
41036 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
41037 DL, MVT::i8)));
41038 }
41039 }
41040 return SDValue();
41041 }
41042 case X86ISD::SHUFP: {
41043 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
41044 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
41045 // TODO: Support types other than v4f32.
41046 if (VT == MVT::v4f32) {
41047 bool Updated = false;
41048 SmallVector<int> Mask;
41050 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
41051 for (int i = 0; i != 2; ++i) {
41052 SmallVector<SDValue> SubOps;
41053 SmallVector<int> SubMask, SubScaledMask;
41054 SDValue Sub = peekThroughBitcasts(Ops[i]);
41055 // TODO: Scaling might be easier if we specify the demanded elts.
41056 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
41057 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
41058 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
41059 int Ofs = i * 2;
41060 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
41061 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
41062 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
41063 Updated = true;
41064 }
41065 }
41066 }
41067 if (Updated) {
41068 for (int &M : Mask)
41069 M %= 4;
41070 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41071 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
41072 }
41073 }
41074 return SDValue();
41075 }
41076 case X86ISD::VPERMI: {
41077 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
41078 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
41079 SDValue N0 = N.getOperand(0);
41080 SDValue N1 = N.getOperand(1);
41081 unsigned EltSizeInBits = VT.getScalarSizeInBits();
41082 if (N0.getOpcode() == ISD::BITCAST &&
41083 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
41084 SDValue Src = N0.getOperand(0);
41085 EVT SrcVT = Src.getValueType();
41086 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
41087 return DAG.getBitcast(VT, Res);
41088 }
41089 return SDValue();
41090 }
41091 case X86ISD::SHUF128: {
41092 // If we're permuting the upper 256-bits subvectors of a concatenation, then
41093 // see if we can peek through and access the subvector directly.
41094 if (VT.is512BitVector()) {
41095 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
41096 // upper subvector is used.
41097 SDValue LHS = N->getOperand(0);
41098 SDValue RHS = N->getOperand(1);
41099 uint64_t Mask = N->getConstantOperandVal(2);
41100 SmallVector<SDValue> LHSOps, RHSOps;
41101 SDValue NewLHS, NewRHS;
41102 if ((Mask & 0x0A) == 0x0A &&
41103 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
41104 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
41105 Mask &= ~0x0A;
41106 }
41107 if ((Mask & 0xA0) == 0xA0 &&
41108 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
41109 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
41110 Mask &= ~0xA0;
41111 }
41112 if (NewLHS || NewRHS)
41113 return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS,
41114 NewRHS ? NewRHS : RHS,
41115 DAG.getTargetConstant(Mask, DL, MVT::i8));
41116 }
41117 return SDValue();
41118 }
41119 case X86ISD::VPERM2X128: {
41120 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
41121 SDValue LHS = N->getOperand(0);
41122 SDValue RHS = N->getOperand(1);
41123 if (LHS.getOpcode() == ISD::BITCAST &&
41124 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
41125 EVT SrcVT = LHS.getOperand(0).getValueType();
41126 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
41127 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
41128 DAG.getBitcast(SrcVT, LHS),
41129 DAG.getBitcast(SrcVT, RHS),
41130 N->getOperand(2)));
41131 }
41132 }
41133
41134 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
41136 return Res;
41137
41138 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
41139 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
41140 auto FindSubVector128 = [&](unsigned Idx) {
41141 if (Idx > 3)
41142 return SDValue();
41143 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
41144 SmallVector<SDValue> SubOps;
41145 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
41146 return SubOps[Idx & 1];
41147 unsigned NumElts = Src.getValueType().getVectorNumElements();
41148 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
41149 Src.getOperand(1).getValueSizeInBits() == 128 &&
41150 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
41151 return Src.getOperand(1);
41152 }
41153 return SDValue();
41154 };
41155 unsigned Imm = N.getConstantOperandVal(2);
41156 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
41157 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
41158 MVT SubVT = VT.getHalfNumVectorElementsVT();
41159 SubLo = DAG.getBitcast(SubVT, SubLo);
41160 SubHi = DAG.getBitcast(SubVT, SubHi);
41161 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
41162 }
41163 }
41164 return SDValue();
41165 }
41166 case X86ISD::PSHUFD:
41167 case X86ISD::PSHUFLW:
41168 case X86ISD::PSHUFHW: {
41169 SDValue N0 = N.getOperand(0);
41170 SDValue N1 = N.getOperand(1);
41171 if (N0->hasOneUse()) {
41173 switch (V.getOpcode()) {
41174 case X86ISD::VSHL:
41175 case X86ISD::VSRL:
41176 case X86ISD::VSRA:
41177 case X86ISD::VSHLI:
41178 case X86ISD::VSRLI:
41179 case X86ISD::VSRAI:
41180 case X86ISD::VROTLI:
41181 case X86ISD::VROTRI: {
41182 MVT InnerVT = V.getSimpleValueType();
41183 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
41184 SDValue Res = DAG.getNode(Opcode, DL, VT,
41185 DAG.getBitcast(VT, V.getOperand(0)), N1);
41186 Res = DAG.getBitcast(InnerVT, Res);
41187 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
41188 return DAG.getBitcast(VT, Res);
41189 }
41190 break;
41191 }
41192 }
41193 }
41194
41195 Mask = getPSHUFShuffleMask(N);
41196 assert(Mask.size() == 4);
41197 break;
41198 }
41199 case X86ISD::MOVSD:
41200 case X86ISD::MOVSH:
41201 case X86ISD::MOVSS: {
41202 SDValue N0 = N.getOperand(0);
41203 SDValue N1 = N.getOperand(1);
41204
41205 // Canonicalize scalar FPOps:
41206 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
41207 // If commutable, allow OP(N1[0], N0[0]).
41208 unsigned Opcode1 = N1.getOpcode();
41209 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
41210 Opcode1 == ISD::FDIV) {
41211 SDValue N10 = N1.getOperand(0);
41212 SDValue N11 = N1.getOperand(1);
41213 if (N10 == N0 ||
41214 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
41215 if (N10 != N0)
41216 std::swap(N10, N11);
41217 MVT SVT = VT.getVectorElementType();
41218 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
41219 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
41220 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
41221 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
41222 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
41223 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
41224 }
41225 }
41226
41227 return SDValue();
41228 }
41229 case X86ISD::INSERTPS: {
41230 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
41231 SDValue Op0 = N.getOperand(0);
41232 SDValue Op1 = N.getOperand(1);
41233 unsigned InsertPSMask = N.getConstantOperandVal(2);
41234 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
41235 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
41236 unsigned ZeroMask = InsertPSMask & 0xF;
41237
41238 // If we zero out all elements from Op0 then we don't need to reference it.
41239 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
41240 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
41241 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41242
41243 // If we zero out the element from Op1 then we don't need to reference it.
41244 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
41245 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
41246 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41247
41248 // Attempt to merge insertps Op1 with an inner target shuffle node.
41249 SmallVector<int, 8> TargetMask1;
41251 APInt KnownUndef1, KnownZero1;
41252 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
41253 KnownZero1)) {
41254 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
41255 // Zero/UNDEF insertion - zero out element and remove dependency.
41256 InsertPSMask |= (1u << DstIdx);
41257 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
41258 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41259 }
41260 // Update insertps mask srcidx and reference the source input directly.
41261 int M = TargetMask1[SrcIdx];
41262 assert(0 <= M && M < 8 && "Shuffle index out of range");
41263 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
41264 Op1 = Ops1[M < 4 ? 0 : 1];
41265 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
41266 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41267 }
41268
41269 // Attempt to merge insertps Op0 with an inner target shuffle node.
41270 SmallVector<int, 8> TargetMask0;
41272 APInt KnownUndef0, KnownZero0;
41273 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
41274 KnownZero0)) {
41275 bool Updated = false;
41276 bool UseInput00 = false;
41277 bool UseInput01 = false;
41278 for (int i = 0; i != 4; ++i) {
41279 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
41280 // No change if element is already zero or the inserted element.
41281 continue;
41282 }
41283
41284 if (KnownUndef0[i] || KnownZero0[i]) {
41285 // If the target mask is undef/zero then we must zero the element.
41286 InsertPSMask |= (1u << i);
41287 Updated = true;
41288 continue;
41289 }
41290
41291 // The input vector element must be inline.
41292 int M = TargetMask0[i];
41293 if (M != i && M != (i + 4))
41294 return SDValue();
41295
41296 // Determine which inputs of the target shuffle we're using.
41297 UseInput00 |= (0 <= M && M < 4);
41298 UseInput01 |= (4 <= M);
41299 }
41300
41301 // If we're not using both inputs of the target shuffle then use the
41302 // referenced input directly.
41303 if (UseInput00 && !UseInput01) {
41304 Updated = true;
41305 Op0 = Ops0[0];
41306 } else if (!UseInput00 && UseInput01) {
41307 Updated = true;
41308 Op0 = Ops0[1];
41309 }
41310
41311 if (Updated)
41312 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
41313 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41314 }
41315
41316 // If we're inserting an element from a vbroadcast load, fold the
41317 // load into the X86insertps instruction. We need to convert the scalar
41318 // load to a vector and clear the source lane of the INSERTPS control.
41319 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
41320 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
41321 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
41322 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
41323 MemIntr->getBasePtr(),
41324 MemIntr->getMemOperand());
41325 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
41327 Load),
41328 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
41329 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
41330 return Insert;
41331 }
41332 }
41333
41334 return SDValue();
41335 }
41336 default:
41337 return SDValue();
41338 }
41339
41340 // Nuke no-op shuffles that show up after combining.
41341 if (isNoopShuffleMask(Mask))
41342 return N.getOperand(0);
41343
41344 // Look for simplifications involving one or two shuffle instructions.
41345 SDValue V = N.getOperand(0);
41346 switch (N.getOpcode()) {
41347 default:
41348 break;
41349 case X86ISD::PSHUFLW:
41350 case X86ISD::PSHUFHW:
41351 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
41352
41353 // See if this reduces to a PSHUFD which is no more expensive and can
41354 // combine with more operations. Note that it has to at least flip the
41355 // dwords as otherwise it would have been removed as a no-op.
41356 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
41357 int DMask[] = {0, 1, 2, 3};
41358 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
41359 DMask[DOffset + 0] = DOffset + 1;
41360 DMask[DOffset + 1] = DOffset + 0;
41361 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
41362 V = DAG.getBitcast(DVT, V);
41363 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
41364 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
41365 return DAG.getBitcast(VT, V);
41366 }
41367
41368 // Look for shuffle patterns which can be implemented as a single unpack.
41369 // FIXME: This doesn't handle the location of the PSHUFD generically, and
41370 // only works when we have a PSHUFD followed by two half-shuffles.
41371 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
41372 (V.getOpcode() == X86ISD::PSHUFLW ||
41373 V.getOpcode() == X86ISD::PSHUFHW) &&
41374 V.getOpcode() != N.getOpcode() &&
41375 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
41376 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
41377 if (D.getOpcode() == X86ISD::PSHUFD) {
41380 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
41381 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
41382 int WordMask[8];
41383 for (int i = 0; i < 4; ++i) {
41384 WordMask[i + NOffset] = Mask[i] + NOffset;
41385 WordMask[i + VOffset] = VMask[i] + VOffset;
41386 }
41387 // Map the word mask through the DWord mask.
41388 int MappedMask[8];
41389 for (int i = 0; i < 8; ++i)
41390 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
41391 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
41392 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
41393 // We can replace all three shuffles with an unpack.
41394 V = DAG.getBitcast(VT, D.getOperand(0));
41395 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
41397 DL, VT, V, V);
41398 }
41399 }
41400 }
41401
41402 break;
41403
41404 case X86ISD::PSHUFD:
41405 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
41406 return NewN;
41407
41408 break;
41409 }
41410
41411 return SDValue();
41412}
41413
41414/// Checks if the shuffle mask takes subsequent elements
41415/// alternately from two vectors.
41416/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
41417static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
41418
41419 int ParitySrc[2] = {-1, -1};
41420 unsigned Size = Mask.size();
41421 for (unsigned i = 0; i != Size; ++i) {
41422 int M = Mask[i];
41423 if (M < 0)
41424 continue;
41425
41426 // Make sure we are using the matching element from the input.
41427 if ((M % Size) != i)
41428 return false;
41429
41430 // Make sure we use the same input for all elements of the same parity.
41431 int Src = M / Size;
41432 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
41433 return false;
41434 ParitySrc[i % 2] = Src;
41435 }
41436
41437 // Make sure each input is used.
41438 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
41439 return false;
41440
41441 Op0Even = ParitySrc[0] == 0;
41442 return true;
41443}
41444
41445/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
41446/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
41447/// are written to the parameters \p Opnd0 and \p Opnd1.
41448///
41449/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
41450/// so it is easier to generically match. We also insert dummy vector shuffle
41451/// nodes for the operands which explicitly discard the lanes which are unused
41452/// by this operation to try to flow through the rest of the combiner
41453/// the fact that they're unused.
41454static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
41455 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
41456 bool &IsSubAdd) {
41457
41458 EVT VT = N->getValueType(0);
41459 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41460 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
41462 return false;
41463
41464 // We only handle target-independent shuffles.
41465 // FIXME: It would be easy and harmless to use the target shuffle mask
41466 // extraction tool to support more.
41467 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41468 return false;
41469
41470 SDValue V1 = N->getOperand(0);
41471 SDValue V2 = N->getOperand(1);
41472
41473 // Make sure we have an FADD and an FSUB.
41474 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
41475 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
41476 V1.getOpcode() == V2.getOpcode())
41477 return false;
41478
41479 // If there are other uses of these operations we can't fold them.
41480 if (!V1->hasOneUse() || !V2->hasOneUse())
41481 return false;
41482
41483 // Ensure that both operations have the same operands. Note that we can
41484 // commute the FADD operands.
41485 SDValue LHS, RHS;
41486 if (V1.getOpcode() == ISD::FSUB) {
41487 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
41488 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
41489 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
41490 return false;
41491 } else {
41492 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
41493 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
41494 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
41495 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
41496 return false;
41497 }
41498
41499 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41500 bool Op0Even;
41501 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41502 return false;
41503
41504 // It's a subadd if the vector in the even parity is an FADD.
41505 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
41506 : V2->getOpcode() == ISD::FADD;
41507
41508 Opnd0 = LHS;
41509 Opnd1 = RHS;
41510 return true;
41511}
41512
41513/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
41515 const X86Subtarget &Subtarget,
41516 SelectionDAG &DAG) {
41517 // We only handle target-independent shuffles.
41518 // FIXME: It would be easy and harmless to use the target shuffle mask
41519 // extraction tool to support more.
41520 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41521 return SDValue();
41522
41523 MVT VT = N->getSimpleValueType(0);
41524 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41525 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
41526 return SDValue();
41527
41528 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
41529 SDValue Op0 = N->getOperand(0);
41530 SDValue Op1 = N->getOperand(1);
41531 SDValue FMAdd = Op0, FMSub = Op1;
41532 if (FMSub.getOpcode() != X86ISD::FMSUB)
41533 std::swap(FMAdd, FMSub);
41534
41535 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
41536 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
41537 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
41538 FMAdd.getOperand(2) != FMSub.getOperand(2))
41539 return SDValue();
41540
41541 // Check for correct shuffle mask.
41542 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41543 bool Op0Even;
41544 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41545 return SDValue();
41546
41547 // FMAddSub takes zeroth operand from FMSub node.
41548 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
41549 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41550 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
41551 FMAdd.getOperand(2));
41552}
41553
41554/// Try to combine a shuffle into a target-specific add-sub or
41555/// mul-add-sub node.
41557 const X86Subtarget &Subtarget,
41558 SelectionDAG &DAG) {
41559 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
41560 return V;
41561
41562 SDValue Opnd0, Opnd1;
41563 bool IsSubAdd;
41564 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
41565 return SDValue();
41566
41567 MVT VT = N->getSimpleValueType(0);
41568
41569 // Try to generate X86ISD::FMADDSUB node here.
41570 SDValue Opnd2;
41571 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
41572 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41573 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
41574 }
41575
41576 if (IsSubAdd)
41577 return SDValue();
41578
41579 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
41580 // the ADDSUB idiom has been successfully recognized. There are no known
41581 // X86 targets with 512-bit ADDSUB instructions!
41582 if (VT.is512BitVector())
41583 return SDValue();
41584
41585 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
41586 // the ADDSUB idiom has been successfully recognized. There are no known
41587 // X86 targets with FP16 ADDSUB instructions!
41588 if (VT.getVectorElementType() == MVT::f16)
41589 return SDValue();
41590
41591 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
41592}
41593
41594// We are looking for a shuffle where both sources are concatenated with undef
41595// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
41596// if we can express this as a single-source shuffle, that's preferable.
41598 SelectionDAG &DAG,
41599 const X86Subtarget &Subtarget) {
41600 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
41601 return SDValue();
41602
41603 EVT VT = N->getValueType(0);
41604
41605 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
41606 if (!VT.is128BitVector() && !VT.is256BitVector())
41607 return SDValue();
41608
41609 if (VT.getVectorElementType() != MVT::i32 &&
41610 VT.getVectorElementType() != MVT::i64 &&
41611 VT.getVectorElementType() != MVT::f32 &&
41612 VT.getVectorElementType() != MVT::f64)
41613 return SDValue();
41614
41615 SDValue N0 = N->getOperand(0);
41616 SDValue N1 = N->getOperand(1);
41617
41618 // Check that both sources are concats with undef.
41619 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
41620 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
41621 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
41622 !N1.getOperand(1).isUndef())
41623 return SDValue();
41624
41625 // Construct the new shuffle mask. Elements from the first source retain their
41626 // index, but elements from the second source no longer need to skip an undef.
41628 int NumElts = VT.getVectorNumElements();
41629
41630 auto *SVOp = cast<ShuffleVectorSDNode>(N);
41631 for (int Elt : SVOp->getMask())
41632 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
41633
41635 N1.getOperand(0));
41636 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
41637}
41638
41639/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
41640/// low half of each source vector and does not set any high half elements in
41641/// the destination vector, narrow the shuffle to half its original size.
41643 EVT VT = Shuf->getValueType(0);
41644 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
41645 return SDValue();
41646 if (!VT.is256BitVector() && !VT.is512BitVector())
41647 return SDValue();
41648
41649 // See if we can ignore all of the high elements of the shuffle.
41650 ArrayRef<int> Mask = Shuf->getMask();
41651 if (!isUndefUpperHalf(Mask))
41652 return SDValue();
41653
41654 // Check if the shuffle mask accesses only the low half of each input vector
41655 // (half-index output is 0 or 2).
41656 int HalfIdx1, HalfIdx2;
41657 SmallVector<int, 8> HalfMask(Mask.size() / 2);
41658 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
41659 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
41660 return SDValue();
41661
41662 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
41663 // The trick is knowing that all of the insert/extract are actually free
41664 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
41665 // of narrow inputs into a narrow output, and that is always cheaper than
41666 // the wide shuffle that we started with.
41667 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
41668 Shuf->getOperand(1), HalfMask, HalfIdx1,
41669 HalfIdx2, false, DAG, /*UseConcat*/ true);
41670}
41671
41674 const X86Subtarget &Subtarget) {
41675 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
41676 if (SDValue V = narrowShuffle(Shuf, DAG))
41677 return V;
41678
41679 // If we have legalized the vector types, look for blends of FADD and FSUB
41680 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
41681 SDLoc dl(N);
41682 EVT VT = N->getValueType(0);
41683 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41684 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
41685 if (SDValue AddSub =
41686 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
41687 return AddSub;
41688
41689 // Attempt to combine into a vector load/broadcast.
41691 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
41692 return LD;
41693
41694 // For AVX2, we sometimes want to combine
41695 // (vector_shuffle <mask> (concat_vectors t1, undef)
41696 // (concat_vectors t2, undef))
41697 // Into:
41698 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
41699 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
41700 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
41701 return ShufConcat;
41702
41703 if (isTargetShuffle(N->getOpcode())) {
41704 SDValue Op(N, 0);
41705 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
41706 return Shuffle;
41707
41708 // Try recursively combining arbitrary sequences of x86 shuffle
41709 // instructions into higher-order shuffles. We do this after combining
41710 // specific PSHUF instruction sequences into their minimal form so that we
41711 // can evaluate how many specialized shuffle instructions are involved in
41712 // a particular chain.
41713 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
41714 return Res;
41715
41716 // Simplify source operands based on shuffle mask.
41717 // TODO - merge this into combineX86ShufflesRecursively.
41718 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
41719 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
41720 return SDValue(N, 0);
41721
41722 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41723 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41724 // Perform this after other shuffle combines to allow inner shuffles to be
41725 // combined away first.
41726 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
41727 return BinOp;
41728 }
41729
41730 return SDValue();
41731}
41732
41733// Simplify variable target shuffle masks based on the demanded elements.
41734// TODO: Handle DemandedBits in mask indices as well?
41736 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
41737 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
41738 // If we're demanding all elements don't bother trying to simplify the mask.
41739 unsigned NumElts = DemandedElts.getBitWidth();
41740 if (DemandedElts.isAllOnes())
41741 return false;
41742
41743 SDValue Mask = Op.getOperand(MaskIndex);
41744 if (!Mask.hasOneUse())
41745 return false;
41746
41747 // Attempt to generically simplify the variable shuffle mask.
41748 APInt MaskUndef, MaskZero;
41749 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
41750 Depth + 1))
41751 return true;
41752
41753 // Attempt to extract+simplify a (constant pool load) shuffle mask.
41754 // TODO: Support other types from getTargetShuffleMaskIndices?
41756 EVT BCVT = BC.getValueType();
41757 auto *Load = dyn_cast<LoadSDNode>(BC);
41758 if (!Load || !Load->getBasePtr().hasOneUse())
41759 return false;
41760
41761 const Constant *C = getTargetConstantFromNode(Load);
41762 if (!C)
41763 return false;
41764
41765 Type *CTy = C->getType();
41766 if (!CTy->isVectorTy() ||
41767 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
41768 return false;
41769
41770 // Handle scaling for i64 elements on 32-bit targets.
41771 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
41772 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
41773 return false;
41774 unsigned Scale = NumCstElts / NumElts;
41775
41776 // Simplify mask if we have an undemanded element that is not undef.
41777 bool Simplified = false;
41778 SmallVector<Constant *, 32> ConstVecOps;
41779 for (unsigned i = 0; i != NumCstElts; ++i) {
41780 Constant *Elt = C->getAggregateElement(i);
41781 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
41782 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
41783 Simplified = true;
41784 continue;
41785 }
41786 ConstVecOps.push_back(Elt);
41787 }
41788 if (!Simplified)
41789 return false;
41790
41791 // Generate new constant pool entry + legalize immediately for the load.
41792 SDLoc DL(Op);
41793 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
41794 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
41795 SDValue NewMask = TLO.DAG.getLoad(
41796 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
41798 Load->getAlign());
41799 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
41800}
41801
41803 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
41804 TargetLoweringOpt &TLO, unsigned Depth) const {
41805 int NumElts = DemandedElts.getBitWidth();
41806 unsigned Opc = Op.getOpcode();
41807 EVT VT = Op.getValueType();
41808
41809 // Handle special case opcodes.
41810 switch (Opc) {
41811 case X86ISD::PMULDQ:
41812 case X86ISD::PMULUDQ: {
41813 APInt LHSUndef, LHSZero;
41814 APInt RHSUndef, RHSZero;
41815 SDValue LHS = Op.getOperand(0);
41816 SDValue RHS = Op.getOperand(1);
41817 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41818 Depth + 1))
41819 return true;
41820 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41821 Depth + 1))
41822 return true;
41823 // Multiply by zero.
41824 KnownZero = LHSZero | RHSZero;
41825 break;
41826 }
41827 case X86ISD::VPMADDUBSW:
41828 case X86ISD::VPMADDWD: {
41829 APInt LHSUndef, LHSZero;
41830 APInt RHSUndef, RHSZero;
41831 SDValue LHS = Op.getOperand(0);
41832 SDValue RHS = Op.getOperand(1);
41833 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
41834
41835 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
41836 Depth + 1))
41837 return true;
41838 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
41839 Depth + 1))
41840 return true;
41841
41842 // TODO: Multiply by zero.
41843
41844 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
41845 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
41846 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
41847 Depth + 1))
41848 return true;
41849 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
41850 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
41851 Depth + 1))
41852 return true;
41853 break;
41854 }
41855 case X86ISD::PSADBW: {
41856 SDValue LHS = Op.getOperand(0);
41857 SDValue RHS = Op.getOperand(1);
41858 assert(VT.getScalarType() == MVT::i64 &&
41859 LHS.getValueType() == RHS.getValueType() &&
41860 LHS.getValueType().getScalarType() == MVT::i8 &&
41861 "Unexpected PSADBW types");
41862
41863 // Aggressively peek through ops to get at the demanded elts.
41864 if (!DemandedElts.isAllOnes()) {
41865 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
41866 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
41868 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41870 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41871 if (NewLHS || NewRHS) {
41872 NewLHS = NewLHS ? NewLHS : LHS;
41873 NewRHS = NewRHS ? NewRHS : RHS;
41874 return TLO.CombineTo(
41875 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41876 }
41877 }
41878 break;
41879 }
41880 case X86ISD::VSHL:
41881 case X86ISD::VSRL:
41882 case X86ISD::VSRA: {
41883 // We only need the bottom 64-bits of the (128-bit) shift amount.
41884 SDValue Amt = Op.getOperand(1);
41885 MVT AmtVT = Amt.getSimpleValueType();
41886 assert(AmtVT.is128BitVector() && "Unexpected value type");
41887
41888 // If we reuse the shift amount just for sse shift amounts then we know that
41889 // only the bottom 64-bits are only ever used.
41890 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
41891 unsigned UseOpc = Use->getOpcode();
41892 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
41893 UseOpc == X86ISD::VSRA) &&
41894 Use->getOperand(0) != Amt;
41895 });
41896
41897 APInt AmtUndef, AmtZero;
41898 unsigned NumAmtElts = AmtVT.getVectorNumElements();
41899 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
41900 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
41901 Depth + 1, AssumeSingleUse))
41902 return true;
41903 [[fallthrough]];
41904 }
41905 case X86ISD::VSHLI:
41906 case X86ISD::VSRLI:
41907 case X86ISD::VSRAI: {
41908 SDValue Src = Op.getOperand(0);
41909 APInt SrcUndef;
41910 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
41911 Depth + 1))
41912 return true;
41913
41914 // Fold shift(0,x) -> 0
41915 if (DemandedElts.isSubsetOf(KnownZero))
41916 return TLO.CombineTo(
41917 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41918
41919 // Aggressively peek through ops to get at the demanded elts.
41920 if (!DemandedElts.isAllOnes())
41922 Src, DemandedElts, TLO.DAG, Depth + 1))
41923 return TLO.CombineTo(
41924 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
41925 break;
41926 }
41927 case X86ISD::VPSHA:
41928 case X86ISD::VPSHL:
41929 case X86ISD::VSHLV:
41930 case X86ISD::VSRLV:
41931 case X86ISD::VSRAV: {
41932 APInt LHSUndef, LHSZero;
41933 APInt RHSUndef, RHSZero;
41934 SDValue LHS = Op.getOperand(0);
41935 SDValue RHS = Op.getOperand(1);
41936 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41937 Depth + 1))
41938 return true;
41939
41940 // Fold shift(0,x) -> 0
41941 if (DemandedElts.isSubsetOf(LHSZero))
41942 return TLO.CombineTo(
41943 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41944
41945 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41946 Depth + 1))
41947 return true;
41948
41949 KnownZero = LHSZero;
41950 break;
41951 }
41952 case X86ISD::PCMPEQ:
41953 case X86ISD::PCMPGT: {
41954 APInt LHSUndef, LHSZero;
41955 APInt RHSUndef, RHSZero;
41956 SDValue LHS = Op.getOperand(0);
41957 SDValue RHS = Op.getOperand(1);
41958 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41959 Depth + 1))
41960 return true;
41961 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41962 Depth + 1))
41963 return true;
41964 break;
41965 }
41966 case X86ISD::KSHIFTL: {
41967 SDValue Src = Op.getOperand(0);
41968 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
41969 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
41970 unsigned ShiftAmt = Amt->getZExtValue();
41971
41972 if (ShiftAmt == 0)
41973 return TLO.CombineTo(Op, Src);
41974
41975 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
41976 // single shift. We can do this if the bottom bits (which are shifted
41977 // out) are never demanded.
41978 if (Src.getOpcode() == X86ISD::KSHIFTR) {
41979 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
41980 unsigned C1 = Src.getConstantOperandVal(1);
41981 unsigned NewOpc = X86ISD::KSHIFTL;
41982 int Diff = ShiftAmt - C1;
41983 if (Diff < 0) {
41984 Diff = -Diff;
41985 NewOpc = X86ISD::KSHIFTR;
41986 }
41987
41988 SDLoc dl(Op);
41989 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
41990 return TLO.CombineTo(
41991 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
41992 }
41993 }
41994
41995 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
41996 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
41997 Depth + 1))
41998 return true;
41999
42000 KnownUndef <<= ShiftAmt;
42001 KnownZero <<= ShiftAmt;
42002 KnownZero.setLowBits(ShiftAmt);
42003 break;
42004 }
42005 case X86ISD::KSHIFTR: {
42006 SDValue Src = Op.getOperand(0);
42007 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
42008 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
42009 unsigned ShiftAmt = Amt->getZExtValue();
42010
42011 if (ShiftAmt == 0)
42012 return TLO.CombineTo(Op, Src);
42013
42014 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
42015 // single shift. We can do this if the top bits (which are shifted
42016 // out) are never demanded.
42017 if (Src.getOpcode() == X86ISD::KSHIFTL) {
42018 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
42019 unsigned C1 = Src.getConstantOperandVal(1);
42020 unsigned NewOpc = X86ISD::KSHIFTR;
42021 int Diff = ShiftAmt - C1;
42022 if (Diff < 0) {
42023 Diff = -Diff;
42024 NewOpc = X86ISD::KSHIFTL;
42025 }
42026
42027 SDLoc dl(Op);
42028 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
42029 return TLO.CombineTo(
42030 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
42031 }
42032 }
42033
42034 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
42035 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
42036 Depth + 1))
42037 return true;
42038
42039 KnownUndef.lshrInPlace(ShiftAmt);
42040 KnownZero.lshrInPlace(ShiftAmt);
42041 KnownZero.setHighBits(ShiftAmt);
42042 break;
42043 }
42044 case X86ISD::ANDNP: {
42045 // ANDNP = (~LHS & RHS);
42046 SDValue LHS = Op.getOperand(0);
42047 SDValue RHS = Op.getOperand(1);
42048
42049 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
42050 APInt UndefElts;
42051 SmallVector<APInt> EltBits;
42052 int NumElts = VT.getVectorNumElements();
42053 int EltSizeInBits = VT.getScalarSizeInBits();
42054 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
42055 APInt OpElts = DemandedElts;
42056 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
42057 EltBits)) {
42058 OpBits.clearAllBits();
42059 OpElts.clearAllBits();
42060 for (int I = 0; I != NumElts; ++I) {
42061 if (!DemandedElts[I])
42062 continue;
42063 if (UndefElts[I]) {
42064 // We can't assume an undef src element gives an undef dst - the
42065 // other src might be zero.
42066 OpBits.setAllBits();
42067 OpElts.setBit(I);
42068 } else if ((Invert && !EltBits[I].isAllOnes()) ||
42069 (!Invert && !EltBits[I].isZero())) {
42070 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
42071 OpElts.setBit(I);
42072 }
42073 }
42074 }
42075 return std::make_pair(OpBits, OpElts);
42076 };
42077 APInt BitsLHS, EltsLHS;
42078 APInt BitsRHS, EltsRHS;
42079 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
42080 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
42081
42082 APInt LHSUndef, LHSZero;
42083 APInt RHSUndef, RHSZero;
42084 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
42085 Depth + 1))
42086 return true;
42087 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
42088 Depth + 1))
42089 return true;
42090
42091 if (!DemandedElts.isAllOnes()) {
42092 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
42093 TLO.DAG, Depth + 1);
42094 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
42095 TLO.DAG, Depth + 1);
42096 if (NewLHS || NewRHS) {
42097 NewLHS = NewLHS ? NewLHS : LHS;
42098 NewRHS = NewRHS ? NewRHS : RHS;
42099 return TLO.CombineTo(
42100 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
42101 }
42102 }
42103 break;
42104 }
42105 case X86ISD::CVTSI2P:
42106 case X86ISD::CVTUI2P:
42107 case X86ISD::CVTPH2PS:
42108 case X86ISD::CVTPS2PH: {
42109 SDValue Src = Op.getOperand(0);
42110 EVT SrcVT = Src.getValueType();
42111 APInt SrcUndef, SrcZero;
42112 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
42113 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
42114 Depth + 1))
42115 return true;
42116 break;
42117 }
42118 case X86ISD::PACKSS:
42119 case X86ISD::PACKUS: {
42120 SDValue N0 = Op.getOperand(0);
42121 SDValue N1 = Op.getOperand(1);
42122
42123 APInt DemandedLHS, DemandedRHS;
42124 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
42125
42126 APInt LHSUndef, LHSZero;
42127 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
42128 Depth + 1))
42129 return true;
42130 APInt RHSUndef, RHSZero;
42131 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
42132 Depth + 1))
42133 return true;
42134
42135 // TODO - pass on known zero/undef.
42136
42137 // Aggressively peek through ops to get at the demanded elts.
42138 // TODO - we should do this for all target/faux shuffles ops.
42139 if (!DemandedElts.isAllOnes()) {
42140 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
42141 TLO.DAG, Depth + 1);
42142 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
42143 TLO.DAG, Depth + 1);
42144 if (NewN0 || NewN1) {
42145 NewN0 = NewN0 ? NewN0 : N0;
42146 NewN1 = NewN1 ? NewN1 : N1;
42147 return TLO.CombineTo(Op,
42148 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
42149 }
42150 }
42151 break;
42152 }
42153 case X86ISD::HADD:
42154 case X86ISD::HSUB:
42155 case X86ISD::FHADD:
42156 case X86ISD::FHSUB: {
42157 SDValue N0 = Op.getOperand(0);
42158 SDValue N1 = Op.getOperand(1);
42159
42160 APInt DemandedLHS, DemandedRHS;
42161 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
42162
42163 APInt LHSUndef, LHSZero;
42164 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
42165 Depth + 1))
42166 return true;
42167 APInt RHSUndef, RHSZero;
42168 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
42169 Depth + 1))
42170 return true;
42171
42172 // TODO - pass on known zero/undef.
42173
42174 // Aggressively peek through ops to get at the demanded elts.
42175 // TODO: Handle repeated operands.
42176 if (N0 != N1 && !DemandedElts.isAllOnes()) {
42177 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
42178 TLO.DAG, Depth + 1);
42179 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
42180 TLO.DAG, Depth + 1);
42181 if (NewN0 || NewN1) {
42182 NewN0 = NewN0 ? NewN0 : N0;
42183 NewN1 = NewN1 ? NewN1 : N1;
42184 return TLO.CombineTo(Op,
42185 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
42186 }
42187 }
42188 break;
42189 }
42190 case X86ISD::VTRUNC:
42191 case X86ISD::VTRUNCS:
42192 case X86ISD::VTRUNCUS: {
42193 SDValue Src = Op.getOperand(0);
42194 MVT SrcVT = Src.getSimpleValueType();
42195 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
42196 APInt SrcUndef, SrcZero;
42197 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
42198 Depth + 1))
42199 return true;
42200 KnownZero = SrcZero.zextOrTrunc(NumElts);
42201 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
42202 break;
42203 }
42204 case X86ISD::BLENDI: {
42205 SmallVector<int, 16> BlendMask;
42206 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
42208 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
42209 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
42210 return TLO.CombineTo(Op, R);
42211 break;
42212 }
42213 case X86ISD::BLENDV: {
42214 APInt SelUndef, SelZero;
42215 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
42216 SelZero, TLO, Depth + 1))
42217 return true;
42218
42219 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
42220 APInt LHSUndef, LHSZero;
42221 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
42222 LHSZero, TLO, Depth + 1))
42223 return true;
42224
42225 APInt RHSUndef, RHSZero;
42226 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
42227 RHSZero, TLO, Depth + 1))
42228 return true;
42229
42230 KnownZero = LHSZero & RHSZero;
42231 KnownUndef = LHSUndef & RHSUndef;
42232 break;
42233 }
42234 case X86ISD::VZEXT_MOVL: {
42235 // If upper demanded elements are already zero then we have nothing to do.
42236 SDValue Src = Op.getOperand(0);
42237 APInt DemandedUpperElts = DemandedElts;
42238 DemandedUpperElts.clearLowBits(1);
42239 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
42240 return TLO.CombineTo(Op, Src);
42241 break;
42242 }
42243 case X86ISD::VZEXT_LOAD: {
42244 // If upper demanded elements are not demanded then simplify to a
42245 // scalar_to_vector(load()).
42247 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
42248 SDLoc DL(Op);
42249 auto *Mem = cast<MemSDNode>(Op);
42250 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
42251 Mem->getMemOperand());
42252 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
42253 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
42254 }
42255 break;
42256 }
42257 case X86ISD::VBROADCAST: {
42258 SDValue Src = Op.getOperand(0);
42259 MVT SrcVT = Src.getSimpleValueType();
42260 if (!SrcVT.isVector())
42261 break;
42262 // Don't bother broadcasting if we just need the 0'th element.
42263 if (DemandedElts == 1) {
42264 if (Src.getValueType() != VT)
42265 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
42266 SDLoc(Op));
42267 return TLO.CombineTo(Op, Src);
42268 }
42269 APInt SrcUndef, SrcZero;
42270 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
42271 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
42272 Depth + 1))
42273 return true;
42274 // Aggressively peek through src to get at the demanded elt.
42275 // TODO - we should do this for all target/faux shuffles ops.
42277 Src, SrcElts, TLO.DAG, Depth + 1))
42278 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42279 break;
42280 }
42281 case X86ISD::VPERMV:
42282 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
42283 Depth))
42284 return true;
42285 break;
42286 case X86ISD::PSHUFB:
42287 case X86ISD::VPERMV3:
42288 case X86ISD::VPERMILPV:
42289 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
42290 Depth))
42291 return true;
42292 break;
42293 case X86ISD::VPPERM:
42294 case X86ISD::VPERMIL2:
42295 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
42296 Depth))
42297 return true;
42298 break;
42299 }
42300
42301 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
42302 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
42303 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
42304 if ((VT.is256BitVector() || VT.is512BitVector()) &&
42305 DemandedElts.lshr(NumElts / 2) == 0) {
42306 unsigned SizeInBits = VT.getSizeInBits();
42307 unsigned ExtSizeInBits = SizeInBits / 2;
42308
42309 // See if 512-bit ops only use the bottom 128-bits.
42310 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
42311 ExtSizeInBits = SizeInBits / 4;
42312
42313 switch (Opc) {
42314 // Scalar broadcast.
42315 case X86ISD::VBROADCAST: {
42316 SDLoc DL(Op);
42317 SDValue Src = Op.getOperand(0);
42318 if (Src.getValueSizeInBits() > ExtSizeInBits)
42319 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
42320 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42321 ExtSizeInBits / VT.getScalarSizeInBits());
42322 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
42323 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
42324 TLO.DAG, DL, ExtSizeInBits));
42325 }
42327 SDLoc DL(Op);
42328 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
42329 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42330 ExtSizeInBits / VT.getScalarSizeInBits());
42331 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
42332 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
42333 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
42334 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
42335 MemIntr->getMemOperand());
42337 Bcst.getValue(1));
42338 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
42339 TLO.DAG, DL, ExtSizeInBits));
42340 }
42341 // Subvector broadcast.
42343 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
42344 EVT MemVT = MemIntr->getMemoryVT();
42345 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
42346 SDLoc DL(Op);
42347 SDValue Ld =
42348 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
42349 MemIntr->getBasePtr(), MemIntr->getMemOperand());
42351 Ld.getValue(1));
42352 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
42353 TLO.DAG, DL, ExtSizeInBits));
42354 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
42355 SDLoc DL(Op);
42356 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42357 ExtSizeInBits / VT.getScalarSizeInBits());
42358 if (SDValue BcstLd =
42359 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
42360 return TLO.CombineTo(Op,
42361 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
42362 TLO.DAG, DL, ExtSizeInBits));
42363 }
42364 break;
42365 }
42366 // Byte shifts by immediate.
42367 case X86ISD::VSHLDQ:
42368 case X86ISD::VSRLDQ:
42369 // Shift by uniform.
42370 case X86ISD::VSHL:
42371 case X86ISD::VSRL:
42372 case X86ISD::VSRA:
42373 // Shift by immediate.
42374 case X86ISD::VSHLI:
42375 case X86ISD::VSRLI:
42376 case X86ISD::VSRAI: {
42377 SDLoc DL(Op);
42378 SDValue Ext0 =
42379 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
42380 SDValue ExtOp =
42381 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
42382 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42383 SDValue Insert =
42384 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42385 return TLO.CombineTo(Op, Insert);
42386 }
42387 case X86ISD::VPERMI: {
42388 // Simplify PERMPD/PERMQ to extract_subvector.
42389 // TODO: This should be done in shuffle combining.
42390 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
42392 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
42393 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
42394 SDLoc DL(Op);
42395 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
42396 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42397 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
42398 return TLO.CombineTo(Op, Insert);
42399 }
42400 }
42401 break;
42402 }
42403 case X86ISD::VPERM2X128: {
42404 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
42405 SDLoc DL(Op);
42406 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
42407 if (LoMask & 0x8)
42408 return TLO.CombineTo(
42409 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
42410 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
42411 unsigned SrcIdx = (LoMask & 0x2) >> 1;
42412 SDValue ExtOp =
42413 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
42414 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42415 SDValue Insert =
42416 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42417 return TLO.CombineTo(Op, Insert);
42418 }
42419 // Zero upper elements.
42420 case X86ISD::VZEXT_MOVL:
42421 // Target unary shuffles by immediate:
42422 case X86ISD::PSHUFD:
42423 case X86ISD::PSHUFLW:
42424 case X86ISD::PSHUFHW:
42425 case X86ISD::VPERMILPI:
42426 // (Non-Lane Crossing) Target Shuffles.
42427 case X86ISD::VPERMILPV:
42428 case X86ISD::VPERMIL2:
42429 case X86ISD::PSHUFB:
42430 case X86ISD::UNPCKL:
42431 case X86ISD::UNPCKH:
42432 case X86ISD::BLENDI:
42433 // Integer ops.
42434 case X86ISD::PACKSS:
42435 case X86ISD::PACKUS:
42436 case X86ISD::PCMPEQ:
42437 case X86ISD::PCMPGT:
42438 case X86ISD::PMULUDQ:
42439 case X86ISD::PMULDQ:
42440 case X86ISD::VSHLV:
42441 case X86ISD::VSRLV:
42442 case X86ISD::VSRAV:
42443 // Float ops.
42444 case X86ISD::FMAX:
42445 case X86ISD::FMIN:
42446 case X86ISD::FMAXC:
42447 case X86ISD::FMINC:
42448 // Horizontal Ops.
42449 case X86ISD::HADD:
42450 case X86ISD::HSUB:
42451 case X86ISD::FHADD:
42452 case X86ISD::FHSUB: {
42453 SDLoc DL(Op);
42455 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
42456 SDValue SrcOp = Op.getOperand(i);
42457 EVT SrcVT = SrcOp.getValueType();
42458 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
42459 "Unsupported vector size");
42460 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
42461 ExtSizeInBits)
42462 : SrcOp);
42463 }
42464 MVT ExtVT = VT.getSimpleVT();
42465 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
42466 ExtSizeInBits / ExtVT.getScalarSizeInBits());
42467 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
42468 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42469 SDValue Insert =
42470 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42471 return TLO.CombineTo(Op, Insert);
42472 }
42473 }
42474 }
42475
42476 // For splats, unless we *only* demand the 0'th element,
42477 // stop attempts at simplification here, we aren't going to improve things,
42478 // this is better than any potential shuffle.
42479 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
42480 return false;
42481
42482 // Get target/faux shuffle mask.
42483 APInt OpUndef, OpZero;
42484 SmallVector<int, 64> OpMask;
42485 SmallVector<SDValue, 2> OpInputs;
42486 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
42487 OpZero, TLO.DAG, Depth, false))
42488 return false;
42489
42490 // Shuffle inputs must be the same size as the result.
42491 if (OpMask.size() != (unsigned)NumElts ||
42492 llvm::any_of(OpInputs, [VT](SDValue V) {
42493 return VT.getSizeInBits() != V.getValueSizeInBits() ||
42494 !V.getValueType().isVector();
42495 }))
42496 return false;
42497
42498 KnownZero = OpZero;
42499 KnownUndef = OpUndef;
42500
42501 // Check if shuffle mask can be simplified to undef/zero/identity.
42502 int NumSrcs = OpInputs.size();
42503 for (int i = 0; i != NumElts; ++i)
42504 if (!DemandedElts[i])
42505 OpMask[i] = SM_SentinelUndef;
42506
42507 if (isUndefInRange(OpMask, 0, NumElts)) {
42508 KnownUndef.setAllBits();
42509 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
42510 }
42511 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
42512 KnownZero.setAllBits();
42513 return TLO.CombineTo(
42514 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42515 }
42516 for (int Src = 0; Src != NumSrcs; ++Src)
42517 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
42518 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
42519
42520 // Attempt to simplify inputs.
42521 for (int Src = 0; Src != NumSrcs; ++Src) {
42522 // TODO: Support inputs of different types.
42523 if (OpInputs[Src].getValueType() != VT)
42524 continue;
42525
42526 int Lo = Src * NumElts;
42527 APInt SrcElts = APInt::getZero(NumElts);
42528 for (int i = 0; i != NumElts; ++i)
42529 if (DemandedElts[i]) {
42530 int M = OpMask[i] - Lo;
42531 if (0 <= M && M < NumElts)
42532 SrcElts.setBit(M);
42533 }
42534
42535 // TODO - Propagate input undef/zero elts.
42536 APInt SrcUndef, SrcZero;
42537 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
42538 TLO, Depth + 1))
42539 return true;
42540 }
42541
42542 // If we don't demand all elements, then attempt to combine to a simpler
42543 // shuffle.
42544 // We need to convert the depth to something combineX86ShufflesRecursively
42545 // can handle - so pretend its Depth == 0 again, and reduce the max depth
42546 // to match. This prevents combineX86ShuffleChain from returning a
42547 // combined shuffle that's the same as the original root, causing an
42548 // infinite loop.
42549 if (!DemandedElts.isAllOnes()) {
42550 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
42551
42552 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
42553 for (int i = 0; i != NumElts; ++i)
42554 if (DemandedElts[i])
42555 DemandedMask[i] = i;
42556
42558 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
42559 /*HasVarMask*/ false,
42560 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
42561 Subtarget);
42562 if (NewShuffle)
42563 return TLO.CombineTo(Op, NewShuffle);
42564 }
42565
42566 return false;
42567}
42568
42570 SDValue Op, const APInt &OriginalDemandedBits,
42571 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
42572 unsigned Depth) const {
42573 EVT VT = Op.getValueType();
42574 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
42575 unsigned Opc = Op.getOpcode();
42576 switch(Opc) {
42577 case X86ISD::VTRUNC: {
42578 KnownBits KnownOp;
42579 SDValue Src = Op.getOperand(0);
42580 MVT SrcVT = Src.getSimpleValueType();
42581
42582 // Simplify the input, using demanded bit information.
42583 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
42584 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
42585 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
42586 return true;
42587 break;
42588 }
42589 case X86ISD::PMULDQ:
42590 case X86ISD::PMULUDQ: {
42591 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
42592 KnownBits KnownLHS, KnownRHS;
42593 SDValue LHS = Op.getOperand(0);
42594 SDValue RHS = Op.getOperand(1);
42595
42596 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
42597 // FIXME: Can we bound this better?
42598 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
42599 APInt DemandedMaskLHS = APInt::getAllOnes(64);
42600 APInt DemandedMaskRHS = APInt::getAllOnes(64);
42601
42602 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
42603 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
42604 DemandedMaskLHS = DemandedMask;
42605 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
42606 DemandedMaskRHS = DemandedMask;
42607
42608 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
42609 KnownLHS, TLO, Depth + 1))
42610 return true;
42611 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
42612 KnownRHS, TLO, Depth + 1))
42613 return true;
42614
42615 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
42616 KnownRHS = KnownRHS.trunc(32);
42617 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
42618 KnownRHS.getConstant().isOne()) {
42619 SDLoc DL(Op);
42620 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
42621 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
42622 }
42623
42624 // Aggressively peek through ops to get at the demanded low bits.
42626 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42628 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42629 if (DemandedLHS || DemandedRHS) {
42630 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
42631 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
42632 return TLO.CombineTo(
42633 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
42634 }
42635 break;
42636 }
42637 case X86ISD::ANDNP: {
42638 KnownBits Known2;
42639 SDValue Op0 = Op.getOperand(0);
42640 SDValue Op1 = Op.getOperand(1);
42641
42642 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
42643 Known, TLO, Depth + 1))
42644 return true;
42645
42646 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
42647 OriginalDemandedElts, Known2, TLO, Depth + 1))
42648 return true;
42649
42650 // If the RHS is a constant, see if we can simplify it.
42651 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
42652 OriginalDemandedElts, TLO))
42653 return true;
42654
42655 // ANDNP = (~Op0 & Op1);
42656 Known.One &= Known2.Zero;
42657 Known.Zero |= Known2.One;
42658 break;
42659 }
42660 case X86ISD::VSHLI: {
42661 SDValue Op0 = Op.getOperand(0);
42662
42663 unsigned ShAmt = Op.getConstantOperandVal(1);
42664 if (ShAmt >= BitWidth)
42665 break;
42666
42667 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
42668
42669 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
42670 // single shift. We can do this if the bottom bits (which are shifted
42671 // out) are never demanded.
42672 if (Op0.getOpcode() == X86ISD::VSRLI &&
42673 OriginalDemandedBits.countr_zero() >= ShAmt) {
42674 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
42675 if (Shift2Amt < BitWidth) {
42676 int Diff = ShAmt - Shift2Amt;
42677 if (Diff == 0)
42678 return TLO.CombineTo(Op, Op0.getOperand(0));
42679
42680 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
42681 SDValue NewShift = TLO.DAG.getNode(
42682 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
42683 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
42684 return TLO.CombineTo(Op, NewShift);
42685 }
42686 }
42687
42688 // If we are only demanding sign bits then we can use the shift source directly.
42689 unsigned NumSignBits =
42690 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
42691 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
42692 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42693 return TLO.CombineTo(Op, Op0);
42694
42695 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42696 TLO, Depth + 1))
42697 return true;
42698
42699 Known.Zero <<= ShAmt;
42700 Known.One <<= ShAmt;
42701
42702 // Low bits known zero.
42703 Known.Zero.setLowBits(ShAmt);
42704 return false;
42705 }
42706 case X86ISD::VSRLI: {
42707 unsigned ShAmt = Op.getConstantOperandVal(1);
42708 if (ShAmt >= BitWidth)
42709 break;
42710
42711 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42712
42713 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
42714 OriginalDemandedElts, Known, TLO, Depth + 1))
42715 return true;
42716
42717 Known.Zero.lshrInPlace(ShAmt);
42718 Known.One.lshrInPlace(ShAmt);
42719
42720 // High bits known zero.
42721 Known.Zero.setHighBits(ShAmt);
42722 return false;
42723 }
42724 case X86ISD::VSRAI: {
42725 SDValue Op0 = Op.getOperand(0);
42726 SDValue Op1 = Op.getOperand(1);
42727
42728 unsigned ShAmt = Op1->getAsZExtVal();
42729 if (ShAmt >= BitWidth)
42730 break;
42731
42732 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42733
42734 // If we just want the sign bit then we don't need to shift it.
42735 if (OriginalDemandedBits.isSignMask())
42736 return TLO.CombineTo(Op, Op0);
42737
42738 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
42739 if (Op0.getOpcode() == X86ISD::VSHLI &&
42740 Op.getOperand(1) == Op0.getOperand(1)) {
42741 SDValue Op00 = Op0.getOperand(0);
42742 unsigned NumSignBits =
42743 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
42744 if (ShAmt < NumSignBits)
42745 return TLO.CombineTo(Op, Op00);
42746 }
42747
42748 // If any of the demanded bits are produced by the sign extension, we also
42749 // demand the input sign bit.
42750 if (OriginalDemandedBits.countl_zero() < ShAmt)
42751 DemandedMask.setSignBit();
42752
42753 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42754 TLO, Depth + 1))
42755 return true;
42756
42757 Known.Zero.lshrInPlace(ShAmt);
42758 Known.One.lshrInPlace(ShAmt);
42759
42760 // If the input sign bit is known to be zero, or if none of the top bits
42761 // are demanded, turn this into an unsigned shift right.
42762 if (Known.Zero[BitWidth - ShAmt - 1] ||
42763 OriginalDemandedBits.countl_zero() >= ShAmt)
42764 return TLO.CombineTo(
42765 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
42766
42767 // High bits are known one.
42768 if (Known.One[BitWidth - ShAmt - 1])
42769 Known.One.setHighBits(ShAmt);
42770 return false;
42771 }
42772 case X86ISD::BLENDV: {
42773 SDValue Sel = Op.getOperand(0);
42774 SDValue LHS = Op.getOperand(1);
42775 SDValue RHS = Op.getOperand(2);
42776
42777 APInt SignMask = APInt::getSignMask(BitWidth);
42779 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
42781 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42783 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42784
42785 if (NewSel || NewLHS || NewRHS) {
42786 NewSel = NewSel ? NewSel : Sel;
42787 NewLHS = NewLHS ? NewLHS : LHS;
42788 NewRHS = NewRHS ? NewRHS : RHS;
42789 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
42790 NewSel, NewLHS, NewRHS));
42791 }
42792 break;
42793 }
42794 case X86ISD::PEXTRB:
42795 case X86ISD::PEXTRW: {
42796 SDValue Vec = Op.getOperand(0);
42797 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
42798 MVT VecVT = Vec.getSimpleValueType();
42799 unsigned NumVecElts = VecVT.getVectorNumElements();
42800
42801 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
42802 unsigned Idx = CIdx->getZExtValue();
42803 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
42804
42805 // If we demand no bits from the vector then we must have demanded
42806 // bits from the implict zext - simplify to zero.
42807 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
42808 if (DemandedVecBits == 0)
42809 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42810
42811 APInt KnownUndef, KnownZero;
42812 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
42813 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
42814 KnownZero, TLO, Depth + 1))
42815 return true;
42816
42817 KnownBits KnownVec;
42818 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
42819 KnownVec, TLO, Depth + 1))
42820 return true;
42821
42823 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
42824 return TLO.CombineTo(
42825 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
42826
42827 Known = KnownVec.zext(BitWidth);
42828 return false;
42829 }
42830 break;
42831 }
42832 case X86ISD::PINSRB:
42833 case X86ISD::PINSRW: {
42834 SDValue Vec = Op.getOperand(0);
42835 SDValue Scl = Op.getOperand(1);
42836 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42837 MVT VecVT = Vec.getSimpleValueType();
42838
42839 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
42840 unsigned Idx = CIdx->getZExtValue();
42841 if (!OriginalDemandedElts[Idx])
42842 return TLO.CombineTo(Op, Vec);
42843
42844 KnownBits KnownVec;
42845 APInt DemandedVecElts(OriginalDemandedElts);
42846 DemandedVecElts.clearBit(Idx);
42847 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
42848 KnownVec, TLO, Depth + 1))
42849 return true;
42850
42851 KnownBits KnownScl;
42852 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
42853 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
42854 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
42855 return true;
42856
42857 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
42858 Known = KnownVec.intersectWith(KnownScl);
42859 return false;
42860 }
42861 break;
42862 }
42863 case X86ISD::PACKSS:
42864 // PACKSS saturates to MIN/MAX integer values. So if we just want the
42865 // sign bit then we can just ask for the source operands sign bit.
42866 // TODO - add known bits handling.
42867 if (OriginalDemandedBits.isSignMask()) {
42868 APInt DemandedLHS, DemandedRHS;
42869 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
42870
42871 KnownBits KnownLHS, KnownRHS;
42872 APInt SignMask = APInt::getSignMask(BitWidth * 2);
42873 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
42874 KnownLHS, TLO, Depth + 1))
42875 return true;
42876 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
42877 KnownRHS, TLO, Depth + 1))
42878 return true;
42879
42880 // Attempt to avoid multi-use ops if we don't need anything from them.
42882 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
42884 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
42885 if (DemandedOp0 || DemandedOp1) {
42886 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
42887 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
42888 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
42889 }
42890 }
42891 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
42892 break;
42893 case X86ISD::VBROADCAST: {
42894 SDValue Src = Op.getOperand(0);
42895 MVT SrcVT = Src.getSimpleValueType();
42896 APInt DemandedElts = APInt::getOneBitSet(
42897 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
42898 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
42899 TLO, Depth + 1))
42900 return true;
42901 // If we don't need the upper bits, attempt to narrow the broadcast source.
42902 // Don't attempt this on AVX512 as it might affect broadcast folding.
42903 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
42904 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
42905 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
42906 Src->hasOneUse()) {
42907 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
42908 SDValue NewSrc =
42909 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
42910 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
42911 SDValue NewBcst =
42912 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
42913 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
42914 }
42915 break;
42916 }
42917 case X86ISD::PCMPGT:
42918 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42919 // iff we only need the sign bit then we can use R directly.
42920 if (OriginalDemandedBits.isSignMask() &&
42921 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42922 return TLO.CombineTo(Op, Op.getOperand(1));
42923 break;
42924 case X86ISD::MOVMSK: {
42925 SDValue Src = Op.getOperand(0);
42926 MVT SrcVT = Src.getSimpleValueType();
42927 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42928 unsigned NumElts = SrcVT.getVectorNumElements();
42929
42930 // If we don't need the sign bits at all just return zero.
42931 if (OriginalDemandedBits.countr_zero() >= NumElts)
42932 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42933
42934 // See if we only demand bits from the lower 128-bit vector.
42935 if (SrcVT.is256BitVector() &&
42936 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
42937 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
42938 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42939 }
42940
42941 // Only demand the vector elements of the sign bits we need.
42942 APInt KnownUndef, KnownZero;
42943 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
42944 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
42945 TLO, Depth + 1))
42946 return true;
42947
42948 Known.Zero = KnownZero.zext(BitWidth);
42949 Known.Zero.setHighBits(BitWidth - NumElts);
42950
42951 // MOVMSK only uses the MSB from each vector element.
42952 KnownBits KnownSrc;
42953 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
42954 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
42955 Depth + 1))
42956 return true;
42957
42958 if (KnownSrc.One[SrcBits - 1])
42959 Known.One.setLowBits(NumElts);
42960 else if (KnownSrc.Zero[SrcBits - 1])
42961 Known.Zero.setLowBits(NumElts);
42962
42963 // Attempt to avoid multi-use os if we don't need anything from it.
42965 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
42966 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42967 return false;
42968 }
42969 case X86ISD::TESTP: {
42970 SDValue Op0 = Op.getOperand(0);
42971 SDValue Op1 = Op.getOperand(1);
42972 MVT OpVT = Op0.getSimpleValueType();
42973 assert((OpVT.getVectorElementType() == MVT::f32 ||
42974 OpVT.getVectorElementType() == MVT::f64) &&
42975 "Illegal vector type for X86ISD::TESTP");
42976
42977 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
42978 KnownBits KnownSrc;
42979 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
42980 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
42981 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
42982 AssumeSingleUse) ||
42983 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
42984 AssumeSingleUse);
42985 }
42986 case X86ISD::CMOV: {
42987 KnownBits Known2;
42988 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
42989 OriginalDemandedElts, Known2, TLO, Depth + 1))
42990 return true;
42991 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
42992 OriginalDemandedElts, Known, TLO, Depth + 1))
42993 return true;
42994
42995 // Only known if known in both the LHS and RHS.
42996 Known = Known.intersectWith(Known2);
42997 break;
42998 }
42999 case X86ISD::BEXTR:
43000 case X86ISD::BEXTRI: {
43001 SDValue Op0 = Op.getOperand(0);
43002 SDValue Op1 = Op.getOperand(1);
43003
43004 // Only bottom 16-bits of the control bits are required.
43005 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
43006 // NOTE: SimplifyDemandedBits won't do this for constants.
43007 uint64_t Val1 = Cst1->getZExtValue();
43008 uint64_t MaskedVal1 = Val1 & 0xFFFF;
43009 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
43010 SDLoc DL(Op);
43011 return TLO.CombineTo(
43012 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
43013 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
43014 }
43015
43016 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
43017 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
43018
43019 // If the length is 0, the result is 0.
43020 if (Length == 0) {
43021 Known.setAllZero();
43022 return false;
43023 }
43024
43025 if ((Shift + Length) <= BitWidth) {
43026 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
43027 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
43028 return true;
43029
43030 Known = Known.extractBits(Length, Shift);
43031 Known = Known.zextOrTrunc(BitWidth);
43032 return false;
43033 }
43034 } else {
43035 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
43036 KnownBits Known1;
43037 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
43038 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
43039 return true;
43040
43041 // If the length is 0, replace with 0.
43042 KnownBits LengthBits = Known1.extractBits(8, 8);
43043 if (LengthBits.isZero())
43044 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
43045 }
43046
43047 break;
43048 }
43049 case X86ISD::PDEP: {
43050 SDValue Op0 = Op.getOperand(0);
43051 SDValue Op1 = Op.getOperand(1);
43052
43053 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
43054 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
43055
43056 // If the demanded bits has leading zeroes, we don't demand those from the
43057 // mask.
43058 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
43059 return true;
43060
43061 // The number of possible 1s in the mask determines the number of LSBs of
43062 // operand 0 used. Undemanded bits from the mask don't matter so filter
43063 // them before counting.
43064 KnownBits Known2;
43065 uint64_t Count = (~Known.Zero & LoMask).popcount();
43066 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
43067 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
43068 return true;
43069
43070 // Zeroes are retained from the mask, but not ones.
43071 Known.One.clearAllBits();
43072 // The result will have at least as many trailing zeros as the non-mask
43073 // operand since bits can only map to the same or higher bit position.
43074 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
43075 return false;
43076 }
43077 }
43078
43080 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
43081}
43082
43084 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
43085 SelectionDAG &DAG, unsigned Depth) const {
43086 int NumElts = DemandedElts.getBitWidth();
43087 unsigned Opc = Op.getOpcode();
43088 EVT VT = Op.getValueType();
43089
43090 switch (Opc) {
43091 case X86ISD::PINSRB:
43092 case X86ISD::PINSRW: {
43093 // If we don't demand the inserted element, return the base vector.
43094 SDValue Vec = Op.getOperand(0);
43095 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
43096 MVT VecVT = Vec.getSimpleValueType();
43097 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
43098 !DemandedElts[CIdx->getZExtValue()])
43099 return Vec;
43100 break;
43101 }
43102 case X86ISD::VSHLI: {
43103 // If we are only demanding sign bits then we can use the shift source
43104 // directly.
43105 SDValue Op0 = Op.getOperand(0);
43106 unsigned ShAmt = Op.getConstantOperandVal(1);
43107 unsigned BitWidth = DemandedBits.getBitWidth();
43108 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
43109 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
43110 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43111 return Op0;
43112 break;
43113 }
43114 case X86ISD::VSRAI:
43115 // iff we only need the sign bit then we can use the source directly.
43116 // TODO: generalize where we only demand extended signbits.
43117 if (DemandedBits.isSignMask())
43118 return Op.getOperand(0);
43119 break;
43120 case X86ISD::PCMPGT:
43121 // icmp sgt(0, R) == ashr(R, BitWidth-1).
43122 // iff we only need the sign bit then we can use R directly.
43123 if (DemandedBits.isSignMask() &&
43124 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
43125 return Op.getOperand(1);
43126 break;
43127 case X86ISD::BLENDV: {
43128 // BLENDV: Cond (MSB) ? LHS : RHS
43129 SDValue Cond = Op.getOperand(0);
43130 SDValue LHS = Op.getOperand(1);
43131 SDValue RHS = Op.getOperand(2);
43132
43133 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
43134 if (CondKnown.isNegative())
43135 return LHS;
43136 if (CondKnown.isNonNegative())
43137 return RHS;
43138 break;
43139 }
43140 case X86ISD::ANDNP: {
43141 // ANDNP = (~LHS & RHS);
43142 SDValue LHS = Op.getOperand(0);
43143 SDValue RHS = Op.getOperand(1);
43144
43145 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
43146 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
43147
43148 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
43149 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
43150 // this context, so return RHS.
43151 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
43152 return RHS;
43153 break;
43154 }
43155 }
43156
43157 APInt ShuffleUndef, ShuffleZero;
43158 SmallVector<int, 16> ShuffleMask;
43160 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
43161 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
43162 // If all the demanded elts are from one operand and are inline,
43163 // then we can use the operand directly.
43164 int NumOps = ShuffleOps.size();
43165 if (ShuffleMask.size() == (unsigned)NumElts &&
43167 return VT.getSizeInBits() == V.getValueSizeInBits();
43168 })) {
43169
43170 if (DemandedElts.isSubsetOf(ShuffleUndef))
43171 return DAG.getUNDEF(VT);
43172 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
43173 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
43174
43175 // Bitmask that indicates which ops have only been accessed 'inline'.
43176 APInt IdentityOp = APInt::getAllOnes(NumOps);
43177 for (int i = 0; i != NumElts; ++i) {
43178 int M = ShuffleMask[i];
43179 if (!DemandedElts[i] || ShuffleUndef[i])
43180 continue;
43181 int OpIdx = M / NumElts;
43182 int EltIdx = M % NumElts;
43183 if (M < 0 || EltIdx != i) {
43184 IdentityOp.clearAllBits();
43185 break;
43186 }
43187 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
43188 if (IdentityOp == 0)
43189 break;
43190 }
43191 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
43192 "Multiple identity shuffles detected");
43193
43194 if (IdentityOp != 0)
43195 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
43196 }
43197 }
43198
43200 Op, DemandedBits, DemandedElts, DAG, Depth);
43201}
43202
43204 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
43205 bool PoisonOnly, unsigned Depth) const {
43206 unsigned NumElts = DemandedElts.getBitWidth();
43207
43208 switch (Op.getOpcode()) {
43209 case X86ISD::PSHUFD:
43210 case X86ISD::VPERMILPI: {
43213 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
43214 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
43215 APInt::getZero(NumElts));
43216 for (auto M : enumerate(Mask)) {
43217 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
43218 continue;
43219 if (M.value() == SM_SentinelUndef)
43220 return false;
43221 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
43222 "Shuffle mask index out of range");
43223 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
43224 }
43225 for (auto Op : enumerate(Ops))
43226 if (!DemandedSrcElts[Op.index()].isZero() &&
43228 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
43229 return false;
43230 return true;
43231 }
43232 break;
43233 }
43234 }
43236 Op, DemandedElts, DAG, PoisonOnly, Depth);
43237}
43238
43240 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
43241 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
43242
43243 switch (Op.getOpcode()) {
43244 // SSE vector shifts handle out of bounds shift amounts.
43245 case X86ISD::VSHLI:
43246 case X86ISD::VSRLI:
43247 case X86ISD::VSRAI:
43248 return false;
43249 case X86ISD::PSHUFD:
43250 case X86ISD::VPERMILPI:
43251 case X86ISD::UNPCKH:
43252 case X86ISD::UNPCKL:
43253 return false;
43254 }
43256 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
43257}
43258
43260 const APInt &DemandedElts,
43261 APInt &UndefElts,
43262 const SelectionDAG &DAG,
43263 unsigned Depth) const {
43264 unsigned NumElts = DemandedElts.getBitWidth();
43265 unsigned Opc = Op.getOpcode();
43266
43267 switch (Opc) {
43268 case X86ISD::VBROADCAST:
43270 UndefElts = APInt::getZero(NumElts);
43271 return true;
43272 }
43273
43274 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
43275 DAG, Depth);
43276}
43277
43278// Helper to peek through bitops/trunc/setcc to determine size of source vector.
43279// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
43280static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
43281 bool AllowTruncate) {
43282 switch (Src.getOpcode()) {
43283 case ISD::TRUNCATE:
43284 if (!AllowTruncate)
43285 return false;
43286 [[fallthrough]];
43287 case ISD::SETCC:
43288 return Src.getOperand(0).getValueSizeInBits() == Size;
43289 case ISD::FREEZE:
43290 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate);
43291 case ISD::AND:
43292 case ISD::XOR:
43293 case ISD::OR:
43294 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
43295 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
43296 case ISD::SELECT:
43297 case ISD::VSELECT:
43298 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
43299 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
43300 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
43301 case ISD::BUILD_VECTOR:
43302 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
43303 ISD::isBuildVectorAllOnes(Src.getNode());
43304 }
43305 return false;
43306}
43307
43308// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
43309static unsigned getAltBitOpcode(unsigned Opcode) {
43310 switch(Opcode) {
43311 // clang-format off
43312 case ISD::AND: return X86ISD::FAND;
43313 case ISD::OR: return X86ISD::FOR;
43314 case ISD::XOR: return X86ISD::FXOR;
43315 case X86ISD::ANDNP: return X86ISD::FANDN;
43316 // clang-format on
43317 }
43318 llvm_unreachable("Unknown bitwise opcode");
43319}
43320
43321// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
43323 const SDLoc &DL) {
43324 EVT SrcVT = Src.getValueType();
43325 if (SrcVT != MVT::v4i1)
43326 return SDValue();
43327
43328 switch (Src.getOpcode()) {
43329 case ISD::SETCC:
43330 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
43331 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
43332 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
43333 SDValue Op0 = Src.getOperand(0);
43334 if (ISD::isNormalLoad(Op0.getNode()))
43335 return DAG.getBitcast(MVT::v4f32, Op0);
43336 if (Op0.getOpcode() == ISD::BITCAST &&
43337 Op0.getOperand(0).getValueType() == MVT::v4f32)
43338 return Op0.getOperand(0);
43339 }
43340 break;
43341 case ISD::AND:
43342 case ISD::XOR:
43343 case ISD::OR: {
43344 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
43345 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
43346 if (Op0 && Op1)
43347 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
43348 Op1);
43349 break;
43350 }
43351 }
43352 return SDValue();
43353}
43354
43355// Helper to push sign extension of vXi1 SETCC result through bitops.
43357 SDValue Src, const SDLoc &DL) {
43358 switch (Src.getOpcode()) {
43359 case ISD::SETCC:
43360 case ISD::FREEZE:
43361 case ISD::TRUNCATE:
43362 case ISD::BUILD_VECTOR:
43363 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
43364 case ISD::AND:
43365 case ISD::XOR:
43366 case ISD::OR:
43367 return DAG.getNode(
43368 Src.getOpcode(), DL, SExtVT,
43369 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
43370 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
43371 case ISD::SELECT:
43372 case ISD::VSELECT:
43373 return DAG.getSelect(
43374 DL, SExtVT, Src.getOperand(0),
43375 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
43376 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
43377 }
43378 llvm_unreachable("Unexpected node type for vXi1 sign extension");
43379}
43380
43381// Try to match patterns such as
43382// (i16 bitcast (v16i1 x))
43383// ->
43384// (i16 movmsk (16i8 sext (v16i1 x)))
43385// before the illegal vector is scalarized on subtargets that don't have legal
43386// vxi1 types.
43388 const SDLoc &DL,
43389 const X86Subtarget &Subtarget) {
43390 EVT SrcVT = Src.getValueType();
43391 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
43392 return SDValue();
43393
43394 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
43395 // legalization destroys the v4i32 type.
43396 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
43397 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
43398 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
43399 DAG.getBitcast(MVT::v4f32, V));
43400 return DAG.getZExtOrTrunc(V, DL, VT);
43401 }
43402 }
43403
43404 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
43405 // movmskb even with avx512. This will be better than truncating to vXi1 and
43406 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
43407 // vpcmpeqb/vpcmpgtb.
43408 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
43409 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
43410 Src.getOperand(0).getValueType() == MVT::v32i8 ||
43411 Src.getOperand(0).getValueType() == MVT::v64i8);
43412
43413 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
43414 // directly with vpmovmskb/vmovmskps/vmovmskpd.
43415 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
43416 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
43417 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
43418 EVT CmpVT = Src.getOperand(0).getValueType();
43419 EVT EltVT = CmpVT.getVectorElementType();
43420 if (CmpVT.getSizeInBits() <= 256 &&
43421 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
43422 PreferMovMsk = true;
43423 }
43424
43425 // With AVX512 vxi1 types are legal and we prefer using k-regs.
43426 // MOVMSK is supported in SSE2 or later.
43427 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
43428 return SDValue();
43429
43430 // If the upper ops of a concatenation are undef, then try to bitcast the
43431 // lower op and extend.
43432 SmallVector<SDValue, 4> SubSrcOps;
43433 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
43434 SubSrcOps.size() >= 2) {
43435 SDValue LowerOp = SubSrcOps[0];
43436 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
43437 if (LowerOp.getOpcode() == ISD::SETCC &&
43438 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
43439 EVT SubVT = VT.getIntegerVT(
43440 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
43441 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
43442 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
43443 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
43444 }
43445 }
43446 }
43447
43448 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
43449 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
43450 // v8i16 and v16i16.
43451 // For these two cases, we can shuffle the upper element bytes to a
43452 // consecutive sequence at the start of the vector and treat the results as
43453 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
43454 // for v16i16 this is not the case, because the shuffle is expensive, so we
43455 // avoid sign-extending to this type entirely.
43456 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
43457 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
43458 MVT SExtVT;
43459 bool PropagateSExt = false;
43460 switch (SrcVT.getSimpleVT().SimpleTy) {
43461 default:
43462 return SDValue();
43463 case MVT::v2i1:
43464 SExtVT = MVT::v2i64;
43465 break;
43466 case MVT::v4i1:
43467 SExtVT = MVT::v4i32;
43468 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
43469 // sign-extend to a 256-bit operation to avoid truncation.
43470 if (Subtarget.hasAVX() &&
43471 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
43472 SExtVT = MVT::v4i64;
43473 PropagateSExt = true;
43474 }
43475 break;
43476 case MVT::v8i1:
43477 SExtVT = MVT::v8i16;
43478 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
43479 // sign-extend to a 256-bit operation to match the compare.
43480 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
43481 // 256-bit because the shuffle is cheaper than sign extending the result of
43482 // the compare.
43483 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
43484 checkBitcastSrcVectorSize(Src, 512, true))) {
43485 SExtVT = MVT::v8i32;
43486 PropagateSExt = true;
43487 }
43488 break;
43489 case MVT::v16i1:
43490 SExtVT = MVT::v16i8;
43491 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
43492 // it is not profitable to sign-extend to 256-bit because this will
43493 // require an extra cross-lane shuffle which is more expensive than
43494 // truncating the result of the compare to 128-bits.
43495 break;
43496 case MVT::v32i1:
43497 SExtVT = MVT::v32i8;
43498 break;
43499 case MVT::v64i1:
43500 // If we have AVX512F, but not AVX512BW and the input is truncated from
43501 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
43502 if (Subtarget.hasAVX512()) {
43503 if (Subtarget.hasBWI())
43504 return SDValue();
43505 SExtVT = MVT::v64i8;
43506 break;
43507 }
43508 // Split if this is a <64 x i8> comparison result.
43509 if (checkBitcastSrcVectorSize(Src, 512, false)) {
43510 SExtVT = MVT::v64i8;
43511 break;
43512 }
43513 return SDValue();
43514 };
43515
43516 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
43517 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
43518
43519 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
43520 V = getPMOVMSKB(DL, V, DAG, Subtarget);
43521 } else {
43522 if (SExtVT == MVT::v8i16) {
43523 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
43524 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
43525 }
43526 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
43527 }
43528
43529 EVT IntVT =
43531 V = DAG.getZExtOrTrunc(V, DL, IntVT);
43532 return DAG.getBitcast(VT, V);
43533}
43534
43535// Convert a vXi1 constant build vector to the same width scalar integer.
43537 EVT SrcVT = Op.getValueType();
43538 assert(SrcVT.getVectorElementType() == MVT::i1 &&
43539 "Expected a vXi1 vector");
43541 "Expected a constant build vector");
43542
43543 APInt Imm(SrcVT.getVectorNumElements(), 0);
43544 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
43545 SDValue In = Op.getOperand(Idx);
43546 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
43547 Imm.setBit(Idx);
43548 }
43549 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
43550 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
43551}
43552
43555 const X86Subtarget &Subtarget) {
43556 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
43557
43558 if (!DCI.isBeforeLegalizeOps())
43559 return SDValue();
43560
43561 // Only do this if we have k-registers.
43562 if (!Subtarget.hasAVX512())
43563 return SDValue();
43564
43565 EVT DstVT = N->getValueType(0);
43566 SDValue Op = N->getOperand(0);
43567 EVT SrcVT = Op.getValueType();
43568
43569 if (!Op.hasOneUse())
43570 return SDValue();
43571
43572 // Look for logic ops.
43573 if (Op.getOpcode() != ISD::AND &&
43574 Op.getOpcode() != ISD::OR &&
43575 Op.getOpcode() != ISD::XOR)
43576 return SDValue();
43577
43578 // Make sure we have a bitcast between mask registers and a scalar type.
43579 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43580 DstVT.isScalarInteger()) &&
43581 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
43582 SrcVT.isScalarInteger()))
43583 return SDValue();
43584
43585 SDValue LHS = Op.getOperand(0);
43586 SDValue RHS = Op.getOperand(1);
43587
43588 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
43589 LHS.getOperand(0).getValueType() == DstVT)
43590 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
43591 DAG.getBitcast(DstVT, RHS));
43592
43593 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
43594 RHS.getOperand(0).getValueType() == DstVT)
43595 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43596 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
43597
43598 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
43599 // Most of these have to move a constant from the scalar domain anyway.
43602 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43603 DAG.getBitcast(DstVT, LHS), RHS);
43604 }
43605
43606 return SDValue();
43607}
43608
43610 const X86Subtarget &Subtarget) {
43611 SDLoc DL(BV);
43612 unsigned NumElts = BV->getNumOperands();
43613 SDValue Splat = BV->getSplatValue();
43614
43615 // Build MMX element from integer GPR or SSE float values.
43616 auto CreateMMXElement = [&](SDValue V) {
43617 if (V.isUndef())
43618 return DAG.getUNDEF(MVT::x86mmx);
43619 if (V.getValueType().isFloatingPoint()) {
43620 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
43621 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
43622 V = DAG.getBitcast(MVT::v2i64, V);
43623 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
43624 }
43625 V = DAG.getBitcast(MVT::i32, V);
43626 } else {
43627 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
43628 }
43629 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
43630 };
43631
43632 // Convert build vector ops to MMX data in the bottom elements.
43634
43635 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43636
43637 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
43638 if (Splat) {
43639 if (Splat.isUndef())
43640 return DAG.getUNDEF(MVT::x86mmx);
43641
43642 Splat = CreateMMXElement(Splat);
43643
43644 if (Subtarget.hasSSE1()) {
43645 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
43646 if (NumElts == 8)
43647 Splat = DAG.getNode(
43648 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43649 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
43650 TLI.getPointerTy(DAG.getDataLayout())),
43651 Splat, Splat);
43652
43653 // Use PSHUFW to repeat 16-bit elements.
43654 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
43655 return DAG.getNode(
43656 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43657 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
43658 TLI.getPointerTy(DAG.getDataLayout())),
43659 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
43660 }
43661 Ops.append(NumElts, Splat);
43662 } else {
43663 for (unsigned i = 0; i != NumElts; ++i)
43664 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
43665 }
43666
43667 // Use tree of PUNPCKLs to build up general MMX vector.
43668 while (Ops.size() > 1) {
43669 unsigned NumOps = Ops.size();
43670 unsigned IntrinOp =
43671 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
43672 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
43673 : Intrinsic::x86_mmx_punpcklbw));
43674 SDValue Intrin = DAG.getTargetConstant(
43675 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
43676 for (unsigned i = 0; i != NumOps; i += 2)
43677 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
43678 Ops[i], Ops[i + 1]);
43679 Ops.resize(NumOps / 2);
43680 }
43681
43682 return Ops[0];
43683}
43684
43685// Recursive function that attempts to find if a bool vector node was originally
43686// a vector/float/double that got truncated/extended/bitcast to/from a scalar
43687// integer. If so, replace the scalar ops with bool vector equivalents back down
43688// the chain.
43690 SelectionDAG &DAG,
43691 const X86Subtarget &Subtarget,
43692 unsigned Depth = 0) {
43694 return SDValue(); // Limit search depth.
43695
43696 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43697 unsigned Opc = V.getOpcode();
43698 switch (Opc) {
43699 case ISD::BITCAST: {
43700 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
43701 SDValue Src = V.getOperand(0);
43702 EVT SrcVT = Src.getValueType();
43703 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
43704 return DAG.getBitcast(VT, Src);
43705 break;
43706 }
43707 case ISD::Constant: {
43708 auto *C = cast<ConstantSDNode>(V);
43709 if (C->isZero())
43710 return DAG.getConstant(0, DL, VT);
43711 if (C->isAllOnes())
43712 return DAG.getAllOnesConstant(DL, VT);
43713 break;
43714 }
43715 case ISD::TRUNCATE: {
43716 // If we find a suitable source, a truncated scalar becomes a subvector.
43717 SDValue Src = V.getOperand(0);
43718 EVT NewSrcVT =
43719 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
43720 if (TLI.isTypeLegal(NewSrcVT))
43721 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
43722 Subtarget, Depth + 1))
43723 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
43724 DAG.getIntPtrConstant(0, DL));
43725 break;
43726 }
43727 case ISD::ANY_EXTEND:
43728 case ISD::ZERO_EXTEND: {
43729 // If we find a suitable source, an extended scalar becomes a subvector.
43730 SDValue Src = V.getOperand(0);
43731 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
43732 Src.getScalarValueSizeInBits());
43733 if (TLI.isTypeLegal(NewSrcVT))
43734 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
43735 Subtarget, Depth + 1))
43736 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
43737 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
43738 : DAG.getConstant(0, DL, VT),
43739 N0, DAG.getIntPtrConstant(0, DL));
43740 break;
43741 }
43742 case ISD::OR:
43743 case ISD::XOR: {
43744 // If we find suitable sources, we can just move the op to the vector
43745 // domain.
43746 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
43747 Subtarget, Depth + 1))
43748 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
43749 Subtarget, Depth + 1))
43750 return DAG.getNode(Opc, DL, VT, N0, N1);
43751 break;
43752 }
43753 case ISD::SHL: {
43754 // If we find a suitable source, a SHL becomes a KSHIFTL.
43755 SDValue Src0 = V.getOperand(0);
43756 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
43757 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
43758 break;
43759
43760 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
43761 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
43762 Depth + 1))
43763 return DAG.getNode(
43764 X86ISD::KSHIFTL, DL, VT, N0,
43765 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
43766 break;
43767 }
43768 }
43769
43770 // Does the inner bitcast already exist?
43771 if (Depth > 0)
43772 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
43773 return SDValue(Alt, 0);
43774
43775 return SDValue();
43776}
43777
43780 const X86Subtarget &Subtarget) {
43781 SDValue N0 = N->getOperand(0);
43782 EVT VT = N->getValueType(0);
43783 EVT SrcVT = N0.getValueType();
43784 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43785
43786 // Try to match patterns such as
43787 // (i16 bitcast (v16i1 x))
43788 // ->
43789 // (i16 movmsk (16i8 sext (v16i1 x)))
43790 // before the setcc result is scalarized on subtargets that don't have legal
43791 // vxi1 types.
43792 if (DCI.isBeforeLegalize()) {
43793 SDLoc dl(N);
43794 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
43795 return V;
43796
43797 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43798 // type, widen both sides to avoid a trip through memory.
43799 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
43800 Subtarget.hasAVX512()) {
43801 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
43802 N0 = DAG.getBitcast(MVT::v8i1, N0);
43803 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
43804 DAG.getIntPtrConstant(0, dl));
43805 }
43806
43807 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43808 // type, widen both sides to avoid a trip through memory.
43809 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
43810 Subtarget.hasAVX512()) {
43811 // Use zeros for the widening if we already have some zeroes. This can
43812 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
43813 // stream of this.
43814 // FIXME: It might make sense to detect a concat_vectors with a mix of
43815 // zeroes and undef and turn it into insert_subvector for i1 vectors as
43816 // a separate combine. What we can't do is canonicalize the operands of
43817 // such a concat or we'll get into a loop with SimplifyDemandedBits.
43818 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
43819 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
43820 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
43821 SrcVT = LastOp.getValueType();
43822 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43823 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
43824 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
43825 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43826 N0 = DAG.getBitcast(MVT::i8, N0);
43827 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43828 }
43829 }
43830
43831 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43832 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
43833 Ops[0] = N0;
43834 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43835 N0 = DAG.getBitcast(MVT::i8, N0);
43836 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43837 }
43838 } else {
43839 // If we're bitcasting from iX to vXi1, see if the integer originally
43840 // began as a vXi1 and whether we can remove the bitcast entirely.
43841 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
43842 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
43843 if (SDValue V =
43844 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
43845 return V;
43846 }
43847 }
43848
43849 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
43850 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
43851 // due to insert_subvector legalization on KNL. By promoting the copy to i16
43852 // we can help with known bits propagation from the vXi1 domain to the
43853 // scalar domain.
43854 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
43855 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43856 N0.getOperand(0).getValueType() == MVT::v16i1 &&
43858 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
43859 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
43860
43861 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
43862 // and the vbroadcast_load are both integer or both fp. In some cases this
43863 // will remove the bitcast entirely.
43864 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
43865 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
43866 auto *BCast = cast<MemIntrinsicSDNode>(N0);
43867 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
43868 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
43869 // Don't swap i8/i16 since don't have fp types that size.
43870 if (MemSize >= 32) {
43871 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
43872 : MVT::getIntegerVT(MemSize);
43873 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
43874 : MVT::getIntegerVT(SrcVTSize);
43875 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
43876
43877 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
43878 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
43879 SDValue ResNode =
43881 MemVT, BCast->getMemOperand());
43882 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
43883 return DAG.getBitcast(VT, ResNode);
43884 }
43885 }
43886
43887 // Since MMX types are special and don't usually play with other vector types,
43888 // it's better to handle them early to be sure we emit efficient code by
43889 // avoiding store-load conversions.
43890 if (VT == MVT::x86mmx) {
43891 // Detect MMX constant vectors.
43892 APInt UndefElts;
43893 SmallVector<APInt, 1> EltBits;
43894 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
43895 /*AllowWholeUndefs*/ true,
43896 /*AllowPartialUndefs*/ true)) {
43897 SDLoc DL(N0);
43898 // Handle zero-extension of i32 with MOVD.
43899 if (EltBits[0].countl_zero() >= 32)
43900 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
43901 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
43902 // Else, bitcast to a double.
43903 // TODO - investigate supporting sext 32-bit immediates on x86_64.
43904 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
43905 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
43906 }
43907
43908 // Detect bitcasts to x86mmx low word.
43909 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43910 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
43911 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
43912 bool LowUndef = true, AllUndefOrZero = true;
43913 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
43914 SDValue Op = N0.getOperand(i);
43915 LowUndef &= Op.isUndef() || (i >= e/2);
43916 AllUndefOrZero &= isNullConstantOrUndef(Op);
43917 }
43918 if (AllUndefOrZero) {
43919 SDValue N00 = N0.getOperand(0);
43920 SDLoc dl(N00);
43921 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
43922 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
43923 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
43924 }
43925 }
43926
43927 // Detect bitcasts of 64-bit build vectors and convert to a
43928 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
43929 // lowest element.
43930 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43931 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
43932 SrcVT == MVT::v8i8))
43933 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
43934
43935 // Detect bitcasts between element or subvector extraction to x86mmx.
43936 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
43938 isNullConstant(N0.getOperand(1))) {
43939 SDValue N00 = N0.getOperand(0);
43940 if (N00.getValueType().is128BitVector())
43941 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
43942 DAG.getBitcast(MVT::v2i64, N00));
43943 }
43944
43945 // Detect bitcasts from FP_TO_SINT to x86mmx.
43946 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
43947 SDLoc DL(N0);
43948 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
43949 DAG.getUNDEF(MVT::v2i32));
43950 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
43951 DAG.getBitcast(MVT::v2i64, Res));
43952 }
43953 }
43954
43955 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
43956 // most of these to scalar anyway.
43957 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
43958 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43960 return combinevXi1ConstantToInteger(N0, DAG);
43961 }
43962
43963 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
43964 VT.getVectorElementType() == MVT::i1) {
43965 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
43966 if (C->isAllOnes())
43967 return DAG.getConstant(1, SDLoc(N0), VT);
43968 if (C->isZero())
43969 return DAG.getConstant(0, SDLoc(N0), VT);
43970 }
43971 }
43972
43973 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
43974 // Turn it into a sign bit compare that produces a k-register. This avoids
43975 // a trip through a GPR.
43976 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
43977 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
43979 unsigned NumElts = VT.getVectorNumElements();
43980 SDValue Src = N0;
43981
43982 // Peek through truncate.
43983 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
43984 Src = N0.getOperand(0);
43985
43986 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
43987 SDValue MovmskIn = Src.getOperand(0);
43988 MVT MovmskVT = MovmskIn.getSimpleValueType();
43989 unsigned MovMskElts = MovmskVT.getVectorNumElements();
43990
43991 // We allow extra bits of the movmsk to be used since they are known zero.
43992 // We can't convert a VPMOVMSKB without avx512bw.
43993 if (MovMskElts <= NumElts &&
43994 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
43995 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
43996 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
43997 SDLoc dl(N);
43998 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
43999 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
44000 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
44001 if (EVT(CmpVT) == VT)
44002 return Cmp;
44003
44004 // Pad with zeroes up to original VT to replace the zeroes that were
44005 // being used from the MOVMSK.
44006 unsigned NumConcats = NumElts / MovMskElts;
44007 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
44008 Ops[0] = Cmp;
44009 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
44010 }
44011 }
44012 }
44013
44014 // Try to remove bitcasts from input and output of mask arithmetic to
44015 // remove GPR<->K-register crossings.
44016 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
44017 return V;
44018
44019 // Convert a bitcasted integer logic operation that has one bitcasted
44020 // floating-point operand into a floating-point logic operation. This may
44021 // create a load of a constant, but that is cheaper than materializing the
44022 // constant in an integer register and transferring it to an SSE register or
44023 // transferring the SSE operand to integer register and back.
44024 unsigned FPOpcode;
44025 switch (N0.getOpcode()) {
44026 // clang-format off
44027 case ISD::AND: FPOpcode = X86ISD::FAND; break;
44028 case ISD::OR: FPOpcode = X86ISD::FOR; break;
44029 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
44030 default: return SDValue();
44031 // clang-format on
44032 }
44033
44034 // Check if we have a bitcast from another integer type as well.
44035 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
44036 (Subtarget.hasSSE2() && VT == MVT::f64) ||
44037 (Subtarget.hasFP16() && VT == MVT::f16) ||
44038 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
44039 TLI.isTypeLegal(VT))))
44040 return SDValue();
44041
44042 SDValue LogicOp0 = N0.getOperand(0);
44043 SDValue LogicOp1 = N0.getOperand(1);
44044 SDLoc DL0(N0);
44045
44046 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
44047 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
44048 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
44049 LogicOp0.getOperand(0).getValueType() == VT &&
44050 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
44051 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
44052 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
44053 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
44054 }
44055 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
44056 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
44057 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
44058 LogicOp1.getOperand(0).getValueType() == VT &&
44059 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
44060 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
44061 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
44062 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
44063 }
44064
44065 return SDValue();
44066}
44067
44068// (mul (zext a), (sext, b))
44069static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
44070 SDValue &Op1) {
44071 Op0 = Mul.getOperand(0);
44072 Op1 = Mul.getOperand(1);
44073
44074 // The operand1 should be signed extend
44075 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
44076 std::swap(Op0, Op1);
44077
44078 auto IsFreeTruncation = [](SDValue &Op) -> bool {
44079 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
44080 Op.getOpcode() == ISD::SIGN_EXTEND) &&
44081 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
44082 return true;
44083
44084 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
44085 return (BV && BV->isConstant());
44086 };
44087
44088 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
44089 // value, we need to check Op0 is zero extended value. Op1 should be signed
44090 // value, so we just check the signed bits.
44091 if ((IsFreeTruncation(Op0) &&
44092 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
44093 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
44094 return true;
44095
44096 return false;
44097}
44098
44099// Given a ABS node, detect the following pattern:
44100// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
44101// This is useful as it is the input into a SAD pattern.
44102static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
44103 SDValue AbsOp1 = Abs->getOperand(0);
44104 if (AbsOp1.getOpcode() != ISD::SUB)
44105 return false;
44106
44107 Op0 = AbsOp1.getOperand(0);
44108 Op1 = AbsOp1.getOperand(1);
44109
44110 // Check if the operands of the sub are zero-extended from vectors of i8.
44111 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
44112 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
44113 Op1.getOpcode() != ISD::ZERO_EXTEND ||
44114 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
44115 return false;
44116
44117 return true;
44118}
44119
44121 unsigned &LogBias, const SDLoc &DL,
44122 const X86Subtarget &Subtarget) {
44123 // Extend or truncate to MVT::i8 first.
44124 MVT Vi8VT =
44125 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
44126 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
44127 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
44128
44129 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
44130 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
44131 // The src A, B element type is i8, but the dst C element type is i32.
44132 // When we calculate the reduce stage, we use src vector type vXi8 for it
44133 // so we need logbias 2 to avoid extra 2 stages.
44134 LogBias = 2;
44135
44136 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
44137 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
44138 RegSize = std::max(512u, RegSize);
44139
44140 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44141 // fill in the missing vector elements with 0.
44142 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
44143 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
44144 Ops[0] = LHS;
44145 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
44146 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44147 Ops[0] = RHS;
44148 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44149
44150 // Actually build the DotProduct, split as 256/512 bits for
44151 // AVXVNNI/AVX512VNNI.
44152 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44153 ArrayRef<SDValue> Ops) {
44154 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
44155 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
44156 };
44157 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
44158 SDValue Zero = DAG.getConstant(0, DL, DpVT);
44159
44160 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
44161 DpBuilder, false);
44162}
44163
44164// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
44165// to these zexts.
44166static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
44167 const SDValue &Zext1, const SDLoc &DL,
44168 const X86Subtarget &Subtarget) {
44169 // Find the appropriate width for the PSADBW.
44170 EVT InVT = Zext0.getOperand(0).getValueType();
44171 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
44172
44173 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
44174 // fill in the missing vector elements with 0.
44175 unsigned NumConcat = RegSize / InVT.getSizeInBits();
44176 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
44177 Ops[0] = Zext0.getOperand(0);
44178 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
44179 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44180 Ops[0] = Zext1.getOperand(0);
44181 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
44182
44183 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
44184 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44185 ArrayRef<SDValue> Ops) {
44186 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
44187 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
44188 };
44189 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
44190 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
44191 PSADBWBuilder);
44192}
44193
44194// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
44195// PHMINPOSUW.
44197 const X86Subtarget &Subtarget) {
44198 // Bail without SSE41.
44199 if (!Subtarget.hasSSE41())
44200 return SDValue();
44201
44202 EVT ExtractVT = Extract->getValueType(0);
44203 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
44204 return SDValue();
44205
44206 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
44207 ISD::NodeType BinOp;
44208 SDValue Src = DAG.matchBinOpReduction(
44209 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
44210 if (!Src)
44211 return SDValue();
44212
44213 EVT SrcVT = Src.getValueType();
44214 EVT SrcSVT = SrcVT.getScalarType();
44215 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
44216 return SDValue();
44217
44218 SDLoc DL(Extract);
44219 SDValue MinPos = Src;
44220
44221 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
44222 while (SrcVT.getSizeInBits() > 128) {
44223 SDValue Lo, Hi;
44224 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
44225 SrcVT = Lo.getValueType();
44226 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
44227 }
44228 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
44229 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
44230 "Unexpected value type");
44231
44232 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
44233 // to flip the value accordingly.
44234 SDValue Mask;
44235 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
44236 if (BinOp == ISD::SMAX)
44237 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
44238 else if (BinOp == ISD::SMIN)
44239 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
44240 else if (BinOp == ISD::UMAX)
44241 Mask = DAG.getAllOnesConstant(DL, SrcVT);
44242
44243 if (Mask)
44244 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
44245
44246 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
44247 // shuffling each upper element down and insert zeros. This means that the
44248 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
44249 // ready for the PHMINPOS.
44250 if (ExtractVT == MVT::i8) {
44252 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
44253 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
44254 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
44255 }
44256
44257 // Perform the PHMINPOS on a v8i16 vector,
44258 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
44259 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
44260 MinPos = DAG.getBitcast(SrcVT, MinPos);
44261
44262 if (Mask)
44263 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
44264
44265 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
44266 DAG.getIntPtrConstant(0, DL));
44267}
44268
44269// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
44271 const X86Subtarget &Subtarget) {
44272 // Bail without SSE2.
44273 if (!Subtarget.hasSSE2())
44274 return SDValue();
44275
44276 EVT ExtractVT = Extract->getValueType(0);
44277 unsigned BitWidth = ExtractVT.getSizeInBits();
44278 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
44279 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
44280 return SDValue();
44281
44282 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
44283 ISD::NodeType BinOp;
44284 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
44285 if (!Match && ExtractVT == MVT::i1)
44286 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
44287 if (!Match)
44288 return SDValue();
44289
44290 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
44291 // which we can't support here for now.
44292 if (Match.getScalarValueSizeInBits() != BitWidth)
44293 return SDValue();
44294
44295 SDValue Movmsk;
44296 SDLoc DL(Extract);
44297 EVT MatchVT = Match.getValueType();
44298 unsigned NumElts = MatchVT.getVectorNumElements();
44299 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
44300 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44301 LLVMContext &Ctx = *DAG.getContext();
44302
44303 if (ExtractVT == MVT::i1) {
44304 // Special case for (pre-legalization) vXi1 reductions.
44305 if (NumElts > 64 || !isPowerOf2_32(NumElts))
44306 return SDValue();
44307 if (Match.getOpcode() == ISD::SETCC) {
44308 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
44309 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
44310 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
44311 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
44312 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
44313 X86::CondCode X86CC;
44314 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
44315 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
44316 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
44317 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
44318 DAG, X86CC))
44319 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
44320 getSETCC(X86CC, V, DL, DAG));
44321 }
44322 }
44323 if (TLI.isTypeLegal(MatchVT)) {
44324 // If this is a legal AVX512 predicate type then we can just bitcast.
44325 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
44326 Movmsk = DAG.getBitcast(MovmskVT, Match);
44327 } else {
44328 // Use combineBitcastvxi1 to create the MOVMSK.
44329 while (NumElts > MaxElts) {
44330 SDValue Lo, Hi;
44331 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
44332 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
44333 NumElts /= 2;
44334 }
44335 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
44336 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
44337 }
44338 if (!Movmsk)
44339 return SDValue();
44340 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
44341 } else {
44342 // FIXME: Better handling of k-registers or 512-bit vectors?
44343 unsigned MatchSizeInBits = Match.getValueSizeInBits();
44344 if (!(MatchSizeInBits == 128 ||
44345 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
44346 return SDValue();
44347
44348 // Make sure this isn't a vector of 1 element. The perf win from using
44349 // MOVMSK diminishes with less elements in the reduction, but it is
44350 // generally better to get the comparison over to the GPRs as soon as
44351 // possible to reduce the number of vector ops.
44352 if (Match.getValueType().getVectorNumElements() < 2)
44353 return SDValue();
44354
44355 // Check that we are extracting a reduction of all sign bits.
44356 if (DAG.ComputeNumSignBits(Match) != BitWidth)
44357 return SDValue();
44358
44359 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
44360 SDValue Lo, Hi;
44361 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
44362 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
44363 MatchSizeInBits = Match.getValueSizeInBits();
44364 }
44365
44366 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
44367 MVT MaskSrcVT;
44368 if (64 == BitWidth || 32 == BitWidth)
44370 MatchSizeInBits / BitWidth);
44371 else
44372 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
44373
44374 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
44375 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
44376 NumElts = MaskSrcVT.getVectorNumElements();
44377 }
44378 assert((NumElts <= 32 || NumElts == 64) &&
44379 "Not expecting more than 64 elements");
44380
44381 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
44382 if (BinOp == ISD::XOR) {
44383 // parity -> (PARITY(MOVMSK X))
44384 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
44385 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
44386 }
44387
44388 SDValue CmpC;
44389 ISD::CondCode CondCode;
44390 if (BinOp == ISD::OR) {
44391 // any_of -> MOVMSK != 0
44392 CmpC = DAG.getConstant(0, DL, CmpVT);
44393 CondCode = ISD::CondCode::SETNE;
44394 } else {
44395 // all_of -> MOVMSK == ((1 << NumElts) - 1)
44396 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
44397 DL, CmpVT);
44398 CondCode = ISD::CondCode::SETEQ;
44399 }
44400
44401 // The setcc produces an i8 of 0/1, so extend that to the result width and
44402 // negate to get the final 0/-1 mask value.
44403 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
44404 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
44405 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
44406 return DAG.getNegative(Zext, DL, ExtractVT);
44407}
44408
44410 const X86Subtarget &Subtarget) {
44411 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
44412 return SDValue();
44413
44414 EVT ExtractVT = Extract->getValueType(0);
44415 // Verify the type we're extracting is i32, as the output element type of
44416 // vpdpbusd is i32.
44417 if (ExtractVT != MVT::i32)
44418 return SDValue();
44419
44420 EVT VT = Extract->getOperand(0).getValueType();
44422 return SDValue();
44423
44424 // Match shuffle + add pyramid.
44425 ISD::NodeType BinOp;
44426 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
44427
44428 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
44429 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
44430 // before adding into the accumulator.
44431 // TODO:
44432 // We also need to verify that the multiply has at least 2x the number of bits
44433 // of the input. We shouldn't match
44434 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
44435 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
44436 // Root = Root.getOperand(0);
44437
44438 // If there was a match, we want Root to be a mul.
44439 if (!Root || Root.getOpcode() != ISD::MUL)
44440 return SDValue();
44441
44442 // Check whether we have an extend and mul pattern
44443 SDValue LHS, RHS;
44444 if (!detectExtMul(DAG, Root, LHS, RHS))
44445 return SDValue();
44446
44447 // Create the dot product instruction.
44448 SDLoc DL(Extract);
44449 unsigned StageBias;
44450 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
44451
44452 // If the original vector was wider than 4 elements, sum over the results
44453 // in the DP vector.
44454 unsigned Stages = Log2_32(VT.getVectorNumElements());
44455 EVT DpVT = DP.getValueType();
44456
44457 if (Stages > StageBias) {
44458 unsigned DpElems = DpVT.getVectorNumElements();
44459
44460 for (unsigned i = Stages - StageBias; i > 0; --i) {
44461 SmallVector<int, 16> Mask(DpElems, -1);
44462 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44463 Mask[j] = MaskEnd + j;
44464
44465 SDValue Shuffle =
44466 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
44467 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
44468 }
44469 }
44470
44471 // Return the lowest ExtractSizeInBits bits.
44472 EVT ResVT =
44473 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44474 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
44475 DP = DAG.getBitcast(ResVT, DP);
44476 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
44477 Extract->getOperand(1));
44478}
44479
44481 const X86Subtarget &Subtarget) {
44482 // PSADBW is only supported on SSE2 and up.
44483 if (!Subtarget.hasSSE2())
44484 return SDValue();
44485
44486 EVT ExtractVT = Extract->getValueType(0);
44487 // Verify the type we're extracting is either i32 or i64.
44488 // FIXME: Could support other types, but this is what we have coverage for.
44489 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
44490 return SDValue();
44491
44492 EVT VT = Extract->getOperand(0).getValueType();
44494 return SDValue();
44495
44496 // Match shuffle + add pyramid.
44497 ISD::NodeType BinOp;
44498 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
44499
44500 // The operand is expected to be zero extended from i8
44501 // (verified in detectZextAbsDiff).
44502 // In order to convert to i64 and above, additional any/zero/sign
44503 // extend is expected.
44504 // The zero extend from 32 bit has no mathematical effect on the result.
44505 // Also the sign extend is basically zero extend
44506 // (extends the sign bit which is zero).
44507 // So it is correct to skip the sign/zero extend instruction.
44508 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
44509 Root.getOpcode() == ISD::ZERO_EXTEND ||
44510 Root.getOpcode() == ISD::ANY_EXTEND))
44511 Root = Root.getOperand(0);
44512
44513 // If there was a match, we want Root to be a select that is the root of an
44514 // abs-diff pattern.
44515 if (!Root || Root.getOpcode() != ISD::ABS)
44516 return SDValue();
44517
44518 // Check whether we have an abs-diff pattern feeding into the select.
44519 SDValue Zext0, Zext1;
44520 if (!detectZextAbsDiff(Root, Zext0, Zext1))
44521 return SDValue();
44522
44523 // Create the SAD instruction.
44524 SDLoc DL(Extract);
44525 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
44526
44527 // If the original vector was wider than 8 elements, sum over the results
44528 // in the SAD vector.
44529 unsigned Stages = Log2_32(VT.getVectorNumElements());
44530 EVT SadVT = SAD.getValueType();
44531 if (Stages > 3) {
44532 unsigned SadElems = SadVT.getVectorNumElements();
44533
44534 for(unsigned i = Stages - 3; i > 0; --i) {
44535 SmallVector<int, 16> Mask(SadElems, -1);
44536 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44537 Mask[j] = MaskEnd + j;
44538
44539 SDValue Shuffle =
44540 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
44541 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
44542 }
44543 }
44544
44545 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
44546 // Return the lowest ExtractSizeInBits bits.
44547 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44548 SadVT.getSizeInBits() / ExtractSizeInBits);
44549 SAD = DAG.getBitcast(ResVT, SAD);
44550 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
44551 Extract->getOperand(1));
44552}
44553
44554// If this extract is from a loaded vector value and will be used as an
44555// integer, that requires a potentially expensive XMM -> GPR transfer.
44556// Additionally, if we can convert to a scalar integer load, that will likely
44557// be folded into a subsequent integer op.
44558// Note: SrcVec might not have a VecVT type, but it must be the same size.
44559// Note: Unlike the related fold for this in DAGCombiner, this is not limited
44560// to a single-use of the loaded vector. For the reasons above, we
44561// expect this to be profitable even if it creates an extra load.
44562static SDValue
44564 const SDLoc &dl, SelectionDAG &DAG,
44566 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44567 "Only EXTRACT_VECTOR_ELT supported so far");
44568
44569 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44570 EVT VT = N->getValueType(0);
44571
44572 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
44573 return Use->getOpcode() == ISD::STORE ||
44574 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
44575 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
44576 });
44577
44578 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
44579 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
44580 VecVT.getVectorElementType() == VT &&
44581 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
44582 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
44583 SDValue NewPtr = TLI.getVectorElementPointer(
44584 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
44585 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
44586 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
44587 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
44588 SDValue Load =
44589 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
44590 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
44591 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
44592 return Load;
44593 }
44594
44595 return SDValue();
44596}
44597
44598// Attempt to peek through a target shuffle and extract the scalar from the
44599// source.
44602 const X86Subtarget &Subtarget) {
44603 if (DCI.isBeforeLegalizeOps())
44604 return SDValue();
44605
44606 SDLoc dl(N);
44607 SDValue Src = N->getOperand(0);
44608 SDValue Idx = N->getOperand(1);
44609
44610 EVT VT = N->getValueType(0);
44611 EVT SrcVT = Src.getValueType();
44612 EVT SrcSVT = SrcVT.getVectorElementType();
44613 unsigned SrcEltBits = SrcSVT.getSizeInBits();
44614 unsigned NumSrcElts = SrcVT.getVectorNumElements();
44615
44616 // Don't attempt this for boolean mask vectors or unknown extraction indices.
44617 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
44618 return SDValue();
44619
44620 const APInt &IdxC = N->getConstantOperandAPInt(1);
44621 if (IdxC.uge(NumSrcElts))
44622 return SDValue();
44623
44624 SDValue SrcBC = peekThroughBitcasts(Src);
44625
44626 // Handle extract(bitcast(broadcast(scalar_value))).
44627 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
44628 SDValue SrcOp = SrcBC.getOperand(0);
44629 EVT SrcOpVT = SrcOp.getValueType();
44630 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
44631 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
44632 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
44633 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
44634 // TODO support non-zero offsets.
44635 if (Offset == 0) {
44636 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
44637 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
44638 return SrcOp;
44639 }
44640 }
44641 }
44642
44643 // If we're extracting a single element from a broadcast load and there are
44644 // no other users, just create a single load.
44645 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
44646 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
44647 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
44648 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
44649 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
44650 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
44651 MemIntr->getBasePtr(),
44652 MemIntr->getPointerInfo(),
44653 MemIntr->getOriginalAlign(),
44654 MemIntr->getMemOperand()->getFlags());
44655 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
44656 return Load;
44657 }
44658 }
44659
44660 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
44661 // TODO: Move to DAGCombine?
44662 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
44663 SrcBC.getValueType().isInteger() &&
44664 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
44665 SrcBC.getScalarValueSizeInBits() ==
44666 SrcBC.getOperand(0).getValueSizeInBits()) {
44667 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
44668 if (IdxC.ult(Scale)) {
44669 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
44670 SDValue Scl = SrcBC.getOperand(0);
44671 EVT SclVT = Scl.getValueType();
44672 if (Offset) {
44673 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
44674 DAG.getShiftAmountConstant(Offset, SclVT, dl));
44675 }
44676 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
44677 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
44678 return Scl;
44679 }
44680 }
44681
44682 // Handle extract(truncate(x)) for 0'th index.
44683 // TODO: Treat this as a faux shuffle?
44684 // TODO: When can we use this for general indices?
44685 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
44686 (SrcVT.getSizeInBits() % 128) == 0) {
44687 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
44688 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
44689 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
44690 Idx);
44691 }
44692
44693 // We can only legally extract other elements from 128-bit vectors and in
44694 // certain circumstances, depending on SSE-level.
44695 // TODO: Investigate float/double extraction if it will be just stored.
44696 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
44697 unsigned Idx) {
44698 EVT VecSVT = VecVT.getScalarType();
44699 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
44700 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
44701 VecSVT == MVT::i64)) {
44702 unsigned EltSizeInBits = VecSVT.getSizeInBits();
44703 unsigned NumEltsPerLane = 128 / EltSizeInBits;
44704 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
44705 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
44706 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
44707 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
44708 Idx &= (NumEltsPerLane - 1);
44709 }
44710 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
44711 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
44712 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
44713 DAG.getBitcast(VecVT, Vec),
44714 DAG.getIntPtrConstant(Idx, dl));
44715 }
44716 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
44717 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
44718 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
44719 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
44720 DAG.getTargetConstant(Idx, dl, MVT::i8));
44721 }
44722 return SDValue();
44723 };
44724
44725 // Resolve the target shuffle inputs and mask.
44728 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
44729 return SDValue();
44730
44731 // Shuffle inputs must be the same size as the result.
44732 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
44733 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
44734 }))
44735 return SDValue();
44736
44737 // Attempt to narrow/widen the shuffle mask to the correct size.
44738 if (Mask.size() != NumSrcElts) {
44739 if ((NumSrcElts % Mask.size()) == 0) {
44740 SmallVector<int, 16> ScaledMask;
44741 int Scale = NumSrcElts / Mask.size();
44742 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
44743 Mask = std::move(ScaledMask);
44744 } else if ((Mask.size() % NumSrcElts) == 0) {
44745 // Simplify Mask based on demanded element.
44746 int ExtractIdx = (int)IdxC.getZExtValue();
44747 int Scale = Mask.size() / NumSrcElts;
44748 int Lo = Scale * ExtractIdx;
44749 int Hi = Scale * (ExtractIdx + 1);
44750 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
44751 if (i < Lo || Hi <= i)
44752 Mask[i] = SM_SentinelUndef;
44753
44754 SmallVector<int, 16> WidenedMask;
44755 while (Mask.size() > NumSrcElts &&
44756 canWidenShuffleElements(Mask, WidenedMask))
44757 Mask = std::move(WidenedMask);
44758 }
44759 }
44760
44761 // If narrowing/widening failed, see if we can extract+zero-extend.
44762 int ExtractIdx;
44763 EVT ExtractVT;
44764 if (Mask.size() == NumSrcElts) {
44765 ExtractIdx = Mask[IdxC.getZExtValue()];
44766 ExtractVT = SrcVT;
44767 } else {
44768 unsigned Scale = Mask.size() / NumSrcElts;
44769 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
44770 return SDValue();
44771 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
44772 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
44773 return SDValue();
44774 ExtractIdx = Mask[ScaledIdx];
44775 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
44776 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
44777 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
44778 "Failed to widen vector type");
44779 }
44780
44781 // If the shuffle source element is undef/zero then we can just accept it.
44782 if (ExtractIdx == SM_SentinelUndef)
44783 return DAG.getUNDEF(VT);
44784
44785 if (ExtractIdx == SM_SentinelZero)
44786 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
44787 : DAG.getConstant(0, dl, VT);
44788
44789 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
44790 ExtractIdx = ExtractIdx % Mask.size();
44791 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
44792 return DAG.getZExtOrTrunc(V, dl, VT);
44793
44794 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
44796 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
44797 return V;
44798
44799 return SDValue();
44800}
44801
44802/// Extracting a scalar FP value from vector element 0 is free, so extract each
44803/// operand first, then perform the math as a scalar op.
44805 const X86Subtarget &Subtarget) {
44806 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
44807 SDValue Vec = ExtElt->getOperand(0);
44808 SDValue Index = ExtElt->getOperand(1);
44809 EVT VT = ExtElt->getValueType(0);
44810 EVT VecVT = Vec.getValueType();
44811
44812 // TODO: If this is a unary/expensive/expand op, allow extraction from a
44813 // non-zero element because the shuffle+scalar op will be cheaper?
44814 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
44815 return SDValue();
44816
44817 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
44818 // extract, the condition code), so deal with those as a special-case.
44819 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
44820 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
44821 if (OpVT != MVT::f32 && OpVT != MVT::f64)
44822 return SDValue();
44823
44824 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
44825 SDLoc DL(ExtElt);
44826 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44827 Vec.getOperand(0), Index);
44828 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44829 Vec.getOperand(1), Index);
44830 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
44831 }
44832
44833 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
44834 VT != MVT::f64)
44835 return SDValue();
44836
44837 // Vector FP selects don't fit the pattern of FP math ops (because the
44838 // condition has a different type and we have to change the opcode), so deal
44839 // with those here.
44840 // FIXME: This is restricted to pre type legalization by ensuring the setcc
44841 // has i1 elements. If we loosen this we need to convert vector bool to a
44842 // scalar bool.
44843 if (Vec.getOpcode() == ISD::VSELECT &&
44844 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
44845 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
44846 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
44847 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
44848 SDLoc DL(ExtElt);
44851 Vec.getOperand(0), Index);
44852 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44853 Vec.getOperand(1), Index);
44854 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44855 Vec.getOperand(2), Index);
44856 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
44857 }
44858
44859 // TODO: This switch could include FNEG and the x86-specific FP logic ops
44860 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
44861 // missed load folding and fma+fneg combining.
44862 switch (Vec.getOpcode()) {
44863 case ISD::FMA: // Begin 3 operands
44864 case ISD::FMAD:
44865 case ISD::FADD: // Begin 2 operands
44866 case ISD::FSUB:
44867 case ISD::FMUL:
44868 case ISD::FDIV:
44869 case ISD::FREM:
44870 case ISD::FCOPYSIGN:
44871 case ISD::FMINNUM:
44872 case ISD::FMAXNUM:
44873 case ISD::FMINNUM_IEEE:
44874 case ISD::FMAXNUM_IEEE:
44875 case ISD::FMAXIMUM:
44876 case ISD::FMINIMUM:
44877 case X86ISD::FMAX:
44878 case X86ISD::FMIN:
44879 case ISD::FABS: // Begin 1 operand
44880 case ISD::FSQRT:
44881 case ISD::FRINT:
44882 case ISD::FCEIL:
44883 case ISD::FTRUNC:
44884 case ISD::FNEARBYINT:
44885 case ISD::FROUNDEVEN:
44886 case ISD::FROUND:
44887 case ISD::FFLOOR:
44888 case X86ISD::FRCP:
44889 case X86ISD::FRSQRT: {
44890 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
44891 SDLoc DL(ExtElt);
44893 for (SDValue Op : Vec->ops())
44894 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
44895 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
44896 }
44897 default:
44898 return SDValue();
44899 }
44900 llvm_unreachable("All opcodes should return within switch");
44901}
44902
44903/// Try to convert a vector reduction sequence composed of binops and shuffles
44904/// into horizontal ops.
44906 const X86Subtarget &Subtarget) {
44907 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
44908
44909 // We need at least SSE2 to anything here.
44910 if (!Subtarget.hasSSE2())
44911 return SDValue();
44912
44913 ISD::NodeType Opc;
44914 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
44915 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
44916 if (!Rdx)
44917 return SDValue();
44918
44919 SDValue Index = ExtElt->getOperand(1);
44921 "Reduction doesn't end in an extract from index 0");
44922
44923 EVT VT = ExtElt->getValueType(0);
44924 EVT VecVT = Rdx.getValueType();
44925 if (VecVT.getScalarType() != VT)
44926 return SDValue();
44927
44928 SDLoc DL(ExtElt);
44929 unsigned NumElts = VecVT.getVectorNumElements();
44930 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
44931
44932 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
44933 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
44934 if (V.getValueType() == MVT::v4i8) {
44935 if (ZeroExtend && Subtarget.hasSSE41()) {
44936 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
44937 DAG.getConstant(0, DL, MVT::v4i32),
44938 DAG.getBitcast(MVT::i32, V),
44939 DAG.getIntPtrConstant(0, DL));
44940 return DAG.getBitcast(MVT::v16i8, V);
44941 }
44942 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
44943 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
44944 : DAG.getUNDEF(MVT::v4i8));
44945 }
44946 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
44947 DAG.getUNDEF(MVT::v8i8));
44948 };
44949
44950 // vXi8 mul reduction - promote to vXi16 mul reduction.
44951 if (Opc == ISD::MUL) {
44952 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
44953 return SDValue();
44954 if (VecVT.getSizeInBits() >= 128) {
44955 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
44956 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44957 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44958 Lo = DAG.getBitcast(WideVT, Lo);
44959 Hi = DAG.getBitcast(WideVT, Hi);
44960 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
44961 while (Rdx.getValueSizeInBits() > 128) {
44962 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44963 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
44964 }
44965 } else {
44966 Rdx = WidenToV16I8(Rdx, false);
44967 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
44968 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
44969 }
44970 if (NumElts >= 8)
44971 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44972 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44973 {4, 5, 6, 7, -1, -1, -1, -1}));
44974 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44975 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44976 {2, 3, -1, -1, -1, -1, -1, -1}));
44977 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44978 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44979 {1, -1, -1, -1, -1, -1, -1, -1}));
44980 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44981 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44982 }
44983
44984 // vXi8 add reduction - sub 128-bit vector.
44985 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
44986 Rdx = WidenToV16I8(Rdx, true);
44987 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
44988 DAG.getConstant(0, DL, MVT::v16i8));
44989 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44990 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44991 }
44992
44993 // Must be a >=128-bit vector with pow2 elements.
44994 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
44995 return SDValue();
44996
44997 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
44998 if (VT == MVT::i8) {
44999 while (Rdx.getValueSizeInBits() > 128) {
45000 SDValue Lo, Hi;
45001 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45002 VecVT = Lo.getValueType();
45003 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45004 }
45005 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
45006
45008 MVT::v16i8, DL, Rdx, Rdx,
45009 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
45010 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
45011 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
45012 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
45013 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
45014 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45015 }
45016
45017 // See if we can use vXi8 PSADBW add reduction for larger zext types.
45018 // If the source vector values are 0-255, then we can use PSADBW to
45019 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
45020 // TODO: See if its worth avoiding vXi16/i32 truncations?
45021 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
45022 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
45023 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
45024 Subtarget.hasAVX512())) {
45025 if (Rdx.getValueType() == MVT::v8i16) {
45026 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
45027 DAG.getUNDEF(MVT::v8i16));
45028 } else {
45029 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
45030 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
45031 if (ByteVT.getSizeInBits() < 128)
45032 Rdx = WidenToV16I8(Rdx, true);
45033 }
45034
45035 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45036 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45037 ArrayRef<SDValue> Ops) {
45038 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45039 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
45040 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
45041 };
45042 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
45043 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
45044
45045 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
45046 while (Rdx.getValueSizeInBits() > 128) {
45047 SDValue Lo, Hi;
45048 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
45049 VecVT = Lo.getValueType();
45050 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
45051 }
45052 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
45053
45054 if (NumElts > 8) {
45055 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
45056 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
45057 }
45058
45059 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
45060 Rdx = DAG.getBitcast(VecVT, Rdx);
45061 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45062 }
45063
45064 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
45065 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
45066 return SDValue();
45067
45068 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
45069
45070 // 256-bit horizontal instructions operate on 128-bit chunks rather than
45071 // across the whole vector, so we need an extract + hop preliminary stage.
45072 // This is the only step where the operands of the hop are not the same value.
45073 // TODO: We could extend this to handle 512-bit or even longer vectors.
45074 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
45075 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
45076 unsigned NumElts = VecVT.getVectorNumElements();
45077 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
45078 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
45079 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
45080 VecVT = Rdx.getValueType();
45081 }
45082 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
45083 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
45084 return SDValue();
45085
45086 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
45087 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
45088 for (unsigned i = 0; i != ReductionSteps; ++i)
45089 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
45090
45091 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
45092}
45093
45094/// Detect vector gather/scatter index generation and convert it from being a
45095/// bunch of shuffles and extracts into a somewhat faster sequence.
45096/// For i686, the best sequence is apparently storing the value and loading
45097/// scalars back, while for x64 we should use 64-bit extracts and shifts.
45100 const X86Subtarget &Subtarget) {
45101 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
45102 return NewOp;
45103
45104 SDValue InputVector = N->getOperand(0);
45105 SDValue EltIdx = N->getOperand(1);
45106 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
45107
45108 EVT SrcVT = InputVector.getValueType();
45109 EVT VT = N->getValueType(0);
45110 SDLoc dl(InputVector);
45111 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
45112 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45113 unsigned NumEltBits = VT.getScalarSizeInBits();
45114 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45115
45116 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
45117 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
45118
45119 // Integer Constant Folding.
45120 if (CIdx && VT.isInteger()) {
45121 APInt UndefVecElts;
45122 SmallVector<APInt, 16> EltBits;
45123 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
45124 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
45125 EltBits, /*AllowWholeUndefs*/ true,
45126 /*AllowPartialUndefs*/ false)) {
45127 uint64_t Idx = CIdx->getZExtValue();
45128 if (UndefVecElts[Idx])
45129 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
45130 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
45131 }
45132
45133 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
45134 // Improves lowering of bool masks on rust which splits them into byte array.
45135 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
45136 SDValue Src = peekThroughBitcasts(InputVector);
45137 if (Src.getValueType().getScalarType() == MVT::i1 &&
45138 TLI.isTypeLegal(Src.getValueType())) {
45139 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
45140 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
45141 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
45142 return DAG.getBitcast(VT, Sub);
45143 }
45144 }
45145 }
45146
45147 if (IsPextr) {
45148 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
45149 DCI))
45150 return SDValue(N, 0);
45151
45152 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
45153 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
45154 InputVector.getOpcode() == X86ISD::PINSRW) &&
45155 InputVector.getOperand(2) == EltIdx) {
45156 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
45157 "Vector type mismatch");
45158 SDValue Scl = InputVector.getOperand(1);
45159 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
45160 return DAG.getZExtOrTrunc(Scl, dl, VT);
45161 }
45162
45163 // TODO - Remove this once we can handle the implicit zero-extension of
45164 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
45165 // combineBasicSADPattern.
45166 return SDValue();
45167 }
45168
45169 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
45170 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
45171 InputVector.getOpcode() == ISD::BITCAST &&
45172 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
45173 isNullConstant(EltIdx) && InputVector.hasOneUse())
45174 return DAG.getBitcast(VT, InputVector);
45175
45176 // Detect mmx to i32 conversion through a v2i32 elt extract.
45177 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
45178 InputVector.getOpcode() == ISD::BITCAST &&
45179 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
45180 isNullConstant(EltIdx) && InputVector.hasOneUse())
45181 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
45182 InputVector.getOperand(0));
45183
45184 // Check whether this extract is the root of a sum of absolute differences
45185 // pattern. This has to be done here because we really want it to happen
45186 // pre-legalization,
45187 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
45188 return SAD;
45189
45190 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
45191 return VPDPBUSD;
45192
45193 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
45194 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
45195 return Cmp;
45196
45197 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
45198 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
45199 return MinMax;
45200
45201 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
45202 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
45203 return V;
45204
45205 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
45206 return V;
45207
45208 if (CIdx)
45210 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
45211 dl, DAG, DCI))
45212 return V;
45213
45214 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
45215 // and then testing the relevant element.
45216 //
45217 // Note that we only combine extracts on the *same* result number, i.e.
45218 // t0 = merge_values a0, a1, a2, a3
45219 // i1 = extract_vector_elt t0, Constant:i64<2>
45220 // i1 = extract_vector_elt t0, Constant:i64<3>
45221 // but not
45222 // i1 = extract_vector_elt t0:1, Constant:i64<2>
45223 // since the latter would need its own MOVMSK.
45224 if (SrcVT.getScalarType() == MVT::i1) {
45225 bool IsVar = !CIdx;
45226 SmallVector<SDNode *, 16> BoolExtracts;
45227 unsigned ResNo = InputVector.getResNo();
45228 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
45229 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45230 Use->getOperand(0).getResNo() == ResNo &&
45231 Use->getValueType(0) == MVT::i1) {
45232 BoolExtracts.push_back(Use);
45233 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
45234 return true;
45235 }
45236 return false;
45237 };
45238 // TODO: Can we drop the oneuse check for constant extracts?
45239 if (all_of(InputVector->uses(), IsBoolExtract) &&
45240 (IsVar || BoolExtracts.size() > 1)) {
45241 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
45242 if (SDValue BC =
45243 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
45244 for (SDNode *Use : BoolExtracts) {
45245 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
45246 // Mask = 1 << MaskIdx
45247 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
45248 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
45249 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
45250 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
45251 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
45252 DCI.CombineTo(Use, Res);
45253 }
45254 return SDValue(N, 0);
45255 }
45256 }
45257 }
45258
45259 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
45260 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
45261 SDValue TruncSrc = InputVector.getOperand(0);
45262 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
45263 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
45264 SDValue NewExt =
45265 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
45266 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
45267 }
45268 }
45269
45270 return SDValue();
45271}
45272
45273// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
45274// This is more or less the reverse of combineBitcastvxi1.
45276 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
45277 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
45278 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
45279 Opcode != ISD::ANY_EXTEND)
45280 return SDValue();
45281 if (!DCI.isBeforeLegalizeOps())
45282 return SDValue();
45283 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
45284 return SDValue();
45285
45286 EVT SVT = VT.getScalarType();
45287 EVT InSVT = N0.getValueType().getScalarType();
45288 unsigned EltSizeInBits = SVT.getSizeInBits();
45289
45290 // Input type must be extending a bool vector (bit-casted from a scalar
45291 // integer) to legal integer types.
45292 if (!VT.isVector())
45293 return SDValue();
45294 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
45295 return SDValue();
45296 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
45297 return SDValue();
45298
45299 SDValue N00 = N0.getOperand(0);
45300 EVT SclVT = N00.getValueType();
45301 if (!SclVT.isScalarInteger())
45302 return SDValue();
45303
45304 SDValue Vec;
45305 SmallVector<int> ShuffleMask;
45306 unsigned NumElts = VT.getVectorNumElements();
45307 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
45308
45309 // Broadcast the scalar integer to the vector elements.
45310 if (NumElts > EltSizeInBits) {
45311 // If the scalar integer is greater than the vector element size, then we
45312 // must split it down into sub-sections for broadcasting. For example:
45313 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
45314 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
45315 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
45316 unsigned Scale = NumElts / EltSizeInBits;
45317 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
45318 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
45319 Vec = DAG.getBitcast(VT, Vec);
45320
45321 for (unsigned i = 0; i != Scale; ++i)
45322 ShuffleMask.append(EltSizeInBits, i);
45323 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
45324 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
45325 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
45326 // If we have register broadcast instructions, use the scalar size as the
45327 // element type for the shuffle. Then cast to the wider element type. The
45328 // widened bits won't be used, and this might allow the use of a broadcast
45329 // load.
45330 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
45331 unsigned Scale = EltSizeInBits / NumElts;
45332 EVT BroadcastVT =
45333 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
45334 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
45335 ShuffleMask.append(NumElts * Scale, 0);
45336 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
45337 Vec = DAG.getBitcast(VT, Vec);
45338 } else {
45339 // For smaller scalar integers, we can simply any-extend it to the vector
45340 // element size (we don't care about the upper bits) and broadcast it to all
45341 // elements.
45342 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
45343 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
45344 ShuffleMask.append(NumElts, 0);
45345 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
45346 }
45347
45348 // Now, mask the relevant bit in each element.
45350 for (unsigned i = 0; i != NumElts; ++i) {
45351 int BitIdx = (i % EltSizeInBits);
45352 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
45353 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
45354 }
45355 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
45356 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
45357
45358 // Compare against the bitmask and extend the result.
45359 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
45360 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
45361 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
45362
45363 // For SEXT, this is now done, otherwise shift the result down for
45364 // zero-extension.
45365 if (Opcode == ISD::SIGN_EXTEND)
45366 return Vec;
45367 return DAG.getNode(ISD::SRL, DL, VT, Vec,
45368 DAG.getConstant(EltSizeInBits - 1, DL, VT));
45369}
45370
45371/// If a vector select has an operand that is -1 or 0, try to simplify the
45372/// select to a bitwise logic operation.
45373/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
45374static SDValue
45377 const X86Subtarget &Subtarget) {
45378 SDValue Cond = N->getOperand(0);
45379 SDValue LHS = N->getOperand(1);
45380 SDValue RHS = N->getOperand(2);
45381 EVT VT = LHS.getValueType();
45382 EVT CondVT = Cond.getValueType();
45383 SDLoc DL(N);
45384 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45385
45386 if (N->getOpcode() != ISD::VSELECT)
45387 return SDValue();
45388
45389 assert(CondVT.isVector() && "Vector select expects a vector selector!");
45390
45391 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
45392 // TODO: Can we assert that both operands are not zeros (because that should
45393 // get simplified at node creation time)?
45394 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
45395 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
45396
45397 // If both inputs are 0/undef, create a complete zero vector.
45398 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
45399 if (TValIsAllZeros && FValIsAllZeros) {
45400 if (VT.isFloatingPoint())
45401 return DAG.getConstantFP(0.0, DL, VT);
45402 return DAG.getConstant(0, DL, VT);
45403 }
45404
45405 // To use the condition operand as a bitwise mask, it must have elements that
45406 // are the same size as the select elements. Ie, the condition operand must
45407 // have already been promoted from the IR select condition type <N x i1>.
45408 // Don't check if the types themselves are equal because that excludes
45409 // vector floating-point selects.
45410 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
45411 return SDValue();
45412
45413 // Try to invert the condition if true value is not all 1s and false value is
45414 // not all 0s. Only do this if the condition has one use.
45415 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
45416 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
45417 // Check if the selector will be produced by CMPP*/PCMP*.
45418 Cond.getOpcode() == ISD::SETCC &&
45419 // Check if SETCC has already been promoted.
45420 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
45421 CondVT) {
45422 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
45423
45424 if (TValIsAllZeros || FValIsAllOnes) {
45425 SDValue CC = Cond.getOperand(2);
45427 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
45428 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
45429 NewCC);
45430 std::swap(LHS, RHS);
45431 TValIsAllOnes = FValIsAllOnes;
45432 FValIsAllZeros = TValIsAllZeros;
45433 }
45434 }
45435
45436 // Cond value must be 'sign splat' to be converted to a logical op.
45437 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
45438 return SDValue();
45439
45440 // vselect Cond, 111..., 000... -> Cond
45441 if (TValIsAllOnes && FValIsAllZeros)
45442 return DAG.getBitcast(VT, Cond);
45443
45444 if (!TLI.isTypeLegal(CondVT))
45445 return SDValue();
45446
45447 // vselect Cond, 111..., X -> or Cond, X
45448 if (TValIsAllOnes) {
45449 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
45450 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
45451 return DAG.getBitcast(VT, Or);
45452 }
45453
45454 // vselect Cond, X, 000... -> and Cond, X
45455 if (FValIsAllZeros) {
45456 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
45457 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
45458 return DAG.getBitcast(VT, And);
45459 }
45460
45461 // vselect Cond, 000..., X -> andn Cond, X
45462 if (TValIsAllZeros) {
45463 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
45464 SDValue AndN;
45465 // The canonical form differs for i1 vectors - x86andnp is not used
45466 if (CondVT.getScalarType() == MVT::i1)
45467 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
45468 CastRHS);
45469 else
45470 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
45471 return DAG.getBitcast(VT, AndN);
45472 }
45473
45474 return SDValue();
45475}
45476
45477/// If both arms of a vector select are concatenated vectors, split the select,
45478/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
45479/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
45480/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
45482 const X86Subtarget &Subtarget) {
45483 unsigned Opcode = N->getOpcode();
45484 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
45485 return SDValue();
45486
45487 // TODO: Split 512-bit vectors too?
45488 EVT VT = N->getValueType(0);
45489 if (!VT.is256BitVector())
45490 return SDValue();
45491
45492 // TODO: Split as long as any 2 of the 3 operands are concatenated?
45493 SDValue Cond = N->getOperand(0);
45494 SDValue TVal = N->getOperand(1);
45495 SDValue FVal = N->getOperand(2);
45496 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
45497 !isFreeToSplitVector(TVal.getNode(), DAG) ||
45498 !isFreeToSplitVector(FVal.getNode(), DAG))
45499 return SDValue();
45500
45501 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
45502 ArrayRef<SDValue> Ops) {
45503 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
45504 };
45505 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
45506 makeBlend, /*CheckBWI*/ false);
45507}
45508
45510 SDValue Cond = N->getOperand(0);
45511 SDValue LHS = N->getOperand(1);
45512 SDValue RHS = N->getOperand(2);
45513 SDLoc DL(N);
45514
45515 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
45516 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
45517 if (!TrueC || !FalseC)
45518 return SDValue();
45519
45520 // Don't do this for crazy integer types.
45521 EVT VT = N->getValueType(0);
45522 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
45523 return SDValue();
45524
45525 // We're going to use the condition bit in math or logic ops. We could allow
45526 // this with a wider condition value (post-legalization it becomes an i8),
45527 // but if nothing is creating selects that late, it doesn't matter.
45528 if (Cond.getValueType() != MVT::i1)
45529 return SDValue();
45530
45531 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
45532 // 3, 5, or 9 with i32/i64, so those get transformed too.
45533 // TODO: For constants that overflow or do not differ by power-of-2 or small
45534 // multiplier, convert to 'and' + 'add'.
45535 const APInt &TrueVal = TrueC->getAPIntValue();
45536 const APInt &FalseVal = FalseC->getAPIntValue();
45537
45538 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
45539 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
45540 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
45541 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45542 if (CC == ISD::SETEQ || CC == ISD::SETNE)
45543 return SDValue();
45544 }
45545
45546 bool OV;
45547 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
45548 if (OV)
45549 return SDValue();
45550
45551 APInt AbsDiff = Diff.abs();
45552 if (AbsDiff.isPowerOf2() ||
45553 ((VT == MVT::i32 || VT == MVT::i64) &&
45554 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
45555
45556 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
45557 // of the condition can usually be folded into a compare predicate, but even
45558 // without that, the sequence should be cheaper than a CMOV alternative.
45559 if (TrueVal.slt(FalseVal)) {
45560 Cond = DAG.getNOT(DL, Cond, MVT::i1);
45561 std::swap(TrueC, FalseC);
45562 }
45563
45564 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
45565 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
45566
45567 // Multiply condition by the difference if non-one.
45568 if (!AbsDiff.isOne())
45569 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
45570
45571 // Add the base if non-zero.
45572 if (!FalseC->isZero())
45573 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
45574
45575 return R;
45576 }
45577
45578 return SDValue();
45579}
45580
45581/// If this is a *dynamic* select (non-constant condition) and we can match
45582/// this node with one of the variable blend instructions, restructure the
45583/// condition so that blends can use the high (sign) bit of each element.
45584/// This function will also call SimplifyDemandedBits on already created
45585/// BLENDV to perform additional simplifications.
45588 const X86Subtarget &Subtarget) {
45589 SDValue Cond = N->getOperand(0);
45590 if ((N->getOpcode() != ISD::VSELECT &&
45591 N->getOpcode() != X86ISD::BLENDV) ||
45593 return SDValue();
45594
45595 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45596 unsigned BitWidth = Cond.getScalarValueSizeInBits();
45597 EVT VT = N->getValueType(0);
45598
45599 // We can only handle the cases where VSELECT is directly legal on the
45600 // subtarget. We custom lower VSELECT nodes with constant conditions and
45601 // this makes it hard to see whether a dynamic VSELECT will correctly
45602 // lower, so we both check the operation's status and explicitly handle the
45603 // cases where a *dynamic* blend will fail even though a constant-condition
45604 // blend could be custom lowered.
45605 // FIXME: We should find a better way to handle this class of problems.
45606 // Potentially, we should combine constant-condition vselect nodes
45607 // pre-legalization into shuffles and not mark as many types as custom
45608 // lowered.
45610 return SDValue();
45611 // FIXME: We don't support i16-element blends currently. We could and
45612 // should support them by making *all* the bits in the condition be set
45613 // rather than just the high bit and using an i8-element blend.
45614 if (VT.getVectorElementType() == MVT::i16)
45615 return SDValue();
45616 // Dynamic blending was only available from SSE4.1 onward.
45617 if (VT.is128BitVector() && !Subtarget.hasSSE41())
45618 return SDValue();
45619 // Byte blends are only available in AVX2
45620 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
45621 return SDValue();
45622 // There are no 512-bit blend instructions that use sign bits.
45623 if (VT.is512BitVector())
45624 return SDValue();
45625
45626 // Don't optimize before the condition has been transformed to a legal type
45627 // and don't ever optimize vector selects that map to AVX512 mask-registers.
45628 if (BitWidth < 8 || BitWidth > 64)
45629 return SDValue();
45630
45631 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
45632 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
45633 UI != UE; ++UI)
45634 if ((UI->getOpcode() != ISD::VSELECT &&
45635 UI->getOpcode() != X86ISD::BLENDV) ||
45636 UI.getOperandNo() != 0)
45637 return false;
45638
45639 return true;
45640 };
45641
45643
45644 if (OnlyUsedAsSelectCond(Cond)) {
45645 KnownBits Known;
45647 !DCI.isBeforeLegalizeOps());
45648 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
45649 return SDValue();
45650
45651 // If we changed the computation somewhere in the DAG, this change will
45652 // affect all users of Cond. Update all the nodes so that we do not use
45653 // the generic VSELECT anymore. Otherwise, we may perform wrong
45654 // optimizations as we messed with the actual expectation for the vector
45655 // boolean values.
45656 for (SDNode *U : Cond->uses()) {
45657 if (U->getOpcode() == X86ISD::BLENDV)
45658 continue;
45659
45660 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
45661 Cond, U->getOperand(1), U->getOperand(2));
45662 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
45663 DCI.AddToWorklist(U);
45664 }
45665 DCI.CommitTargetLoweringOpt(TLO);
45666 return SDValue(N, 0);
45667 }
45668
45669 // Otherwise we can still at least try to simplify multiple use bits.
45671 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
45672 N->getOperand(1), N->getOperand(2));
45673
45674 return SDValue();
45675}
45676
45677// Try to match:
45678// (or (and (M, (sub 0, X)), (pandn M, X)))
45679// which is a special case of:
45680// (select M, (sub 0, X), X)
45681// Per:
45682// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
45683// We know that, if fNegate is 0 or 1:
45684// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
45685//
45686// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
45687// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
45688// ( M ? -X : X) == ((X ^ M ) + (M & 1))
45689// This lets us transform our vselect to:
45690// (add (xor X, M), (and M, 1))
45691// And further to:
45692// (sub (xor X, M), M)
45694 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
45695 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
45696 EVT MaskVT = Mask.getValueType();
45697 assert(MaskVT.isInteger() &&
45698 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
45699 "Mask must be zero/all-bits");
45700
45701 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
45702 return SDValue();
45704 return SDValue();
45705
45706 auto IsNegV = [](SDNode *N, SDValue V) {
45707 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
45708 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
45709 };
45710
45711 SDValue V;
45712 if (IsNegV(Y.getNode(), X))
45713 V = X;
45714 else if (IsNegV(X.getNode(), Y))
45715 V = Y;
45716 else
45717 return SDValue();
45718
45719 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
45720 SDValue SubOp2 = Mask;
45721
45722 // If the negate was on the false side of the select, then
45723 // the operands of the SUB need to be swapped. PR 27251.
45724 // This is because the pattern being matched above is
45725 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
45726 // but if the pattern matched was
45727 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
45728 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
45729 // pattern also needs to be a negation of the replacement pattern above.
45730 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
45731 // sub accomplishes the negation of the replacement pattern.
45732 if (V == Y)
45733 std::swap(SubOp1, SubOp2);
45734
45735 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
45736 return DAG.getBitcast(VT, Res);
45737}
45738
45740 const X86Subtarget &Subtarget) {
45741 if (!Subtarget.hasAVX512())
45742 return SDValue();
45743 if (N->getOpcode() != ISD::VSELECT)
45744 return SDValue();
45745
45746 SDLoc DL(N);
45747 SDValue Cond = N->getOperand(0);
45748 SDValue LHS = N->getOperand(1);
45749 SDValue RHS = N->getOperand(2);
45750
45751 if (canCombineAsMaskOperation(LHS, Subtarget))
45752 return SDValue();
45753
45754 if (!canCombineAsMaskOperation(RHS, Subtarget))
45755 return SDValue();
45756
45757 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
45758 return SDValue();
45759
45760 // Commute LHS and RHS to create opportunity to select mask instruction.
45761 // (vselect M, L, R) -> (vselect ~M, R, L)
45762 ISD::CondCode NewCC =
45763 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
45764 Cond.getOperand(0).getValueType());
45765 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
45766 Cond.getOperand(1), NewCC);
45767 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
45768}
45769
45770/// Do target-specific dag combines on SELECT and VSELECT nodes.
45773 const X86Subtarget &Subtarget) {
45774 SDLoc DL(N);
45775 SDValue Cond = N->getOperand(0);
45776 SDValue LHS = N->getOperand(1);
45777 SDValue RHS = N->getOperand(2);
45778
45779 // Try simplification again because we use this function to optimize
45780 // BLENDV nodes that are not handled by the generic combiner.
45781 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
45782 return V;
45783
45784 // When avx512 is available the lhs operand of select instruction can be
45785 // folded with mask instruction, while the rhs operand can't. Commute the
45786 // lhs and rhs of the select instruction to create the opportunity of
45787 // folding.
45788 if (SDValue V = commuteSelect(N, DAG, Subtarget))
45789 return V;
45790
45791 EVT VT = LHS.getValueType();
45792 EVT CondVT = Cond.getValueType();
45793 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45794 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
45795
45796 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
45797 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
45798 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
45799 if (CondVT.isVector() && CondVT.isInteger() &&
45800 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
45801 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
45804 DL, DAG, Subtarget))
45805 return V;
45806
45807 // Convert vselects with constant condition into shuffles.
45808 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
45809 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
45812 N->getOpcode() == X86ISD::BLENDV))
45813 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
45814 }
45815
45816 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
45817 // by forcing the unselected elements to zero.
45818 // TODO: Can we handle more shuffles with this?
45819 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
45820 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
45821 LHS.hasOneUse() && RHS.hasOneUse()) {
45822 MVT SimpleVT = VT.getSimpleVT();
45823 SmallVector<SDValue, 1> LHSOps, RHSOps;
45824 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
45825 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
45826 getTargetShuffleMask(LHS, true, LHSOps, LHSMask) &&
45827 getTargetShuffleMask(RHS, true, RHSOps, RHSMask)) {
45828 int NumElts = VT.getVectorNumElements();
45829 for (int i = 0; i != NumElts; ++i) {
45830 // getConstVector sets negative shuffle mask values as undef, so ensure
45831 // we hardcode SM_SentinelZero values to zero (0x80).
45832 if (CondMask[i] < NumElts) {
45833 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
45834 RHSMask[i] = 0x80;
45835 } else {
45836 LHSMask[i] = 0x80;
45837 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
45838 }
45839 }
45840 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
45841 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
45842 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
45843 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
45844 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
45845 }
45846 }
45847
45848 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
45849 // instructions match the semantics of the common C idiom x<y?x:y but not
45850 // x<=y?x:y, because of how they handle negative zero (which can be
45851 // ignored in unsafe-math mode).
45852 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
45853 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
45854 VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
45855 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
45856 (Subtarget.hasSSE2() ||
45857 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
45858 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45859
45860 unsigned Opcode = 0;
45861 // Check for x CC y ? x : y.
45862 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
45863 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
45864 switch (CC) {
45865 default: break;
45866 case ISD::SETULT:
45867 // Converting this to a min would handle NaNs incorrectly, and swapping
45868 // the operands would cause it to handle comparisons between positive
45869 // and negative zero incorrectly.
45870 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45872 !(DAG.isKnownNeverZeroFloat(LHS) ||
45874 break;
45875 std::swap(LHS, RHS);
45876 }
45877 Opcode = X86ISD::FMIN;
45878 break;
45879 case ISD::SETOLE:
45880 // Converting this to a min would handle comparisons between positive
45881 // and negative zero incorrectly.
45884 break;
45885 Opcode = X86ISD::FMIN;
45886 break;
45887 case ISD::SETULE:
45888 // Converting this to a min would handle both negative zeros and NaNs
45889 // incorrectly, but we can swap the operands to fix both.
45890 std::swap(LHS, RHS);
45891 [[fallthrough]];
45892 case ISD::SETOLT:
45893 case ISD::SETLT:
45894 case ISD::SETLE:
45895 Opcode = X86ISD::FMIN;
45896 break;
45897
45898 case ISD::SETOGE:
45899 // Converting this to a max would handle comparisons between positive
45900 // and negative zero incorrectly.
45903 break;
45904 Opcode = X86ISD::FMAX;
45905 break;
45906 case ISD::SETUGT:
45907 // Converting this to a max would handle NaNs incorrectly, and swapping
45908 // the operands would cause it to handle comparisons between positive
45909 // and negative zero incorrectly.
45910 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45912 !(DAG.isKnownNeverZeroFloat(LHS) ||
45914 break;
45915 std::swap(LHS, RHS);
45916 }
45917 Opcode = X86ISD::FMAX;
45918 break;
45919 case ISD::SETUGE:
45920 // Converting this to a max would handle both negative zeros and NaNs
45921 // incorrectly, but we can swap the operands to fix both.
45922 std::swap(LHS, RHS);
45923 [[fallthrough]];
45924 case ISD::SETOGT:
45925 case ISD::SETGT:
45926 case ISD::SETGE:
45927 Opcode = X86ISD::FMAX;
45928 break;
45929 }
45930 // Check for x CC y ? y : x -- a min/max with reversed arms.
45931 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
45932 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
45933 switch (CC) {
45934 default: break;
45935 case ISD::SETOGE:
45936 // Converting this to a min would handle comparisons between positive
45937 // and negative zero incorrectly, and swapping the operands would
45938 // cause it to handle NaNs incorrectly.
45940 !(DAG.isKnownNeverZeroFloat(LHS) ||
45941 DAG.isKnownNeverZeroFloat(RHS))) {
45942 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45943 break;
45944 std::swap(LHS, RHS);
45945 }
45946 Opcode = X86ISD::FMIN;
45947 break;
45948 case ISD::SETUGT:
45949 // Converting this to a min would handle NaNs incorrectly.
45950 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45951 break;
45952 Opcode = X86ISD::FMIN;
45953 break;
45954 case ISD::SETUGE:
45955 // Converting this to a min would handle both negative zeros and NaNs
45956 // incorrectly, but we can swap the operands to fix both.
45957 std::swap(LHS, RHS);
45958 [[fallthrough]];
45959 case ISD::SETOGT:
45960 case ISD::SETGT:
45961 case ISD::SETGE:
45962 Opcode = X86ISD::FMIN;
45963 break;
45964
45965 case ISD::SETULT:
45966 // Converting this to a max would handle NaNs incorrectly.
45967 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45968 break;
45969 Opcode = X86ISD::FMAX;
45970 break;
45971 case ISD::SETOLE:
45972 // Converting this to a max would handle comparisons between positive
45973 // and negative zero incorrectly, and swapping the operands would
45974 // cause it to handle NaNs incorrectly.
45976 !DAG.isKnownNeverZeroFloat(LHS) &&
45977 !DAG.isKnownNeverZeroFloat(RHS)) {
45978 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45979 break;
45980 std::swap(LHS, RHS);
45981 }
45982 Opcode = X86ISD::FMAX;
45983 break;
45984 case ISD::SETULE:
45985 // Converting this to a max would handle both negative zeros and NaNs
45986 // incorrectly, but we can swap the operands to fix both.
45987 std::swap(LHS, RHS);
45988 [[fallthrough]];
45989 case ISD::SETOLT:
45990 case ISD::SETLT:
45991 case ISD::SETLE:
45992 Opcode = X86ISD::FMAX;
45993 break;
45994 }
45995 }
45996
45997 if (Opcode)
45998 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
45999 }
46000
46001 // Some mask scalar intrinsics rely on checking if only one bit is set
46002 // and implement it in C code like this:
46003 // A[0] = (U & 1) ? A[0] : W[0];
46004 // This creates some redundant instructions that break pattern matching.
46005 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
46006 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
46007 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
46008 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46009 SDValue AndNode = Cond.getOperand(0);
46010 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
46011 isNullConstant(Cond.getOperand(1)) &&
46012 isOneConstant(AndNode.getOperand(1))) {
46013 // LHS and RHS swapped due to
46014 // setcc outputting 1 when AND resulted in 0 and vice versa.
46015 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
46016 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
46017 }
46018 }
46019
46020 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
46021 // lowering on KNL. In this case we convert it to
46022 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
46023 // The same situation all vectors of i8 and i16 without BWI.
46024 // Make sure we extend these even before type legalization gets a chance to
46025 // split wide vectors.
46026 // Since SKX these selects have a proper lowering.
46027 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
46028 CondVT.getVectorElementType() == MVT::i1 &&
46029 (VT.getVectorElementType() == MVT::i8 ||
46030 VT.getVectorElementType() == MVT::i16)) {
46031 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
46032 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
46033 }
46034
46035 // AVX512 - Extend select with zero to merge with target shuffle.
46036 // select(mask, extract_subvector(shuffle(x)), zero) -->
46037 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
46038 // TODO - support non target shuffles as well.
46039 if (Subtarget.hasAVX512() && CondVT.isVector() &&
46040 CondVT.getVectorElementType() == MVT::i1) {
46041 auto SelectableOp = [&TLI](SDValue Op) {
46042 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46043 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
46044 isNullConstant(Op.getOperand(1)) &&
46045 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
46046 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
46047 };
46048
46049 bool SelectableLHS = SelectableOp(LHS);
46050 bool SelectableRHS = SelectableOp(RHS);
46051 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
46052 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
46053
46054 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
46055 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
46056 : RHS.getOperand(0).getValueType();
46057 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
46058 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
46059 VT.getSizeInBits());
46060 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
46061 VT.getSizeInBits());
46062 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
46063 DAG.getUNDEF(SrcCondVT), Cond,
46064 DAG.getIntPtrConstant(0, DL));
46065 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
46066 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
46067 }
46068 }
46069
46071 return V;
46072
46073 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
46074 Cond.hasOneUse()) {
46075 EVT CondVT = Cond.getValueType();
46076 SDValue Cond0 = Cond.getOperand(0);
46077 SDValue Cond1 = Cond.getOperand(1);
46078 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46079
46080 // Canonicalize min/max:
46081 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
46082 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
46083 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
46084 // the need for an extra compare against zero. e.g.
46085 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
46086 // subl %esi, %edi
46087 // testl %edi, %edi
46088 // movl $0, %eax
46089 // cmovgl %edi, %eax
46090 // =>
46091 // xorl %eax, %eax
46092 // subl %esi, $edi
46093 // cmovsl %eax, %edi
46094 //
46095 // We can also canonicalize
46096 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
46097 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
46098 // This allows the use of a test instruction for the compare.
46099 if (LHS == Cond0 && RHS == Cond1) {
46100 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
46103 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
46104 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
46105 }
46106 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
46107 ISD::CondCode NewCC = ISD::SETUGE;
46108 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
46109 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
46110 }
46111 }
46112
46113 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
46114 // fold eq + gt/lt nested selects into ge/le selects
46115 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
46116 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
46117 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
46118 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
46119 // .. etc ..
46120 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
46121 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
46122 SDValue InnerSetCC = RHS.getOperand(0);
46123 ISD::CondCode InnerCC =
46124 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
46125 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
46126 Cond0 == InnerSetCC.getOperand(0) &&
46127 Cond1 == InnerSetCC.getOperand(1)) {
46128 ISD::CondCode NewCC;
46129 switch (CC == ISD::SETEQ ? InnerCC : CC) {
46130 // clang-format off
46131 case ISD::SETGT: NewCC = ISD::SETGE; break;
46132 case ISD::SETLT: NewCC = ISD::SETLE; break;
46133 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
46134 case ISD::SETULT: NewCC = ISD::SETULE; break;
46135 default: NewCC = ISD::SETCC_INVALID; break;
46136 // clang-format on
46137 }
46138 if (NewCC != ISD::SETCC_INVALID) {
46139 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
46140 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
46141 }
46142 }
46143 }
46144 }
46145
46146 // Check if the first operand is all zeros and Cond type is vXi1.
46147 // If this an avx512 target we can improve the use of zero masking by
46148 // swapping the operands and inverting the condition.
46149 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
46150 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
46151 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
46152 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
46153 // Invert the cond to not(cond) : xor(op,allones)=not(op)
46154 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
46155 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
46156 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
46157 }
46158
46159 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
46160 // get split by legalization.
46161 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
46162 CondVT.getVectorElementType() == MVT::i1 &&
46163 TLI.isTypeLegal(VT.getScalarType())) {
46164 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
46166 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
46167 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
46168 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
46169 }
46170 }
46171
46172 // Early exit check
46173 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
46174 return SDValue();
46175
46176 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
46177 return V;
46178
46179 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
46180 return V;
46181
46182 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
46183 return V;
46184
46185 // select(~Cond, X, Y) -> select(Cond, Y, X)
46186 if (CondVT.getScalarType() != MVT::i1) {
46187 if (SDValue CondNot = IsNOT(Cond, DAG))
46188 return DAG.getNode(N->getOpcode(), DL, VT,
46189 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
46190
46191 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
46192 // signbit.
46193 if (Cond.getOpcode() == X86ISD::PCMPGT &&
46194 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
46195 Cond.hasOneUse()) {
46196 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
46197 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
46198 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
46199 }
46200 }
46201
46202 // Try to optimize vXi1 selects if both operands are either all constants or
46203 // bitcasts from scalar integer type. In that case we can convert the operands
46204 // to integer and use an integer select which will be converted to a CMOV.
46205 // We need to take a little bit of care to avoid creating an i64 type after
46206 // type legalization.
46207 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
46208 VT.getVectorElementType() == MVT::i1 &&
46209 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
46211 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
46212 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
46213 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
46214
46215 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
46216 LHS.getOperand(0).getValueType() == IntVT)) &&
46217 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
46218 RHS.getOperand(0).getValueType() == IntVT))) {
46219 if (LHSIsConst)
46221 else
46222 LHS = LHS.getOperand(0);
46223
46224 if (RHSIsConst)
46226 else
46227 RHS = RHS.getOperand(0);
46228
46229 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
46230 return DAG.getBitcast(VT, Select);
46231 }
46232 }
46233 }
46234
46235 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
46236 // single bits, then invert the predicate and swap the select operands.
46237 // This can lower using a vector shift bit-hack rather than mask and compare.
46238 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
46239 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
46240 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
46241 Cond.getOperand(0).getOpcode() == ISD::AND &&
46242 isNullOrNullSplat(Cond.getOperand(1)) &&
46243 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
46244 Cond.getOperand(0).getValueType() == VT) {
46245 // The 'and' mask must be composed of power-of-2 constants.
46246 SDValue And = Cond.getOperand(0);
46247 auto *C = isConstOrConstSplat(And.getOperand(1));
46248 if (C && C->getAPIntValue().isPowerOf2()) {
46249 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
46250 SDValue NotCond =
46251 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
46252 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
46253 }
46254
46255 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
46256 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
46257 // 16-bit lacks a proper blendv.
46258 unsigned EltBitWidth = VT.getScalarSizeInBits();
46259 bool CanShiftBlend =
46260 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
46261 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
46262 (Subtarget.hasXOP()));
46263 if (CanShiftBlend &&
46264 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
46265 return C->getAPIntValue().isPowerOf2();
46266 })) {
46267 // Create a left-shift constant to get the mask bits over to the sign-bit.
46268 SDValue Mask = And.getOperand(1);
46269 SmallVector<int, 32> ShlVals;
46270 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
46271 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
46272 ShlVals.push_back(EltBitWidth - 1 -
46273 MaskVal->getAPIntValue().exactLogBase2());
46274 }
46275 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
46276 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
46277 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
46278 SDValue NewCond =
46279 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
46280 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
46281 }
46282 }
46283
46284 return SDValue();
46285}
46286
46287/// Combine:
46288/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
46289/// to:
46290/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
46291/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
46292/// Note that this is only legal for some op/cc combinations.
46294 SelectionDAG &DAG,
46295 const X86Subtarget &Subtarget) {
46296 // This combine only operates on CMP-like nodes.
46297 if (!(Cmp.getOpcode() == X86ISD::CMP ||
46298 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
46299 return SDValue();
46300
46301 // Can't replace the cmp if it has more uses than the one we're looking at.
46302 // FIXME: We would like to be able to handle this, but would need to make sure
46303 // all uses were updated.
46304 if (!Cmp.hasOneUse())
46305 return SDValue();
46306
46307 // This only applies to variations of the common case:
46308 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
46309 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
46310 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
46311 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
46312 // Using the proper condcodes (see below), overflow is checked for.
46313
46314 // FIXME: We can generalize both constraints:
46315 // - XOR/OR/AND (if they were made to survive AtomicExpand)
46316 // - LHS != 1
46317 // if the result is compared.
46318
46319 SDValue CmpLHS = Cmp.getOperand(0);
46320 SDValue CmpRHS = Cmp.getOperand(1);
46321 EVT CmpVT = CmpLHS.getValueType();
46322
46323 if (!CmpLHS.hasOneUse())
46324 return SDValue();
46325
46326 unsigned Opc = CmpLHS.getOpcode();
46327 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
46328 return SDValue();
46329
46330 SDValue OpRHS = CmpLHS.getOperand(2);
46331 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
46332 if (!OpRHSC)
46333 return SDValue();
46334
46335 APInt Addend = OpRHSC->getAPIntValue();
46336 if (Opc == ISD::ATOMIC_LOAD_SUB)
46337 Addend = -Addend;
46338
46339 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
46340 if (!CmpRHSC)
46341 return SDValue();
46342
46343 APInt Comparison = CmpRHSC->getAPIntValue();
46344 APInt NegAddend = -Addend;
46345
46346 // See if we can adjust the CC to make the comparison match the negated
46347 // addend.
46348 if (Comparison != NegAddend) {
46349 APInt IncComparison = Comparison + 1;
46350 if (IncComparison == NegAddend) {
46351 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
46352 Comparison = IncComparison;
46353 CC = X86::COND_AE;
46354 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
46355 Comparison = IncComparison;
46356 CC = X86::COND_L;
46357 }
46358 }
46359 APInt DecComparison = Comparison - 1;
46360 if (DecComparison == NegAddend) {
46361 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
46362 Comparison = DecComparison;
46363 CC = X86::COND_A;
46364 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
46365 Comparison = DecComparison;
46366 CC = X86::COND_LE;
46367 }
46368 }
46369 }
46370
46371 // If the addend is the negation of the comparison value, then we can do
46372 // a full comparison by emitting the atomic arithmetic as a locked sub.
46373 if (Comparison == NegAddend) {
46374 // The CC is fine, but we need to rewrite the LHS of the comparison as an
46375 // atomic sub.
46376 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
46377 auto AtomicSub = DAG.getAtomic(
46378 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
46379 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
46380 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
46381 AN->getMemOperand());
46382 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
46383 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
46384 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
46385 return LockOp;
46386 }
46387
46388 // We can handle comparisons with zero in a number of cases by manipulating
46389 // the CC used.
46390 if (!Comparison.isZero())
46391 return SDValue();
46392
46393 if (CC == X86::COND_S && Addend == 1)
46394 CC = X86::COND_LE;
46395 else if (CC == X86::COND_NS && Addend == 1)
46396 CC = X86::COND_G;
46397 else if (CC == X86::COND_G && Addend == -1)
46398 CC = X86::COND_GE;
46399 else if (CC == X86::COND_LE && Addend == -1)
46400 CC = X86::COND_L;
46401 else
46402 return SDValue();
46403
46404 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
46405 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
46406 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
46407 return LockOp;
46408}
46409
46410// Check whether a boolean test is testing a boolean value generated by
46411// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
46412// code.
46413//
46414// Simplify the following patterns:
46415// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
46416// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
46417// to (Op EFLAGS Cond)
46418//
46419// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
46420// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
46421// to (Op EFLAGS !Cond)
46422//
46423// where Op could be BRCOND or CMOV.
46424//
46426 // This combine only operates on CMP-like nodes.
46427 if (!(Cmp.getOpcode() == X86ISD::CMP ||
46428 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
46429 return SDValue();
46430
46431 // Quit if not used as a boolean value.
46432 if (CC != X86::COND_E && CC != X86::COND_NE)
46433 return SDValue();
46434
46435 // Check CMP operands. One of them should be 0 or 1 and the other should be
46436 // an SetCC or extended from it.
46437 SDValue Op1 = Cmp.getOperand(0);
46438 SDValue Op2 = Cmp.getOperand(1);
46439
46440 SDValue SetCC;
46441 const ConstantSDNode* C = nullptr;
46442 bool needOppositeCond = (CC == X86::COND_E);
46443 bool checkAgainstTrue = false; // Is it a comparison against 1?
46444
46445 if ((C = dyn_cast<ConstantSDNode>(Op1)))
46446 SetCC = Op2;
46447 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
46448 SetCC = Op1;
46449 else // Quit if all operands are not constants.
46450 return SDValue();
46451
46452 if (C->getZExtValue() == 1) {
46453 needOppositeCond = !needOppositeCond;
46454 checkAgainstTrue = true;
46455 } else if (C->getZExtValue() != 0)
46456 // Quit if the constant is neither 0 or 1.
46457 return SDValue();
46458
46459 bool truncatedToBoolWithAnd = false;
46460 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
46461 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
46462 SetCC.getOpcode() == ISD::TRUNCATE ||
46463 SetCC.getOpcode() == ISD::AND) {
46464 if (SetCC.getOpcode() == ISD::AND) {
46465 int OpIdx = -1;
46466 if (isOneConstant(SetCC.getOperand(0)))
46467 OpIdx = 1;
46468 if (isOneConstant(SetCC.getOperand(1)))
46469 OpIdx = 0;
46470 if (OpIdx < 0)
46471 break;
46472 SetCC = SetCC.getOperand(OpIdx);
46473 truncatedToBoolWithAnd = true;
46474 } else
46475 SetCC = SetCC.getOperand(0);
46476 }
46477
46478 switch (SetCC.getOpcode()) {
46480 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
46481 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
46482 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
46483 // truncated to i1 using 'and'.
46484 if (checkAgainstTrue && !truncatedToBoolWithAnd)
46485 break;
46487 "Invalid use of SETCC_CARRY!");
46488 [[fallthrough]];
46489 case X86ISD::SETCC:
46490 // Set the condition code or opposite one if necessary.
46492 if (needOppositeCond)
46494 return SetCC.getOperand(1);
46495 case X86ISD::CMOV: {
46496 // Check whether false/true value has canonical one, i.e. 0 or 1.
46497 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
46498 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
46499 // Quit if true value is not a constant.
46500 if (!TVal)
46501 return SDValue();
46502 // Quit if false value is not a constant.
46503 if (!FVal) {
46504 SDValue Op = SetCC.getOperand(0);
46505 // Skip 'zext' or 'trunc' node.
46506 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
46507 Op.getOpcode() == ISD::TRUNCATE)
46508 Op = Op.getOperand(0);
46509 // A special case for rdrand/rdseed, where 0 is set if false cond is
46510 // found.
46511 if ((Op.getOpcode() != X86ISD::RDRAND &&
46512 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
46513 return SDValue();
46514 }
46515 // Quit if false value is not the constant 0 or 1.
46516 bool FValIsFalse = true;
46517 if (FVal && FVal->getZExtValue() != 0) {
46518 if (FVal->getZExtValue() != 1)
46519 return SDValue();
46520 // If FVal is 1, opposite cond is needed.
46521 needOppositeCond = !needOppositeCond;
46522 FValIsFalse = false;
46523 }
46524 // Quit if TVal is not the constant opposite of FVal.
46525 if (FValIsFalse && TVal->getZExtValue() != 1)
46526 return SDValue();
46527 if (!FValIsFalse && TVal->getZExtValue() != 0)
46528 return SDValue();
46530 if (needOppositeCond)
46532 return SetCC.getOperand(3);
46533 }
46534 }
46535
46536 return SDValue();
46537}
46538
46539/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
46540/// Match:
46541/// (X86or (X86setcc) (X86setcc))
46542/// (X86cmp (and (X86setcc) (X86setcc)), 0)
46544 X86::CondCode &CC1, SDValue &Flags,
46545 bool &isAnd) {
46546 if (Cond->getOpcode() == X86ISD::CMP) {
46547 if (!isNullConstant(Cond->getOperand(1)))
46548 return false;
46549
46550 Cond = Cond->getOperand(0);
46551 }
46552
46553 isAnd = false;
46554
46555 SDValue SetCC0, SetCC1;
46556 switch (Cond->getOpcode()) {
46557 default: return false;
46558 case ISD::AND:
46559 case X86ISD::AND:
46560 isAnd = true;
46561 [[fallthrough]];
46562 case ISD::OR:
46563 case X86ISD::OR:
46564 SetCC0 = Cond->getOperand(0);
46565 SetCC1 = Cond->getOperand(1);
46566 break;
46567 };
46568
46569 // Make sure we have SETCC nodes, using the same flags value.
46570 if (SetCC0.getOpcode() != X86ISD::SETCC ||
46571 SetCC1.getOpcode() != X86ISD::SETCC ||
46572 SetCC0->getOperand(1) != SetCC1->getOperand(1))
46573 return false;
46574
46575 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
46576 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
46577 Flags = SetCC0->getOperand(1);
46578 return true;
46579}
46580
46581// When legalizing carry, we create carries via add X, -1
46582// If that comes from an actual carry, via setcc, we use the
46583// carry directly.
46585 if (EFLAGS.getOpcode() == X86ISD::ADD) {
46586 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
46587 bool FoundAndLSB = false;
46588 SDValue Carry = EFLAGS.getOperand(0);
46589 while (Carry.getOpcode() == ISD::TRUNCATE ||
46590 Carry.getOpcode() == ISD::ZERO_EXTEND ||
46591 (Carry.getOpcode() == ISD::AND &&
46592 isOneConstant(Carry.getOperand(1)))) {
46593 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
46594 Carry = Carry.getOperand(0);
46595 }
46596 if (Carry.getOpcode() == X86ISD::SETCC ||
46597 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
46598 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
46599 uint64_t CarryCC = Carry.getConstantOperandVal(0);
46600 SDValue CarryOp1 = Carry.getOperand(1);
46601 if (CarryCC == X86::COND_B)
46602 return CarryOp1;
46603 if (CarryCC == X86::COND_A) {
46604 // Try to convert COND_A into COND_B in an attempt to facilitate
46605 // materializing "setb reg".
46606 //
46607 // Do not flip "e > c", where "c" is a constant, because Cmp
46608 // instruction cannot take an immediate as its first operand.
46609 //
46610 if (CarryOp1.getOpcode() == X86ISD::SUB &&
46611 CarryOp1.getNode()->hasOneUse() &&
46612 CarryOp1.getValueType().isInteger() &&
46613 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
46614 SDValue SubCommute =
46615 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
46616 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
46617 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
46618 }
46619 }
46620 // If this is a check of the z flag of an add with 1, switch to the
46621 // C flag.
46622 if (CarryCC == X86::COND_E &&
46623 CarryOp1.getOpcode() == X86ISD::ADD &&
46624 isOneConstant(CarryOp1.getOperand(1)))
46625 return CarryOp1;
46626 } else if (FoundAndLSB) {
46627 SDLoc DL(Carry);
46628 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
46629 if (Carry.getOpcode() == ISD::SRL) {
46630 BitNo = Carry.getOperand(1);
46631 Carry = Carry.getOperand(0);
46632 }
46633 return getBT(Carry, BitNo, DL, DAG);
46634 }
46635 }
46636 }
46637
46638 return SDValue();
46639}
46640
46641/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
46642/// to avoid the inversion.
46644 SelectionDAG &DAG,
46645 const X86Subtarget &Subtarget) {
46646 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
46647 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
46648 EFLAGS.getOpcode() != X86ISD::TESTP)
46649 return SDValue();
46650
46651 // PTEST/TESTP sets EFLAGS as:
46652 // TESTZ: ZF = (Op0 & Op1) == 0
46653 // TESTC: CF = (~Op0 & Op1) == 0
46654 // TESTNZC: ZF == 0 && CF == 0
46655 MVT VT = EFLAGS.getSimpleValueType();
46656 SDValue Op0 = EFLAGS.getOperand(0);
46657 SDValue Op1 = EFLAGS.getOperand(1);
46658 MVT OpVT = Op0.getSimpleValueType();
46659 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46660
46661 // TEST*(~X,Y) == TEST*(X,Y)
46662 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
46663 X86::CondCode InvCC;
46664 switch (CC) {
46665 case X86::COND_B:
46666 // testc -> testz.
46667 InvCC = X86::COND_E;
46668 break;
46669 case X86::COND_AE:
46670 // !testc -> !testz.
46671 InvCC = X86::COND_NE;
46672 break;
46673 case X86::COND_E:
46674 // testz -> testc.
46675 InvCC = X86::COND_B;
46676 break;
46677 case X86::COND_NE:
46678 // !testz -> !testc.
46679 InvCC = X86::COND_AE;
46680 break;
46681 case X86::COND_A:
46682 case X86::COND_BE:
46683 // testnzc -> testnzc (no change).
46684 InvCC = CC;
46685 break;
46686 default:
46687 InvCC = X86::COND_INVALID;
46688 break;
46689 }
46690
46691 if (InvCC != X86::COND_INVALID) {
46692 CC = InvCC;
46693 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46694 DAG.getBitcast(OpVT, NotOp0), Op1);
46695 }
46696 }
46697
46698 if (CC == X86::COND_B || CC == X86::COND_AE) {
46699 // TESTC(X,~X) == TESTC(X,-1)
46700 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46701 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
46702 SDLoc DL(EFLAGS);
46703 return DAG.getNode(
46704 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
46705 DAG.getBitcast(OpVT,
46706 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
46707 }
46708 }
46709 }
46710
46711 if (CC == X86::COND_E || CC == X86::COND_NE) {
46712 // TESTZ(X,~Y) == TESTC(Y,X)
46713 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46715 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46716 DAG.getBitcast(OpVT, NotOp1), Op0);
46717 }
46718
46719 if (Op0 == Op1) {
46720 SDValue BC = peekThroughBitcasts(Op0);
46721 EVT BCVT = BC.getValueType();
46722
46723 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
46724 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
46725 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46726 DAG.getBitcast(OpVT, BC.getOperand(0)),
46727 DAG.getBitcast(OpVT, BC.getOperand(1)));
46728 }
46729
46730 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
46731 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
46733 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46734 DAG.getBitcast(OpVT, BC.getOperand(0)),
46735 DAG.getBitcast(OpVT, BC.getOperand(1)));
46736 }
46737
46738 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
46739 // to more efficiently extract the sign bits and compare that.
46740 // TODO: Handle TESTC with comparison inversion.
46741 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
46742 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
46743 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
46744 unsigned EltBits = BCVT.getScalarSizeInBits();
46745 if (DAG.ComputeNumSignBits(BC) == EltBits) {
46746 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
46747 APInt SignMask = APInt::getSignMask(EltBits);
46748 if (SDValue Res =
46749 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
46750 // For vXi16 cases we need to use pmovmksb and extract every other
46751 // sign bit.
46752 SDLoc DL(EFLAGS);
46753 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
46754 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
46755 MVT FloatVT =
46756 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
46757 Res = DAG.getBitcast(FloatVT, Res);
46758 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
46759 } else if (EltBits == 16) {
46760 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
46761 Res = DAG.getBitcast(MovmskVT, Res);
46762 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46763 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
46764 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46765 } else {
46766 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46767 }
46768 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
46769 DAG.getConstant(0, DL, MVT::i32));
46770 }
46771 }
46772 }
46773 }
46774
46775 // TESTZ(-1,X) == TESTZ(X,X)
46777 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
46778
46779 // TESTZ(X,-1) == TESTZ(X,X)
46781 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
46782
46783 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
46784 // TODO: Add COND_NE handling?
46785 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
46786 SDValue Src0 = peekThroughBitcasts(Op0);
46787 SDValue Src1 = peekThroughBitcasts(Op1);
46788 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
46790 peekThroughBitcasts(Src0.getOperand(1)), true);
46792 peekThroughBitcasts(Src1.getOperand(1)), true);
46793 if (Src0 && Src1) {
46794 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
46795 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46796 DAG.getBitcast(OpVT2, Src0),
46797 DAG.getBitcast(OpVT2, Src1));
46798 }
46799 }
46800 }
46801 }
46802
46803 return SDValue();
46804}
46805
46806// Attempt to simplify the MOVMSK input based on the comparison type.
46808 SelectionDAG &DAG,
46809 const X86Subtarget &Subtarget) {
46810 // Handle eq/ne against zero (any_of).
46811 // Handle eq/ne against -1 (all_of).
46812 if (!(CC == X86::COND_E || CC == X86::COND_NE))
46813 return SDValue();
46814 if (EFLAGS.getValueType() != MVT::i32)
46815 return SDValue();
46816 unsigned CmpOpcode = EFLAGS.getOpcode();
46817 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
46818 return SDValue();
46819 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
46820 if (!CmpConstant)
46821 return SDValue();
46822 const APInt &CmpVal = CmpConstant->getAPIntValue();
46823
46824 SDValue CmpOp = EFLAGS.getOperand(0);
46825 unsigned CmpBits = CmpOp.getValueSizeInBits();
46826 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
46827
46828 // Peek through any truncate.
46829 if (CmpOp.getOpcode() == ISD::TRUNCATE)
46830 CmpOp = CmpOp.getOperand(0);
46831
46832 // Bail if we don't find a MOVMSK.
46833 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
46834 return SDValue();
46835
46836 SDValue Vec = CmpOp.getOperand(0);
46837 MVT VecVT = Vec.getSimpleValueType();
46838 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
46839 "Unexpected MOVMSK operand");
46840 unsigned NumElts = VecVT.getVectorNumElements();
46841 unsigned NumEltBits = VecVT.getScalarSizeInBits();
46842
46843 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
46844 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
46845 NumElts <= CmpBits && CmpVal.isMask(NumElts);
46846 if (!IsAnyOf && !IsAllOf)
46847 return SDValue();
46848
46849 // TODO: Check more combining cases for me.
46850 // Here we check the cmp use number to decide do combining or not.
46851 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
46852 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
46853 bool IsOneUse = CmpOp.getNode()->hasOneUse();
46854
46855 // See if we can peek through to a vector with a wider element type, if the
46856 // signbits extend down to all the sub-elements as well.
46857 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
46858 // potential SimplifyDemandedBits/Elts cases.
46859 // If we looked through a truncate that discard bits, we can't do this
46860 // transform.
46861 // FIXME: We could do this transform for truncates that discarded bits by
46862 // inserting an AND mask between the new MOVMSK and the CMP.
46863 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
46864 SDValue BC = peekThroughBitcasts(Vec);
46865 MVT BCVT = BC.getSimpleValueType();
46866 unsigned BCNumElts = BCVT.getVectorNumElements();
46867 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
46868 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
46869 BCNumEltBits > NumEltBits &&
46870 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
46871 SDLoc DL(EFLAGS);
46872 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
46873 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46874 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
46875 DAG.getConstant(CmpMask, DL, MVT::i32));
46876 }
46877 }
46878
46879 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
46880 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
46881 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
46882 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
46883 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
46885 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
46886 Ops.size() == 2) {
46887 SDLoc DL(EFLAGS);
46888 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
46889 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
46890 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
46891 DAG.getBitcast(SubVT, Ops[0]),
46892 DAG.getBitcast(SubVT, Ops[1]));
46893 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
46894 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46895 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
46896 DAG.getConstant(CmpMask, DL, MVT::i32));
46897 }
46898 }
46899
46900 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
46901 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
46902 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
46903 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
46904 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
46905 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
46906 SDValue BC = peekThroughBitcasts(Vec);
46907 // Ensure MOVMSK was testing every signbit of BC.
46908 if (BC.getValueType().getVectorNumElements() <= NumElts) {
46909 if (BC.getOpcode() == X86ISD::PCMPEQ) {
46910 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
46911 BC.getOperand(0), BC.getOperand(1));
46912 V = DAG.getBitcast(TestVT, V);
46913 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46914 }
46915 // Check for 256-bit split vector cases.
46916 if (BC.getOpcode() == ISD::AND &&
46917 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
46918 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
46919 SDValue LHS = BC.getOperand(0);
46920 SDValue RHS = BC.getOperand(1);
46921 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
46922 LHS.getOperand(0), LHS.getOperand(1));
46923 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
46924 RHS.getOperand(0), RHS.getOperand(1));
46925 LHS = DAG.getBitcast(TestVT, LHS);
46926 RHS = DAG.getBitcast(TestVT, RHS);
46927 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
46928 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46929 }
46930 }
46931 }
46932
46933 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
46934 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
46935 // sign bits prior to the comparison with zero unless we know that
46936 // the vXi16 splats the sign bit down to the lower i8 half.
46937 // TODO: Handle all_of patterns.
46938 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
46939 SDValue VecOp0 = Vec.getOperand(0);
46940 SDValue VecOp1 = Vec.getOperand(1);
46941 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
46942 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
46943 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
46944 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
46945 SDLoc DL(EFLAGS);
46946 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
46947 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46948 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
46949 if (!SignExt0) {
46950 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
46951 DAG.getConstant(0xAAAA, DL, MVT::i16));
46952 }
46953 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46954 DAG.getConstant(0, DL, MVT::i16));
46955 }
46956 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
46957 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
46958 if (CmpBits >= 16 && Subtarget.hasInt256() &&
46959 (IsAnyOf || (SignExt0 && SignExt1))) {
46960 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
46961 SDLoc DL(EFLAGS);
46962 SDValue Result = peekThroughBitcasts(Src);
46963 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
46964 Result.getValueType().getVectorNumElements() <= NumElts) {
46965 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
46966 Result.getOperand(0), Result.getOperand(1));
46967 V = DAG.getBitcast(MVT::v4i64, V);
46968 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46969 }
46970 Result = DAG.getBitcast(MVT::v32i8, Result);
46971 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46972 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
46973 if (!SignExt0 || !SignExt1) {
46974 assert(IsAnyOf &&
46975 "Only perform v16i16 signmasks for any_of patterns");
46976 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
46977 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46978 }
46979 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46980 DAG.getConstant(CmpMask, DL, MVT::i32));
46981 }
46982 }
46983 }
46984
46985 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
46986 // Since we peek through a bitcast, we need to be careful if the base vector
46987 // type has smaller elements than the MOVMSK type. In that case, even if
46988 // all the elements are demanded by the shuffle mask, only the "high"
46989 // elements which have highbits that align with highbits in the MOVMSK vec
46990 // elements are actually demanded. A simplification of spurious operations
46991 // on the "low" elements take place during other simplifications.
46992 //
46993 // For example:
46994 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
46995 // demanded, because we are swapping around the result can change.
46996 //
46997 // To address this, we check that we can scale the shuffle mask to MOVMSK
46998 // element width (this will ensure "high" elements match). Its slightly overly
46999 // conservative, but fine for an edge case fold.
47000 SmallVector<int, 32> ShuffleMask;
47001 SmallVector<SDValue, 2> ShuffleInputs;
47002 if (NumElts <= CmpBits &&
47003 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
47004 ShuffleMask, DAG) &&
47005 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
47006 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
47007 canScaleShuffleElements(ShuffleMask, NumElts)) {
47008 SDLoc DL(EFLAGS);
47009 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
47010 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47011 Result =
47012 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
47013 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
47014 }
47015
47016 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
47017 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
47018 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
47019 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
47020 // iff every element is referenced.
47021 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
47022 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
47023 (NumEltBits == 32 || NumEltBits == 64)) {
47024 SDLoc DL(EFLAGS);
47025 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
47026 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
47027 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
47028 SDValue LHS = Vec;
47029 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
47030 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47031 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
47032 DAG.getBitcast(FloatVT, LHS),
47033 DAG.getBitcast(FloatVT, RHS));
47034 }
47035
47036 return SDValue();
47037}
47038
47039/// Optimize an EFLAGS definition used according to the condition code \p CC
47040/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
47041/// uses of chain values.
47043 SelectionDAG &DAG,
47044 const X86Subtarget &Subtarget) {
47045 if (CC == X86::COND_B)
47046 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
47047 return Flags;
47048
47049 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
47050 return R;
47051
47052 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
47053 return R;
47054
47055 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
47056 return R;
47057
47058 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
47059}
47060
47061/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
47064 const X86Subtarget &Subtarget) {
47065 SDLoc DL(N);
47066
47067 SDValue FalseOp = N->getOperand(0);
47068 SDValue TrueOp = N->getOperand(1);
47069 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
47070 SDValue Cond = N->getOperand(3);
47071
47072 // cmov X, X, ?, ? --> X
47073 if (TrueOp == FalseOp)
47074 return TrueOp;
47075
47076 // Try to simplify the EFLAGS and condition code operands.
47077 // We can't always do this as FCMOV only supports a subset of X86 cond.
47078 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
47079 if (!(FalseOp.getValueType() == MVT::f80 ||
47080 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
47081 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
47082 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
47083 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
47084 Flags};
47085 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47086 }
47087 }
47088
47089 // If this is a select between two integer constants, try to do some
47090 // optimizations. Note that the operands are ordered the opposite of SELECT
47091 // operands.
47092 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
47093 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
47094 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
47095 // larger than FalseC (the false value).
47096 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
47098 std::swap(TrueC, FalseC);
47099 std::swap(TrueOp, FalseOp);
47100 }
47101
47102 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
47103 // This is efficient for any integer data type (including i8/i16) and
47104 // shift amount.
47105 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
47106 Cond = getSETCC(CC, Cond, DL, DAG);
47107
47108 // Zero extend the condition if needed.
47109 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
47110
47111 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
47112 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
47113 DAG.getConstant(ShAmt, DL, MVT::i8));
47114 return Cond;
47115 }
47116
47117 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
47118 // for any integer data type, including i8/i16.
47119 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
47120 Cond = getSETCC(CC, Cond, DL, DAG);
47121
47122 // Zero extend the condition if needed.
47124 FalseC->getValueType(0), Cond);
47125 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
47126 SDValue(FalseC, 0));
47127 return Cond;
47128 }
47129
47130 // Optimize cases that will turn into an LEA instruction. This requires
47131 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
47132 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
47133 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
47134 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
47135 "Implicit constant truncation");
47136
47137 bool isFastMultiplier = false;
47138 if (Diff.ult(10)) {
47139 switch (Diff.getZExtValue()) {
47140 default: break;
47141 case 1: // result = add base, cond
47142 case 2: // result = lea base( , cond*2)
47143 case 3: // result = lea base(cond, cond*2)
47144 case 4: // result = lea base( , cond*4)
47145 case 5: // result = lea base(cond, cond*4)
47146 case 8: // result = lea base( , cond*8)
47147 case 9: // result = lea base(cond, cond*8)
47148 isFastMultiplier = true;
47149 break;
47150 }
47151 }
47152
47153 if (isFastMultiplier) {
47154 Cond = getSETCC(CC, Cond, DL ,DAG);
47155 // Zero extend the condition if needed.
47156 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
47157 Cond);
47158 // Scale the condition by the difference.
47159 if (Diff != 1)
47160 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
47161 DAG.getConstant(Diff, DL, Cond.getValueType()));
47162
47163 // Add the base if non-zero.
47164 if (FalseC->getAPIntValue() != 0)
47165 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
47166 SDValue(FalseC, 0));
47167 return Cond;
47168 }
47169 }
47170 }
47171 }
47172
47173 // Handle these cases:
47174 // (select (x != c), e, c) -> select (x != c), e, x),
47175 // (select (x == c), c, e) -> select (x == c), x, e)
47176 // where the c is an integer constant, and the "select" is the combination
47177 // of CMOV and CMP.
47178 //
47179 // The rationale for this change is that the conditional-move from a constant
47180 // needs two instructions, however, conditional-move from a register needs
47181 // only one instruction.
47182 //
47183 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
47184 // some instruction-combining opportunities. This opt needs to be
47185 // postponed as late as possible.
47186 //
47187 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
47188 // the DCI.xxxx conditions are provided to postpone the optimization as
47189 // late as possible.
47190
47191 ConstantSDNode *CmpAgainst = nullptr;
47192 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
47193 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
47194 !isa<ConstantSDNode>(Cond.getOperand(0))) {
47195
47196 if (CC == X86::COND_NE &&
47197 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
47199 std::swap(TrueOp, FalseOp);
47200 }
47201
47202 if (CC == X86::COND_E &&
47203 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
47204 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
47205 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
47206 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47207 }
47208 }
47209 }
47210
47211 // Transform:
47212 //
47213 // (cmov 1 T (uge T 2))
47214 //
47215 // to:
47216 //
47217 // (adc T 0 (sub T 1))
47218 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
47219 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
47220 SDValue Cond0 = Cond.getOperand(0);
47221 if (Cond0.getOpcode() == ISD::TRUNCATE)
47222 Cond0 = Cond0.getOperand(0);
47223 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
47224 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
47225 EVT CondVT = Cond->getValueType(0);
47226 EVT OuterVT = N->getValueType(0);
47227 // Subtract 1 and generate a carry.
47228 SDValue NewSub =
47229 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
47230 DAG.getConstant(1, DL, CondVT));
47231 SDValue EFLAGS(NewSub.getNode(), 1);
47232 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
47233 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
47234 }
47235 }
47236
47237 // Fold and/or of setcc's to double CMOV:
47238 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
47239 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
47240 //
47241 // This combine lets us generate:
47242 // cmovcc1 (jcc1 if we don't have CMOV)
47243 // cmovcc2 (same)
47244 // instead of:
47245 // setcc1
47246 // setcc2
47247 // and/or
47248 // cmovne (jne if we don't have CMOV)
47249 // When we can't use the CMOV instruction, it might increase branch
47250 // mispredicts.
47251 // When we can use CMOV, or when there is no mispredict, this improves
47252 // throughput and reduces register pressure.
47253 //
47254 if (CC == X86::COND_NE) {
47255 SDValue Flags;
47256 X86::CondCode CC0, CC1;
47257 bool isAndSetCC;
47258 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
47259 if (isAndSetCC) {
47260 std::swap(FalseOp, TrueOp);
47263 }
47264
47265 SDValue LOps[] = {FalseOp, TrueOp,
47266 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
47267 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
47268 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
47269 Flags};
47270 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
47271 return CMOV;
47272 }
47273 }
47274
47275 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
47276 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
47277 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
47278 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
47279 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
47280 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
47281 SDValue Add = TrueOp;
47282 SDValue Const = FalseOp;
47283 // Canonicalize the condition code for easier matching and output.
47284 if (CC == X86::COND_E)
47285 std::swap(Add, Const);
47286
47287 // We might have replaced the constant in the cmov with the LHS of the
47288 // compare. If so change it to the RHS of the compare.
47289 if (Const == Cond.getOperand(0))
47290 Const = Cond.getOperand(1);
47291
47292 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
47293 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
47294 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
47295 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
47296 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
47297 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
47298 EVT VT = N->getValueType(0);
47299 // This should constant fold.
47300 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
47301 SDValue CMov =
47302 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
47303 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
47304 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
47305 }
47306 }
47307
47308 return SDValue();
47309}
47310
47311/// Different mul shrinking modes.
47313
47315 EVT VT = N->getOperand(0).getValueType();
47316 if (VT.getScalarSizeInBits() != 32)
47317 return false;
47318
47319 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
47320 unsigned SignBits[2] = {1, 1};
47321 bool IsPositive[2] = {false, false};
47322 for (unsigned i = 0; i < 2; i++) {
47323 SDValue Opd = N->getOperand(i);
47324
47325 SignBits[i] = DAG.ComputeNumSignBits(Opd);
47326 IsPositive[i] = DAG.SignBitIsZero(Opd);
47327 }
47328
47329 bool AllPositive = IsPositive[0] && IsPositive[1];
47330 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
47331 // When ranges are from -128 ~ 127, use MULS8 mode.
47332 if (MinSignBits >= 25)
47333 Mode = ShrinkMode::MULS8;
47334 // When ranges are from 0 ~ 255, use MULU8 mode.
47335 else if (AllPositive && MinSignBits >= 24)
47336 Mode = ShrinkMode::MULU8;
47337 // When ranges are from -32768 ~ 32767, use MULS16 mode.
47338 else if (MinSignBits >= 17)
47339 Mode = ShrinkMode::MULS16;
47340 // When ranges are from 0 ~ 65535, use MULU16 mode.
47341 else if (AllPositive && MinSignBits >= 16)
47342 Mode = ShrinkMode::MULU16;
47343 else
47344 return false;
47345 return true;
47346}
47347
47348/// When the operands of vector mul are extended from smaller size values,
47349/// like i8 and i16, the type of mul may be shrinked to generate more
47350/// efficient code. Two typical patterns are handled:
47351/// Pattern1:
47352/// %2 = sext/zext <N x i8> %1 to <N x i32>
47353/// %4 = sext/zext <N x i8> %3 to <N x i32>
47354// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47355/// %5 = mul <N x i32> %2, %4
47356///
47357/// Pattern2:
47358/// %2 = zext/sext <N x i16> %1 to <N x i32>
47359/// %4 = zext/sext <N x i16> %3 to <N x i32>
47360/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47361/// %5 = mul <N x i32> %2, %4
47362///
47363/// There are four mul shrinking modes:
47364/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
47365/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
47366/// generate pmullw+sext32 for it (MULS8 mode).
47367/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
47368/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
47369/// generate pmullw+zext32 for it (MULU8 mode).
47370/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
47371/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
47372/// generate pmullw+pmulhw for it (MULS16 mode).
47373/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
47374/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
47375/// generate pmullw+pmulhuw for it (MULU16 mode).
47377 const X86Subtarget &Subtarget) {
47378 // Check for legality
47379 // pmullw/pmulhw are not supported by SSE.
47380 if (!Subtarget.hasSSE2())
47381 return SDValue();
47382
47383 // Check for profitability
47384 // pmulld is supported since SSE41. It is better to use pmulld
47385 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
47386 // the expansion.
47387 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
47388 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
47389 return SDValue();
47390
47391 ShrinkMode Mode;
47392 if (!canReduceVMulWidth(N, DAG, Mode))
47393 return SDValue();
47394
47395 SDValue N0 = N->getOperand(0);
47396 SDValue N1 = N->getOperand(1);
47397 EVT VT = N->getOperand(0).getValueType();
47398 unsigned NumElts = VT.getVectorNumElements();
47399 if ((NumElts % 2) != 0)
47400 return SDValue();
47401
47402 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
47403
47404 // Shrink the operands of mul.
47405 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
47406 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
47407
47408 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
47409 // lower part is needed.
47410 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
47411 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
47412 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
47414 DL, VT, MulLo);
47415
47416 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
47417 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
47418 // the higher part is also needed.
47419 SDValue MulHi =
47420 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
47421 ReducedVT, NewN0, NewN1);
47422
47423 // Repack the lower part and higher part result of mul into a wider
47424 // result.
47425 // Generate shuffle functioning as punpcklwd.
47426 SmallVector<int, 16> ShuffleMask(NumElts);
47427 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
47428 ShuffleMask[2 * i] = i;
47429 ShuffleMask[2 * i + 1] = i + NumElts;
47430 }
47431 SDValue ResLo =
47432 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
47433 ResLo = DAG.getBitcast(ResVT, ResLo);
47434 // Generate shuffle functioning as punpckhwd.
47435 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
47436 ShuffleMask[2 * i] = i + NumElts / 2;
47437 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
47438 }
47439 SDValue ResHi =
47440 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
47441 ResHi = DAG.getBitcast(ResVT, ResHi);
47442 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
47443}
47444
47446 EVT VT, const SDLoc &DL) {
47447
47448 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
47449 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47450 DAG.getConstant(Mult, DL, VT));
47451 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
47452 DAG.getConstant(Shift, DL, MVT::i8));
47453 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
47454 N->getOperand(0));
47455 return Result;
47456 };
47457
47458 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
47459 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47460 DAG.getConstant(Mul1, DL, VT));
47461 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
47462 DAG.getConstant(Mul2, DL, VT));
47463 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
47464 N->getOperand(0));
47465 return Result;
47466 };
47467
47468 switch (MulAmt) {
47469 default:
47470 break;
47471 case 11:
47472 // mul x, 11 => add ((shl (mul x, 5), 1), x)
47473 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
47474 case 21:
47475 // mul x, 21 => add ((shl (mul x, 5), 2), x)
47476 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
47477 case 41:
47478 // mul x, 41 => add ((shl (mul x, 5), 3), x)
47479 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
47480 case 22:
47481 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
47482 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47483 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
47484 case 19:
47485 // mul x, 19 => add ((shl (mul x, 9), 1), x)
47486 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
47487 case 37:
47488 // mul x, 37 => add ((shl (mul x, 9), 2), x)
47489 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
47490 case 73:
47491 // mul x, 73 => add ((shl (mul x, 9), 3), x)
47492 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
47493 case 13:
47494 // mul x, 13 => add ((shl (mul x, 3), 2), x)
47495 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
47496 case 23:
47497 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
47498 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
47499 case 26:
47500 // mul x, 26 => add ((mul (mul x, 5), 5), x)
47501 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
47502 case 28:
47503 // mul x, 28 => add ((mul (mul x, 9), 3), x)
47504 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
47505 case 29:
47506 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
47507 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47508 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
47509 }
47510
47511 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
47512 // by a single LEA.
47513 // First check if this a sum of two power of 2s because that's easy. Then
47514 // count how many zeros are up to the first bit.
47515 // TODO: We can do this even without LEA at a cost of two shifts and an add.
47516 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
47517 unsigned ScaleShift = llvm::countr_zero(MulAmt);
47518 if (ScaleShift >= 1 && ScaleShift < 4) {
47519 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
47520 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47521 DAG.getConstant(ShiftAmt, DL, MVT::i8));
47522 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47523 DAG.getConstant(ScaleShift, DL, MVT::i8));
47524 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
47525 }
47526 }
47527
47528 return SDValue();
47529}
47530
47531// If the upper 17 bits of either element are zero and the other element are
47532// zero/sign bits then we can use PMADDWD, which is always at least as quick as
47533// PMULLD, except on KNL.
47535 SelectionDAG &DAG,
47536 const X86Subtarget &Subtarget) {
47537 if (!Subtarget.hasSSE2())
47538 return SDValue();
47539
47540 if (Subtarget.isPMADDWDSlow())
47541 return SDValue();
47542
47543 EVT VT = N->getValueType(0);
47544
47545 // Only support vXi32 vectors.
47546 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
47547 return SDValue();
47548
47549 // Make sure the type is legal or can split/widen to a legal type.
47550 // With AVX512 but without BWI, we would need to split v32i16.
47551 unsigned NumElts = VT.getVectorNumElements();
47552 if (NumElts == 1 || !isPowerOf2_32(NumElts))
47553 return SDValue();
47554
47555 // With AVX512 but without BWI, we would need to split v32i16.
47556 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
47557 return SDValue();
47558
47559 SDValue N0 = N->getOperand(0);
47560 SDValue N1 = N->getOperand(1);
47561
47562 // If we are zero/sign extending two steps without SSE4.1, its better to
47563 // reduce the vmul width instead.
47564 if (!Subtarget.hasSSE41() &&
47565 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
47566 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47567 (N1.getOpcode() == ISD::ZERO_EXTEND &&
47568 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
47569 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
47570 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47571 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47572 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
47573 return SDValue();
47574
47575 // If we are sign extending a wide vector without SSE4.1, its better to reduce
47576 // the vmul width instead.
47577 if (!Subtarget.hasSSE41() &&
47578 (N0.getOpcode() == ISD::SIGN_EXTEND &&
47579 N0.getOperand(0).getValueSizeInBits() > 128) &&
47580 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47581 N1.getOperand(0).getValueSizeInBits() > 128))
47582 return SDValue();
47583
47584 // Sign bits must extend down to the lowest i16.
47585 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
47586 DAG.ComputeMaxSignificantBits(N0) > 16)
47587 return SDValue();
47588
47589 // At least one of the elements must be zero in the upper 17 bits, or can be
47590 // safely made zero without altering the final result.
47591 auto GetZeroableOp = [&](SDValue Op) {
47592 APInt Mask17 = APInt::getHighBitsSet(32, 17);
47593 if (DAG.MaskedValueIsZero(Op, Mask17))
47594 return Op;
47595 // Mask off upper 16-bits of sign-extended constants.
47597 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
47598 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
47599 SDValue Src = Op.getOperand(0);
47600 // Convert sext(vXi16) to zext(vXi16).
47601 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
47602 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
47603 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
47604 // which will expand the extension.
47605 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
47606 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
47607 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
47608 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
47609 }
47610 }
47611 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
47612 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
47613 N->isOnlyUserOf(Op.getNode())) {
47614 SDValue Src = Op.getOperand(0);
47615 if (Src.getScalarValueSizeInBits() == 16)
47616 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
47617 }
47618 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
47619 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
47620 N->isOnlyUserOf(Op.getNode())) {
47621 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
47622 Op.getOperand(1));
47623 }
47624 return SDValue();
47625 };
47626 SDValue ZeroN0 = GetZeroableOp(N0);
47627 SDValue ZeroN1 = GetZeroableOp(N1);
47628 if (!ZeroN0 && !ZeroN1)
47629 return SDValue();
47630 N0 = ZeroN0 ? ZeroN0 : N0;
47631 N1 = ZeroN1 ? ZeroN1 : N1;
47632
47633 // Use SplitOpsAndApply to handle AVX splitting.
47634 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47635 ArrayRef<SDValue> Ops) {
47636 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
47637 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
47638 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
47639 DAG.getBitcast(OpVT, Ops[0]),
47640 DAG.getBitcast(OpVT, Ops[1]));
47641 };
47642 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
47643}
47644
47646 const X86Subtarget &Subtarget) {
47647 if (!Subtarget.hasSSE2())
47648 return SDValue();
47649
47650 EVT VT = N->getValueType(0);
47651
47652 // Only support vXi64 vectors.
47653 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
47654 VT.getVectorNumElements() < 2 ||
47656 return SDValue();
47657
47658 SDValue N0 = N->getOperand(0);
47659 SDValue N1 = N->getOperand(1);
47660
47661 // MULDQ returns the 64-bit result of the signed multiplication of the lower
47662 // 32-bits. We can lower with this if the sign bits stretch that far.
47663 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
47664 DAG.ComputeNumSignBits(N1) > 32) {
47665 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47666 ArrayRef<SDValue> Ops) {
47667 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
47668 };
47669 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
47670 /*CheckBWI*/ false);
47671 }
47672
47673 // If the upper bits are zero we can use a single pmuludq.
47674 APInt Mask = APInt::getHighBitsSet(64, 32);
47675 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
47676 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47677 ArrayRef<SDValue> Ops) {
47678 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
47679 };
47680 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
47681 /*CheckBWI*/ false);
47682 }
47683
47684 return SDValue();
47685}
47686
47689 const X86Subtarget &Subtarget) {
47690 EVT VT = N->getValueType(0);
47691 SDLoc DL(N);
47692
47693 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
47694 return V;
47695
47696 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
47697 return V;
47698
47699 if (DCI.isBeforeLegalize() && VT.isVector())
47700 return reduceVMULWidth(N, DL, DAG, Subtarget);
47701
47702 // Optimize a single multiply with constant into two operations in order to
47703 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
47705 return SDValue();
47706
47707 // An imul is usually smaller than the alternative sequence.
47709 return SDValue();
47710
47711 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
47712 return SDValue();
47713
47714 if (VT != MVT::i64 && VT != MVT::i32 &&
47715 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
47716 return SDValue();
47717
47719 N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false);
47720 const APInt *C = nullptr;
47721 if (!CNode) {
47722 if (VT.isVector())
47723 if (auto *RawC = getTargetConstantFromNode(N->getOperand(1)))
47724 if (auto *SplatC = RawC->getSplatValue())
47725 C = &(SplatC->getUniqueInteger());
47726
47727 if (!C || C->getBitWidth() != VT.getScalarSizeInBits())
47728 return SDValue();
47729 } else {
47730 C = &(CNode->getAPIntValue());
47731 }
47732
47733 if (isPowerOf2_64(C->getZExtValue()))
47734 return SDValue();
47735
47736 int64_t SignMulAmt = C->getSExtValue();
47737 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
47738 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
47739
47740 SDValue NewMul = SDValue();
47741 if (VT == MVT::i64 || VT == MVT::i32) {
47742 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
47743 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47744 DAG.getConstant(AbsMulAmt, DL, VT));
47745 if (SignMulAmt < 0)
47746 NewMul = DAG.getNegative(NewMul, DL, VT);
47747
47748 return NewMul;
47749 }
47750
47751 uint64_t MulAmt1 = 0;
47752 uint64_t MulAmt2 = 0;
47753 if ((AbsMulAmt % 9) == 0) {
47754 MulAmt1 = 9;
47755 MulAmt2 = AbsMulAmt / 9;
47756 } else if ((AbsMulAmt % 5) == 0) {
47757 MulAmt1 = 5;
47758 MulAmt2 = AbsMulAmt / 5;
47759 } else if ((AbsMulAmt % 3) == 0) {
47760 MulAmt1 = 3;
47761 MulAmt2 = AbsMulAmt / 3;
47762 }
47763
47764 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
47765 if (MulAmt2 &&
47766 (isPowerOf2_64(MulAmt2) ||
47767 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
47768
47769 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
47770 N->use_begin()->getOpcode() == ISD::ADD))
47771 // If second multiplifer is pow2, issue it first. We want the multiply
47772 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
47773 // use is an add. Only do this for positive multiply amounts since the
47774 // negate would prevent it from being used as an address mode anyway.
47775 std::swap(MulAmt1, MulAmt2);
47776
47777 if (isPowerOf2_64(MulAmt1))
47778 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47779 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
47780 else
47781 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47782 DAG.getConstant(MulAmt1, DL, VT));
47783
47784 if (isPowerOf2_64(MulAmt2))
47785 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
47786 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
47787 else
47788 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
47789 DAG.getConstant(MulAmt2, DL, VT));
47790
47791 // Negate the result.
47792 if (SignMulAmt < 0)
47793 NewMul = DAG.getNegative(NewMul, DL, VT);
47794 } else if (!Subtarget.slowLEA())
47795 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
47796 }
47797 if (!NewMul) {
47798 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
47799 assert(C->getZExtValue() != 0 &&
47800 C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) &&
47801 "Both cases that could cause potential overflows should have "
47802 "already been handled.");
47803 if (isPowerOf2_64(AbsMulAmt - 1)) {
47804 // (mul x, 2^N + 1) => (add (shl x, N), x)
47805 NewMul = DAG.getNode(
47806 ISD::ADD, DL, VT, N->getOperand(0),
47807 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47808 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
47809 if (SignMulAmt < 0)
47810 NewMul = DAG.getNegative(NewMul, DL, VT);
47811 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
47812 // (mul x, 2^N - 1) => (sub (shl x, N), x)
47813 NewMul =
47814 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47815 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
47816 // To negate, reverse the operands of the subtract.
47817 if (SignMulAmt < 0)
47818 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
47819 else
47820 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
47821 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
47822 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
47823 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
47824 NewMul =
47825 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47826 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
47827 NewMul = DAG.getNode(
47828 ISD::ADD, DL, VT, NewMul,
47829 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47830 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
47831 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
47832 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
47833 NewMul =
47834 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47835 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
47836 NewMul = DAG.getNode(
47837 ISD::SUB, DL, VT, NewMul,
47838 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47839 } else if (SignMulAmt >= 0 && VT.isVector() &&
47840 Subtarget.fastImmVectorShift()) {
47841 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
47842 uint64_t ShiftAmt1;
47843 std::optional<unsigned> Opc;
47844 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
47845 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
47846 Opc = ISD::ADD;
47847 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
47848 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
47849 Opc = ISD::SUB;
47850 }
47851
47852 if (Opc) {
47853 SDValue Shift1 =
47854 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47855 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
47856 SDValue Shift2 =
47857 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47858 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
47859 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
47860 }
47861 }
47862 }
47863
47864 return NewMul;
47865}
47866
47867// Try to form a MULHU or MULHS node by looking for
47868// (srl (mul ext, ext), 16)
47869// TODO: This is X86 specific because we want to be able to handle wide types
47870// before type legalization. But we can only do it if the vector will be
47871// legalized via widening/splitting. Type legalization can't handle promotion
47872// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
47873// combiner.
47875 const X86Subtarget &Subtarget) {
47876 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
47877 "SRL or SRA node is required here!");
47878 SDLoc DL(N);
47879
47880 if (!Subtarget.hasSSE2())
47881 return SDValue();
47882
47883 // The operation feeding into the shift must be a multiply.
47884 SDValue ShiftOperand = N->getOperand(0);
47885 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
47886 return SDValue();
47887
47888 // Input type should be at least vXi32.
47889 EVT VT = N->getValueType(0);
47890 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
47891 return SDValue();
47892
47893 // Need a shift by 16.
47894 APInt ShiftAmt;
47895 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
47896 ShiftAmt != 16)
47897 return SDValue();
47898
47899 SDValue LHS = ShiftOperand.getOperand(0);
47900 SDValue RHS = ShiftOperand.getOperand(1);
47901
47902 unsigned ExtOpc = LHS.getOpcode();
47903 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
47904 RHS.getOpcode() != ExtOpc)
47905 return SDValue();
47906
47907 // Peek through the extends.
47908 LHS = LHS.getOperand(0);
47909 RHS = RHS.getOperand(0);
47910
47911 // Ensure the input types match.
47912 EVT MulVT = LHS.getValueType();
47913 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
47914 return SDValue();
47915
47916 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
47917 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
47918
47919 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
47920 return DAG.getNode(ExtOpc, DL, VT, Mulh);
47921}
47922
47924 SDValue N0 = N->getOperand(0);
47925 SDValue N1 = N->getOperand(1);
47926 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
47927 EVT VT = N0.getValueType();
47928
47929 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
47930 // since the result of setcc_c is all zero's or all ones.
47931 if (VT.isInteger() && !VT.isVector() &&
47932 N1C && N0.getOpcode() == ISD::AND &&
47933 N0.getOperand(1).getOpcode() == ISD::Constant) {
47934 SDValue N00 = N0.getOperand(0);
47935 APInt Mask = N0.getConstantOperandAPInt(1);
47936 Mask <<= N1C->getAPIntValue();
47937 bool MaskOK = false;
47938 // We can handle cases concerning bit-widening nodes containing setcc_c if
47939 // we carefully interrogate the mask to make sure we are semantics
47940 // preserving.
47941 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
47942 // of the underlying setcc_c operation if the setcc_c was zero extended.
47943 // Consider the following example:
47944 // zext(setcc_c) -> i32 0x0000FFFF
47945 // c1 -> i32 0x0000FFFF
47946 // c2 -> i32 0x00000001
47947 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
47948 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
47949 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
47950 MaskOK = true;
47951 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
47953 MaskOK = true;
47954 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
47955 N00.getOpcode() == ISD::ANY_EXTEND) &&
47957 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
47958 }
47959 if (MaskOK && Mask != 0) {
47960 SDLoc DL(N);
47961 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
47962 }
47963 }
47964
47965 return SDValue();
47966}
47967
47969 const X86Subtarget &Subtarget) {
47970 SDValue N0 = N->getOperand(0);
47971 SDValue N1 = N->getOperand(1);
47972 EVT VT = N0.getValueType();
47973 unsigned Size = VT.getSizeInBits();
47974
47975 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
47976 return V;
47977
47978 APInt ShiftAmt;
47979 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA) &&
47980 N1.getOpcode() == ISD::UMIN &&
47981 ISD::isConstantSplatVector(N1.getOperand(1).getNode(), ShiftAmt) &&
47982 ShiftAmt == VT.getScalarSizeInBits() - 1) {
47983 SDValue ShrAmtVal = N1.getOperand(0);
47984 SDLoc DL(N);
47985 return DAG.getNode(X86ISD::VSRAV, DL, N->getVTList(), N0, ShrAmtVal);
47986 }
47987
47988 // fold (SRA (SHL X, ShlConst), SraConst)
47989 // into (SHL (sext_in_reg X), ShlConst - SraConst)
47990 // or (sext_in_reg X)
47991 // or (SRA (sext_in_reg X), SraConst - ShlConst)
47992 // depending on relation between SraConst and ShlConst.
47993 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
47994 // us to do the sext_in_reg from corresponding bit.
47995
47996 // sexts in X86 are MOVs. The MOVs have the same code size
47997 // as above SHIFTs (only SHIFT on 1 has lower code size).
47998 // However the MOVs have 2 advantages to a SHIFT:
47999 // 1. MOVs can write to a register that differs from source
48000 // 2. MOVs accept memory operands
48001
48002 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
48003 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
48005 return SDValue();
48006
48007 SDValue N00 = N0.getOperand(0);
48008 SDValue N01 = N0.getOperand(1);
48009 APInt ShlConst = N01->getAsAPIntVal();
48010 APInt SraConst = N1->getAsAPIntVal();
48011 EVT CVT = N1.getValueType();
48012
48013 if (CVT != N01.getValueType())
48014 return SDValue();
48015 if (SraConst.isNegative())
48016 return SDValue();
48017
48018 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
48019 unsigned ShiftSize = SVT.getSizeInBits();
48020 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
48021 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
48022 continue;
48023 SDLoc DL(N);
48024 SDValue NN =
48025 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
48026 if (SraConst.eq(ShlConst))
48027 return NN;
48028 if (SraConst.ult(ShlConst))
48029 return DAG.getNode(ISD::SHL, DL, VT, NN,
48030 DAG.getConstant(ShlConst - SraConst, DL, CVT));
48031 return DAG.getNode(ISD::SRA, DL, VT, NN,
48032 DAG.getConstant(SraConst - ShlConst, DL, CVT));
48033 }
48034 return SDValue();
48035}
48036
48039 const X86Subtarget &Subtarget) {
48040 SDValue N0 = N->getOperand(0);
48041 SDValue N1 = N->getOperand(1);
48042 EVT VT = N0.getValueType();
48043
48044 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
48045 return V;
48046
48047 // Only do this on the last DAG combine as it can interfere with other
48048 // combines.
48049 if (!DCI.isAfterLegalizeDAG())
48050 return SDValue();
48051
48052 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
48053 // TODO: This is a generic DAG combine that became an x86-only combine to
48054 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
48055 // and-not ('andn').
48056 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
48057 return SDValue();
48058
48059 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
48060 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
48061 if (!ShiftC || !AndC)
48062 return SDValue();
48063
48064 // If we can shrink the constant mask below 8-bits or 32-bits, then this
48065 // transform should reduce code size. It may also enable secondary transforms
48066 // from improved known-bits analysis or instruction selection.
48067 APInt MaskVal = AndC->getAPIntValue();
48068
48069 // If this can be matched by a zero extend, don't optimize.
48070 if (MaskVal.isMask()) {
48071 unsigned TO = MaskVal.countr_one();
48072 if (TO >= 8 && isPowerOf2_32(TO))
48073 return SDValue();
48074 }
48075
48076 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
48077 unsigned OldMaskSize = MaskVal.getSignificantBits();
48078 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
48079 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
48080 (OldMaskSize > 32 && NewMaskSize <= 32)) {
48081 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
48082 SDLoc DL(N);
48083 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
48084 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
48085 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
48086 }
48087 return SDValue();
48088}
48089
48091 const X86Subtarget &Subtarget) {
48092 unsigned Opcode = N->getOpcode();
48093 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
48094
48095 SDLoc DL(N);
48096 EVT VT = N->getValueType(0);
48097 SDValue N0 = N->getOperand(0);
48098 SDValue N1 = N->getOperand(1);
48099 EVT SrcVT = N0.getValueType();
48100
48101 SDValue BC0 =
48102 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
48103 SDValue BC1 =
48104 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
48105
48106 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
48107 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
48108 // truncation trees that help us avoid lane crossing shuffles.
48109 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
48110 // TODO: We don't handle vXf64 shuffles yet.
48111 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48112 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
48114 SmallVector<int> ShuffleMask, ScaledMask;
48115 SDValue Vec = peekThroughBitcasts(BCSrc);
48116 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
48118 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
48119 // shuffle to a v4X64 width - we can probably relax this in the future.
48120 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
48121 ShuffleOps[0].getValueType().is256BitVector() &&
48122 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
48123 SDValue Lo, Hi;
48124 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
48125 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
48126 Lo = DAG.getBitcast(SrcVT, Lo);
48127 Hi = DAG.getBitcast(SrcVT, Hi);
48128 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
48129 Res = DAG.getBitcast(ShufVT, Res);
48130 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
48131 return DAG.getBitcast(VT, Res);
48132 }
48133 }
48134 }
48135 }
48136
48137 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
48138 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
48139 // If either/both ops are a shuffle that can scale to v2x64,
48140 // then see if we can perform this as a v4x32 post shuffle.
48141 SmallVector<SDValue> Ops0, Ops1;
48142 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
48143 bool IsShuf0 =
48144 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
48145 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
48146 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48147 bool IsShuf1 =
48148 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
48149 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
48150 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
48151 if (IsShuf0 || IsShuf1) {
48152 if (!IsShuf0) {
48153 Ops0.assign({BC0});
48154 ScaledMask0.assign({0, 1});
48155 }
48156 if (!IsShuf1) {
48157 Ops1.assign({BC1});
48158 ScaledMask1.assign({0, 1});
48159 }
48160
48161 SDValue LHS, RHS;
48162 int PostShuffle[4] = {-1, -1, -1, -1};
48163 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
48164 if (M < 0)
48165 return true;
48166 Idx = M % 2;
48167 SDValue Src = Ops[M / 2];
48168 if (!LHS || LHS == Src) {
48169 LHS = Src;
48170 return true;
48171 }
48172 if (!RHS || RHS == Src) {
48173 Idx += 2;
48174 RHS = Src;
48175 return true;
48176 }
48177 return false;
48178 };
48179 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
48180 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
48181 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
48182 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
48183 LHS = DAG.getBitcast(SrcVT, LHS);
48184 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
48185 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
48186 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
48187 Res = DAG.getBitcast(ShufVT, Res);
48188 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
48189 return DAG.getBitcast(VT, Res);
48190 }
48191 }
48192 }
48193
48194 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
48195 if (VT.is256BitVector() && Subtarget.hasInt256()) {
48196 SmallVector<int> Mask0, Mask1;
48197 SmallVector<SDValue> Ops0, Ops1;
48198 SmallVector<int, 2> ScaledMask0, ScaledMask1;
48199 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
48200 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
48201 !Ops0.empty() && !Ops1.empty() &&
48202 all_of(Ops0,
48203 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
48204 all_of(Ops1,
48205 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
48206 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
48207 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
48208 SDValue Op00 = peekThroughBitcasts(Ops0.front());
48209 SDValue Op10 = peekThroughBitcasts(Ops1.front());
48210 SDValue Op01 = peekThroughBitcasts(Ops0.back());
48211 SDValue Op11 = peekThroughBitcasts(Ops1.back());
48212 if ((Op00 == Op11) && (Op01 == Op10)) {
48213 std::swap(Op10, Op11);
48215 }
48216 if ((Op00 == Op10) && (Op01 == Op11)) {
48217 const int Map[4] = {0, 2, 1, 3};
48218 SmallVector<int, 4> ShuffleMask(
48219 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
48220 Map[ScaledMask1[1]]});
48221 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
48222 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
48223 DAG.getBitcast(SrcVT, Op01));
48224 Res = DAG.getBitcast(ShufVT, Res);
48225 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
48226 return DAG.getBitcast(VT, Res);
48227 }
48228 }
48229 }
48230
48231 return SDValue();
48232}
48233
48236 const X86Subtarget &Subtarget) {
48237 unsigned Opcode = N->getOpcode();
48238 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
48239 "Unexpected pack opcode");
48240
48241 EVT VT = N->getValueType(0);
48242 SDValue N0 = N->getOperand(0);
48243 SDValue N1 = N->getOperand(1);
48244 unsigned NumDstElts = VT.getVectorNumElements();
48245 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
48246 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
48247 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
48248 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
48249 "Unexpected PACKSS/PACKUS input type");
48250
48251 bool IsSigned = (X86ISD::PACKSS == Opcode);
48252
48253 // Constant Folding.
48254 APInt UndefElts0, UndefElts1;
48255 SmallVector<APInt, 32> EltBits0, EltBits1;
48256 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
48257 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
48258 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
48259 /*AllowWholeUndefs*/ true,
48260 /*AllowPartialUndefs*/ true) &&
48261 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
48262 /*AllowWholeUndefs*/ true,
48263 /*AllowPartialUndefs*/ true)) {
48264 unsigned NumLanes = VT.getSizeInBits() / 128;
48265 unsigned NumSrcElts = NumDstElts / 2;
48266 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
48267 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
48268
48269 APInt Undefs(NumDstElts, 0);
48270 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
48271 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
48272 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
48273 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
48274 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
48275 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
48276
48277 if (UndefElts[SrcIdx]) {
48278 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
48279 continue;
48280 }
48281
48282 APInt &Val = EltBits[SrcIdx];
48283 if (IsSigned) {
48284 // PACKSS: Truncate signed value with signed saturation.
48285 // Source values less than dst minint are saturated to minint.
48286 // Source values greater than dst maxint are saturated to maxint.
48287 Val = Val.truncSSat(DstBitsPerElt);
48288 } else {
48289 // PACKUS: Truncate signed value with unsigned saturation.
48290 // Source values less than zero are saturated to zero.
48291 // Source values greater than dst maxuint are saturated to maxuint.
48292 // NOTE: This is different from APInt::truncUSat.
48293 if (Val.isIntN(DstBitsPerElt))
48294 Val = Val.trunc(DstBitsPerElt);
48295 else if (Val.isNegative())
48296 Val = APInt::getZero(DstBitsPerElt);
48297 else
48298 Val = APInt::getAllOnes(DstBitsPerElt);
48299 }
48300 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
48301 }
48302 }
48303
48304 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
48305 }
48306
48307 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
48308 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
48309 return V;
48310
48311 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
48312 // Currently limit this to allsignbits cases only.
48313 if (IsSigned &&
48314 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
48315 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
48316 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
48317 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
48318 if (Not0 && Not1) {
48319 SDLoc DL(N);
48320 MVT SrcVT = N0.getSimpleValueType();
48321 SDValue Pack =
48322 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
48323 DAG.getBitcast(SrcVT, Not1));
48324 return DAG.getNOT(DL, Pack, VT);
48325 }
48326 }
48327
48328 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
48329 // truncate to create a larger truncate.
48330 if (Subtarget.hasAVX512() &&
48331 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
48332 N0.getOperand(0).getValueType() == MVT::v8i32) {
48333 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
48334 (!IsSigned &&
48335 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
48336 if (Subtarget.hasVLX())
48337 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
48338
48339 // Widen input to v16i32 so we can truncate that.
48340 SDLoc dl(N);
48341 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
48342 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
48343 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
48344 }
48345 }
48346
48347 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
48348 if (VT.is128BitVector()) {
48349 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48350 SDValue Src0, Src1;
48351 if (N0.getOpcode() == ExtOpc &&
48353 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48354 Src0 = N0.getOperand(0);
48355 }
48356 if (N1.getOpcode() == ExtOpc &&
48358 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48359 Src1 = N1.getOperand(0);
48360 }
48361 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
48362 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
48363 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
48364 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
48365 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
48366 }
48367
48368 // Try again with pack(*_extend_vector_inreg, undef).
48369 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
48371 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
48372 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
48373 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
48374 DAG);
48375 }
48376
48377 // Attempt to combine as shuffle.
48378 SDValue Op(N, 0);
48379 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48380 return Res;
48381
48382 return SDValue();
48383}
48384
48387 const X86Subtarget &Subtarget) {
48388 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
48389 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
48390 "Unexpected horizontal add/sub opcode");
48391
48392 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
48393 MVT VT = N->getSimpleValueType(0);
48394 SDValue LHS = N->getOperand(0);
48395 SDValue RHS = N->getOperand(1);
48396
48397 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
48398 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
48399 LHS.getOpcode() == RHS.getOpcode() &&
48400 LHS.getValueType() == RHS.getValueType() &&
48401 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
48402 SDValue LHS0 = LHS.getOperand(0);
48403 SDValue LHS1 = LHS.getOperand(1);
48404 SDValue RHS0 = RHS.getOperand(0);
48405 SDValue RHS1 = RHS.getOperand(1);
48406 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
48407 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
48408 SDLoc DL(N);
48409 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
48410 LHS0.isUndef() ? LHS1 : LHS0,
48411 RHS0.isUndef() ? RHS1 : RHS0);
48412 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
48413 Res = DAG.getBitcast(ShufVT, Res);
48414 SDValue NewLHS =
48415 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48416 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
48417 SDValue NewRHS =
48418 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48419 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
48420 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
48421 DAG.getBitcast(VT, NewRHS));
48422 }
48423 }
48424 }
48425
48426 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
48427 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
48428 return V;
48429
48430 return SDValue();
48431}
48432
48435 const X86Subtarget &Subtarget) {
48436 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
48437 X86ISD::VSRL == N->getOpcode()) &&
48438 "Unexpected shift opcode");
48439 EVT VT = N->getValueType(0);
48440 SDValue N0 = N->getOperand(0);
48441 SDValue N1 = N->getOperand(1);
48442
48443 // Shift zero -> zero.
48445 return DAG.getConstant(0, SDLoc(N), VT);
48446
48447 // Detect constant shift amounts.
48448 APInt UndefElts;
48449 SmallVector<APInt, 32> EltBits;
48450 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
48451 /*AllowWholeUndefs*/ true,
48452 /*AllowPartialUndefs*/ false)) {
48453 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
48454 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
48455 EltBits[0].getZExtValue(), DAG);
48456 }
48457
48458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48459 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
48460 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
48461 return SDValue(N, 0);
48462
48463 return SDValue();
48464}
48465
48468 const X86Subtarget &Subtarget) {
48469 unsigned Opcode = N->getOpcode();
48470 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
48471 X86ISD::VSRLI == Opcode) &&
48472 "Unexpected shift opcode");
48473 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
48474 EVT VT = N->getValueType(0);
48475 SDValue N0 = N->getOperand(0);
48476 SDValue N1 = N->getOperand(1);
48477 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48478 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
48479 "Unexpected value type");
48480 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
48481
48482 // (shift undef, X) -> 0
48483 if (N0.isUndef())
48484 return DAG.getConstant(0, SDLoc(N), VT);
48485
48486 // Out of range logical bit shifts are guaranteed to be zero.
48487 // Out of range arithmetic bit shifts splat the sign bit.
48488 unsigned ShiftVal = N->getConstantOperandVal(1);
48489 if (ShiftVal >= NumBitsPerElt) {
48490 if (LogicalShift)
48491 return DAG.getConstant(0, SDLoc(N), VT);
48492 ShiftVal = NumBitsPerElt - 1;
48493 }
48494
48495 // (shift X, 0) -> X
48496 if (!ShiftVal)
48497 return N0;
48498
48499 // (shift 0, C) -> 0
48501 // N0 is all zeros or undef. We guarantee that the bits shifted into the
48502 // result are all zeros, not undef.
48503 return DAG.getConstant(0, SDLoc(N), VT);
48504
48505 // (VSRAI -1, C) -> -1
48506 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
48507 // N0 is all ones or undef. We guarantee that the bits shifted into the
48508 // result are all ones, not undef.
48509 return DAG.getConstant(-1, SDLoc(N), VT);
48510
48511 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
48512 unsigned NewShiftVal = Amt0 + Amt1;
48513 if (NewShiftVal >= NumBitsPerElt) {
48514 // Out of range logical bit shifts are guaranteed to be zero.
48515 // Out of range arithmetic bit shifts splat the sign bit.
48516 if (LogicalShift)
48517 return DAG.getConstant(0, SDLoc(N), VT);
48518 NewShiftVal = NumBitsPerElt - 1;
48519 }
48520 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
48521 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
48522 };
48523
48524 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
48525 if (Opcode == N0.getOpcode())
48526 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
48527
48528 // (shl (add X, X), C) -> (shl X, (C + 1))
48529 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
48530 N0.getOperand(0) == N0.getOperand(1))
48531 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
48532
48533 // We can decode 'whole byte' logical bit shifts as shuffles.
48534 if (LogicalShift && (ShiftVal % 8) == 0) {
48535 SDValue Op(N, 0);
48536 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48537 return Res;
48538 }
48539
48540 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
48541 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
48542 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
48543 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
48544 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
48545 N0.getOpcode() == X86ISD::PSHUFD &&
48546 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
48547 N0->hasOneUse()) {
48549 if (BC.getOpcode() == X86ISD::VSHLI &&
48550 BC.getScalarValueSizeInBits() == 64 &&
48551 BC.getConstantOperandVal(1) == 63) {
48552 SDLoc DL(N);
48553 SDValue Src = BC.getOperand(0);
48554 Src = DAG.getBitcast(VT, Src);
48555 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
48556 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
48557 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
48558 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
48559 return Src;
48560 }
48561 }
48562
48563 auto TryConstantFold = [&](SDValue V) {
48564 APInt UndefElts;
48565 SmallVector<APInt, 32> EltBits;
48566 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
48567 /*AllowWholeUndefs*/ true,
48568 /*AllowPartialUndefs*/ true))
48569 return SDValue();
48570 assert(EltBits.size() == VT.getVectorNumElements() &&
48571 "Unexpected shift value type");
48572 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
48573 // created an undef input due to no input bits being demanded, but user
48574 // still expects 0 in other bits.
48575 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
48576 APInt &Elt = EltBits[i];
48577 if (UndefElts[i])
48578 Elt = 0;
48579 else if (X86ISD::VSHLI == Opcode)
48580 Elt <<= ShiftVal;
48581 else if (X86ISD::VSRAI == Opcode)
48582 Elt.ashrInPlace(ShiftVal);
48583 else
48584 Elt.lshrInPlace(ShiftVal);
48585 }
48586 // Reset undef elements since they were zeroed above.
48587 UndefElts = 0;
48588 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
48589 };
48590
48591 // Constant Folding.
48592 if (N->isOnlyUserOf(N0.getNode())) {
48593 if (SDValue C = TryConstantFold(N0))
48594 return C;
48595
48596 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
48597 // Don't break NOT patterns.
48599 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
48600 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
48602 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
48603 SDLoc DL(N);
48604 SDValue LHS = DAG.getNode(Opcode, DL, VT,
48605 DAG.getBitcast(VT, BC.getOperand(0)), N1);
48606 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
48607 }
48608 }
48609 }
48610
48611 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48612 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
48613 DCI))
48614 return SDValue(N, 0);
48615
48616 return SDValue();
48617}
48618
48621 const X86Subtarget &Subtarget) {
48622 EVT VT = N->getValueType(0);
48623 unsigned Opcode = N->getOpcode();
48624 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
48625 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
48626 Opcode == ISD::INSERT_VECTOR_ELT) &&
48627 "Unexpected vector insertion");
48628
48629 SDValue Vec = N->getOperand(0);
48630 SDValue Scl = N->getOperand(1);
48631 SDValue Idx = N->getOperand(2);
48632
48633 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
48634 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
48635 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
48636
48637 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
48638 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48639 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48640 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
48641 APInt::getAllOnes(NumBitsPerElt), DCI))
48642 return SDValue(N, 0);
48643 }
48644
48645 // Attempt to combine insertion patterns to a shuffle.
48646 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
48647 SDValue Op(N, 0);
48648 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48649 return Res;
48650 }
48651
48652 return SDValue();
48653}
48654
48655/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
48656/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
48657/// OR -> CMPNEQSS.
48660 const X86Subtarget &Subtarget) {
48661 unsigned opcode;
48662
48663 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
48664 // we're requiring SSE2 for both.
48665 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
48666 SDValue N0 = N->getOperand(0);
48667 SDValue N1 = N->getOperand(1);
48668 SDValue CMP0 = N0.getOperand(1);
48669 SDValue CMP1 = N1.getOperand(1);
48670 SDLoc DL(N);
48671
48672 // The SETCCs should both refer to the same CMP.
48673 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
48674 return SDValue();
48675
48676 SDValue CMP00 = CMP0->getOperand(0);
48677 SDValue CMP01 = CMP0->getOperand(1);
48678 EVT VT = CMP00.getValueType();
48679
48680 if (VT == MVT::f32 || VT == MVT::f64 ||
48681 (VT == MVT::f16 && Subtarget.hasFP16())) {
48682 bool ExpectingFlags = false;
48683 // Check for any users that want flags:
48684 for (const SDNode *U : N->uses()) {
48685 if (ExpectingFlags)
48686 break;
48687
48688 switch (U->getOpcode()) {
48689 default:
48690 case ISD::BR_CC:
48691 case ISD::BRCOND:
48692 case ISD::SELECT:
48693 ExpectingFlags = true;
48694 break;
48695 case ISD::CopyToReg:
48696 case ISD::SIGN_EXTEND:
48697 case ISD::ZERO_EXTEND:
48698 case ISD::ANY_EXTEND:
48699 break;
48700 }
48701 }
48702
48703 if (!ExpectingFlags) {
48704 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
48705 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
48706
48707 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
48708 X86::CondCode tmp = cc0;
48709 cc0 = cc1;
48710 cc1 = tmp;
48711 }
48712
48713 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
48714 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
48715 // FIXME: need symbolic constants for these magic numbers.
48716 // See X86ATTInstPrinter.cpp:printSSECC().
48717 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
48718 if (Subtarget.hasAVX512()) {
48719 SDValue FSetCC =
48720 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
48721 DAG.getTargetConstant(x86cc, DL, MVT::i8));
48722 // Need to fill with zeros to ensure the bitcast will produce zeroes
48723 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
48724 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
48725 DAG.getConstant(0, DL, MVT::v16i1),
48726 FSetCC, DAG.getIntPtrConstant(0, DL));
48727 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
48728 N->getSimpleValueType(0));
48729 }
48730 SDValue OnesOrZeroesF =
48731 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
48732 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
48733
48734 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
48735 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
48736
48737 if (is64BitFP && !Subtarget.is64Bit()) {
48738 // On a 32-bit target, we cannot bitcast the 64-bit float to a
48739 // 64-bit integer, since that's not a legal type. Since
48740 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
48741 // bits, but can do this little dance to extract the lowest 32 bits
48742 // and work with those going forward.
48743 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
48744 OnesOrZeroesF);
48745 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
48746 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
48747 Vector32, DAG.getIntPtrConstant(0, DL));
48748 IntVT = MVT::i32;
48749 }
48750
48751 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
48752 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
48753 DAG.getConstant(1, DL, IntVT));
48754 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
48755 ANDed);
48756 return OneBitOfTruth;
48757 }
48758 }
48759 }
48760 }
48761 return SDValue();
48762}
48763
48764/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
48766 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48767
48768 MVT VT = N->getSimpleValueType(0);
48769 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
48770 return SDValue();
48771
48772 SDValue X, Y;
48773 SDValue N0 = N->getOperand(0);
48774 SDValue N1 = N->getOperand(1);
48775
48776 if (SDValue Not = IsNOT(N0, DAG)) {
48777 X = Not;
48778 Y = N1;
48779 } else if (SDValue Not = IsNOT(N1, DAG)) {
48780 X = Not;
48781 Y = N0;
48782 } else
48783 return SDValue();
48784
48785 X = DAG.getBitcast(VT, X);
48786 Y = DAG.getBitcast(VT, Y);
48787 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
48788}
48789
48790/// Try to fold:
48791/// and (vector_shuffle<Z,...,Z>
48792/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
48793/// ->
48794/// andnp (vector_shuffle<Z,...,Z>
48795/// (insert_vector_elt undef, X, Z), undef), Y
48797 const X86Subtarget &Subtarget) {
48798 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48799
48800 EVT VT = N->getValueType(0);
48801 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
48802 // value and require extra moves.
48803 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
48804 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
48805 return SDValue();
48806
48807 auto GetNot = [&DAG](SDValue V) {
48808 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
48809 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
48810 // end-users are ISD::AND including cases
48811 // (and(extract_vector_element(SVN), Y)).
48812 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
48813 !SVN->getOperand(1).isUndef()) {
48814 return SDValue();
48815 }
48816 SDValue IVEN = SVN->getOperand(0);
48817 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
48818 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
48819 return SDValue();
48820 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
48821 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
48822 return SDValue();
48823 SDValue Src = IVEN.getOperand(1);
48824 if (SDValue Not = IsNOT(Src, DAG)) {
48825 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
48826 SDValue NotIVEN =
48828 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
48829 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
48830 SVN->getOperand(1), SVN->getMask());
48831 }
48832 return SDValue();
48833 };
48834
48835 SDValue X, Y;
48836 SDValue N0 = N->getOperand(0);
48837 SDValue N1 = N->getOperand(1);
48838 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48839
48840 if (SDValue Not = GetNot(N0)) {
48841 X = Not;
48842 Y = N1;
48843 } else if (SDValue Not = GetNot(N1)) {
48844 X = Not;
48845 Y = N0;
48846 } else
48847 return SDValue();
48848
48849 X = DAG.getBitcast(VT, X);
48850 Y = DAG.getBitcast(VT, Y);
48851 SDLoc DL(N);
48852
48853 // We do not split for SSE at all, but we need to split vectors for AVX1 and
48854 // AVX2.
48855 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
48857 SDValue LoX, HiX;
48858 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
48859 SDValue LoY, HiY;
48860 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
48861 EVT SplitVT = LoX.getValueType();
48862 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
48863 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
48864 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
48865 }
48866
48867 if (TLI.isTypeLegal(VT))
48868 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
48869
48870 return SDValue();
48871}
48872
48873// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
48874// logical operations, like in the example below.
48875// or (and (truncate x, truncate y)),
48876// (xor (truncate z, build_vector (constants)))
48877// Given a target type \p VT, we generate
48878// or (and x, y), (xor z, zext(build_vector (constants)))
48879// given x, y and z are of type \p VT. We can do so, if operands are either
48880// truncates from VT types, the second operand is a vector of constants or can
48881// be recursively promoted.
48883 SelectionDAG &DAG, unsigned Depth) {
48884 // Limit recursion to avoid excessive compile times.
48886 return SDValue();
48887
48888 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
48889 return SDValue();
48890
48891 SDValue N0 = N.getOperand(0);
48892 SDValue N1 = N.getOperand(1);
48893
48894 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48895 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
48896 return SDValue();
48897
48898 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
48899 N0 = NN0;
48900 else {
48901 // The left side has to be a trunc.
48902 if (N0.getOpcode() != ISD::TRUNCATE)
48903 return SDValue();
48904
48905 // The type of the truncated inputs.
48906 if (N0.getOperand(0).getValueType() != VT)
48907 return SDValue();
48908
48909 N0 = N0.getOperand(0);
48910 }
48911
48912 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
48913 N1 = NN1;
48914 else {
48915 // The right side has to be a 'trunc' or a (foldable) constant.
48916 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
48917 N1.getOperand(0).getValueType() == VT;
48918 if (RHSTrunc)
48919 N1 = N1.getOperand(0);
48920 else if (SDValue Cst =
48922 N1 = Cst;
48923 else
48924 return SDValue();
48925 }
48926
48927 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
48928}
48929
48930// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
48931// register. In most cases we actually compare or select YMM-sized registers
48932// and mixing the two types creates horrible code. This method optimizes
48933// some of the transition sequences.
48934// Even with AVX-512 this is still useful for removing casts around logical
48935// operations on vXi1 mask types.
48937 SelectionDAG &DAG,
48938 const X86Subtarget &Subtarget) {
48939 EVT VT = N.getValueType();
48940 assert(VT.isVector() && "Expected vector type");
48941 assert((N.getOpcode() == ISD::ANY_EXTEND ||
48942 N.getOpcode() == ISD::ZERO_EXTEND ||
48943 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
48944
48945 SDValue Narrow = N.getOperand(0);
48946 EVT NarrowVT = Narrow.getValueType();
48947
48948 // Generate the wide operation.
48949 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
48950 if (!Op)
48951 return SDValue();
48952 switch (N.getOpcode()) {
48953 default: llvm_unreachable("Unexpected opcode");
48954 case ISD::ANY_EXTEND:
48955 return Op;
48956 case ISD::ZERO_EXTEND:
48957 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
48958 case ISD::SIGN_EXTEND:
48959 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
48960 Op, DAG.getValueType(NarrowVT));
48961 }
48962}
48963
48964static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
48965 unsigned FPOpcode;
48966 switch (Opcode) {
48967 // clang-format off
48968 default: llvm_unreachable("Unexpected input node for FP logic conversion");
48969 case ISD::AND: FPOpcode = X86ISD::FAND; break;
48970 case ISD::OR: FPOpcode = X86ISD::FOR; break;
48971 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
48972 // clang-format on
48973 }
48974 return FPOpcode;
48975}
48976
48977/// If both input operands of a logic op are being cast from floating-point
48978/// types or FP compares, try to convert this into a floating-point logic node
48979/// to avoid unnecessary moves from SSE to integer registers.
48982 const X86Subtarget &Subtarget) {
48983 EVT VT = N->getValueType(0);
48984 SDValue N0 = N->getOperand(0);
48985 SDValue N1 = N->getOperand(1);
48986 SDLoc DL(N);
48987
48988 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
48989 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
48990 return SDValue();
48991
48992 SDValue N00 = N0.getOperand(0);
48993 SDValue N10 = N1.getOperand(0);
48994 EVT N00Type = N00.getValueType();
48995 EVT N10Type = N10.getValueType();
48996
48997 // Ensure that both types are the same and are legal scalar fp types.
48998 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
48999 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
49000 (Subtarget.hasFP16() && N00Type == MVT::f16)))
49001 return SDValue();
49002
49003 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
49004 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
49005 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
49006 return DAG.getBitcast(VT, FPLogic);
49007 }
49008
49009 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
49010 !N1.hasOneUse())
49011 return SDValue();
49012
49013 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
49014 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
49015
49016 // The vector ISA for FP predicates is incomplete before AVX, so converting
49017 // COMIS* to CMPS* may not be a win before AVX.
49018 if (!Subtarget.hasAVX() &&
49019 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
49020 return SDValue();
49021
49022 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
49023 // and vector logic:
49024 // logic (setcc N00, N01), (setcc N10, N11) -->
49025 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
49026 unsigned NumElts = 128 / N00Type.getSizeInBits();
49027 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
49028 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
49029 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
49030 SDValue N01 = N0.getOperand(1);
49031 SDValue N11 = N1.getOperand(1);
49032 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
49033 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
49034 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
49035 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
49036 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
49037 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
49038 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
49039 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
49040}
49041
49042// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
49043// to reduce XMM->GPR traffic.
49045 unsigned Opc = N->getOpcode();
49046 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
49047 "Unexpected bit opcode");
49048
49049 SDValue N0 = N->getOperand(0);
49050 SDValue N1 = N->getOperand(1);
49051
49052 // Both operands must be single use MOVMSK.
49053 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
49054 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
49055 return SDValue();
49056
49057 SDValue Vec0 = N0.getOperand(0);
49058 SDValue Vec1 = N1.getOperand(0);
49059 EVT VecVT0 = Vec0.getValueType();
49060 EVT VecVT1 = Vec1.getValueType();
49061
49062 // Both MOVMSK operands must be from vectors of the same size and same element
49063 // size, but its OK for a fp/int diff.
49064 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
49065 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
49066 return SDValue();
49067
49068 SDLoc DL(N);
49069 unsigned VecOpc =
49070 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
49071 SDValue Result =
49072 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
49073 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49074}
49075
49076// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
49077// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
49078// handles in InstCombine.
49080 unsigned Opc = N->getOpcode();
49081 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
49082 "Unexpected bit opcode");
49083
49084 SDValue N0 = N->getOperand(0);
49085 SDValue N1 = N->getOperand(1);
49086 EVT VT = N->getValueType(0);
49087
49088 // Both operands must be single use.
49089 if (!N0.hasOneUse() || !N1.hasOneUse())
49090 return SDValue();
49091
49092 // Search for matching shifts.
49095
49096 unsigned BCOpc = BC0.getOpcode();
49097 EVT BCVT = BC0.getValueType();
49098 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
49099 return SDValue();
49100
49101 switch (BCOpc) {
49102 case X86ISD::VSHLI:
49103 case X86ISD::VSRLI:
49104 case X86ISD::VSRAI: {
49105 if (BC0.getOperand(1) != BC1.getOperand(1))
49106 return SDValue();
49107
49108 SDLoc DL(N);
49109 SDValue BitOp =
49110 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
49111 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
49112 return DAG.getBitcast(VT, Shift);
49113 }
49114 }
49115
49116 return SDValue();
49117}
49118
49119// Attempt to fold:
49120// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
49121// TODO: Handle PACKUS handling.
49123 unsigned Opc = N->getOpcode();
49124 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
49125 "Unexpected bit opcode");
49126
49127 SDValue N0 = N->getOperand(0);
49128 SDValue N1 = N->getOperand(1);
49129 EVT VT = N->getValueType(0);
49130
49131 // Both operands must be single use.
49132 if (!N0.hasOneUse() || !N1.hasOneUse())
49133 return SDValue();
49134
49135 // Search for matching packs.
49138
49139 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
49140 return SDValue();
49141
49142 MVT DstVT = N0.getSimpleValueType();
49143 if (DstVT != N1.getSimpleValueType())
49144 return SDValue();
49145
49146 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
49147 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
49148
49149 // Limit to allsignbits packing.
49150 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
49151 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
49152 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
49153 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
49154 return SDValue();
49155
49156 SDLoc DL(N);
49157 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
49158 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
49159 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
49160}
49161
49162/// If this is a zero/all-bits result that is bitwise-anded with a low bits
49163/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
49164/// with a shift-right to eliminate loading the vector constant mask value.
49166 const X86Subtarget &Subtarget) {
49167 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
49168 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
49169 EVT VT = Op0.getValueType();
49170 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
49171 return SDValue();
49172
49173 // Try to convert an "is positive" signbit masking operation into arithmetic
49174 // shift and "andn". This saves a materialization of a -1 vector constant.
49175 // The "is negative" variant should be handled more generally because it only
49176 // requires "and" rather than "andn":
49177 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
49178 //
49179 // This is limited to the original type to avoid producing even more bitcasts.
49180 // If the bitcasts can't be eliminated, then it is unlikely that this fold
49181 // will be profitable.
49182 if (N->getValueType(0) == VT &&
49183 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
49184 SDValue X, Y;
49185 if (Op1.getOpcode() == X86ISD::PCMPGT &&
49186 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
49187 X = Op1.getOperand(0);
49188 Y = Op0;
49189 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
49190 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
49191 X = Op0.getOperand(0);
49192 Y = Op1;
49193 }
49194 if (X && Y) {
49195 SDLoc DL(N);
49196 SDValue Sra =
49198 VT.getScalarSizeInBits() - 1, DAG);
49199 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
49200 }
49201 }
49202
49203 APInt SplatVal;
49204 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
49205 return SDValue();
49206
49207 // Don't prevent creation of ANDN.
49208 if (isBitwiseNot(Op0))
49209 return SDValue();
49210
49211 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
49212 return SDValue();
49213
49214 unsigned EltBitWidth = VT.getScalarSizeInBits();
49215 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
49216 return SDValue();
49217
49218 SDLoc DL(N);
49219 unsigned ShiftVal = SplatVal.countr_one();
49220 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
49221 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
49222 return DAG.getBitcast(N->getValueType(0), Shift);
49223}
49224
49225// Get the index node from the lowered DAG of a GEP IR instruction with one
49226// indexing dimension.
49228 if (Ld->isIndexed())
49229 return SDValue();
49230
49231 SDValue Base = Ld->getBasePtr();
49232
49233 if (Base.getOpcode() != ISD::ADD)
49234 return SDValue();
49235
49236 SDValue ShiftedIndex = Base.getOperand(0);
49237
49238 if (ShiftedIndex.getOpcode() != ISD::SHL)
49239 return SDValue();
49240
49241 return ShiftedIndex.getOperand(0);
49242
49243}
49244
49245static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
49246 return Subtarget.hasBMI2() &&
49247 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
49248}
49249
49250// This function recognizes cases where X86 bzhi instruction can replace and
49251// 'and-load' sequence.
49252// In case of loading integer value from an array of constants which is defined
49253// as follows:
49254//
49255// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
49256//
49257// then applying a bitwise and on the result with another input.
49258// It's equivalent to performing bzhi (zero high bits) on the input, with the
49259// same index of the load.
49261 const X86Subtarget &Subtarget) {
49262 MVT VT = Node->getSimpleValueType(0);
49263 SDLoc dl(Node);
49264
49265 // Check if subtarget has BZHI instruction for the node's type
49266 if (!hasBZHI(Subtarget, VT))
49267 return SDValue();
49268
49269 // Try matching the pattern for both operands.
49270 for (unsigned i = 0; i < 2; i++) {
49271 SDValue N = Node->getOperand(i);
49272 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
49273
49274 // continue if the operand is not a load instruction
49275 if (!Ld)
49276 return SDValue();
49277
49278 const Value *MemOp = Ld->getMemOperand()->getValue();
49279
49280 if (!MemOp)
49281 return SDValue();
49282
49283 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
49284 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
49285 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
49286
49287 Constant *Init = GV->getInitializer();
49288 Type *Ty = Init->getType();
49289 if (!isa<ConstantDataArray>(Init) ||
49290 !Ty->getArrayElementType()->isIntegerTy() ||
49292 VT.getSizeInBits() ||
49293 Ty->getArrayNumElements() >
49295 continue;
49296
49297 // Check if the array's constant elements are suitable to our case.
49298 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
49299 bool ConstantsMatch = true;
49300 for (uint64_t j = 0; j < ArrayElementCount; j++) {
49301 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
49302 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
49303 ConstantsMatch = false;
49304 break;
49305 }
49306 }
49307 if (!ConstantsMatch)
49308 continue;
49309
49310 // Do the transformation (For 32-bit type):
49311 // -> (and (load arr[idx]), inp)
49312 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
49313 // that will be replaced with one bzhi instruction.
49314 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
49315 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
49316
49317 // Get the Node which indexes into the array.
49319 if (!Index)
49320 return SDValue();
49321 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
49322
49323 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
49324 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
49325
49326 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
49327 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
49328
49329 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
49330 }
49331 }
49332 }
49333 }
49334 return SDValue();
49335}
49336
49337// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
49338// Where C is a mask containing the same number of bits as the setcc and
49339// where the setcc will freely 0 upper bits of k-register. We can replace the
49340// undef in the concat with 0s and remove the AND. This mainly helps with
49341// v2i1/v4i1 setcc being casted to scalar.
49343 const X86Subtarget &Subtarget) {
49344 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
49345
49346 EVT VT = N->getValueType(0);
49347
49348 // Make sure this is an AND with constant. We will check the value of the
49349 // constant later.
49350 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
49351 if (!C1)
49352 return SDValue();
49353
49354 // This is implied by the ConstantSDNode.
49355 assert(!VT.isVector() && "Expected scalar VT!");
49356
49357 SDValue Src = N->getOperand(0);
49358 if (!Src.hasOneUse())
49359 return SDValue();
49360
49361 // (Optionally) peek through any_extend().
49362 if (Src.getOpcode() == ISD::ANY_EXTEND) {
49363 if (!Src.getOperand(0).hasOneUse())
49364 return SDValue();
49365 Src = Src.getOperand(0);
49366 }
49367
49368 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
49369 return SDValue();
49370
49371 Src = Src.getOperand(0);
49372 EVT SrcVT = Src.getValueType();
49373
49374 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49375 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
49376 !TLI.isTypeLegal(SrcVT))
49377 return SDValue();
49378
49379 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
49380 return SDValue();
49381
49382 // We only care about the first subvector of the concat, we expect the
49383 // other subvectors to be ignored due to the AND if we make the change.
49384 SDValue SubVec = Src.getOperand(0);
49385 EVT SubVecVT = SubVec.getValueType();
49386
49387 // The RHS of the AND should be a mask with as many bits as SubVec.
49388 if (!TLI.isTypeLegal(SubVecVT) ||
49389 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
49390 return SDValue();
49391
49392 // First subvector should be a setcc with a legal result type or a
49393 // AND containing at least one setcc with a legal result type.
49394 auto IsLegalSetCC = [&](SDValue V) {
49395 if (V.getOpcode() != ISD::SETCC)
49396 return false;
49397 EVT SetccVT = V.getOperand(0).getValueType();
49398 if (!TLI.isTypeLegal(SetccVT) ||
49399 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
49400 return false;
49401 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
49402 return false;
49403 return true;
49404 };
49405 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
49406 (IsLegalSetCC(SubVec.getOperand(0)) ||
49407 IsLegalSetCC(SubVec.getOperand(1))))))
49408 return SDValue();
49409
49410 // We passed all the checks. Rebuild the concat_vectors with zeroes
49411 // and cast it back to VT.
49412 SDLoc dl(N);
49413 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
49414 DAG.getConstant(0, dl, SubVecVT));
49415 Ops[0] = SubVec;
49416 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
49417 Ops);
49418 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
49419 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
49420}
49421
49422static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
49423 SDValue OpMustEq, SDValue Op, unsigned Depth) {
49424 // We don't want to go crazy with the recursion here. This isn't a super
49425 // important optimization.
49426 static constexpr unsigned kMaxDepth = 2;
49427
49428 // Only do this re-ordering if op has one use.
49429 if (!Op.hasOneUse())
49430 return SDValue();
49431
49432 SDLoc DL(Op);
49433 // If we hit another assosiative op, recurse further.
49434 if (Op.getOpcode() == Opc) {
49435 // Done recursing.
49436 if (Depth++ >= kMaxDepth)
49437 return SDValue();
49438
49439 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49440 if (SDValue R =
49441 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
49442 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
49443 Op.getOperand(1 - OpIdx));
49444
49445 } else if (Op.getOpcode() == ISD::SUB) {
49446 if (Opc == ISD::AND) {
49447 // BLSI: (and x, (sub 0, x))
49448 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
49449 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49450 }
49451 // Opc must be ISD::AND or ISD::XOR
49452 // BLSR: (and x, (sub x, 1))
49453 // BLSMSK: (xor x, (sub x, 1))
49454 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
49455 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49456
49457 } else if (Op.getOpcode() == ISD::ADD) {
49458 // Opc must be ISD::AND or ISD::XOR
49459 // BLSR: (and x, (add x, -1))
49460 // BLSMSK: (xor x, (add x, -1))
49461 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
49462 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49463 }
49464 return SDValue();
49465}
49466
49468 const X86Subtarget &Subtarget) {
49469 EVT VT = N->getValueType(0);
49470 // Make sure this node is a candidate for BMI instructions.
49471 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
49472 (VT != MVT::i32 && VT != MVT::i64))
49473 return SDValue();
49474
49475 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
49476
49477 // Try and match LHS and RHS.
49478 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49479 if (SDValue OpMatch =
49480 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
49481 N->getOperand(1 - OpIdx), 0))
49482 return OpMatch;
49483 return SDValue();
49484}
49485
49487 SelectionDAG &DAG,
49489 const X86Subtarget &ST) {
49490 // cmp(setcc(cc, X), 0)
49491 // brcond ne
49492 // ->
49493 // X
49494 // brcond cc
49495
49496 // sub(setcc(cc, X), 1)
49497 // brcond ne
49498 // ->
49499 // X
49500 // brcond ~cc
49501 //
49502 // if only flag has users
49503
49504 SDValue SetCC = N->getOperand(0);
49505
49506 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
49507 return SDValue();
49508
49509 // Check the only user of flag is `brcond ne`.
49510 SDNode *BrCond = *Flag->uses().begin();
49511 if (BrCond->getOpcode() != X86ISD::BRCOND)
49512 return SDValue();
49513 unsigned CondNo = 2;
49514 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
49516 return SDValue();
49517
49518 SDValue X = SetCC.getOperand(1);
49519 // sub has two results while X only have one. DAG combine assumes the value
49520 // type matches.
49521 if (N->getOpcode() == X86ISD::SUB)
49522 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
49523
49524 SDValue CCN = SetCC.getOperand(0);
49526 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
49528 // Update CC for the consumer of the flag.
49529 // The old CC is `ne`. Hence, when comparing the result with 0, we are
49530 // checking if the second condition evaluates to true. When comparing the
49531 // result with 1, we are checking uf the second condition evaluates to false.
49532 SmallVector<SDValue> Ops(BrCond->op_values());
49533 if (isNullConstant(N->getOperand(1)))
49534 Ops[CondNo] = CCN;
49535 else if (isOneConstant(N->getOperand(1)))
49536 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
49537 else
49538 llvm_unreachable("expect constant 0 or 1");
49539
49540 SDValue NewBrCond =
49541 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
49542 // Avoid self-assign error b/c CC1 can be `e/ne`.
49543 if (BrCond != NewBrCond.getNode())
49544 DCI.CombineTo(BrCond, NewBrCond);
49545 return X;
49546}
49547
49550 const X86Subtarget &ST) {
49551 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
49552 // ->
49553 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
49554
49555 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
49556 // ->
49557 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
49558 //
49559 // where cflags is determined by cc1.
49560
49561 if (!ST.hasCCMP())
49562 return SDValue();
49563
49564 SDValue SetCC0 = N->getOperand(0);
49565 SDValue SetCC1 = N->getOperand(1);
49566 if (SetCC0.getOpcode() != X86ISD::SETCC ||
49567 SetCC1.getOpcode() != X86ISD::SETCC)
49568 return SDValue();
49569
49570 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
49571 SDValue Op = V.getOperand(1);
49572 unsigned Opc = Op.getOpcode();
49573 if (Opc == X86ISD::SUB)
49574 return X86ISD::CCMP;
49575 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
49576 return X86ISD::CTEST;
49577 return 0U;
49578 };
49579
49580 unsigned NewOpc = 0;
49581
49582 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
49583 // appear on the right.
49584 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
49585 std::swap(SetCC0, SetCC1);
49586 if (!(NewOpc = GetCombineToOpc(SetCC1)))
49587 return SDValue();
49588 }
49589
49590 X86::CondCode CC0 =
49591 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
49592 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
49593 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
49594 return SDValue();
49595
49596 bool IsOR = N->getOpcode() == ISD::OR;
49597
49598 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
49599 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
49600 // operator is OR. Similar for CC1.
49601 SDValue SrcCC =
49603 SDLoc(SetCC0.getOperand(0)), MVT::i8)
49604 : SetCC0.getOperand(0);
49605 SDValue CC1N = SetCC1.getOperand(0);
49606 X86::CondCode CC1 =
49607 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
49609 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
49610 SDLoc DL(N);
49611 SDValue CFlags = DAG.getTargetConstant(
49612 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
49613 SDValue Sub = SetCC1.getOperand(1);
49614
49615 // Replace any uses of the old flag produced by SUB/CMP with the new one
49616 // produced by CCMP/CTEST.
49617 SDValue CCMP = (NewOpc == X86ISD::CCMP)
49618 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
49619 {Sub.getOperand(0), Sub.getOperand(1),
49620 CFlags, SrcCC, SetCC0.getOperand(1)})
49621 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
49622 {Sub.getOperand(0), Sub.getOperand(0),
49623 CFlags, SrcCC, SetCC0.getOperand(1)});
49624
49625 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
49626}
49627
49630 const X86Subtarget &Subtarget) {
49631 SDValue N0 = N->getOperand(0);
49632 SDValue N1 = N->getOperand(1);
49633 EVT VT = N->getValueType(0);
49634 SDLoc dl(N);
49635 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49636
49637 // If this is SSE1 only convert to FAND to avoid scalarization.
49638 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49639 return DAG.getBitcast(MVT::v4i32,
49640 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
49641 DAG.getBitcast(MVT::v4f32, N0),
49642 DAG.getBitcast(MVT::v4f32, N1)));
49643 }
49644
49645 // Use a 32-bit and+zext if upper bits known zero.
49646 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
49647 APInt HiMask = APInt::getHighBitsSet(64, 32);
49648 if (DAG.MaskedValueIsZero(N1, HiMask) ||
49649 DAG.MaskedValueIsZero(N0, HiMask)) {
49650 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
49651 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
49652 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
49653 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
49654 }
49655 }
49656
49657 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
49658 // TODO: Support multiple SrcOps.
49659 if (VT == MVT::i1) {
49661 SmallVector<APInt, 2> SrcPartials;
49662 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
49663 SrcOps.size() == 1) {
49664 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49665 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49666 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49667 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49668 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49669 if (Mask) {
49670 assert(SrcPartials[0].getBitWidth() == NumElts &&
49671 "Unexpected partial reduction mask");
49672 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49673 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49674 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
49675 }
49676 }
49677 }
49678
49679 // InstCombine converts:
49680 // `(-x << C0) & C1`
49681 // to
49682 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
49683 // This saves an IR instruction but on x86 the neg/shift version is preferable
49684 // so undo the transform.
49685
49686 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
49687 // TODO: We don't actually need a splat for this, we just need the checks to
49688 // hold for each element.
49689 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
49690 /*AllowTruncation*/ false);
49691 ConstantSDNode *N01C =
49692 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
49693 /*AllowTruncation*/ false);
49694 if (N1C && N01C) {
49695 const APInt &MulC = N01C->getAPIntValue();
49696 const APInt &AndC = N1C->getAPIntValue();
49697 APInt MulCLowBit = MulC & (-MulC);
49698 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
49699 (MulCLowBit + MulC).isPowerOf2()) {
49700 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
49701 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
49702 assert(MulCLowBitLog != -1 &&
49703 "Isolated lowbit is somehow not a power of 2!");
49704 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
49705 DAG.getConstant(MulCLowBitLog, dl, VT));
49706 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
49707 }
49708 }
49709 }
49710
49711 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
49712 return SetCC;
49713
49714 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
49715 return V;
49716
49717 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49718 return R;
49719
49720 if (SDValue R = combineBitOpWithShift(N, DAG))
49721 return R;
49722
49723 if (SDValue R = combineBitOpWithPACK(N, DAG))
49724 return R;
49725
49726 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49727 return FPLogic;
49728
49729 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
49730 return R;
49731
49732 if (DCI.isBeforeLegalizeOps())
49733 return SDValue();
49734
49735 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49736 return R;
49737
49738 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
49739 return R;
49740
49741 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
49742 return ShiftRight;
49743
49744 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
49745 return R;
49746
49747 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
49748 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
49749 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
49750 if (VT.isVector() && getTargetConstantFromNode(N1)) {
49751 unsigned Opc0 = N0.getOpcode();
49752 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
49754 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
49755 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
49756 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
49757 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
49758 }
49759 }
49760
49761 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
49762 // avoids slow variable shift (moving shift amount to ECX etc.)
49763 if (isOneConstant(N1) && N0->hasOneUse()) {
49764 SDValue Src = N0;
49765 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
49766 Src.getOpcode() == ISD::TRUNCATE) &&
49767 Src.getOperand(0)->hasOneUse())
49768 Src = Src.getOperand(0);
49769 bool ContainsNOT = false;
49770 X86::CondCode X86CC = X86::COND_B;
49771 // Peek through AND(NOT(SRL(X,Y)),1).
49772 if (isBitwiseNot(Src)) {
49773 Src = Src.getOperand(0);
49774 X86CC = X86::COND_AE;
49775 ContainsNOT = true;
49776 }
49777 if (Src.getOpcode() == ISD::SRL &&
49778 !isa<ConstantSDNode>(Src.getOperand(1))) {
49779 SDValue BitNo = Src.getOperand(1);
49780 Src = Src.getOperand(0);
49781 // Peek through AND(SRL(NOT(X),Y),1).
49782 if (isBitwiseNot(Src)) {
49783 Src = Src.getOperand(0);
49784 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
49785 ContainsNOT = true;
49786 }
49787 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
49788 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
49789 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
49790 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
49791 }
49792 }
49793
49794 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49795 // Attempt to recursively combine a bitmask AND with shuffles.
49796 SDValue Op(N, 0);
49797 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49798 return Res;
49799
49800 // If either operand is a constant mask, then only the elements that aren't
49801 // zero are actually demanded by the other operand.
49802 auto GetDemandedMasks = [&](SDValue Op) {
49803 APInt UndefElts;
49804 SmallVector<APInt> EltBits;
49805 int NumElts = VT.getVectorNumElements();
49806 int EltSizeInBits = VT.getScalarSizeInBits();
49807 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
49808 APInt DemandedElts = APInt::getAllOnes(NumElts);
49809 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
49810 EltBits)) {
49811 DemandedBits.clearAllBits();
49812 DemandedElts.clearAllBits();
49813 for (int I = 0; I != NumElts; ++I) {
49814 if (UndefElts[I]) {
49815 // We can't assume an undef src element gives an undef dst - the
49816 // other src might be zero.
49817 DemandedBits.setAllBits();
49818 DemandedElts.setBit(I);
49819 } else if (!EltBits[I].isZero()) {
49820 DemandedBits |= EltBits[I];
49821 DemandedElts.setBit(I);
49822 }
49823 }
49824 }
49825 return std::make_pair(DemandedBits, DemandedElts);
49826 };
49827 APInt Bits0, Elts0;
49828 APInt Bits1, Elts1;
49829 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
49830 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
49831
49832 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
49833 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
49834 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
49835 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
49836 if (N->getOpcode() != ISD::DELETED_NODE)
49837 DCI.AddToWorklist(N);
49838 return SDValue(N, 0);
49839 }
49840
49841 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
49842 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
49843 if (NewN0 || NewN1)
49844 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
49845 NewN1 ? NewN1 : N1);
49846 }
49847
49848 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
49849 if ((VT.getScalarSizeInBits() % 8) == 0 &&
49851 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
49852 SDValue BitMask = N1;
49853 SDValue SrcVec = N0.getOperand(0);
49854 EVT SrcVecVT = SrcVec.getValueType();
49855
49856 // Check that the constant bitmask masks whole bytes.
49857 APInt UndefElts;
49858 SmallVector<APInt, 64> EltBits;
49859 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
49860 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
49861 llvm::all_of(EltBits, [](const APInt &M) {
49862 return M.isZero() || M.isAllOnes();
49863 })) {
49864 unsigned NumElts = SrcVecVT.getVectorNumElements();
49865 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
49866 unsigned Idx = N0.getConstantOperandVal(1);
49867
49868 // Create a root shuffle mask from the byte mask and the extracted index.
49869 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
49870 for (unsigned i = 0; i != Scale; ++i) {
49871 if (UndefElts[i])
49872 continue;
49873 int VecIdx = Scale * Idx + i;
49874 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
49875 }
49876
49878 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
49880 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
49881 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
49882 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
49883 N0.getOperand(1));
49884 }
49885 }
49886
49887 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
49888 return R;
49889
49890 return SDValue();
49891}
49892
49893// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
49895 const X86Subtarget &Subtarget) {
49896 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
49897
49898 MVT VT = N->getSimpleValueType(0);
49899 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49900 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
49901 return SDValue();
49902
49903 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
49904 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
49905 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
49906 return SDValue();
49907
49908 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
49909 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
49910 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
49911 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
49912 return SDValue();
49913
49914 // Attempt to extract constant byte masks.
49915 APInt UndefElts0, UndefElts1;
49916 SmallVector<APInt, 32> EltBits0, EltBits1;
49917 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
49918 /*AllowWholeUndefs*/ false,
49919 /*AllowPartialUndefs*/ false))
49920 return SDValue();
49921 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
49922 /*AllowWholeUndefs*/ false,
49923 /*AllowPartialUndefs*/ false))
49924 return SDValue();
49925
49926 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
49927 // TODO - add UNDEF elts support.
49928 if (UndefElts0[i] || UndefElts1[i])
49929 return SDValue();
49930 if (EltBits0[i] != ~EltBits1[i])
49931 return SDValue();
49932 }
49933
49934 SDLoc DL(N);
49935
49936 if (useVPTERNLOG(Subtarget, VT)) {
49937 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
49938 // VPTERNLOG is only available as vXi32/64-bit types.
49939 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
49940 MVT OpVT =
49941 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
49942 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
49943 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
49944 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
49945 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
49946 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
49947 DAG, Subtarget);
49948 return DAG.getBitcast(VT, Res);
49949 }
49950
49951 SDValue X = N->getOperand(0);
49952 SDValue Y =
49953 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
49954 DAG.getBitcast(VT, N1.getOperand(0)));
49955 return DAG.getNode(ISD::OR, DL, VT, X, Y);
49956}
49957
49958// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
49959static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
49960 if (N->getOpcode() != ISD::OR)
49961 return false;
49962
49963 SDValue N0 = N->getOperand(0);
49964 SDValue N1 = N->getOperand(1);
49965
49966 // Canonicalize AND to LHS.
49967 if (N1.getOpcode() == ISD::AND)
49968 std::swap(N0, N1);
49969
49970 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
49971 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
49972 return false;
49973
49974 Mask = N1.getOperand(0);
49975 X = N1.getOperand(1);
49976
49977 // Check to see if the mask appeared in both the AND and ANDNP.
49978 if (N0.getOperand(0) == Mask)
49979 Y = N0.getOperand(1);
49980 else if (N0.getOperand(1) == Mask)
49981 Y = N0.getOperand(0);
49982 else
49983 return false;
49984
49985 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
49986 // ANDNP combine allows other combines to happen that prevent matching.
49987 return true;
49988}
49989
49990// Try to fold:
49991// (or (and (m, y), (pandn m, x)))
49992// into:
49993// (vselect m, x, y)
49994// As a special case, try to fold:
49995// (or (and (m, (sub 0, x)), (pandn m, x)))
49996// into:
49997// (sub (xor X, M), M)
49999 const X86Subtarget &Subtarget) {
50000 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
50001
50002 EVT VT = N->getValueType(0);
50003 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50004 (VT.is256BitVector() && Subtarget.hasInt256())))
50005 return SDValue();
50006
50007 SDValue X, Y, Mask;
50008 if (!matchLogicBlend(N, X, Y, Mask))
50009 return SDValue();
50010
50011 // Validate that X, Y, and Mask are bitcasts, and see through them.
50012 Mask = peekThroughBitcasts(Mask);
50015
50016 EVT MaskVT = Mask.getValueType();
50017 unsigned EltBits = MaskVT.getScalarSizeInBits();
50018
50019 // TODO: Attempt to handle floating point cases as well?
50020 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
50021 return SDValue();
50022
50023 SDLoc DL(N);
50024
50025 // Attempt to combine to conditional negate: (sub (xor X, M), M)
50026 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
50027 DAG, Subtarget))
50028 return Res;
50029
50030 // PBLENDVB is only available on SSE 4.1.
50031 if (!Subtarget.hasSSE41())
50032 return SDValue();
50033
50034 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
50035 if (Subtarget.hasVLX())
50036 return SDValue();
50037
50038 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
50039
50040 X = DAG.getBitcast(BlendVT, X);
50041 Y = DAG.getBitcast(BlendVT, Y);
50042 Mask = DAG.getBitcast(BlendVT, Mask);
50043 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
50044 return DAG.getBitcast(VT, Mask);
50045}
50046
50047// Helper function for combineOrCmpEqZeroToCtlzSrl
50048// Transforms:
50049// seteq(cmp x, 0)
50050// into:
50051// srl(ctlz x), log2(bitsize(x))
50052// Input pattern is checked by caller.
50054 SDValue Cmp = Op.getOperand(1);
50055 EVT VT = Cmp.getOperand(0).getValueType();
50056 unsigned Log2b = Log2_32(VT.getSizeInBits());
50057 SDLoc dl(Op);
50058 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
50059 // The result of the shift is true or false, and on X86, the 32-bit
50060 // encoding of shr and lzcnt is more desirable.
50061 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
50062 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
50063 DAG.getConstant(Log2b, dl, MVT::i8));
50064 return Scc;
50065}
50066
50067// Try to transform:
50068// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
50069// into:
50070// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
50071// Will also attempt to match more generic cases, eg:
50072// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
50073// Only applies if the target supports the FastLZCNT feature.
50076 const X86Subtarget &Subtarget) {
50077 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
50078 return SDValue();
50079
50080 auto isORCandidate = [](SDValue N) {
50081 return (N->getOpcode() == ISD::OR && N->hasOneUse());
50082 };
50083
50084 // Check the zero extend is extending to 32-bit or more. The code generated by
50085 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
50086 // instructions to clear the upper bits.
50087 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
50088 !isORCandidate(N->getOperand(0)))
50089 return SDValue();
50090
50091 // Check the node matches: setcc(eq, cmp 0)
50092 auto isSetCCCandidate = [](SDValue N) {
50093 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
50094 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
50095 N->getOperand(1).getOpcode() == X86ISD::CMP &&
50096 isNullConstant(N->getOperand(1).getOperand(1)) &&
50097 N->getOperand(1).getValueType().bitsGE(MVT::i32);
50098 };
50099
50100 SDNode *OR = N->getOperand(0).getNode();
50101 SDValue LHS = OR->getOperand(0);
50102 SDValue RHS = OR->getOperand(1);
50103
50104 // Save nodes matching or(or, setcc(eq, cmp 0)).
50106 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
50107 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
50108 ORNodes.push_back(OR);
50109 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
50110 LHS = OR->getOperand(0);
50111 RHS = OR->getOperand(1);
50112 }
50113
50114 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
50115 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
50116 !isORCandidate(SDValue(OR, 0)))
50117 return SDValue();
50118
50119 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
50120 // to
50121 // or(srl(ctlz),srl(ctlz)).
50122 // The dag combiner can then fold it into:
50123 // srl(or(ctlz, ctlz)).
50124 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
50125 SDValue Ret, NewRHS;
50126 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
50127 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
50128
50129 if (!Ret)
50130 return SDValue();
50131
50132 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
50133 while (!ORNodes.empty()) {
50134 OR = ORNodes.pop_back_val();
50135 LHS = OR->getOperand(0);
50136 RHS = OR->getOperand(1);
50137 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
50138 if (RHS->getOpcode() == ISD::OR)
50139 std::swap(LHS, RHS);
50140 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
50141 if (!NewRHS)
50142 return SDValue();
50143 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
50144 }
50145
50146 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
50147}
50148
50150 SDValue And1_L, SDValue And1_R,
50151 const SDLoc &DL, SelectionDAG &DAG) {
50152 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
50153 return SDValue();
50154 SDValue NotOp = And0_L->getOperand(0);
50155 if (NotOp == And1_R)
50156 std::swap(And1_R, And1_L);
50157 if (NotOp != And1_L)
50158 return SDValue();
50159
50160 // (~(NotOp) & And0_R) | (NotOp & And1_R)
50161 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
50162 EVT VT = And1_L->getValueType(0);
50163 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
50164 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
50165 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
50166 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
50167 return Xor1;
50168}
50169
50170/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
50171/// equivalent `((x ^ y) & m) ^ y)` pattern.
50172/// This is typically a better representation for targets without a fused
50173/// "and-not" operation. This function is intended to be called from a
50174/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
50176 // Note that masked-merge variants using XOR or ADD expressions are
50177 // normalized to OR by InstCombine so we only check for OR.
50178 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
50179 SDValue N0 = Node->getOperand(0);
50180 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
50181 return SDValue();
50182 SDValue N1 = Node->getOperand(1);
50183 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
50184 return SDValue();
50185
50186 SDLoc DL(Node);
50187 SDValue N00 = N0->getOperand(0);
50188 SDValue N01 = N0->getOperand(1);
50189 SDValue N10 = N1->getOperand(0);
50190 SDValue N11 = N1->getOperand(1);
50191 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
50192 return Result;
50193 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
50194 return Result;
50195 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
50196 return Result;
50197 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
50198 return Result;
50199 return SDValue();
50200}
50201
50202/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50203/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50204/// with CMP+{ADC, SBB}.
50205/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
50206static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
50207 SDValue X, SDValue Y,
50208 SelectionDAG &DAG,
50209 bool ZeroSecondOpOnly = false) {
50210 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
50211 return SDValue();
50212
50213 // Look through a one-use zext.
50214 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
50215 Y = Y.getOperand(0);
50216
50218 SDValue EFLAGS;
50219 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
50220 CC = (X86::CondCode)Y.getConstantOperandVal(0);
50221 EFLAGS = Y.getOperand(1);
50222 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
50223 Y.hasOneUse()) {
50224 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
50225 }
50226
50227 if (!EFLAGS)
50228 return SDValue();
50229
50230 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50231 // the general case below.
50232 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
50233 if (ConstantX && !ZeroSecondOpOnly) {
50234 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
50235 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
50236 // This is a complicated way to get -1 or 0 from the carry flag:
50237 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50238 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50239 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50240 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50241 EFLAGS);
50242 }
50243
50244 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
50245 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
50246 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
50247 EFLAGS.getValueType().isInteger() &&
50248 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50249 // Swap the operands of a SUB, and we have the same pattern as above.
50250 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
50251 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
50252 SDValue NewSub = DAG.getNode(
50253 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50254 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50255 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
50256 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50257 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50258 NewEFLAGS);
50259 }
50260 }
50261 }
50262
50263 if (CC == X86::COND_B) {
50264 // X + SETB Z --> adc X, 0
50265 // X - SETB Z --> sbb X, 0
50266 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50267 DAG.getVTList(VT, MVT::i32), X,
50268 DAG.getConstant(0, DL, VT), EFLAGS);
50269 }
50270
50271 if (ZeroSecondOpOnly)
50272 return SDValue();
50273
50274 if (CC == X86::COND_A) {
50275 // Try to convert COND_A into COND_B in an attempt to facilitate
50276 // materializing "setb reg".
50277 //
50278 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
50279 // cannot take an immediate as its first operand.
50280 //
50281 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50282 EFLAGS.getValueType().isInteger() &&
50283 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50284 SDValue NewSub =
50285 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50286 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50287 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50288 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50289 DAG.getVTList(VT, MVT::i32), X,
50290 DAG.getConstant(0, DL, VT), NewEFLAGS);
50291 }
50292 }
50293
50294 if (CC == X86::COND_AE) {
50295 // X + SETAE --> sbb X, -1
50296 // X - SETAE --> adc X, -1
50297 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50298 DAG.getVTList(VT, MVT::i32), X,
50299 DAG.getConstant(-1, DL, VT), EFLAGS);
50300 }
50301
50302 if (CC == X86::COND_BE) {
50303 // X + SETBE --> sbb X, -1
50304 // X - SETBE --> adc X, -1
50305 // Try to convert COND_BE into COND_AE in an attempt to facilitate
50306 // materializing "setae reg".
50307 //
50308 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
50309 // cannot take an immediate as its first operand.
50310 //
50311 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50312 EFLAGS.getValueType().isInteger() &&
50313 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50314 SDValue NewSub =
50315 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50316 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50317 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50318 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50319 DAG.getVTList(VT, MVT::i32), X,
50320 DAG.getConstant(-1, DL, VT), NewEFLAGS);
50321 }
50322 }
50323
50324 if (CC != X86::COND_E && CC != X86::COND_NE)
50325 return SDValue();
50326
50327 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
50328 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
50329 !EFLAGS.getOperand(0).getValueType().isInteger())
50330 return SDValue();
50331
50332 SDValue Z = EFLAGS.getOperand(0);
50333 EVT ZVT = Z.getValueType();
50334
50335 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50336 // the general case below.
50337 if (ConstantX) {
50338 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
50339 // fake operands:
50340 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
50341 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
50342 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
50343 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
50344 SDValue Zero = DAG.getConstant(0, DL, ZVT);
50345 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50346 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
50347 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50348 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50349 SDValue(Neg.getNode(), 1));
50350 }
50351
50352 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
50353 // with fake operands:
50354 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
50355 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
50356 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
50357 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
50358 SDValue One = DAG.getConstant(1, DL, ZVT);
50359 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50360 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50361 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50362 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50363 Cmp1.getValue(1));
50364 }
50365 }
50366
50367 // (cmp Z, 1) sets the carry flag if Z is 0.
50368 SDValue One = DAG.getConstant(1, DL, ZVT);
50369 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50370 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50371
50372 // Add the flags type for ADC/SBB nodes.
50373 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50374
50375 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
50376 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
50377 if (CC == X86::COND_NE)
50378 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
50379 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
50380
50381 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
50382 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
50383 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
50384 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
50385}
50386
50387/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50388/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50389/// with CMP+{ADC, SBB}.
50391 SelectionDAG &DAG) {
50392 bool IsSub = N->getOpcode() == ISD::SUB;
50393 SDValue X = N->getOperand(0);
50394 SDValue Y = N->getOperand(1);
50395 EVT VT = N->getValueType(0);
50396
50397 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
50398 return ADCOrSBB;
50399
50400 // Commute and try again (negate the result for subtracts).
50401 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
50402 if (IsSub)
50403 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
50404 return ADCOrSBB;
50405 }
50406
50407 return SDValue();
50408}
50409
50411 SelectionDAG &DAG) {
50412 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&
50413 "Unexpected opcode");
50414
50415 // Delegate to combineAddOrSubToADCOrSBB if we have:
50416 //
50417 // (xor/or (zero_extend (setcc)) imm)
50418 //
50419 // where imm is odd if and only if we have xor, in which case the XOR/OR are
50420 // equivalent to a SUB/ADD, respectively.
50421 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
50422 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
50423 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
50424 bool IsSub = N->getOpcode() == ISD::XOR;
50425 bool N1COdd = N1C->getZExtValue() & 1;
50426 if (IsSub ? N1COdd : !N1COdd) {
50427 SDLoc DL(N);
50428 EVT VT = N->getValueType(0);
50429 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
50430 return R;
50431 }
50432 }
50433 }
50434
50435 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
50436 if (N->getOpcode() == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
50437 N0.getOperand(0).getOpcode() == ISD::AND &&
50440 MVT VT = N->getSimpleValueType(0);
50441 APInt UndefElts;
50442 SmallVector<APInt> EltBits;
50444 VT.getScalarSizeInBits(), UndefElts,
50445 EltBits)) {
50446 bool IsPow2OrUndef = true;
50447 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
50448 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
50449
50450 if (IsPow2OrUndef)
50451 return DAG.getNode(X86ISD::PCMPEQ, SDLoc(N), VT, N0.getOperand(0),
50452 N0.getOperand(0).getOperand(1));
50453 }
50454 }
50455
50456 return SDValue();
50457}
50458
50461 const X86Subtarget &Subtarget) {
50462 SDValue N0 = N->getOperand(0);
50463 SDValue N1 = N->getOperand(1);
50464 EVT VT = N->getValueType(0);
50465 SDLoc dl(N);
50466 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50467
50468 // If this is SSE1 only convert to FOR to avoid scalarization.
50469 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
50470 return DAG.getBitcast(MVT::v4i32,
50471 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
50472 DAG.getBitcast(MVT::v4f32, N0),
50473 DAG.getBitcast(MVT::v4f32, N1)));
50474 }
50475
50476 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
50477 // TODO: Support multiple SrcOps.
50478 if (VT == MVT::i1) {
50480 SmallVector<APInt, 2> SrcPartials;
50481 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
50482 SrcOps.size() == 1) {
50483 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
50484 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50485 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
50486 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
50487 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
50488 if (Mask) {
50489 assert(SrcPartials[0].getBitWidth() == NumElts &&
50490 "Unexpected partial reduction mask");
50491 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
50492 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
50493 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
50494 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
50495 }
50496 }
50497 }
50498
50499 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
50500 return SetCC;
50501
50502 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50503 return R;
50504
50505 if (SDValue R = combineBitOpWithShift(N, DAG))
50506 return R;
50507
50508 if (SDValue R = combineBitOpWithPACK(N, DAG))
50509 return R;
50510
50511 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50512 return FPLogic;
50513
50514 if (DCI.isBeforeLegalizeOps())
50515 return SDValue();
50516
50517 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
50518 return R;
50519
50520 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
50521 return R;
50522
50523 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
50524 return R;
50525
50526 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
50527 if ((VT == MVT::i32 || VT == MVT::i64) &&
50528 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
50529 isNullConstant(N0.getOperand(0))) {
50530 SDValue Cond = N0.getOperand(1);
50531 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
50532 Cond = Cond.getOperand(0);
50533
50534 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
50535 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
50536 uint64_t Val = CN->getZExtValue();
50537 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
50538 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
50539 CCode = X86::GetOppositeBranchCondition(CCode);
50540 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
50541
50542 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
50543 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
50544 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
50545 return R;
50546 }
50547 }
50548 }
50549 }
50550
50551 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
50552 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
50553 // iff the upper elements of the non-shifted arg are zero.
50554 // KUNPCK require 16+ bool vector elements.
50555 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
50556 unsigned NumElts = VT.getVectorNumElements();
50557 unsigned HalfElts = NumElts / 2;
50558 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
50559 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
50560 N1.getConstantOperandAPInt(1) == HalfElts &&
50561 DAG.MaskedVectorIsZero(N0, UpperElts)) {
50562 return DAG.getNode(
50563 ISD::CONCAT_VECTORS, dl, VT,
50564 extractSubVector(N0, 0, DAG, dl, HalfElts),
50565 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
50566 }
50567 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
50568 N0.getConstantOperandAPInt(1) == HalfElts &&
50569 DAG.MaskedVectorIsZero(N1, UpperElts)) {
50570 return DAG.getNode(
50571 ISD::CONCAT_VECTORS, dl, VT,
50572 extractSubVector(N1, 0, DAG, dl, HalfElts),
50573 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
50574 }
50575 }
50576
50577 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50578 // Attempt to recursively combine an OR of shuffles.
50579 SDValue Op(N, 0);
50580 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50581 return Res;
50582
50583 // If either operand is a constant mask, then only the elements that aren't
50584 // allones are actually demanded by the other operand.
50585 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
50586 APInt UndefElts;
50587 SmallVector<APInt> EltBits;
50588 int NumElts = VT.getVectorNumElements();
50589 int EltSizeInBits = VT.getScalarSizeInBits();
50590 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
50591 return false;
50592
50593 APInt DemandedElts = APInt::getZero(NumElts);
50594 for (int I = 0; I != NumElts; ++I)
50595 if (!EltBits[I].isAllOnes())
50596 DemandedElts.setBit(I);
50597
50598 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
50599 };
50600 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
50601 if (N->getOpcode() != ISD::DELETED_NODE)
50602 DCI.AddToWorklist(N);
50603 return SDValue(N, 0);
50604 }
50605 }
50606
50607 // We should fold "masked merge" patterns when `andn` is not available.
50608 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
50609 if (SDValue R = foldMaskedMerge(N, DAG))
50610 return R;
50611
50612 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
50613 return R;
50614
50615 return SDValue();
50616}
50617
50618/// Try to turn tests against the signbit in the form of:
50619/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
50620/// into:
50621/// SETGT(X, -1)
50623 // This is only worth doing if the output type is i8 or i1.
50624 EVT ResultType = N->getValueType(0);
50625 if (ResultType != MVT::i8 && ResultType != MVT::i1)
50626 return SDValue();
50627
50628 SDValue N0 = N->getOperand(0);
50629 SDValue N1 = N->getOperand(1);
50630
50631 // We should be performing an xor against a truncated shift.
50632 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
50633 return SDValue();
50634
50635 // Make sure we are performing an xor against one.
50636 if (!isOneConstant(N1))
50637 return SDValue();
50638
50639 // SetCC on x86 zero extends so only act on this if it's a logical shift.
50640 SDValue Shift = N0.getOperand(0);
50641 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
50642 return SDValue();
50643
50644 // Make sure we are truncating from one of i16, i32 or i64.
50645 EVT ShiftTy = Shift.getValueType();
50646 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
50647 return SDValue();
50648
50649 // Make sure the shift amount extracts the sign bit.
50650 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
50651 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
50652 return SDValue();
50653
50654 // Create a greater-than comparison against -1.
50655 // N.B. Using SETGE against 0 works but we want a canonical looking
50656 // comparison, using SETGT matches up with what TranslateX86CC.
50657 SDLoc DL(N);
50658 SDValue ShiftOp = Shift.getOperand(0);
50659 EVT ShiftOpTy = ShiftOp.getValueType();
50660 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50661 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
50662 *DAG.getContext(), ResultType);
50663 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
50664 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
50665 if (SetCCResultType != ResultType)
50666 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
50667 return Cond;
50668}
50669
50670/// Turn vector tests of the signbit in the form of:
50671/// xor (sra X, elt_size(X)-1), -1
50672/// into:
50673/// pcmpgt X, -1
50674///
50675/// This should be called before type legalization because the pattern may not
50676/// persist after that.
50678 const X86Subtarget &Subtarget) {
50679 EVT VT = N->getValueType(0);
50680 if (!VT.isSimple())
50681 return SDValue();
50682
50683 switch (VT.getSimpleVT().SimpleTy) {
50684 // clang-format off
50685 default: return SDValue();
50686 case MVT::v16i8:
50687 case MVT::v8i16:
50688 case MVT::v4i32:
50689 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
50690 case MVT::v32i8:
50691 case MVT::v16i16:
50692 case MVT::v8i32:
50693 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
50694 // clang-format on
50695 }
50696
50697 // There must be a shift right algebraic before the xor, and the xor must be a
50698 // 'not' operation.
50699 SDValue Shift = N->getOperand(0);
50700 SDValue Ones = N->getOperand(1);
50701 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
50703 return SDValue();
50704
50705 // The shift should be smearing the sign bit across each vector element.
50706 auto *ShiftAmt =
50707 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
50708 if (!ShiftAmt ||
50709 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
50710 return SDValue();
50711
50712 // Create a greater-than comparison against -1. We don't use the more obvious
50713 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
50714 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
50715}
50716
50717/// Detect patterns of truncation with unsigned saturation:
50718///
50719/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
50720/// Return the source value x to be truncated or SDValue() if the pattern was
50721/// not matched.
50722///
50723/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
50724/// where C1 >= 0 and C2 is unsigned max of destination type.
50725///
50726/// (truncate (smax (smin (x, C2), C1)) to dest_type)
50727/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
50728///
50729/// These two patterns are equivalent to:
50730/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
50731/// So return the smax(x, C1) value to be truncated or SDValue() if the
50732/// pattern was not matched.
50734 const SDLoc &DL) {
50735 EVT InVT = In.getValueType();
50736
50737 // Saturation with truncation. We truncate from InVT to VT.
50739 "Unexpected types for truncate operation");
50740
50741 // Match min/max and return limit value as a parameter.
50742 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
50743 if (V.getOpcode() == Opcode &&
50744 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
50745 return V.getOperand(0);
50746 return SDValue();
50747 };
50748
50749 APInt C1, C2;
50750 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
50751 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
50752 // the element size of the destination type.
50753 if (C2.isMask(VT.getScalarSizeInBits()))
50754 return UMin;
50755
50756 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
50757 if (MatchMinMax(SMin, ISD::SMAX, C1))
50758 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
50759 return SMin;
50760
50761 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
50762 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
50763 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
50764 C2.uge(C1)) {
50765 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
50766 }
50767
50768 return SDValue();
50769}
50770
50771/// Detect patterns of truncation with signed saturation:
50772/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
50773/// signed_max_of_dest_type)) to dest_type)
50774/// or:
50775/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
50776/// signed_min_of_dest_type)) to dest_type).
50777/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
50778/// Return the source value to be truncated or SDValue() if the pattern was not
50779/// matched.
50780static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
50781 unsigned NumDstBits = VT.getScalarSizeInBits();
50782 unsigned NumSrcBits = In.getScalarValueSizeInBits();
50783 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
50784
50785 auto MatchMinMax = [](SDValue V, unsigned Opcode,
50786 const APInt &Limit) -> SDValue {
50787 APInt C;
50788 if (V.getOpcode() == Opcode &&
50789 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
50790 return V.getOperand(0);
50791 return SDValue();
50792 };
50793
50794 APInt SignedMax, SignedMin;
50795 if (MatchPackUS) {
50796 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
50797 SignedMin = APInt(NumSrcBits, 0);
50798 } else {
50799 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
50800 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
50801 }
50802
50803 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
50804 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
50805 return SMax;
50806
50807 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
50808 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
50809 return SMin;
50810
50811 return SDValue();
50812}
50813
50815 SelectionDAG &DAG,
50816 const X86Subtarget &Subtarget) {
50817 if (!Subtarget.hasSSE2() || !VT.isVector())
50818 return SDValue();
50819
50820 EVT SVT = VT.getVectorElementType();
50821 EVT InVT = In.getValueType();
50822 EVT InSVT = InVT.getVectorElementType();
50823
50824 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
50825 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
50826 // and concatenate at the same time. Then we can use a final vpmovuswb to
50827 // clip to 0-255.
50828 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
50829 InVT == MVT::v16i32 && VT == MVT::v16i8) {
50830 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
50831 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
50832 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
50833 DL, DAG, Subtarget);
50834 assert(Mid && "Failed to pack!");
50835 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
50836 }
50837 }
50838
50839 // vXi32 truncate instructions are available with AVX512F.
50840 // vXi16 truncate instructions are only available with AVX512BW.
50841 // For 256-bit or smaller vectors, we require VLX.
50842 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
50843 // If the result type is 256-bits or larger and we have disable 512-bit
50844 // registers, we should go ahead and use the pack instructions if possible.
50845 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
50846 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
50847 (InVT.getSizeInBits() > 128) &&
50848 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
50849 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
50850
50851 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
50853 (SVT == MVT::i8 || SVT == MVT::i16) &&
50854 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
50855 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
50856 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
50857 if (SVT == MVT::i8 && InSVT == MVT::i32) {
50858 EVT MidVT = VT.changeVectorElementType(MVT::i16);
50859 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
50860 DAG, Subtarget);
50861 assert(Mid && "Failed to pack!");
50863 Subtarget);
50864 assert(V && "Failed to pack!");
50865 return V;
50866 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
50867 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
50868 Subtarget);
50869 }
50870 if (SDValue SSatVal = detectSSatPattern(In, VT))
50871 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
50872 Subtarget);
50873 }
50874
50875 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50876 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
50877 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
50878 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
50879 unsigned TruncOpc = 0;
50880 SDValue SatVal;
50881 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
50882 SatVal = SSatVal;
50883 TruncOpc = X86ISD::VTRUNCS;
50884 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
50885 SatVal = USatVal;
50886 TruncOpc = X86ISD::VTRUNCUS;
50887 }
50888 if (SatVal) {
50889 unsigned ResElts = VT.getVectorNumElements();
50890 // If the input type is less than 512 bits and we don't have VLX, we need
50891 // to widen to 512 bits.
50892 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
50893 unsigned NumConcats = 512 / InVT.getSizeInBits();
50894 ResElts *= NumConcats;
50895 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
50896 ConcatOps[0] = SatVal;
50897 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
50898 NumConcats * InVT.getVectorNumElements());
50899 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
50900 }
50901 // Widen the result if its narrower than 128 bits.
50902 if (ResElts * SVT.getSizeInBits() < 128)
50903 ResElts = 128 / SVT.getSizeInBits();
50904 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
50905 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
50906 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
50907 DAG.getIntPtrConstant(0, DL));
50908 }
50909 }
50910
50911 return SDValue();
50912}
50913
50915 SelectionDAG &DAG,
50917 const X86Subtarget &Subtarget) {
50918 auto *Ld = cast<LoadSDNode>(N);
50919 EVT RegVT = Ld->getValueType(0);
50920 SDValue Ptr = Ld->getBasePtr();
50921 SDValue Chain = Ld->getChain();
50922 ISD::LoadExtType Ext = Ld->getExtensionType();
50923
50924 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
50925 return SDValue();
50926
50927 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
50928 return SDValue();
50929
50931 if (!LdC)
50932 return SDValue();
50933
50934 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
50935 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
50936 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
50937 if (Undefs[I])
50938 continue;
50939 if (UserUndefs[I] || Bits[I] != UserBits[I])
50940 return false;
50941 }
50942 return true;
50943 };
50944
50945 // Look through all other loads/broadcasts in the chain for another constant
50946 // pool entry.
50947 for (SDNode *User : Chain->uses()) {
50948 auto *UserLd = dyn_cast<MemSDNode>(User);
50949 if (User != N && UserLd &&
50950 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
50951 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
50953 UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
50954 User->getValueSizeInBits(0).getFixedValue() >
50955 RegVT.getFixedSizeInBits()) {
50956 EVT UserVT = User->getValueType(0);
50957 SDValue UserPtr = UserLd->getBasePtr();
50958 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
50959
50960 // See if we are loading a constant that matches in the lower
50961 // bits of a longer constant (but from a different constant pool ptr).
50962 if (UserC && UserPtr != Ptr) {
50963 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
50964 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
50965 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
50966 APInt Undefs, UserUndefs;
50967 SmallVector<APInt> Bits, UserBits;
50968 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
50969 UserVT.getScalarSizeInBits());
50970 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
50971 Bits) &&
50973 UserUndefs, UserBits)) {
50974 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
50975 SDValue Extract = extractSubVector(
50976 SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
50977 Extract = DAG.getBitcast(RegVT, Extract);
50978 return DCI.CombineTo(N, Extract, SDValue(User, 1));
50979 }
50980 }
50981 }
50982 }
50983 }
50984 }
50985
50986 return SDValue();
50987}
50988
50991 const X86Subtarget &Subtarget) {
50992 auto *Ld = cast<LoadSDNode>(N);
50993 EVT RegVT = Ld->getValueType(0);
50994 EVT MemVT = Ld->getMemoryVT();
50995 SDLoc dl(Ld);
50996 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50997
50998 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
50999 // into two 16-byte operations. Also split non-temporal aligned loads on
51000 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
51001 ISD::LoadExtType Ext = Ld->getExtensionType();
51002 unsigned Fast;
51003 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
51004 Ext == ISD::NON_EXTLOAD &&
51005 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
51006 Ld->getAlign() >= Align(16)) ||
51007 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
51008 *Ld->getMemOperand(), &Fast) &&
51009 !Fast))) {
51010 unsigned NumElems = RegVT.getVectorNumElements();
51011 if (NumElems < 2)
51012 return SDValue();
51013
51014 unsigned HalfOffset = 16;
51015 SDValue Ptr1 = Ld->getBasePtr();
51016 SDValue Ptr2 =
51017 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
51018 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
51019 NumElems / 2);
51020 SDValue Load1 =
51021 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
51022 Ld->getOriginalAlign(),
51023 Ld->getMemOperand()->getFlags());
51024 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
51025 Ld->getPointerInfo().getWithOffset(HalfOffset),
51026 Ld->getOriginalAlign(),
51027 Ld->getMemOperand()->getFlags());
51028 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
51029 Load1.getValue(1), Load2.getValue(1));
51030
51031 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
51032 return DCI.CombineTo(N, NewVec, TF, true);
51033 }
51034
51035 // Bool vector load - attempt to cast to an integer, as we have good
51036 // (vXiY *ext(vXi1 bitcast(iX))) handling.
51037 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
51038 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
51039 unsigned NumElts = RegVT.getVectorNumElements();
51040 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51041 if (TLI.isTypeLegal(IntVT)) {
51042 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
51043 Ld->getPointerInfo(),
51044 Ld->getOriginalAlign(),
51045 Ld->getMemOperand()->getFlags());
51046 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
51047 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
51048 }
51049 }
51050
51051 // If we also broadcast this vector to a wider type, then just extract the
51052 // lowest subvector.
51053 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
51054 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
51055 SDValue Ptr = Ld->getBasePtr();
51056 SDValue Chain = Ld->getChain();
51057 for (SDNode *User : Chain->uses()) {
51058 auto *UserLd = dyn_cast<MemSDNode>(User);
51059 if (User != N && UserLd &&
51060 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
51061 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
51062 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
51063 !User->hasAnyUseOfValue(1) &&
51064 User->getValueSizeInBits(0).getFixedValue() >
51065 RegVT.getFixedSizeInBits()) {
51066 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
51067 RegVT.getSizeInBits());
51068 Extract = DAG.getBitcast(RegVT, Extract);
51069 return DCI.CombineTo(N, Extract, SDValue(User, 1));
51070 }
51071 }
51072 }
51073
51074 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
51075 return V;
51076
51077 // Cast ptr32 and ptr64 pointers to the default address space before a load.
51078 unsigned AddrSpace = Ld->getAddressSpace();
51079 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51080 AddrSpace == X86AS::PTR32_UPTR) {
51081 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51082 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
51083 SDValue Cast =
51084 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
51085 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
51086 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
51087 Ld->getMemOperand()->getFlags());
51088 }
51089 }
51090
51091 return SDValue();
51092}
51093
51094/// If V is a build vector of boolean constants and exactly one of those
51095/// constants is true, return the operand index of that true element.
51096/// Otherwise, return -1.
51097static int getOneTrueElt(SDValue V) {
51098 // This needs to be a build vector of booleans.
51099 // TODO: Checking for the i1 type matches the IR definition for the mask,
51100 // but the mask check could be loosened to i8 or other types. That might
51101 // also require checking more than 'allOnesValue'; eg, the x86 HW
51102 // instructions only require that the MSB is set for each mask element.
51103 // The ISD::MSTORE comments/definition do not specify how the mask operand
51104 // is formatted.
51105 auto *BV = dyn_cast<BuildVectorSDNode>(V);
51106 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
51107 return -1;
51108
51109 int TrueIndex = -1;
51110 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
51111 for (unsigned i = 0; i < NumElts; ++i) {
51112 const SDValue &Op = BV->getOperand(i);
51113 if (Op.isUndef())
51114 continue;
51115 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
51116 if (!ConstNode)
51117 return -1;
51118 if (ConstNode->getAPIntValue().countr_one() >= 1) {
51119 // If we already found a one, this is too many.
51120 if (TrueIndex >= 0)
51121 return -1;
51122 TrueIndex = i;
51123 }
51124 }
51125 return TrueIndex;
51126}
51127
51128/// Given a masked memory load/store operation, return true if it has one mask
51129/// bit set. If it has one mask bit set, then also return the memory address of
51130/// the scalar element to load/store, the vector index to insert/extract that
51131/// scalar element, and the alignment for the scalar memory access.
51133 SelectionDAG &DAG, SDValue &Addr,
51134 SDValue &Index, Align &Alignment,
51135 unsigned &Offset) {
51136 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
51137 if (TrueMaskElt < 0)
51138 return false;
51139
51140 // Get the address of the one scalar element that is specified by the mask
51141 // using the appropriate offset from the base pointer.
51142 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
51143 Offset = 0;
51144 Addr = MaskedOp->getBasePtr();
51145 if (TrueMaskElt != 0) {
51146 Offset = TrueMaskElt * EltVT.getStoreSize();
51148 SDLoc(MaskedOp));
51149 }
51150
51151 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
51152 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
51153 EltVT.getStoreSize());
51154 return true;
51155}
51156
51157/// If exactly one element of the mask is set for a non-extending masked load,
51158/// it is a scalar load and vector insert.
51159/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51160/// mask have already been optimized in IR, so we don't bother with those here.
51161static SDValue
51164 const X86Subtarget &Subtarget) {
51165 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
51166 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51167 // However, some target hooks may need to be added to know when the transform
51168 // is profitable. Endianness would also have to be considered.
51169
51170 SDValue Addr, VecIndex;
51171 Align Alignment;
51172 unsigned Offset;
51173 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
51174 return SDValue();
51175
51176 // Load the one scalar element that is specified by the mask using the
51177 // appropriate offset from the base pointer.
51178 SDLoc DL(ML);
51179 EVT VT = ML->getValueType(0);
51180 EVT EltVT = VT.getVectorElementType();
51181
51182 EVT CastVT = VT;
51183 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51184 EltVT = MVT::f64;
51185 CastVT = VT.changeVectorElementType(EltVT);
51186 }
51187
51188 SDValue Load =
51189 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
51190 ML->getPointerInfo().getWithOffset(Offset),
51191 Alignment, ML->getMemOperand()->getFlags());
51192
51193 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
51194
51195 // Insert the loaded element into the appropriate place in the vector.
51196 SDValue Insert =
51197 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
51198 Insert = DAG.getBitcast(VT, Insert);
51199 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
51200}
51201
51202static SDValue
51205 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
51206 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
51207 return SDValue();
51208
51209 SDLoc DL(ML);
51210 EVT VT = ML->getValueType(0);
51211
51212 // If we are loading the first and last elements of a vector, it is safe and
51213 // always faster to load the whole vector. Replace the masked load with a
51214 // vector load and select.
51215 unsigned NumElts = VT.getVectorNumElements();
51216 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
51217 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
51218 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
51219 if (LoadFirstElt && LoadLastElt) {
51220 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
51221 ML->getMemOperand());
51222 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
51223 ML->getPassThru());
51224 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
51225 }
51226
51227 // Convert a masked load with a constant mask into a masked load and a select.
51228 // This allows the select operation to use a faster kind of select instruction
51229 // (for example, vblendvps -> vblendps).
51230
51231 // Don't try this if the pass-through operand is already undefined. That would
51232 // cause an infinite loop because that's what we're about to create.
51233 if (ML->getPassThru().isUndef())
51234 return SDValue();
51235
51236 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
51237 return SDValue();
51238
51239 // The new masked load has an undef pass-through operand. The select uses the
51240 // original pass-through operand.
51241 SDValue NewML = DAG.getMaskedLoad(
51242 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
51243 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
51244 ML->getAddressingMode(), ML->getExtensionType());
51245 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
51246 ML->getPassThru());
51247
51248 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
51249}
51250
51253 const X86Subtarget &Subtarget) {
51254 auto *Mld = cast<MaskedLoadSDNode>(N);
51255
51256 // TODO: Expanding load with constant mask may be optimized as well.
51257 if (Mld->isExpandingLoad())
51258 return SDValue();
51259
51260 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
51261 if (SDValue ScalarLoad =
51262 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
51263 return ScalarLoad;
51264
51265 // TODO: Do some AVX512 subsets benefit from this transform?
51266 if (!Subtarget.hasAVX512())
51267 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
51268 return Blend;
51269 }
51270
51271 // If the mask value has been legalized to a non-boolean vector, try to
51272 // simplify ops leading up to it. We only demand the MSB of each lane.
51273 SDValue Mask = Mld->getMask();
51274 if (Mask.getScalarValueSizeInBits() != 1) {
51275 EVT VT = Mld->getValueType(0);
51276 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51278 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51279 if (N->getOpcode() != ISD::DELETED_NODE)
51280 DCI.AddToWorklist(N);
51281 return SDValue(N, 0);
51282 }
51283 if (SDValue NewMask =
51285 return DAG.getMaskedLoad(
51286 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
51287 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
51288 Mld->getAddressingMode(), Mld->getExtensionType());
51289 }
51290
51291 return SDValue();
51292}
51293
51294/// If exactly one element of the mask is set for a non-truncating masked store,
51295/// it is a vector extract and scalar store.
51296/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
51297/// mask have already been optimized in IR, so we don't bother with those here.
51299 SelectionDAG &DAG,
51300 const X86Subtarget &Subtarget) {
51301 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
51302 // However, some target hooks may need to be added to know when the transform
51303 // is profitable. Endianness would also have to be considered.
51304
51305 SDValue Addr, VecIndex;
51306 Align Alignment;
51307 unsigned Offset;
51308 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
51309 return SDValue();
51310
51311 // Extract the one scalar element that is actually being stored.
51312 SDLoc DL(MS);
51313 SDValue Value = MS->getValue();
51314 EVT VT = Value.getValueType();
51315 EVT EltVT = VT.getVectorElementType();
51316 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
51317 EltVT = MVT::f64;
51318 EVT CastVT = VT.changeVectorElementType(EltVT);
51319 Value = DAG.getBitcast(CastVT, Value);
51320 }
51321 SDValue Extract =
51322 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
51323
51324 // Store that element at the appropriate offset from the base pointer.
51325 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
51327 Alignment, MS->getMemOperand()->getFlags());
51328}
51329
51332 const X86Subtarget &Subtarget) {
51333 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
51334 if (Mst->isCompressingStore())
51335 return SDValue();
51336
51337 EVT VT = Mst->getValue().getValueType();
51338 SDLoc dl(Mst);
51339 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51340
51341 if (Mst->isTruncatingStore())
51342 return SDValue();
51343
51344 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
51345 return ScalarStore;
51346
51347 // If the mask value has been legalized to a non-boolean vector, try to
51348 // simplify ops leading up to it. We only demand the MSB of each lane.
51349 SDValue Mask = Mst->getMask();
51350 if (Mask.getScalarValueSizeInBits() != 1) {
51352 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51353 if (N->getOpcode() != ISD::DELETED_NODE)
51354 DCI.AddToWorklist(N);
51355 return SDValue(N, 0);
51356 }
51357 if (SDValue NewMask =
51359 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
51360 Mst->getBasePtr(), Mst->getOffset(), NewMask,
51361 Mst->getMemoryVT(), Mst->getMemOperand(),
51362 Mst->getAddressingMode());
51363 }
51364
51365 SDValue Value = Mst->getValue();
51366 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
51367 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
51368 Mst->getMemoryVT())) {
51369 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
51370 Mst->getBasePtr(), Mst->getOffset(), Mask,
51371 Mst->getMemoryVT(), Mst->getMemOperand(),
51372 Mst->getAddressingMode(), true);
51373 }
51374
51375 return SDValue();
51376}
51377
51380 const X86Subtarget &Subtarget) {
51381 StoreSDNode *St = cast<StoreSDNode>(N);
51382 EVT StVT = St->getMemoryVT();
51383 SDLoc dl(St);
51384 SDValue StoredVal = St->getValue();
51385 EVT VT = StoredVal.getValueType();
51386 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51387
51388 // Convert a store of vXi1 into a store of iX and a bitcast.
51389 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
51390 VT.getVectorElementType() == MVT::i1) {
51391
51393 StoredVal = DAG.getBitcast(NewVT, StoredVal);
51394
51395 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51396 St->getPointerInfo(), St->getOriginalAlign(),
51397 St->getMemOperand()->getFlags());
51398 }
51399
51400 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
51401 // This will avoid a copy to k-register.
51402 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
51403 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
51404 StoredVal.getOperand(0).getValueType() == MVT::i8) {
51405 SDValue Val = StoredVal.getOperand(0);
51406 // We must store zeros to the unused bits.
51407 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
51408 return DAG.getStore(St->getChain(), dl, Val,
51409 St->getBasePtr(), St->getPointerInfo(),
51410 St->getOriginalAlign(),
51411 St->getMemOperand()->getFlags());
51412 }
51413
51414 // Widen v2i1/v4i1 stores to v8i1.
51415 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
51416 Subtarget.hasAVX512()) {
51417 unsigned NumConcats = 8 / VT.getVectorNumElements();
51418 // We must store zeros to the unused bits.
51419 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
51420 Ops[0] = StoredVal;
51421 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
51422 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51423 St->getPointerInfo(), St->getOriginalAlign(),
51424 St->getMemOperand()->getFlags());
51425 }
51426
51427 // Turn vXi1 stores of constants into a scalar store.
51428 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
51429 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
51431 // If its a v64i1 store without 64-bit support, we need two stores.
51432 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
51433 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
51434 StoredVal->ops().slice(0, 32));
51436 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
51437 StoredVal->ops().slice(32, 32));
51439
51440 SDValue Ptr0 = St->getBasePtr();
51441 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
51442
51443 SDValue Ch0 =
51444 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
51445 St->getOriginalAlign(),
51446 St->getMemOperand()->getFlags());
51447 SDValue Ch1 =
51448 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
51450 St->getOriginalAlign(),
51451 St->getMemOperand()->getFlags());
51452 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
51453 }
51454
51455 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
51456 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51457 St->getPointerInfo(), St->getOriginalAlign(),
51458 St->getMemOperand()->getFlags());
51459 }
51460
51461 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
51462 // Sandy Bridge, perform two 16-byte stores.
51463 unsigned Fast;
51464 if (VT.is256BitVector() && StVT == VT &&
51465 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
51466 *St->getMemOperand(), &Fast) &&
51467 !Fast) {
51468 unsigned NumElems = VT.getVectorNumElements();
51469 if (NumElems < 2)
51470 return SDValue();
51471
51472 return splitVectorStore(St, DAG);
51473 }
51474
51475 // Split under-aligned vector non-temporal stores.
51476 if (St->isNonTemporal() && StVT == VT &&
51477 St->getAlign().value() < VT.getStoreSize()) {
51478 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
51479 // vectors or the legalizer can scalarize it to use MOVNTI.
51480 if (VT.is256BitVector() || VT.is512BitVector()) {
51481 unsigned NumElems = VT.getVectorNumElements();
51482 if (NumElems < 2)
51483 return SDValue();
51484 return splitVectorStore(St, DAG);
51485 }
51486
51487 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
51488 // to use MOVNTI.
51489 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
51490 MVT NTVT = Subtarget.hasSSE4A()
51491 ? MVT::v2f64
51492 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
51493 return scalarizeVectorStore(St, NTVT, DAG);
51494 }
51495 }
51496
51497 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
51498 // supported, but avx512f is by extending to v16i32 and truncating.
51499 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
51500 St->getValue().getOpcode() == ISD::TRUNCATE &&
51501 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
51502 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
51503 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
51504 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
51505 St->getValue().getOperand(0));
51506 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
51507 MVT::v16i8, St->getMemOperand());
51508 }
51509
51510 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
51511 if (!St->isTruncatingStore() &&
51512 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
51513 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
51514 StoredVal.hasOneUse() &&
51515 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
51516 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
51517 return EmitTruncSStore(IsSigned, St->getChain(),
51518 dl, StoredVal.getOperand(0), St->getBasePtr(),
51519 VT, St->getMemOperand(), DAG);
51520 }
51521
51522 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
51523 if (!St->isTruncatingStore()) {
51524 auto IsExtractedElement = [](SDValue V) {
51525 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
51526 V = V.getOperand(0);
51527 unsigned Opc = V.getOpcode();
51528 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
51529 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
51530 V.getOperand(0).hasOneUse())
51531 return V.getOperand(0);
51532 return SDValue();
51533 };
51534 if (SDValue Extract = IsExtractedElement(StoredVal)) {
51535 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
51536 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
51537 SDValue Src = Trunc.getOperand(0);
51538 MVT DstVT = Trunc.getSimpleValueType();
51539 MVT SrcVT = Src.getSimpleValueType();
51540 unsigned NumSrcElts = SrcVT.getVectorNumElements();
51541 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
51542 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
51543 if (NumTruncBits == VT.getSizeInBits() &&
51544 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
51545 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
51546 TruncVT, St->getMemOperand());
51547 }
51548 }
51549 }
51550 }
51551
51552 // Optimize trunc store (of multiple scalars) to shuffle and store.
51553 // First, pack all of the elements in one place. Next, store to memory
51554 // in fewer chunks.
51555 if (St->isTruncatingStore() && VT.isVector()) {
51556 if (TLI.isTruncStoreLegal(VT, StVT)) {
51557 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
51558 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
51559 dl, Val, St->getBasePtr(),
51560 St->getMemoryVT(), St->getMemOperand(), DAG);
51561 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
51562 DAG, dl))
51563 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
51564 dl, Val, St->getBasePtr(),
51565 St->getMemoryVT(), St->getMemOperand(), DAG);
51566 }
51567
51568 return SDValue();
51569 }
51570
51571 // Cast ptr32 and ptr64 pointers to the default address space before a store.
51572 unsigned AddrSpace = St->getAddressSpace();
51573 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51574 AddrSpace == X86AS::PTR32_UPTR) {
51575 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51576 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
51577 SDValue Cast =
51578 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
51579 return DAG.getTruncStore(
51580 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
51581 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
51582 St->getAAInfo());
51583 }
51584 }
51585
51586 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
51587 // the FP state in cases where an emms may be missing.
51588 // A preferable solution to the general problem is to figure out the right
51589 // places to insert EMMS. This qualifies as a quick hack.
51590
51591 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
51592 if (VT.getSizeInBits() != 64)
51593 return SDValue();
51594
51595 const Function &F = DAG.getMachineFunction().getFunction();
51596 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
51597 bool F64IsLegal =
51598 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
51599
51600 if (!F64IsLegal || Subtarget.is64Bit())
51601 return SDValue();
51602
51603 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
51604 cast<LoadSDNode>(St->getValue())->isSimple() &&
51605 St->getChain().hasOneUse() && St->isSimple()) {
51606 auto *Ld = cast<LoadSDNode>(St->getValue());
51607
51608 if (!ISD::isNormalLoad(Ld))
51609 return SDValue();
51610
51611 // Avoid the transformation if there are multiple uses of the loaded value.
51612 if (!Ld->hasNUsesOfValue(1, 0))
51613 return SDValue();
51614
51615 SDLoc LdDL(Ld);
51616 SDLoc StDL(N);
51617 // Lower to a single movq load/store pair.
51618 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
51619 Ld->getBasePtr(), Ld->getMemOperand());
51620
51621 // Make sure new load is placed in same chain order.
51622 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
51623 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
51624 St->getMemOperand());
51625 }
51626
51627 // This is similar to the above case, but here we handle a scalar 64-bit
51628 // integer store that is extracted from a vector on a 32-bit target.
51629 // If we have SSE2, then we can treat it like a floating-point double
51630 // to get past legalization. The execution dependencies fixup pass will
51631 // choose the optimal machine instruction for the store if this really is
51632 // an integer or v2f32 rather than an f64.
51633 if (VT == MVT::i64 &&
51635 SDValue OldExtract = St->getOperand(1);
51636 SDValue ExtOp0 = OldExtract.getOperand(0);
51637 unsigned VecSize = ExtOp0.getValueSizeInBits();
51638 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
51639 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
51640 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
51641 BitCast, OldExtract.getOperand(1));
51642 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
51643 St->getPointerInfo(), St->getOriginalAlign(),
51644 St->getMemOperand()->getFlags());
51645 }
51646
51647 return SDValue();
51648}
51649
51652 const X86Subtarget &Subtarget) {
51653 auto *St = cast<MemIntrinsicSDNode>(N);
51654
51655 SDValue StoredVal = N->getOperand(1);
51656 MVT VT = StoredVal.getSimpleValueType();
51657 EVT MemVT = St->getMemoryVT();
51658
51659 // Figure out which elements we demand.
51660 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
51661 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
51662
51663 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51664 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
51665 if (N->getOpcode() != ISD::DELETED_NODE)
51666 DCI.AddToWorklist(N);
51667 return SDValue(N, 0);
51668 }
51669
51670 return SDValue();
51671}
51672
51673/// Return 'true' if this vector operation is "horizontal"
51674/// and return the operands for the horizontal operation in LHS and RHS. A
51675/// horizontal operation performs the binary operation on successive elements
51676/// of its first operand, then on successive elements of its second operand,
51677/// returning the resulting values in a vector. For example, if
51678/// A = < float a0, float a1, float a2, float a3 >
51679/// and
51680/// B = < float b0, float b1, float b2, float b3 >
51681/// then the result of doing a horizontal operation on A and B is
51682/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
51683/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
51684/// A horizontal-op B, for some already available A and B, and if so then LHS is
51685/// set to A, RHS to B, and the routine returns 'true'.
51686static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
51687 SelectionDAG &DAG, const X86Subtarget &Subtarget,
51688 bool IsCommutative,
51689 SmallVectorImpl<int> &PostShuffleMask,
51690 bool ForceHorizOp) {
51691 // If either operand is undef, bail out. The binop should be simplified.
51692 if (LHS.isUndef() || RHS.isUndef())
51693 return false;
51694
51695 // Look for the following pattern:
51696 // A = < float a0, float a1, float a2, float a3 >
51697 // B = < float b0, float b1, float b2, float b3 >
51698 // and
51699 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
51700 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
51701 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
51702 // which is A horizontal-op B.
51703
51704 MVT VT = LHS.getSimpleValueType();
51705 assert((VT.is128BitVector() || VT.is256BitVector()) &&
51706 "Unsupported vector type for horizontal add/sub");
51707 unsigned NumElts = VT.getVectorNumElements();
51708
51709 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
51710 SmallVectorImpl<int> &ShuffleMask) {
51711 bool UseSubVector = false;
51712 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
51713 Op.getOperand(0).getValueType().is256BitVector() &&
51714 llvm::isNullConstant(Op.getOperand(1))) {
51715 Op = Op.getOperand(0);
51716 UseSubVector = true;
51717 }
51719 SmallVector<int, 16> SrcMask, ScaledMask;
51721 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
51722 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
51723 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
51724 })) {
51725 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
51726 if (!UseSubVector && SrcOps.size() <= 2 &&
51727 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
51728 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
51729 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
51730 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
51731 }
51732 if (UseSubVector && SrcOps.size() == 1 &&
51733 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
51734 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
51735 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
51736 ShuffleMask.assign(Mask.begin(), Mask.end());
51737 }
51738 }
51739 };
51740
51741 // View LHS in the form
51742 // LHS = VECTOR_SHUFFLE A, B, LMask
51743 // If LHS is not a shuffle, then pretend it is the identity shuffle:
51744 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
51745 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
51746 SDValue A, B;
51748 GetShuffle(LHS, A, B, LMask);
51749
51750 // Likewise, view RHS in the form
51751 // RHS = VECTOR_SHUFFLE C, D, RMask
51752 SDValue C, D;
51754 GetShuffle(RHS, C, D, RMask);
51755
51756 // At least one of the operands should be a vector shuffle.
51757 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
51758 if (NumShuffles == 0)
51759 return false;
51760
51761 if (LMask.empty()) {
51762 A = LHS;
51763 for (unsigned i = 0; i != NumElts; ++i)
51764 LMask.push_back(i);
51765 }
51766
51767 if (RMask.empty()) {
51768 C = RHS;
51769 for (unsigned i = 0; i != NumElts; ++i)
51770 RMask.push_back(i);
51771 }
51772
51773 // If we have an unary mask, ensure the other op is set to null.
51774 if (isUndefOrInRange(LMask, 0, NumElts))
51775 B = SDValue();
51776 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
51777 A = SDValue();
51778
51779 if (isUndefOrInRange(RMask, 0, NumElts))
51780 D = SDValue();
51781 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
51782 C = SDValue();
51783
51784 // If A and B occur in reverse order in RHS, then canonicalize by commuting
51785 // RHS operands and shuffle mask.
51786 if (A != C) {
51787 std::swap(C, D);
51789 }
51790 // Check that the shuffles are both shuffling the same vectors.
51791 if (!(A == C && B == D))
51792 return false;
51793
51794 PostShuffleMask.clear();
51795 PostShuffleMask.append(NumElts, SM_SentinelUndef);
51796
51797 // LHS and RHS are now:
51798 // LHS = shuffle A, B, LMask
51799 // RHS = shuffle A, B, RMask
51800 // Check that the masks correspond to performing a horizontal operation.
51801 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
51802 // so we just repeat the inner loop if this is a 256-bit op.
51803 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
51804 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
51805 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
51806 assert((NumEltsPer128BitChunk % 2 == 0) &&
51807 "Vector type should have an even number of elements in each lane");
51808 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
51809 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
51810 // Ignore undefined components.
51811 int LIdx = LMask[i + j], RIdx = RMask[i + j];
51812 if (LIdx < 0 || RIdx < 0 ||
51813 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
51814 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
51815 continue;
51816
51817 // Check that successive odd/even elements are being operated on. If not,
51818 // this is not a horizontal operation.
51819 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
51820 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
51821 return false;
51822
51823 // Compute the post-shuffle mask index based on where the element
51824 // is stored in the HOP result, and where it needs to be moved to.
51825 int Base = LIdx & ~1u;
51826 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
51827 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
51828
51829 // The low half of the 128-bit result must choose from A.
51830 // The high half of the 128-bit result must choose from B,
51831 // unless B is undef. In that case, we are always choosing from A.
51832 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
51833 Index += NumEltsPer64BitChunk;
51834 PostShuffleMask[i + j] = Index;
51835 }
51836 }
51837
51838 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
51839 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
51840
51841 bool IsIdentityPostShuffle =
51842 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
51843 if (IsIdentityPostShuffle)
51844 PostShuffleMask.clear();
51845
51846 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
51847 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
51848 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
51849 return false;
51850
51851 // If the source nodes are already used in HorizOps then always accept this.
51852 // Shuffle folding should merge these back together.
51853 auto FoundHorizUser = [&](SDNode *User) {
51854 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51855 };
51856 ForceHorizOp =
51857 ForceHorizOp || (llvm::any_of(NewLHS->uses(), FoundHorizUser) &&
51858 llvm::any_of(NewRHS->uses(), FoundHorizUser));
51859
51860 // Assume a SingleSource HOP if we only shuffle one input and don't need to
51861 // shuffle the result.
51862 if (!ForceHorizOp &&
51863 !shouldUseHorizontalOp(NewLHS == NewRHS &&
51864 (NumShuffles < 2 || !IsIdentityPostShuffle),
51865 DAG, Subtarget))
51866 return false;
51867
51868 LHS = DAG.getBitcast(VT, NewLHS);
51869 RHS = DAG.getBitcast(VT, NewRHS);
51870 return true;
51871}
51872
51873// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
51875 const X86Subtarget &Subtarget) {
51876 EVT VT = N->getValueType(0);
51877 unsigned Opcode = N->getOpcode();
51878 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
51879 SmallVector<int, 8> PostShuffleMask;
51880
51881 auto MergableHorizOp = [N](unsigned HorizOpcode) {
51882 return N->hasOneUse() &&
51883 N->use_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
51884 (N->use_begin()->getOperand(0).getOpcode() == HorizOpcode ||
51885 N->use_begin()->getOperand(1).getOpcode() == HorizOpcode);
51886 };
51887
51888 switch (Opcode) {
51889 case ISD::FADD:
51890 case ISD::FSUB:
51891 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
51892 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
51893 SDValue LHS = N->getOperand(0);
51894 SDValue RHS = N->getOperand(1);
51895 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
51896 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51897 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
51898 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
51899 if (!PostShuffleMask.empty())
51900 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51901 DAG.getUNDEF(VT), PostShuffleMask);
51902 return HorizBinOp;
51903 }
51904 }
51905 break;
51906 case ISD::ADD:
51907 case ISD::SUB:
51908 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
51909 VT == MVT::v16i16 || VT == MVT::v8i32)) {
51910 SDValue LHS = N->getOperand(0);
51911 SDValue RHS = N->getOperand(1);
51912 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
51913 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51914 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
51915 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
51916 ArrayRef<SDValue> Ops) {
51917 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
51918 };
51919 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
51920 {LHS, RHS}, HOpBuilder);
51921 if (!PostShuffleMask.empty())
51922 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51923 DAG.getUNDEF(VT), PostShuffleMask);
51924 return HorizBinOp;
51925 }
51926 }
51927 break;
51928 }
51929
51930 return SDValue();
51931}
51932
51933// Try to combine the following nodes
51934// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
51935// <i32 -2147483648[float -0.000000e+00]> 0
51936// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
51937// <(load 4 from constant-pool)> t0, t29
51938// [t30: v16i32 = bitcast t27]
51939// t6: v16i32 = xor t7, t27[t30]
51940// t11: v16f32 = bitcast t6
51941// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
51942// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
51943// t22: v16f32 = bitcast t7
51944// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
51945// t24: v32f16 = bitcast t23
51947 const X86Subtarget &Subtarget) {
51948 EVT VT = N->getValueType(0);
51949 SDValue LHS = N->getOperand(0);
51950 SDValue RHS = N->getOperand(1);
51951 int CombineOpcode =
51952 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
51953 auto combineConjugation = [&](SDValue &r) {
51954 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
51955 SDValue XOR = LHS.getOperand(0);
51956 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
51957 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
51958 if (XORRHS.isConstant()) {
51959 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
51960 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
51961 if ((XORRHS.getBitWidth() == 32 &&
51962 XORRHS.getConstant() == ConjugationInt32) ||
51963 (XORRHS.getBitWidth() == 64 &&
51964 XORRHS.getConstant() == ConjugationInt64)) {
51965 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
51966 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
51967 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
51968 r = DAG.getBitcast(VT, FCMulC);
51969 return true;
51970 }
51971 }
51972 }
51973 }
51974 return false;
51975 };
51976 SDValue Res;
51977 if (combineConjugation(Res))
51978 return Res;
51979 std::swap(LHS, RHS);
51980 if (combineConjugation(Res))
51981 return Res;
51982 return Res;
51983}
51984
51985// Try to combine the following nodes:
51986// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
51988 const X86Subtarget &Subtarget) {
51989 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
51991 Flags.hasAllowContract();
51992 };
51993
51994 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
51995 return DAG.getTarget().Options.NoSignedZerosFPMath ||
51996 Flags.hasNoSignedZeros();
51997 };
51998 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
51999 APInt AI = APInt(32, 0x80008000, true);
52000 KnownBits Bits = DAG.computeKnownBits(Op);
52001 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
52002 Bits.getConstant() == AI;
52003 };
52004
52005 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
52006 !AllowContract(N->getFlags()))
52007 return SDValue();
52008
52009 EVT VT = N->getValueType(0);
52010 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
52011 return SDValue();
52012
52013 SDValue LHS = N->getOperand(0);
52014 SDValue RHS = N->getOperand(1);
52015 bool IsConj;
52016 SDValue FAddOp1, MulOp0, MulOp1;
52017 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
52018 &IsVectorAllNegativeZero,
52019 &HasNoSignedZero](SDValue N) -> bool {
52020 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
52021 return false;
52022 SDValue Op0 = N.getOperand(0);
52023 unsigned Opcode = Op0.getOpcode();
52024 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
52025 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
52026 MulOp0 = Op0.getOperand(0);
52027 MulOp1 = Op0.getOperand(1);
52028 IsConj = Opcode == X86ISD::VFCMULC;
52029 return true;
52030 }
52031 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
52033 HasNoSignedZero(Op0->getFlags())) ||
52034 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
52035 MulOp0 = Op0.getOperand(0);
52036 MulOp1 = Op0.getOperand(1);
52037 IsConj = Opcode == X86ISD::VFCMADDC;
52038 return true;
52039 }
52040 }
52041 return false;
52042 };
52043
52044 if (GetCFmulFrom(LHS))
52045 FAddOp1 = RHS;
52046 else if (GetCFmulFrom(RHS))
52047 FAddOp1 = LHS;
52048 else
52049 return SDValue();
52050
52051 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
52052 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
52053 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
52054 // FIXME: How do we handle when fast math flags of FADD are different from
52055 // CFMUL's?
52056 SDValue CFmul =
52057 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
52058 return DAG.getBitcast(VT, CFmul);
52059}
52060
52061/// Do target-specific dag combines on floating-point adds/subs.
52063 const X86Subtarget &Subtarget) {
52064 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
52065 return HOp;
52066
52067 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
52068 return COp;
52069
52070 return SDValue();
52071}
52072
52074 const X86Subtarget &Subtarget) {
52075 EVT VT = N->getValueType(0);
52076 SDValue Src = N->getOperand(0);
52077 EVT SrcVT = Src.getValueType();
52078 SDLoc DL(N);
52079
52080 if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
52081 SrcVT != MVT::v2f32)
52082 return SDValue();
52083
52084 return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
52085 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
52086 DAG.getUNDEF(SrcVT)));
52087}
52088
52089/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
52090/// the codegen.
52091/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
52092/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
52093/// anything that is guaranteed to be transformed by DAGCombiner.
52095 const X86Subtarget &Subtarget,
52096 const SDLoc &DL) {
52097 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
52098 SDValue Src = N->getOperand(0);
52099 unsigned SrcOpcode = Src.getOpcode();
52100 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52101
52102 EVT VT = N->getValueType(0);
52103 EVT SrcVT = Src.getValueType();
52104
52105 auto IsFreeTruncation = [VT](SDValue Op) {
52106 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
52107
52108 // See if this has been extended from a smaller/equal size to
52109 // the truncation size, allowing a truncation to combine with the extend.
52110 unsigned Opcode = Op.getOpcode();
52111 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
52112 Opcode == ISD::ZERO_EXTEND) &&
52113 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
52114 return true;
52115
52116 // See if this is a single use constant which can be constant folded.
52117 // NOTE: We don't peek throught bitcasts here because there is currently
52118 // no support for constant folding truncate+bitcast+vector_of_constants. So
52119 // we'll just send up with a truncate on both operands which will
52120 // get turned back into (truncate (binop)) causing an infinite loop.
52121 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
52122 };
52123
52124 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
52125 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
52126 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
52127 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
52128 };
52129
52130 // Don't combine if the operation has other uses.
52131 if (!Src.hasOneUse())
52132 return SDValue();
52133
52134 // Only support vector truncation for now.
52135 // TODO: i64 scalar math would benefit as well.
52136 if (!VT.isVector())
52137 return SDValue();
52138
52139 // In most cases its only worth pre-truncating if we're only facing the cost
52140 // of one truncation.
52141 // i.e. if one of the inputs will constant fold or the input is repeated.
52142 switch (SrcOpcode) {
52143 case ISD::MUL:
52144 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
52145 // better to truncate if we have the chance.
52146 if (SrcVT.getScalarType() == MVT::i64 &&
52147 TLI.isOperationLegal(SrcOpcode, VT) &&
52148 !TLI.isOperationLegal(SrcOpcode, SrcVT))
52149 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
52150 [[fallthrough]];
52151 case ISD::AND:
52152 case ISD::XOR:
52153 case ISD::OR:
52154 case ISD::ADD:
52155 case ISD::SUB: {
52156 SDValue Op0 = Src.getOperand(0);
52157 SDValue Op1 = Src.getOperand(1);
52158 if (TLI.isOperationLegal(SrcOpcode, VT) &&
52159 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
52160 return TruncateArithmetic(Op0, Op1);
52161 break;
52162 }
52163 }
52164
52165 return SDValue();
52166}
52167
52168// Try to form a MULHU or MULHS node by looking for
52169// (trunc (srl (mul ext, ext), 16))
52170// TODO: This is X86 specific because we want to be able to handle wide types
52171// before type legalization. But we can only do it if the vector will be
52172// legalized via widening/splitting. Type legalization can't handle promotion
52173// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
52174// combiner.
52175static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
52176 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
52177 // First instruction should be a right shift of a multiply.
52178 if (Src.getOpcode() != ISD::SRL ||
52179 Src.getOperand(0).getOpcode() != ISD::MUL)
52180 return SDValue();
52181
52182 if (!Subtarget.hasSSE2())
52183 return SDValue();
52184
52185 // Only handle vXi16 types that are at least 128-bits unless they will be
52186 // widened.
52187 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
52188 return SDValue();
52189
52190 // Input type should be at least vXi32.
52191 EVT InVT = Src.getValueType();
52192 if (InVT.getVectorElementType().getSizeInBits() < 32)
52193 return SDValue();
52194
52195 // Need a shift by 16.
52196 APInt ShiftAmt;
52197 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
52198 ShiftAmt != 16)
52199 return SDValue();
52200
52201 SDValue LHS = Src.getOperand(0).getOperand(0);
52202 SDValue RHS = Src.getOperand(0).getOperand(1);
52203
52204 // Count leading sign/zero bits on both inputs - if there are enough then
52205 // truncation back to vXi16 will be cheap - either as a pack/shuffle
52206 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
52207 // truncations may actually be free by peeking through to the ext source.
52208 auto IsSext = [&DAG](SDValue V) {
52209 return DAG.ComputeMaxSignificantBits(V) <= 16;
52210 };
52211 auto IsZext = [&DAG](SDValue V) {
52212 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
52213 };
52214
52215 bool IsSigned = IsSext(LHS) && IsSext(RHS);
52216 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
52217 if (!IsSigned && !IsUnsigned)
52218 return SDValue();
52219
52220 // Check if both inputs are extensions, which will be removed by truncation.
52221 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
52222 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
52223 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
52224 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
52225 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
52226 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
52227
52228 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
52229 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
52230 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
52231 // will have to split anyway.
52232 unsigned InSizeInBits = InVT.getSizeInBits();
52233 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
52234 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
52235 (InSizeInBits % 16) == 0) {
52236 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52237 InVT.getSizeInBits() / 16);
52238 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
52239 DAG.getBitcast(BCVT, RHS));
52240 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
52241 }
52242
52243 // Truncate back to source type.
52244 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
52245 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
52246
52247 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
52248 return DAG.getNode(Opc, DL, VT, LHS, RHS);
52249}
52250
52251// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
52252// from one vector with signed bytes from another vector, adds together
52253// adjacent pairs of 16-bit products, and saturates the result before
52254// truncating to 16-bits.
52255//
52256// Which looks something like this:
52257// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
52258// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
52260 const X86Subtarget &Subtarget,
52261 const SDLoc &DL) {
52262 if (!VT.isVector() || !Subtarget.hasSSSE3())
52263 return SDValue();
52264
52265 unsigned NumElems = VT.getVectorNumElements();
52266 EVT ScalarVT = VT.getVectorElementType();
52267 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
52268 return SDValue();
52269
52270 SDValue SSatVal = detectSSatPattern(In, VT);
52271 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
52272 return SDValue();
52273
52274 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
52275 // of multiplies from even/odd elements.
52276 SDValue N0 = SSatVal.getOperand(0);
52277 SDValue N1 = SSatVal.getOperand(1);
52278
52279 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
52280 return SDValue();
52281
52282 SDValue N00 = N0.getOperand(0);
52283 SDValue N01 = N0.getOperand(1);
52284 SDValue N10 = N1.getOperand(0);
52285 SDValue N11 = N1.getOperand(1);
52286
52287 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
52288 // Canonicalize zero_extend to LHS.
52289 if (N01.getOpcode() == ISD::ZERO_EXTEND)
52290 std::swap(N00, N01);
52291 if (N11.getOpcode() == ISD::ZERO_EXTEND)
52292 std::swap(N10, N11);
52293
52294 // Ensure we have a zero_extend and a sign_extend.
52295 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
52296 N01.getOpcode() != ISD::SIGN_EXTEND ||
52297 N10.getOpcode() != ISD::ZERO_EXTEND ||
52298 N11.getOpcode() != ISD::SIGN_EXTEND)
52299 return SDValue();
52300
52301 // Peek through the extends.
52302 N00 = N00.getOperand(0);
52303 N01 = N01.getOperand(0);
52304 N10 = N10.getOperand(0);
52305 N11 = N11.getOperand(0);
52306
52307 // Ensure the extend is from vXi8.
52308 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
52309 N01.getValueType().getVectorElementType() != MVT::i8 ||
52310 N10.getValueType().getVectorElementType() != MVT::i8 ||
52311 N11.getValueType().getVectorElementType() != MVT::i8)
52312 return SDValue();
52313
52314 // All inputs should be build_vectors.
52315 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
52316 N01.getOpcode() != ISD::BUILD_VECTOR ||
52317 N10.getOpcode() != ISD::BUILD_VECTOR ||
52319 return SDValue();
52320
52321 // N00/N10 are zero extended. N01/N11 are sign extended.
52322
52323 // For each element, we need to ensure we have an odd element from one vector
52324 // multiplied by the odd element of another vector and the even element from
52325 // one of the same vectors being multiplied by the even element from the
52326 // other vector. So we need to make sure for each element i, this operator
52327 // is being performed:
52328 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
52329 SDValue ZExtIn, SExtIn;
52330 for (unsigned i = 0; i != NumElems; ++i) {
52331 SDValue N00Elt = N00.getOperand(i);
52332 SDValue N01Elt = N01.getOperand(i);
52333 SDValue N10Elt = N10.getOperand(i);
52334 SDValue N11Elt = N11.getOperand(i);
52335 // TODO: Be more tolerant to undefs.
52336 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52337 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52338 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52340 return SDValue();
52341 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
52342 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
52343 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
52344 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
52345 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
52346 return SDValue();
52347 unsigned IdxN00 = ConstN00Elt->getZExtValue();
52348 unsigned IdxN01 = ConstN01Elt->getZExtValue();
52349 unsigned IdxN10 = ConstN10Elt->getZExtValue();
52350 unsigned IdxN11 = ConstN11Elt->getZExtValue();
52351 // Add is commutative so indices can be reordered.
52352 if (IdxN00 > IdxN10) {
52353 std::swap(IdxN00, IdxN10);
52354 std::swap(IdxN01, IdxN11);
52355 }
52356 // N0 indices be the even element. N1 indices must be the next odd element.
52357 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
52358 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
52359 return SDValue();
52360 SDValue N00In = N00Elt.getOperand(0);
52361 SDValue N01In = N01Elt.getOperand(0);
52362 SDValue N10In = N10Elt.getOperand(0);
52363 SDValue N11In = N11Elt.getOperand(0);
52364 // First time we find an input capture it.
52365 if (!ZExtIn) {
52366 ZExtIn = N00In;
52367 SExtIn = N01In;
52368 }
52369 if (ZExtIn != N00In || SExtIn != N01In ||
52370 ZExtIn != N10In || SExtIn != N11In)
52371 return SDValue();
52372 }
52373
52374 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
52375 EVT ExtVT = Ext.getValueType();
52376 if (ExtVT.getVectorNumElements() != NumElems * 2) {
52377 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
52378 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
52379 DAG.getIntPtrConstant(0, DL));
52380 }
52381 };
52382 ExtractVec(ZExtIn);
52383 ExtractVec(SExtIn);
52384
52385 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
52386 ArrayRef<SDValue> Ops) {
52387 // Shrink by adding truncate nodes and let DAGCombine fold with the
52388 // sources.
52389 EVT InVT = Ops[0].getValueType();
52390 assert(InVT.getScalarType() == MVT::i8 &&
52391 "Unexpected scalar element type");
52392 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
52393 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52394 InVT.getVectorNumElements() / 2);
52395 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
52396 };
52397 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
52398 PMADDBuilder);
52399}
52400
52402 const X86Subtarget &Subtarget) {
52403 EVT VT = N->getValueType(0);
52404 SDValue Src = N->getOperand(0);
52405 SDLoc DL(N);
52406
52407 // Attempt to pre-truncate inputs to arithmetic ops instead.
52408 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
52409 return V;
52410
52411 // Try to detect PMADD
52412 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
52413 return PMAdd;
52414
52415 // Try to combine truncation with signed/unsigned saturation.
52416 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
52417 return Val;
52418
52419 // Try to combine PMULHUW/PMULHW for vXi16.
52420 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
52421 return V;
52422
52423 // The bitcast source is a direct mmx result.
52424 // Detect bitcasts between i32 to x86mmx
52425 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
52426 SDValue BCSrc = Src.getOperand(0);
52427 if (BCSrc.getValueType() == MVT::x86mmx)
52428 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
52429 }
52430
52431 // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
52432 if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
52433 Src.hasOneUse())
52434 return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
52435
52436 return SDValue();
52437}
52438
52441 EVT VT = N->getValueType(0);
52442 SDValue In = N->getOperand(0);
52443 SDLoc DL(N);
52444
52445 if (SDValue SSatVal = detectSSatPattern(In, VT))
52446 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
52447 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
52448 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
52449
52450 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52451 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
52452 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52453 return SDValue(N, 0);
52454
52455 return SDValue();
52456}
52457
52458/// Returns the negated value if the node \p N flips sign of FP value.
52459///
52460/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
52461/// or FSUB(0, x)
52462/// AVX512F does not have FXOR, so FNEG is lowered as
52463/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
52464/// In this case we go though all bitcasts.
52465/// This also recognizes splat of a negated value and returns the splat of that
52466/// value.
52467static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
52468 if (N->getOpcode() == ISD::FNEG)
52469 return N->getOperand(0);
52470
52471 // Don't recurse exponentially.
52473 return SDValue();
52474
52475 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
52476
52478 EVT VT = Op->getValueType(0);
52479
52480 // Make sure the element size doesn't change.
52481 if (VT.getScalarSizeInBits() != ScalarSize)
52482 return SDValue();
52483
52484 unsigned Opc = Op.getOpcode();
52485 switch (Opc) {
52486 case ISD::VECTOR_SHUFFLE: {
52487 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
52488 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
52489 if (!Op.getOperand(1).isUndef())
52490 return SDValue();
52491 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
52492 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
52493 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
52494 cast<ShuffleVectorSDNode>(Op)->getMask());
52495 break;
52496 }
52498 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
52499 // -V, INDEX).
52500 SDValue InsVector = Op.getOperand(0);
52501 SDValue InsVal = Op.getOperand(1);
52502 if (!InsVector.isUndef())
52503 return SDValue();
52504 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
52505 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
52506 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
52507 NegInsVal, Op.getOperand(2));
52508 break;
52509 }
52510 case ISD::FSUB:
52511 case ISD::XOR:
52512 case X86ISD::FXOR: {
52513 SDValue Op1 = Op.getOperand(1);
52514 SDValue Op0 = Op.getOperand(0);
52515
52516 // For XOR and FXOR, we want to check if constant
52517 // bits of Op1 are sign bit masks. For FSUB, we
52518 // have to check if constant bits of Op0 are sign
52519 // bit masks and hence we swap the operands.
52520 if (Opc == ISD::FSUB)
52521 std::swap(Op0, Op1);
52522
52523 APInt UndefElts;
52524 SmallVector<APInt, 16> EltBits;
52525 // Extract constant bits and see if they are all
52526 // sign bit masks. Ignore the undef elements.
52527 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
52528 /* AllowWholeUndefs */ true,
52529 /* AllowPartialUndefs */ false)) {
52530 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
52531 if (!UndefElts[I] && !EltBits[I].isSignMask())
52532 return SDValue();
52533
52534 // Only allow bitcast from correctly-sized constant.
52535 Op0 = peekThroughBitcasts(Op0);
52536 if (Op0.getScalarValueSizeInBits() == ScalarSize)
52537 return Op0;
52538 }
52539 break;
52540 } // case
52541 } // switch
52542
52543 return SDValue();
52544}
52545
52546static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
52547 bool NegRes) {
52548 if (NegMul) {
52549 switch (Opcode) {
52550 // clang-format off
52551 default: llvm_unreachable("Unexpected opcode");
52552 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
52553 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
52554 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
52555 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
52556 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
52557 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
52558 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
52559 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
52560 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
52561 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
52562 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
52563 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
52564 // clang-format on
52565 }
52566 }
52567
52568 if (NegAcc) {
52569 switch (Opcode) {
52570 // clang-format off
52571 default: llvm_unreachable("Unexpected opcode");
52572 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
52573 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
52574 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52575 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
52576 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
52577 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52578 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
52579 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
52580 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52581 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
52582 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
52583 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52584 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
52585 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
52586 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
52587 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
52588 // clang-format on
52589 }
52590 }
52591
52592 if (NegRes) {
52593 switch (Opcode) {
52594 // For accuracy reason, we never combine fneg and fma under strict FP.
52595 // clang-format off
52596 default: llvm_unreachable("Unexpected opcode");
52597 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
52598 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52599 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
52600 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52601 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
52602 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52603 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
52604 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52605 // clang-format on
52606 }
52607 }
52608
52609 return Opcode;
52610}
52611
52612/// Do target-specific dag combines on floating point negations.
52615 const X86Subtarget &Subtarget) {
52616 EVT OrigVT = N->getValueType(0);
52617 SDValue Arg = isFNEG(DAG, N);
52618 if (!Arg)
52619 return SDValue();
52620
52621 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52622 EVT VT = Arg.getValueType();
52623 EVT SVT = VT.getScalarType();
52624 SDLoc DL(N);
52625
52626 // Let legalize expand this if it isn't a legal type yet.
52627 if (!TLI.isTypeLegal(VT))
52628 return SDValue();
52629
52630 // If we're negating a FMUL node on a target with FMA, then we can avoid the
52631 // use of a constant by performing (-0 - A*B) instead.
52632 // FIXME: Check rounding control flags as well once it becomes available.
52633 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
52634 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
52635 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
52636 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
52637 Arg.getOperand(1), Zero);
52638 return DAG.getBitcast(OrigVT, NewNode);
52639 }
52640
52641 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
52642 bool LegalOperations = !DCI.isBeforeLegalizeOps();
52643 if (SDValue NegArg =
52644 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
52645 return DAG.getBitcast(OrigVT, NegArg);
52646
52647 return SDValue();
52648}
52649
52651 bool LegalOperations,
52652 bool ForCodeSize,
52654 unsigned Depth) const {
52655 // fneg patterns are removable even if they have multiple uses.
52656 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
52658 return DAG.getBitcast(Op.getValueType(), Arg);
52659 }
52660
52661 EVT VT = Op.getValueType();
52662 EVT SVT = VT.getScalarType();
52663 unsigned Opc = Op.getOpcode();
52664 SDNodeFlags Flags = Op.getNode()->getFlags();
52665 switch (Opc) {
52666 case ISD::FMA:
52667 case X86ISD::FMSUB:
52668 case X86ISD::FNMADD:
52669 case X86ISD::FNMSUB:
52670 case X86ISD::FMADD_RND:
52671 case X86ISD::FMSUB_RND:
52672 case X86ISD::FNMADD_RND:
52673 case X86ISD::FNMSUB_RND: {
52674 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
52675 !(SVT == MVT::f32 || SVT == MVT::f64) ||
52677 break;
52678
52679 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
52680 // if it may have signed zeros.
52681 if (!Flags.hasNoSignedZeros())
52682 break;
52683
52684 // This is always negatible for free but we might be able to remove some
52685 // extra operand negations as well.
52687 for (int i = 0; i != 3; ++i)
52688 NewOps[i] = getCheaperNegatedExpression(
52689 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
52690
52691 bool NegA = !!NewOps[0];
52692 bool NegB = !!NewOps[1];
52693 bool NegC = !!NewOps[2];
52694 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
52695
52696 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
52698
52699 // Fill in the non-negated ops with the original values.
52700 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
52701 if (!NewOps[i])
52702 NewOps[i] = Op.getOperand(i);
52703 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
52704 }
52705 case X86ISD::FRCP:
52706 if (SDValue NegOp0 =
52707 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
52708 ForCodeSize, Cost, Depth + 1))
52709 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
52710 break;
52711 }
52712
52713 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
52714 ForCodeSize, Cost, Depth);
52715}
52716
52718 const X86Subtarget &Subtarget) {
52719 MVT VT = N->getSimpleValueType(0);
52720 // If we have integer vector types available, use the integer opcodes.
52721 if (!VT.isVector() || !Subtarget.hasSSE2())
52722 return SDValue();
52723
52724 SDLoc dl(N);
52725
52726 unsigned IntBits = VT.getScalarSizeInBits();
52727 MVT IntSVT = MVT::getIntegerVT(IntBits);
52728 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
52729
52730 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
52731 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
52732 unsigned IntOpcode;
52733 switch (N->getOpcode()) {
52734 // clang-format off
52735 default: llvm_unreachable("Unexpected FP logic op");
52736 case X86ISD::FOR: IntOpcode = ISD::OR; break;
52737 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
52738 case X86ISD::FAND: IntOpcode = ISD::AND; break;
52739 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
52740 // clang-format on
52741 }
52742 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
52743 return DAG.getBitcast(VT, IntOp);
52744}
52745
52746
52747/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
52749 if (N->getOpcode() != ISD::XOR)
52750 return SDValue();
52751
52752 SDValue LHS = N->getOperand(0);
52753 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
52754 return SDValue();
52755
52757 X86::CondCode(LHS->getConstantOperandVal(0)));
52758 SDLoc DL(N);
52759 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
52760}
52761
52763 const X86Subtarget &Subtarget) {
52764 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
52765 "Invalid opcode for combing with CTLZ");
52766 if (Subtarget.hasFastLZCNT())
52767 return SDValue();
52768
52769 EVT VT = N->getValueType(0);
52770 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
52771 (VT != MVT::i64 || !Subtarget.is64Bit()))
52772 return SDValue();
52773
52774 SDValue N0 = N->getOperand(0);
52775 SDValue N1 = N->getOperand(1);
52776
52777 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
52779 return SDValue();
52780
52781 SDValue OpCTLZ;
52782 SDValue OpSizeTM1;
52783
52784 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
52785 OpCTLZ = N1;
52786 OpSizeTM1 = N0;
52787 } else if (N->getOpcode() == ISD::SUB) {
52788 return SDValue();
52789 } else {
52790 OpCTLZ = N0;
52791 OpSizeTM1 = N1;
52792 }
52793
52794 if (!OpCTLZ.hasOneUse())
52795 return SDValue();
52796 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
52797 if (!C)
52798 return SDValue();
52799
52800 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
52801 return SDValue();
52802 EVT OpVT = VT;
52803 SDValue Op = OpCTLZ.getOperand(0);
52804 if (VT == MVT::i8) {
52805 // Zero extend to i32 since there is not an i8 bsr.
52806 OpVT = MVT::i32;
52807 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
52808 }
52809
52810 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
52811 Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
52812 if (VT == MVT::i8)
52813 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
52814
52815 return Op;
52816}
52817
52820 const X86Subtarget &Subtarget) {
52821 SDValue N0 = N->getOperand(0);
52822 SDValue N1 = N->getOperand(1);
52823 EVT VT = N->getValueType(0);
52824 SDLoc DL(N);
52825
52826 // If this is SSE1 only convert to FXOR to avoid scalarization.
52827 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52828 return DAG.getBitcast(MVT::v4i32,
52829 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
52830 DAG.getBitcast(MVT::v4f32, N0),
52831 DAG.getBitcast(MVT::v4f32, N1)));
52832 }
52833
52834 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
52835 return Cmp;
52836
52837 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
52838 return R;
52839
52840 if (SDValue R = combineBitOpWithShift(N, DAG))
52841 return R;
52842
52843 if (SDValue R = combineBitOpWithPACK(N, DAG))
52844 return R;
52845
52846 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
52847 return FPLogic;
52848
52849 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
52850 return R;
52851
52852 if (DCI.isBeforeLegalizeOps())
52853 return SDValue();
52854
52855 if (SDValue SetCC = foldXor1SetCC(N, DAG))
52856 return SetCC;
52857
52858 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
52859 return R;
52860
52861 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
52862 return RV;
52863
52864 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
52865 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52866 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
52867 N0.getOperand(0).getValueType().isVector() &&
52868 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
52869 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
52870 return DAG.getBitcast(
52871 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
52872 }
52873
52874 // Handle AVX512 mask widening.
52875 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
52876 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
52877 VT.getVectorElementType() == MVT::i1 &&
52879 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
52880 return DAG.getNode(
52882 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
52883 N0.getOperand(2));
52884 }
52885
52886 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
52887 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
52888 // TODO: Under what circumstances could this be performed in DAGCombine?
52889 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
52890 N0.getOperand(0).getOpcode() == N->getOpcode()) {
52891 SDValue TruncExtSrc = N0.getOperand(0);
52892 auto *N1C = dyn_cast<ConstantSDNode>(N1);
52893 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
52894 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
52895 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
52896 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
52897 return DAG.getNode(ISD::XOR, DL, VT, LHS,
52898 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
52899 }
52900 }
52901
52902 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52903 return R;
52904
52905 return combineFneg(N, DAG, DCI, Subtarget);
52906}
52907
52910 const X86Subtarget &Subtarget) {
52911 SDValue N0 = N->getOperand(0);
52912 EVT VT = N->getValueType(0);
52913
52914 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
52915 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
52916 SDValue Src = N0.getOperand(0);
52917 EVT SrcVT = Src.getValueType();
52918 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
52919 (DCI.isBeforeLegalize() ||
52920 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
52921 Subtarget.hasSSSE3()) {
52922 unsigned NumElts = SrcVT.getVectorNumElements();
52923 SmallVector<int, 32> ReverseMask(NumElts);
52924 for (unsigned I = 0; I != NumElts; ++I)
52925 ReverseMask[I] = (NumElts - 1) - I;
52926 SDValue Rev =
52927 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
52928 return DAG.getBitcast(VT, Rev);
52929 }
52930 }
52931
52932 return SDValue();
52933}
52934
52935// Various combines to try to convert to avgceilu.
52938 const X86Subtarget &Subtarget) {
52939 unsigned Opcode = N->getOpcode();
52940 SDValue N0 = N->getOperand(0);
52941 SDValue N1 = N->getOperand(1);
52942 EVT VT = N->getValueType(0);
52943 EVT SVT = VT.getScalarType();
52944 SDLoc DL(N);
52945
52946 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
52947 // Only useful on vXi8 which doesn't have good SRA handling.
52948 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
52950 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
52951 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
52952 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
52953 return DAG.getNode(ISD::XOR, DL, VT,
52954 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
52955 }
52956
52957 return SDValue();
52958}
52959
52962 const X86Subtarget &Subtarget) {
52963 EVT VT = N->getValueType(0);
52964 unsigned NumBits = VT.getSizeInBits();
52965
52966 // TODO - Constant Folding.
52967
52968 // Simplify the inputs.
52969 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52970 APInt DemandedMask(APInt::getAllOnes(NumBits));
52971 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52972 return SDValue(N, 0);
52973
52974 return SDValue();
52975}
52976
52978 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
52979}
52980
52981/// If a value is a scalar FP zero or a vector FP zero (potentially including
52982/// undefined elements), return a zero constant that may be used to fold away
52983/// that value. In the case of a vector, the returned constant will not contain
52984/// undefined elements even if the input parameter does. This makes it suitable
52985/// to be used as a replacement operand with operations (eg, bitwise-and) where
52986/// an undef should not propagate.
52988 const X86Subtarget &Subtarget) {
52990 return SDValue();
52991
52992 if (V.getValueType().isVector())
52993 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
52994
52995 return V;
52996}
52997
52999 const X86Subtarget &Subtarget) {
53000 SDValue N0 = N->getOperand(0);
53001 SDValue N1 = N->getOperand(1);
53002 EVT VT = N->getValueType(0);
53003 SDLoc DL(N);
53004
53005 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
53006 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
53007 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
53008 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
53009 return SDValue();
53010
53011 auto isAllOnesConstantFP = [](SDValue V) {
53012 if (V.getSimpleValueType().isVector())
53013 return ISD::isBuildVectorAllOnes(V.getNode());
53014 auto *C = dyn_cast<ConstantFPSDNode>(V);
53015 return C && C->getConstantFPValue()->isAllOnesValue();
53016 };
53017
53018 // fand (fxor X, -1), Y --> fandn X, Y
53019 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
53020 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
53021
53022 // fand X, (fxor Y, -1) --> fandn Y, X
53023 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
53024 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
53025
53026 return SDValue();
53027}
53028
53029/// Do target-specific dag combines on X86ISD::FAND nodes.
53031 const X86Subtarget &Subtarget) {
53032 // FAND(0.0, x) -> 0.0
53033 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
53034 return V;
53035
53036 // FAND(x, 0.0) -> 0.0
53037 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53038 return V;
53039
53040 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
53041 return V;
53042
53043 return lowerX86FPLogicOp(N, DAG, Subtarget);
53044}
53045
53046/// Do target-specific dag combines on X86ISD::FANDN nodes.
53048 const X86Subtarget &Subtarget) {
53049 // FANDN(0.0, x) -> x
53050 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53051 return N->getOperand(1);
53052
53053 // FANDN(x, 0.0) -> 0.0
53054 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
53055 return V;
53056
53057 return lowerX86FPLogicOp(N, DAG, Subtarget);
53058}
53059
53060/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
53063 const X86Subtarget &Subtarget) {
53064 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
53065
53066 // F[X]OR(0.0, x) -> x
53067 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
53068 return N->getOperand(1);
53069
53070 // F[X]OR(x, 0.0) -> x
53071 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
53072 return N->getOperand(0);
53073
53074 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
53075 return NewVal;
53076
53077 return lowerX86FPLogicOp(N, DAG, Subtarget);
53078}
53079
53080/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
53082 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
53083
53084 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
53085 if (!DAG.getTarget().Options.NoNaNsFPMath ||
53087 return SDValue();
53088
53089 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
53090 // into FMINC and FMAXC, which are Commutative operations.
53091 unsigned NewOp = 0;
53092 switch (N->getOpcode()) {
53093 default: llvm_unreachable("unknown opcode");
53094 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
53095 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
53096 }
53097
53098 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
53099 N->getOperand(0), N->getOperand(1));
53100}
53101
53103 const X86Subtarget &Subtarget) {
53104 EVT VT = N->getValueType(0);
53105 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
53106 return SDValue();
53107
53108 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53109
53110 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
53111 (Subtarget.hasSSE2() && VT == MVT::f64) ||
53112 (Subtarget.hasFP16() && VT == MVT::f16) ||
53113 (VT.isVector() && TLI.isTypeLegal(VT))))
53114 return SDValue();
53115
53116 SDValue Op0 = N->getOperand(0);
53117 SDValue Op1 = N->getOperand(1);
53118 SDLoc DL(N);
53119 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
53120
53121 // If we don't have to respect NaN inputs, this is a direct translation to x86
53122 // min/max instructions.
53123 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
53124 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53125
53126 // If one of the operands is known non-NaN use the native min/max instructions
53127 // with the non-NaN input as second operand.
53128 if (DAG.isKnownNeverNaN(Op1))
53129 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
53130 if (DAG.isKnownNeverNaN(Op0))
53131 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
53132
53133 // If we have to respect NaN inputs, this takes at least 3 instructions.
53134 // Favor a library call when operating on a scalar and minimizing code size.
53135 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
53136 return SDValue();
53137
53138 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
53139 VT);
53140
53141 // There are 4 possibilities involving NaN inputs, and these are the required
53142 // outputs:
53143 // Op1
53144 // Num NaN
53145 // ----------------
53146 // Num | Max | Op0 |
53147 // Op0 ----------------
53148 // NaN | Op1 | NaN |
53149 // ----------------
53150 //
53151 // The SSE FP max/min instructions were not designed for this case, but rather
53152 // to implement:
53153 // Min = Op1 < Op0 ? Op1 : Op0
53154 // Max = Op1 > Op0 ? Op1 : Op0
53155 //
53156 // So they always return Op0 if either input is a NaN. However, we can still
53157 // use those instructions for fmaxnum by selecting away a NaN input.
53158
53159 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
53160 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
53161 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
53162
53163 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
53164 // are NaN, the NaN value of Op1 is the result.
53165 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
53166}
53167
53170 EVT VT = N->getValueType(0);
53171 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53172
53173 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
53174 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
53175 return SDValue(N, 0);
53176
53177 // Convert a full vector load into vzload when not all bits are needed.
53178 SDValue In = N->getOperand(0);
53179 MVT InVT = In.getSimpleValueType();
53180 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53181 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53182 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
53183 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
53184 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53185 MVT MemVT = MVT::getIntegerVT(NumBits);
53186 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53187 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53188 SDLoc dl(N);
53189 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
53190 DAG.getBitcast(InVT, VZLoad));
53191 DCI.CombineTo(N, Convert);
53192 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53194 return SDValue(N, 0);
53195 }
53196 }
53197
53198 return SDValue();
53199}
53200
53203 bool IsStrict = N->isTargetStrictFPOpcode();
53204 EVT VT = N->getValueType(0);
53205
53206 // Convert a full vector load into vzload when not all bits are needed.
53207 SDValue In = N->getOperand(IsStrict ? 1 : 0);
53208 MVT InVT = In.getSimpleValueType();
53209 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
53210 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
53211 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
53212 LoadSDNode *LN = cast<LoadSDNode>(In);
53213 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
53214 MVT MemVT = MVT::getFloatingPointVT(NumBits);
53215 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
53216 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
53217 SDLoc dl(N);
53218 if (IsStrict) {
53219 SDValue Convert =
53220 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
53221 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
53222 DCI.CombineTo(N, Convert, Convert.getValue(1));
53223 } else {
53224 SDValue Convert =
53225 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
53226 DCI.CombineTo(N, Convert);
53227 }
53228 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53230 return SDValue(N, 0);
53231 }
53232 }
53233
53234 return SDValue();
53235}
53236
53237/// Do target-specific dag combines on X86ISD::ANDNP nodes.
53240 const X86Subtarget &Subtarget) {
53241 SDValue N0 = N->getOperand(0);
53242 SDValue N1 = N->getOperand(1);
53243 MVT VT = N->getSimpleValueType(0);
53244 int NumElts = VT.getVectorNumElements();
53245 unsigned EltSizeInBits = VT.getScalarSizeInBits();
53246 SDLoc DL(N);
53247
53248 // ANDNP(undef, x) -> 0
53249 // ANDNP(x, undef) -> 0
53250 if (N0.isUndef() || N1.isUndef())
53251 return DAG.getConstant(0, DL, VT);
53252
53253 // ANDNP(0, x) -> x
53255 return N1;
53256
53257 // ANDNP(x, 0) -> 0
53259 return DAG.getConstant(0, DL, VT);
53260
53261 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
53263 return DAG.getNOT(DL, N0, VT);
53264
53265 // Turn ANDNP back to AND if input is inverted.
53266 if (SDValue Not = IsNOT(N0, DAG))
53267 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
53268
53269 // Fold for better commutativity:
53270 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
53271 if (N1->hasOneUse())
53272 if (SDValue Not = IsNOT(N1, DAG))
53273 return DAG.getNOT(
53274 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
53275
53276 // Constant Folding
53277 APInt Undefs0, Undefs1;
53278 SmallVector<APInt> EltBits0, EltBits1;
53279 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
53280 /*AllowWholeUndefs*/ true,
53281 /*AllowPartialUndefs*/ true)) {
53282 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
53283 /*AllowWholeUndefs*/ true,
53284 /*AllowPartialUndefs*/ true)) {
53285 SmallVector<APInt> ResultBits;
53286 for (int I = 0; I != NumElts; ++I)
53287 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
53288 return getConstVector(ResultBits, VT, DAG, DL);
53289 }
53290
53291 // Constant fold NOT(N0) to allow us to use AND.
53292 // Ensure this is only performed if we can confirm that the bitcasted source
53293 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
53294 if (N0->hasOneUse()) {
53296 if (BC0.getOpcode() != ISD::BITCAST) {
53297 for (APInt &Elt : EltBits0)
53298 Elt = ~Elt;
53299 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
53300 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
53301 }
53302 }
53303 }
53304
53305 // Attempt to recursively combine a bitmask ANDNP with shuffles.
53306 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
53307 SDValue Op(N, 0);
53308 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
53309 return Res;
53310
53311 // If either operand is a constant mask, then only the elements that aren't
53312 // zero are actually demanded by the other operand.
53313 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
53314 APInt UndefElts;
53315 SmallVector<APInt> EltBits;
53316 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
53317 APInt DemandedElts = APInt::getAllOnes(NumElts);
53318 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
53319 EltBits)) {
53320 DemandedBits.clearAllBits();
53321 DemandedElts.clearAllBits();
53322 for (int I = 0; I != NumElts; ++I) {
53323 if (UndefElts[I]) {
53324 // We can't assume an undef src element gives an undef dst - the
53325 // other src might be zero.
53326 DemandedBits.setAllBits();
53327 DemandedElts.setBit(I);
53328 } else if ((Invert && !EltBits[I].isAllOnes()) ||
53329 (!Invert && !EltBits[I].isZero())) {
53330 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
53331 DemandedElts.setBit(I);
53332 }
53333 }
53334 }
53335 return std::make_pair(DemandedBits, DemandedElts);
53336 };
53337 APInt Bits0, Elts0;
53338 APInt Bits1, Elts1;
53339 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
53340 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
53341
53342 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53343 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
53344 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
53345 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
53346 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
53347 if (N->getOpcode() != ISD::DELETED_NODE)
53348 DCI.AddToWorklist(N);
53349 return SDValue(N, 0);
53350 }
53351 }
53352
53353 return SDValue();
53354}
53355
53358 SDValue N1 = N->getOperand(1);
53359
53360 // BT ignores high bits in the bit index operand.
53361 unsigned BitWidth = N1.getValueSizeInBits();
53363 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
53364 if (N->getOpcode() != ISD::DELETED_NODE)
53365 DCI.AddToWorklist(N);
53366 return SDValue(N, 0);
53367 }
53368
53369 return SDValue();
53370}
53371
53374 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
53375 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
53376
53377 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
53378 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53379 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
53380 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
53381 if (N->getOpcode() != ISD::DELETED_NODE)
53382 DCI.AddToWorklist(N);
53383 return SDValue(N, 0);
53384 }
53385
53386 // Convert a full vector load into vzload when not all bits are needed.
53387 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
53388 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
53389 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
53390 SDLoc dl(N);
53391 if (IsStrict) {
53392 SDValue Convert = DAG.getNode(
53393 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
53394 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
53395 DCI.CombineTo(N, Convert, Convert.getValue(1));
53396 } else {
53397 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
53398 DAG.getBitcast(MVT::v8i16, VZLoad));
53399 DCI.CombineTo(N, Convert);
53400 }
53401
53402 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53404 return SDValue(N, 0);
53405 }
53406 }
53407 }
53408
53409 return SDValue();
53410}
53411
53412// Try to combine sext_in_reg of a cmov of constants by extending the constants.
53414 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
53415
53416 EVT DstVT = N->getValueType(0);
53417
53418 SDValue N0 = N->getOperand(0);
53419 SDValue N1 = N->getOperand(1);
53420 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53421
53422 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
53423 return SDValue();
53424
53425 // Look through single use any_extends / truncs.
53426 SDValue IntermediateBitwidthOp;
53427 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
53428 N0.hasOneUse()) {
53429 IntermediateBitwidthOp = N0;
53430 N0 = N0.getOperand(0);
53431 }
53432
53433 // See if we have a single use cmov.
53434 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
53435 return SDValue();
53436
53437 SDValue CMovOp0 = N0.getOperand(0);
53438 SDValue CMovOp1 = N0.getOperand(1);
53439
53440 // Make sure both operands are constants.
53441 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
53442 !isa<ConstantSDNode>(CMovOp1.getNode()))
53443 return SDValue();
53444
53445 SDLoc DL(N);
53446
53447 // If we looked through an any_extend/trunc above, add one to the constants.
53448 if (IntermediateBitwidthOp) {
53449 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
53450 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
53451 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
53452 }
53453
53454 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
53455 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
53456
53457 EVT CMovVT = DstVT;
53458 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
53459 if (DstVT == MVT::i16) {
53460 CMovVT = MVT::i32;
53461 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
53462 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
53463 }
53464
53465 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
53466 N0.getOperand(2), N0.getOperand(3));
53467
53468 if (CMovVT != DstVT)
53469 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
53470
53471 return CMov;
53472}
53473
53475 const X86Subtarget &Subtarget) {
53476 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
53477
53478 if (SDValue V = combineSextInRegCmov(N, DAG))
53479 return V;
53480
53481 EVT VT = N->getValueType(0);
53482 SDValue N0 = N->getOperand(0);
53483 SDValue N1 = N->getOperand(1);
53484 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53485 SDLoc dl(N);
53486
53487 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
53488 // both SSE and AVX2 since there is no sign-extended shift right
53489 // operation on a vector with 64-bit elements.
53490 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
53491 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
53492 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
53493 N0.getOpcode() == ISD::SIGN_EXTEND)) {
53494 SDValue N00 = N0.getOperand(0);
53495
53496 // EXTLOAD has a better solution on AVX2,
53497 // it may be replaced with X86ISD::VSEXT node.
53498 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
53499 if (!ISD::isNormalLoad(N00.getNode()))
53500 return SDValue();
53501
53502 // Attempt to promote any comparison mask ops before moving the
53503 // SIGN_EXTEND_INREG in the way.
53504 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
53505 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
53506
53507 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
53508 SDValue Tmp =
53509 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
53510 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
53511 }
53512 }
53513 return SDValue();
53514}
53515
53516/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
53517/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
53518/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
53519/// opportunities to combine math ops, use an LEA, or use a complex addressing
53520/// mode. This can eliminate extend, add, and shift instructions.
53522 const X86Subtarget &Subtarget) {
53523 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
53524 Ext->getOpcode() != ISD::ZERO_EXTEND)
53525 return SDValue();
53526
53527 // TODO: This should be valid for other integer types.
53528 EVT VT = Ext->getValueType(0);
53529 if (VT != MVT::i64)
53530 return SDValue();
53531
53532 SDValue Add = Ext->getOperand(0);
53533 if (Add.getOpcode() != ISD::ADD)
53534 return SDValue();
53535
53536 SDValue AddOp0 = Add.getOperand(0);
53537 SDValue AddOp1 = Add.getOperand(1);
53538 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
53539 bool NSW = Add->getFlags().hasNoSignedWrap();
53540 bool NUW = Add->getFlags().hasNoUnsignedWrap();
53541 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
53542 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
53543
53544 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
53545 // into the 'zext'
53546 if ((Sext && !NSW) || (!Sext && !NUW))
53547 return SDValue();
53548
53549 // Having a constant operand to the 'add' ensures that we are not increasing
53550 // the instruction count because the constant is extended for free below.
53551 // A constant operand can also become the displacement field of an LEA.
53552 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
53553 if (!AddOp1C)
53554 return SDValue();
53555
53556 // Don't make the 'add' bigger if there's no hope of combining it with some
53557 // other 'add' or 'shl' instruction.
53558 // TODO: It may be profitable to generate simpler LEA instructions in place
53559 // of single 'add' instructions, but the cost model for selecting an LEA
53560 // currently has a high threshold.
53561 bool HasLEAPotential = false;
53562 for (auto *User : Ext->uses()) {
53563 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
53564 HasLEAPotential = true;
53565 break;
53566 }
53567 }
53568 if (!HasLEAPotential)
53569 return SDValue();
53570
53571 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
53572 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
53573 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
53574 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
53575
53576 // The wider add is guaranteed to not wrap because both operands are
53577 // sign-extended.
53578 SDNodeFlags Flags;
53579 Flags.setNoSignedWrap(NSW);
53580 Flags.setNoUnsignedWrap(NUW);
53581 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
53582}
53583
53584// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
53585// operands and the result of CMOV is not used anywhere else - promote CMOV
53586// itself instead of promoting its result. This could be beneficial, because:
53587// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
53588// (or more) pseudo-CMOVs only when they go one-after-another and
53589// getting rid of result extension code after CMOV will help that.
53590// 2) Promotion of constant CMOV arguments is free, hence the
53591// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
53592// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
53593// promotion is also good in terms of code-size.
53594// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
53595// promotion).
53597 SDValue CMovN = Extend->getOperand(0);
53598 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
53599 return SDValue();
53600
53601 EVT TargetVT = Extend->getValueType(0);
53602 unsigned ExtendOpcode = Extend->getOpcode();
53603 SDLoc DL(Extend);
53604
53605 EVT VT = CMovN.getValueType();
53606 SDValue CMovOp0 = CMovN.getOperand(0);
53607 SDValue CMovOp1 = CMovN.getOperand(1);
53608
53609 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
53610 !isa<ConstantSDNode>(CMovOp1.getNode()))
53611 return SDValue();
53612
53613 // Only extend to i32 or i64.
53614 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
53615 return SDValue();
53616
53617 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
53618 // are free.
53619 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
53620 return SDValue();
53621
53622 // If this a zero extend to i64, we should only extend to i32 and use a free
53623 // zero extend to finish.
53624 EVT ExtendVT = TargetVT;
53625 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
53626 ExtendVT = MVT::i32;
53627
53628 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
53629 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
53630
53631 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
53632 CMovN.getOperand(2), CMovN.getOperand(3));
53633
53634 // Finish extending if needed.
53635 if (ExtendVT != TargetVT)
53636 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
53637
53638 return Res;
53639}
53640
53641// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
53642// result type.
53644 const X86Subtarget &Subtarget) {
53645 SDValue N0 = N->getOperand(0);
53646 EVT VT = N->getValueType(0);
53647 SDLoc dl(N);
53648
53649 // Only do this combine with AVX512 for vector extends.
53650 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
53651 return SDValue();
53652
53653 // Only combine legal element types.
53654 EVT SVT = VT.getVectorElementType();
53655 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
53656 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
53657 return SDValue();
53658
53659 // We don't have CMPP Instruction for vxf16
53660 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
53661 return SDValue();
53662 // We can only do this if the vector size in 256 bits or less.
53663 unsigned Size = VT.getSizeInBits();
53664 if (Size > 256 && Subtarget.useAVX512Regs())
53665 return SDValue();
53666
53667 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
53668 // that's the only integer compares with we have.
53669 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
53671 return SDValue();
53672
53673 // Only do this combine if the extension will be fully consumed by the setcc.
53674 EVT N00VT = N0.getOperand(0).getValueType();
53675 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
53676 if (Size != MatchingVecType.getSizeInBits())
53677 return SDValue();
53678
53679 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
53680
53681 if (N->getOpcode() == ISD::ZERO_EXTEND)
53682 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
53683
53684 return Res;
53685}
53686
53689 const X86Subtarget &Subtarget) {
53690 SDValue N0 = N->getOperand(0);
53691 EVT VT = N->getValueType(0);
53692 SDLoc DL(N);
53693
53694 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53695 if (!DCI.isBeforeLegalizeOps() &&
53697 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
53698 N0->getOperand(1));
53699 bool ReplaceOtherUses = !N0.hasOneUse();
53700 DCI.CombineTo(N, Setcc);
53701 // Replace other uses with a truncate of the widened setcc_carry.
53702 if (ReplaceOtherUses) {
53703 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53704 N0.getValueType(), Setcc);
53705 DCI.CombineTo(N0.getNode(), Trunc);
53706 }
53707
53708 return SDValue(N, 0);
53709 }
53710
53711 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53712 return NewCMov;
53713
53714 if (!DCI.isBeforeLegalizeOps())
53715 return SDValue();
53716
53717 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53718 return V;
53719
53720 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
53721 DAG, DCI, Subtarget))
53722 return V;
53723
53724 if (VT.isVector()) {
53725 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
53726 return R;
53727
53729 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
53730 }
53731
53732 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53733 return NewAdd;
53734
53735 return SDValue();
53736}
53737
53738// Inverting a constant vector is profitable if it can be eliminated and the
53739// inverted vector is already present in DAG. Otherwise, it will be loaded
53740// anyway.
53741//
53742// We determine which of the values can be completely eliminated and invert it.
53743// If both are eliminable, select a vector with the first negative element.
53746 "ConstantFP build vector expected");
53747 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
53748 // can eliminate it. Since this function is invoked for each FMA with this
53749 // vector.
53750 auto IsNotFMA = [](SDNode *Use) {
53751 return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA;
53752 };
53753 if (llvm::any_of(V->uses(), IsNotFMA))
53754 return SDValue();
53755
53757 EVT VT = V.getValueType();
53758 EVT EltVT = VT.getVectorElementType();
53759 for (const SDValue &Op : V->op_values()) {
53760 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53761 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
53762 } else {
53763 assert(Op.isUndef());
53764 Ops.push_back(DAG.getUNDEF(EltVT));
53765 }
53766 }
53767
53768 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
53769 if (!NV)
53770 return SDValue();
53771
53772 // If an inverted version cannot be eliminated, choose it instead of the
53773 // original version.
53774 if (llvm::any_of(NV->uses(), IsNotFMA))
53775 return SDValue(NV, 0);
53776
53777 // If the inverted version also can be eliminated, we have to consistently
53778 // prefer one of the values. We prefer a constant with a negative value on
53779 // the first place.
53780 // N.B. We need to skip undefs that may precede a value.
53781 for (const SDValue &Op : V->op_values()) {
53782 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53783 if (Cst->isNegative())
53784 return SDValue();
53785 break;
53786 }
53787 }
53788 return SDValue(NV, 0);
53789}
53790
53793 const X86Subtarget &Subtarget) {
53794 SDLoc dl(N);
53795 EVT VT = N->getValueType(0);
53796 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
53797
53798 // Let legalize expand this if it isn't a legal type yet.
53799 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53800 if (!TLI.isTypeLegal(VT))
53801 return SDValue();
53802
53803 SDValue A = N->getOperand(IsStrict ? 1 : 0);
53804 SDValue B = N->getOperand(IsStrict ? 2 : 1);
53805 SDValue C = N->getOperand(IsStrict ? 3 : 2);
53806
53807 // If the operation allows fast-math and the target does not support FMA,
53808 // split this into mul+add to avoid libcall(s).
53809 SDNodeFlags Flags = N->getFlags();
53810 if (!IsStrict && Flags.hasAllowReassociation() &&
53811 TLI.isOperationExpand(ISD::FMA, VT)) {
53812 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
53813 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
53814 }
53815
53816 EVT ScalarVT = VT.getScalarType();
53817 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
53818 !Subtarget.hasAnyFMA()) &&
53819 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
53820 return SDValue();
53821
53822 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
53823 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53824 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53825 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
53826 CodeSize)) {
53827 V = NegV;
53828 return true;
53829 }
53830 // Look through extract_vector_elts. If it comes from an FNEG, create a
53831 // new extract from the FNEG input.
53832 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
53833 isNullConstant(V.getOperand(1))) {
53834 SDValue Vec = V.getOperand(0);
53835 if (SDValue NegV = TLI.getCheaperNegatedExpression(
53836 Vec, DAG, LegalOperations, CodeSize)) {
53837 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
53838 NegV, V.getOperand(1));
53839 return true;
53840 }
53841 }
53842 // Lookup if there is an inverted version of constant vector V in DAG.
53843 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
53844 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
53845 V = NegV;
53846 return true;
53847 }
53848 }
53849 return false;
53850 };
53851
53852 // Do not convert the passthru input of scalar intrinsics.
53853 // FIXME: We could allow negations of the lower element only.
53854 bool NegA = invertIfNegative(A);
53855 bool NegB = invertIfNegative(B);
53856 bool NegC = invertIfNegative(C);
53857
53858 if (!NegA && !NegB && !NegC)
53859 return SDValue();
53860
53861 unsigned NewOpcode =
53862 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
53863
53864 // Propagate fast-math-flags to new FMA node.
53865 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
53866 if (IsStrict) {
53867 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
53868 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
53869 {N->getOperand(0), A, B, C});
53870 } else {
53871 if (N->getNumOperands() == 4)
53872 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
53873 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
53874 }
53875}
53876
53877// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
53878// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
53881 SDLoc dl(N);
53882 EVT VT = N->getValueType(0);
53883 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53884 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53885 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53886
53887 SDValue N2 = N->getOperand(2);
53888
53889 SDValue NegN2 =
53890 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
53891 if (!NegN2)
53892 return SDValue();
53893 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
53894
53895 if (N->getNumOperands() == 4)
53896 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53897 NegN2, N->getOperand(3));
53898 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53899 NegN2);
53900}
53901
53904 const X86Subtarget &Subtarget) {
53905 SDLoc dl(N);
53906 SDValue N0 = N->getOperand(0);
53907 EVT VT = N->getValueType(0);
53908
53909 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53910 // FIXME: Is this needed? We don't seem to have any tests for it.
53911 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
53913 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
53914 N0->getOperand(1));
53915 bool ReplaceOtherUses = !N0.hasOneUse();
53916 DCI.CombineTo(N, Setcc);
53917 // Replace other uses with a truncate of the widened setcc_carry.
53918 if (ReplaceOtherUses) {
53919 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53920 N0.getValueType(), Setcc);
53921 DCI.CombineTo(N0.getNode(), Trunc);
53922 }
53923
53924 return SDValue(N, 0);
53925 }
53926
53927 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53928 return NewCMov;
53929
53930 if (DCI.isBeforeLegalizeOps())
53931 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53932 return V;
53933
53934 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
53935 DAG, DCI, Subtarget))
53936 return V;
53937
53938 if (VT.isVector())
53939 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
53940 return R;
53941
53942 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53943 return NewAdd;
53944
53945 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
53946 return R;
53947
53948 // TODO: Combine with any target/faux shuffle.
53949 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
53951 SDValue N00 = N0.getOperand(0);
53952 SDValue N01 = N0.getOperand(1);
53953 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
53954 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
53955 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
53956 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
53957 return concatSubVectors(N00, N01, DAG, dl);
53958 }
53959 }
53960
53961 return SDValue();
53962}
53963
53964/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
53965/// pre-promote its result type since vXi1 vectors don't get promoted
53966/// during type legalization.
53969 const SDLoc &DL, SelectionDAG &DAG,
53970 const X86Subtarget &Subtarget) {
53971 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
53972 VT.getVectorElementType() == MVT::i1 &&
53973 (OpVT.getVectorElementType() == MVT::i8 ||
53974 OpVT.getVectorElementType() == MVT::i16)) {
53975 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
53976 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
53977 }
53978 return SDValue();
53979}
53980
53983 const X86Subtarget &Subtarget) {
53984 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
53985 const SDValue LHS = N->getOperand(0);
53986 const SDValue RHS = N->getOperand(1);
53987 EVT VT = N->getValueType(0);
53988 EVT OpVT = LHS.getValueType();
53989 SDLoc DL(N);
53990
53991 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
53992 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
53993 Subtarget))
53994 return V;
53995
53996 if (VT == MVT::i1) {
53997 X86::CondCode X86CC;
53998 if (SDValue V =
53999 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
54000 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
54001 }
54002
54003 if (OpVT.isScalarInteger()) {
54004 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
54005 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
54006 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
54007 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
54008 if (N0.getOperand(0) == N1)
54009 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54010 N0.getOperand(1));
54011 if (N0.getOperand(1) == N1)
54012 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
54013 N0.getOperand(0));
54014 }
54015 return SDValue();
54016 };
54017 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
54018 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54019 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
54020 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54021
54022 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
54023 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
54024 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
54025 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
54026 if (N0.getOperand(0) == N1)
54027 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54028 DAG.getNOT(DL, N0.getOperand(1), OpVT));
54029 if (N0.getOperand(1) == N1)
54030 return DAG.getNode(ISD::AND, DL, OpVT, N1,
54031 DAG.getNOT(DL, N0.getOperand(0), OpVT));
54032 }
54033 return SDValue();
54034 };
54035 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
54036 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54037 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
54038 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
54039
54040 // cmpeq(trunc(x),C) --> cmpeq(x,C)
54041 // cmpne(trunc(x),C) --> cmpne(x,C)
54042 // iff x upper bits are zero.
54043 if (LHS.getOpcode() == ISD::TRUNCATE &&
54044 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
54045 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
54046 EVT SrcVT = LHS.getOperand(0).getValueType();
54048 OpVT.getScalarSizeInBits());
54049 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54050 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
54051 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
54052 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
54053 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
54054 }
54055
54056 // With C as a power of 2 and C != 0 and C != INT_MIN:
54057 // icmp eq Abs(X) C ->
54058 // (icmp eq A, C) | (icmp eq A, -C)
54059 // icmp ne Abs(X) C ->
54060 // (icmp ne A, C) & (icmp ne A, -C)
54061 // Both of these patterns can be better optimized in
54062 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
54063 // integers which is checked above.
54064 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
54065 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
54066 const APInt &CInt = C->getAPIntValue();
54067 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
54068 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
54069 SDValue BaseOp = LHS.getOperand(0);
54070 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
54071 SDValue SETCC1 = DAG.getSetCC(
54072 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
54073 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
54074 SETCC0, SETCC1);
54075 }
54076 }
54077 }
54078 }
54079 }
54080
54081 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
54083 // Using temporaries to avoid messing up operand ordering for later
54084 // transformations if this doesn't work.
54085 SDValue Op0 = LHS;
54086 SDValue Op1 = RHS;
54087 ISD::CondCode TmpCC = CC;
54088 // Put build_vector on the right.
54089 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
54090 std::swap(Op0, Op1);
54091 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
54092 }
54093
54094 bool IsSEXT0 =
54095 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
54096 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
54097 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
54098
54099 if (IsSEXT0 && IsVZero1) {
54100 assert(VT == Op0.getOperand(0).getValueType() &&
54101 "Unexpected operand type");
54102 if (TmpCC == ISD::SETGT)
54103 return DAG.getConstant(0, DL, VT);
54104 if (TmpCC == ISD::SETLE)
54105 return DAG.getConstant(1, DL, VT);
54106 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
54107 return DAG.getNOT(DL, Op0.getOperand(0), VT);
54108
54109 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
54110 "Unexpected condition code!");
54111 return Op0.getOperand(0);
54112 }
54113 }
54114
54115 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
54116 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
54117 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
54118 // a mask, there are signed AVX512 comparisons).
54119 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
54120 bool CanMakeSigned = false;
54122 KnownBits CmpKnown =
54124 // If we know LHS/RHS share the same sign bit at each element we can
54125 // make this signed.
54126 // NOTE: `computeKnownBits` on a vector type aggregates common bits
54127 // across all lanes. So a pattern where the sign varies from lane to
54128 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
54129 // missed. We could get around this by demanding each lane
54130 // independently, but this isn't the most important optimization and
54131 // that may eat into compile time.
54132 CanMakeSigned =
54133 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
54134 }
54135 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
54136 SDValue LHSOut = LHS;
54137 SDValue RHSOut = RHS;
54138 ISD::CondCode NewCC = CC;
54139 switch (CC) {
54140 case ISD::SETGE:
54141 case ISD::SETUGE:
54142 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
54143 /*NSW*/ true))
54144 LHSOut = NewLHS;
54145 else if (SDValue NewRHS = incDecVectorConstant(
54146 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
54147 RHSOut = NewRHS;
54148 else
54149 break;
54150
54151 [[fallthrough]];
54152 case ISD::SETUGT:
54153 NewCC = ISD::SETGT;
54154 break;
54155
54156 case ISD::SETLE:
54157 case ISD::SETULE:
54158 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
54159 /*NSW*/ true))
54160 LHSOut = NewLHS;
54161 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
54162 /*NSW*/ true))
54163 RHSOut = NewRHS;
54164 else
54165 break;
54166
54167 [[fallthrough]];
54168 case ISD::SETULT:
54169 // Will be swapped to SETGT in LowerVSETCC*.
54170 NewCC = ISD::SETLT;
54171 break;
54172 default:
54173 break;
54174 }
54175 if (NewCC != CC) {
54176 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
54177 NewCC, DL, DAG, Subtarget))
54178 return R;
54179 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
54180 }
54181 }
54182 }
54183
54184 if (SDValue R =
54185 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
54186 return R;
54187
54188 // In the middle end transforms:
54189 // `(or (icmp eq X, C), (icmp eq X, C+1))`
54190 // -> `(icmp ult (add x, -C), 2)`
54191 // Likewise inverted cases with `ugt`.
54192 //
54193 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
54194 // in worse codegen. So, undo the middle-end transform and go back to `(or
54195 // (icmp eq), (icmp eq))` form.
54196 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
54197 // the xmm approach.
54198 //
54199 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
54200 // ne))` as it doesn't end up instruction positive.
54201 // TODO: We might want to do this for avx512 as well if we `sext` the result.
54202 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
54203 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
54204 !Subtarget.hasAVX512() &&
54205 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
54206 Subtarget.hasAVX2()) &&
54207 LHS.hasOneUse()) {
54208
54209 APInt CmpC;
54210 SDValue AddC = LHS.getOperand(1);
54211 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
54213 // See which form we have depending on the constant/condition.
54214 SDValue C0 = SDValue();
54215 SDValue C1 = SDValue();
54216
54217 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
54218 // we will end up generating an additional constant. Keeping in the
54219 // current form has a slight latency cost, but it probably worth saving a
54220 // constant.
54223 // Pass
54224 }
54225 // Normal Cases
54226 else if ((CC == ISD::SETULT && CmpC == 2) ||
54227 (CC == ISD::SETULE && CmpC == 1)) {
54228 // These will constant fold.
54229 C0 = DAG.getNegative(AddC, DL, OpVT);
54230 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
54231 DAG.getAllOnesConstant(DL, OpVT));
54232 }
54233 // Inverted Cases
54234 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
54235 (CC == ISD::SETUGE && (-CmpC) == 2)) {
54236 // These will constant fold.
54237 C0 = DAG.getNOT(DL, AddC, OpVT);
54238 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
54239 DAG.getAllOnesConstant(DL, OpVT));
54240 }
54241 if (C0 && C1) {
54242 SDValue NewLHS =
54243 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
54244 SDValue NewRHS =
54245 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
54246 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
54247 }
54248 }
54249 }
54250
54251 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
54252 // to avoid scalarization via legalization because v4i32 is not a legal type.
54253 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
54254 LHS.getValueType() == MVT::v4f32)
54255 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
54256
54257 // X pred 0.0 --> X pred -X
54258 // If the negation of X already exists, use it in the comparison. This removes
54259 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
54260 // instructions in patterns with a 'select' node.
54262 SDVTList FNegVT = DAG.getVTList(OpVT);
54263 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
54264 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
54265 }
54266
54267 return SDValue();
54268}
54269
54272 const X86Subtarget &Subtarget) {
54273 SDValue Src = N->getOperand(0);
54274 MVT SrcVT = Src.getSimpleValueType();
54275 MVT VT = N->getSimpleValueType(0);
54276 unsigned NumBits = VT.getScalarSizeInBits();
54277 unsigned NumElts = SrcVT.getVectorNumElements();
54278 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
54279 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
54280
54281 // Perform constant folding.
54282 APInt UndefElts;
54283 SmallVector<APInt, 32> EltBits;
54284 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
54285 /*AllowWholeUndefs*/ true,
54286 /*AllowPartialUndefs*/ true)) {
54287 APInt Imm(32, 0);
54288 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
54289 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54290 Imm.setBit(Idx);
54291
54292 return DAG.getConstant(Imm, SDLoc(N), VT);
54293 }
54294
54295 // Look through int->fp bitcasts that don't change the element width.
54296 unsigned EltWidth = SrcVT.getScalarSizeInBits();
54297 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
54298 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
54299 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
54300
54301 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
54302 // with scalar comparisons.
54303 if (SDValue NotSrc = IsNOT(Src, DAG)) {
54304 SDLoc DL(N);
54305 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54306 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
54307 return DAG.getNode(ISD::XOR, DL, VT,
54308 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
54309 DAG.getConstant(NotMask, DL, VT));
54310 }
54311
54312 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
54313 // results with scalar comparisons.
54314 if (Src.getOpcode() == X86ISD::PCMPGT &&
54315 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
54316 SDLoc DL(N);
54317 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
54318 return DAG.getNode(ISD::XOR, DL, VT,
54319 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
54320 DAG.getConstant(NotMask, DL, VT));
54321 }
54322
54323 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
54324 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
54325 // iff pow2splat(c1).
54326 // Use KnownBits to determine if only a single bit is non-zero
54327 // in each element (pow2 or zero), and shift that bit to the msb.
54328 if (Src.getOpcode() == X86ISD::PCMPEQ) {
54329 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
54330 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
54331 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
54332 if (KnownLHS.countMaxPopulation() == 1 &&
54333 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
54334 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
54335 SDLoc DL(N);
54336 MVT ShiftVT = SrcVT;
54337 SDValue ShiftLHS = Src.getOperand(0);
54338 SDValue ShiftRHS = Src.getOperand(1);
54339 if (ShiftVT.getScalarType() == MVT::i8) {
54340 // vXi8 shifts - we only care about the signbit so can use PSLLW.
54341 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
54342 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
54343 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
54344 }
54345 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54346 ShiftLHS, ShiftAmt, DAG);
54347 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
54348 ShiftRHS, ShiftAmt, DAG);
54349 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
54350 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
54351 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
54352 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
54353 }
54354 }
54355
54356 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
54357 if (N->isOnlyUserOf(Src.getNode())) {
54359 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
54360 APInt UndefElts;
54361 SmallVector<APInt, 32> EltBits;
54362 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
54363 UndefElts, EltBits)) {
54364 APInt Mask = APInt::getZero(NumBits);
54365 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
54366 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54367 Mask.setBit(Idx);
54368 }
54369 SDLoc DL(N);
54370 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
54371 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
54372 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
54373 DAG.getConstant(Mask, DL, VT));
54374 }
54375 }
54376 }
54377
54378 // Simplify the inputs.
54379 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54380 APInt DemandedMask(APInt::getAllOnes(NumBits));
54381 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54382 return SDValue(N, 0);
54383
54384 return SDValue();
54385}
54386
54389 const X86Subtarget &Subtarget) {
54390 MVT VT = N->getSimpleValueType(0);
54391 unsigned NumBits = VT.getScalarSizeInBits();
54392
54393 // Simplify the inputs.
54394 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54395 APInt DemandedMask(APInt::getAllOnes(NumBits));
54396 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54397 return SDValue(N, 0);
54398
54399 return SDValue();
54400}
54401
54404 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
54405 SDValue Mask = MemOp->getMask();
54406
54407 // With vector masks we only demand the upper bit of the mask.
54408 if (Mask.getScalarValueSizeInBits() != 1) {
54409 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54410 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54411 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54412 if (N->getOpcode() != ISD::DELETED_NODE)
54413 DCI.AddToWorklist(N);
54414 return SDValue(N, 0);
54415 }
54416 }
54417
54418 return SDValue();
54419}
54420
54423 SelectionDAG &DAG) {
54424 SDLoc DL(GorS);
54425
54426 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
54427 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
54428 Gather->getMask(), Base, Index, Scale } ;
54429 return DAG.getMaskedGather(Gather->getVTList(),
54430 Gather->getMemoryVT(), DL, Ops,
54431 Gather->getMemOperand(),
54432 Gather->getIndexType(),
54433 Gather->getExtensionType());
54434 }
54435 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
54436 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
54437 Scatter->getMask(), Base, Index, Scale };
54438 return DAG.getMaskedScatter(Scatter->getVTList(),
54439 Scatter->getMemoryVT(), DL,
54440 Ops, Scatter->getMemOperand(),
54441 Scatter->getIndexType(),
54442 Scatter->isTruncatingStore());
54443}
54444
54447 SDLoc DL(N);
54448 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
54449 SDValue Index = GorS->getIndex();
54450 SDValue Base = GorS->getBasePtr();
54451 SDValue Scale = GorS->getScale();
54452 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54453
54454 if (DCI.isBeforeLegalize()) {
54455 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54456
54457 // Shrink constant indices if they are larger than 32-bits.
54458 // Only do this before legalize types since v2i64 could become v2i32.
54459 // FIXME: We could check that the type is legal if we're after legalize
54460 // types, but then we would need to construct test cases where that happens.
54461 // FIXME: We could support more than just constant vectors, but we need to
54462 // careful with costing. A truncate that can be optimized out would be fine.
54463 // Otherwise we might only want to create a truncate if it avoids a split.
54464 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
54465 if (BV->isConstant() && IndexWidth > 32 &&
54466 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54467 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
54468 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
54469 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54470 }
54471 }
54472
54473 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
54474 // there are sufficient sign bits. Only do this before legalize types to
54475 // avoid creating illegal types in truncate.
54476 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
54477 Index.getOpcode() == ISD::ZERO_EXTEND) &&
54478 IndexWidth > 32 &&
54479 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
54480 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54481 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
54482 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
54483 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54484 }
54485 }
54486
54487 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
54488 // Try to move splat constant adders from the index operand to the base
54489 // pointer operand. Taking care to multiply by the scale. We can only do
54490 // this when index element type is the same as the pointer type.
54491 // Otherwise we need to be sure the math doesn't wrap before the scale.
54492 if (Index.getOpcode() == ISD::ADD &&
54493 Index.getValueType().getVectorElementType() == PtrVT &&
54494 isa<ConstantSDNode>(Scale)) {
54495 uint64_t ScaleAmt = Scale->getAsZExtVal();
54496 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
54497 BitVector UndefElts;
54498 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
54499 // FIXME: Allow non-constant?
54500 if (UndefElts.none()) {
54501 // Apply the scale.
54502 APInt Adder = C->getAPIntValue() * ScaleAmt;
54503 // Add it to the existing base.
54504 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
54505 DAG.getConstant(Adder, DL, PtrVT));
54506 Index = Index.getOperand(0);
54507 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54508 }
54509 }
54510
54511 // It's also possible base is just a constant. In that case, just
54512 // replace it with 0 and move the displacement into the index.
54513 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
54514 isOneConstant(Scale)) {
54515 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
54516 // Combine the constant build_vector and the constant base.
54517 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
54518 Index.getOperand(1), Splat);
54519 // Add to the LHS of the original Index add.
54520 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
54521 Index.getOperand(0), Splat);
54522 Base = DAG.getConstant(0, DL, Base.getValueType());
54523 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54524 }
54525 }
54526 }
54527
54528 if (DCI.isBeforeLegalizeOps()) {
54529 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54530
54531 // Make sure the index is either i32 or i64
54532 if (IndexWidth != 32 && IndexWidth != 64) {
54533 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
54534 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
54535 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
54536 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54537 }
54538 }
54539
54540 // With vector masks we only demand the upper bit of the mask.
54541 SDValue Mask = GorS->getMask();
54542 if (Mask.getScalarValueSizeInBits() != 1) {
54543 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54544 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54545 if (N->getOpcode() != ISD::DELETED_NODE)
54546 DCI.AddToWorklist(N);
54547 return SDValue(N, 0);
54548 }
54549 }
54550
54551 return SDValue();
54552}
54553
54554// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
54556 const X86Subtarget &Subtarget) {
54557 SDLoc DL(N);
54558 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
54559 SDValue EFLAGS = N->getOperand(1);
54560
54561 // Try to simplify the EFLAGS and condition code operands.
54562 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
54563 return getSETCC(CC, Flags, DL, DAG);
54564
54565 return SDValue();
54566}
54567
54568/// Optimize branch condition evaluation.
54570 const X86Subtarget &Subtarget) {
54571 SDLoc DL(N);
54572 SDValue EFLAGS = N->getOperand(3);
54573 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
54574
54575 // Try to simplify the EFLAGS and condition code operands.
54576 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
54577 // RAUW them under us.
54578 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
54579 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
54580 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
54581 N->getOperand(1), Cond, Flags);
54582 }
54583
54584 return SDValue();
54585}
54586
54587// TODO: Could we move this to DAGCombine?
54589 SelectionDAG &DAG) {
54590 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
54591 // to optimize away operation when it's from a constant.
54592 //
54593 // The general transformation is:
54594 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
54595 // AND(VECTOR_CMP(x,y), constant2)
54596 // constant2 = UNARYOP(constant)
54597
54598 // Early exit if this isn't a vector operation, the operand of the
54599 // unary operation isn't a bitwise AND, or if the sizes of the operations
54600 // aren't the same.
54601 EVT VT = N->getValueType(0);
54602 bool IsStrict = N->isStrictFPOpcode();
54603 unsigned NumEltBits = VT.getScalarSizeInBits();
54604 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54605 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
54606 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
54607 VT.getSizeInBits() != Op0.getValueSizeInBits())
54608 return SDValue();
54609
54610 // Now check that the other operand of the AND is a constant. We could
54611 // make the transformation for non-constant splats as well, but it's unclear
54612 // that would be a benefit as it would not eliminate any operations, just
54613 // perform one more step in scalar code before moving to the vector unit.
54614 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
54615 // Bail out if the vector isn't a constant.
54616 if (!BV->isConstant())
54617 return SDValue();
54618
54619 // Everything checks out. Build up the new and improved node.
54620 SDLoc DL(N);
54621 EVT IntVT = BV->getValueType(0);
54622 // Create a new constant of the appropriate type for the transformed
54623 // DAG.
54624 SDValue SourceConst;
54625 if (IsStrict)
54626 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
54627 {N->getOperand(0), SDValue(BV, 0)});
54628 else
54629 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
54630 // The AND node needs bitcasts to/from an integer vector type around it.
54631 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
54632 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
54633 MaskConst);
54634 SDValue Res = DAG.getBitcast(VT, NewAnd);
54635 if (IsStrict)
54636 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
54637 return Res;
54638 }
54639
54640 return SDValue();
54641}
54642
54643/// If we are converting a value to floating-point, try to replace scalar
54644/// truncate of an extracted vector element with a bitcast. This tries to keep
54645/// the sequence on XMM registers rather than moving between vector and GPRs.
54647 // TODO: This is currently only used by combineSIntToFP, but it is generalized
54648 // to allow being called by any similar cast opcode.
54649 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
54650 SDValue Trunc = N->getOperand(0);
54651 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
54652 return SDValue();
54653
54654 SDValue ExtElt = Trunc.getOperand(0);
54655 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54656 !isNullConstant(ExtElt.getOperand(1)))
54657 return SDValue();
54658
54659 EVT TruncVT = Trunc.getValueType();
54660 EVT SrcVT = ExtElt.getValueType();
54661 unsigned DestWidth = TruncVT.getSizeInBits();
54662 unsigned SrcWidth = SrcVT.getSizeInBits();
54663 if (SrcWidth % DestWidth != 0)
54664 return SDValue();
54665
54666 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
54667 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
54668 unsigned VecWidth = SrcVecVT.getSizeInBits();
54669 unsigned NumElts = VecWidth / DestWidth;
54670 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
54671 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
54672 SDLoc DL(N);
54673 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
54674 BitcastVec, ExtElt.getOperand(1));
54675 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
54676}
54677
54679 const X86Subtarget &Subtarget) {
54680 bool IsStrict = N->isStrictFPOpcode();
54681 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54682 EVT VT = N->getValueType(0);
54683 EVT InVT = Op0.getValueType();
54684
54685 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54686 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54687 // if hasFP16 support:
54688 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
54689 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
54690 // else
54691 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54692 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
54693 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54694 unsigned ScalarSize = InVT.getScalarSizeInBits();
54695 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54696 ScalarSize >= 64)
54697 return SDValue();
54698 SDLoc dl(N);
54699 EVT DstVT =
54701 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54702 : ScalarSize < 32 ? MVT::i32
54703 : MVT::i64,
54704 InVT.getVectorNumElements());
54705 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54706 if (IsStrict)
54707 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54708 {N->getOperand(0), P});
54709 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54710 }
54711
54712 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
54713 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
54714 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
54715 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
54716 VT.getScalarType() != MVT::f16) {
54717 SDLoc dl(N);
54718 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
54719 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54720
54721 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
54722 if (IsStrict)
54723 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54724 {N->getOperand(0), P});
54725 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54726 }
54727
54728 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
54729 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
54730 // the optimization here.
54731 SDNodeFlags Flags = N->getFlags();
54732 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
54733 if (IsStrict)
54734 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
54735 {N->getOperand(0), Op0});
54736 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
54737 }
54738
54739 return SDValue();
54740}
54741
54744 const X86Subtarget &Subtarget) {
54745 // First try to optimize away the conversion entirely when it's
54746 // conditionally from a constant. Vectors only.
54747 bool IsStrict = N->isStrictFPOpcode();
54749 return Res;
54750
54751 // Now move on to more general possibilities.
54752 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54753 EVT VT = N->getValueType(0);
54754 EVT InVT = Op0.getValueType();
54755
54756 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54757 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54758 // if hasFP16 support:
54759 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
54760 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
54761 // else
54762 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54763 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
54764 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54765 unsigned ScalarSize = InVT.getScalarSizeInBits();
54766 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54767 ScalarSize >= 64)
54768 return SDValue();
54769 SDLoc dl(N);
54770 EVT DstVT =
54772 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54773 : ScalarSize < 32 ? MVT::i32
54774 : MVT::i64,
54775 InVT.getVectorNumElements());
54776 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
54777 if (IsStrict)
54778 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54779 {N->getOperand(0), P});
54780 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54781 }
54782
54783 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
54784 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
54785 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
54786 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
54787 VT.getScalarType() != MVT::f16) {
54788 SDLoc dl(N);
54789 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
54790 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
54791 if (IsStrict)
54792 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54793 {N->getOperand(0), P});
54794 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54795 }
54796
54797 // Without AVX512DQ we only support i64 to float scalar conversion. For both
54798 // vectors and scalars, see if we know that the upper bits are all the sign
54799 // bit, in which case we can truncate the input to i32 and convert from that.
54800 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
54801 unsigned BitWidth = InVT.getScalarSizeInBits();
54802 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
54803 if (NumSignBits >= (BitWidth - 31)) {
54804 EVT TruncVT = MVT::i32;
54805 if (InVT.isVector())
54806 TruncVT = InVT.changeVectorElementType(TruncVT);
54807 SDLoc dl(N);
54808 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
54809 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
54810 if (IsStrict)
54811 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54812 {N->getOperand(0), Trunc});
54813 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
54814 }
54815 // If we're after legalize and the type is v2i32 we need to shuffle and
54816 // use CVTSI2P.
54817 assert(InVT == MVT::v2i64 && "Unexpected VT!");
54818 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
54819 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
54820 { 0, 2, -1, -1 });
54821 if (IsStrict)
54822 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
54823 {N->getOperand(0), Shuf});
54824 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
54825 }
54826 }
54827
54828 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
54829 // a 32-bit target where SSE doesn't support i64->FP operations.
54830 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
54831 Op0.getOpcode() == ISD::LOAD) {
54832 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
54833
54834 // This transformation is not supported if the result type is f16 or f128.
54835 if (VT == MVT::f16 || VT == MVT::f128)
54836 return SDValue();
54837
54838 // If we have AVX512DQ we can use packed conversion instructions unless
54839 // the VT is f80.
54840 if (Subtarget.hasDQI() && VT != MVT::f80)
54841 return SDValue();
54842
54843 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
54844 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
54845 std::pair<SDValue, SDValue> Tmp =
54846 Subtarget.getTargetLowering()->BuildFILD(
54847 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
54848 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
54849 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
54850 return Tmp.first;
54851 }
54852 }
54853
54854 if (IsStrict)
54855 return SDValue();
54856
54857 if (SDValue V = combineToFPTruncExtElt(N, DAG))
54858 return V;
54859
54860 return SDValue();
54861}
54862
54864 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
54865
54866 for (const SDNode *User : Flags->uses()) {
54868 switch (User->getOpcode()) {
54869 default:
54870 // Be conservative.
54871 return true;
54872 case X86ISD::SETCC:
54874 CC = (X86::CondCode)User->getConstantOperandVal(0);
54875 break;
54876 case X86ISD::BRCOND:
54877 case X86ISD::CMOV:
54878 CC = (X86::CondCode)User->getConstantOperandVal(2);
54879 break;
54880 }
54881
54882 switch (CC) {
54883 // clang-format off
54884 default: break;
54885 case X86::COND_A: case X86::COND_AE:
54886 case X86::COND_B: case X86::COND_BE:
54887 case X86::COND_O: case X86::COND_NO:
54888 case X86::COND_G: case X86::COND_GE:
54889 case X86::COND_L: case X86::COND_LE:
54890 return true;
54891 // clang-format on
54892 }
54893 }
54894
54895 return false;
54896}
54897
54898static bool onlyZeroFlagUsed(SDValue Flags) {
54899 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
54900
54901 for (const SDNode *User : Flags->uses()) {
54902 unsigned CCOpNo;
54903 switch (User->getOpcode()) {
54904 default:
54905 // Be conservative.
54906 return false;
54907 case X86ISD::SETCC:
54909 CCOpNo = 0;
54910 break;
54911 case X86ISD::BRCOND:
54912 case X86ISD::CMOV:
54913 CCOpNo = 2;
54914 break;
54915 }
54916
54917 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
54918 if (CC != X86::COND_E && CC != X86::COND_NE)
54919 return false;
54920 }
54921
54922 return true;
54923}
54924
54927 const X86Subtarget &Subtarget) {
54928 // Only handle test patterns.
54929 if (!isNullConstant(N->getOperand(1)))
54930 return SDValue();
54931
54932 // If we have a CMP of a truncated binop, see if we can make a smaller binop
54933 // and use its flags directly.
54934 // TODO: Maybe we should try promoting compares that only use the zero flag
54935 // first if we can prove the upper bits with computeKnownBits?
54936 SDLoc dl(N);
54937 SDValue Op = N->getOperand(0);
54938 EVT VT = Op.getValueType();
54939 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54940
54941 if (SDValue CMP =
54942 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
54943 return CMP;
54944
54945 // If we have a constant logical shift that's only used in a comparison
54946 // against zero turn it into an equivalent AND. This allows turning it into
54947 // a TEST instruction later.
54948 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
54949 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
54950 onlyZeroFlagUsed(SDValue(N, 0))) {
54951 unsigned BitWidth = VT.getSizeInBits();
54952 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
54953 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
54954 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
54955 APInt Mask = Op.getOpcode() == ISD::SRL
54956 ? APInt::getHighBitsSet(BitWidth, MaskBits)
54957 : APInt::getLowBitsSet(BitWidth, MaskBits);
54958 if (Mask.isSignedIntN(32)) {
54959 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
54960 DAG.getConstant(Mask, dl, VT));
54961 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54962 DAG.getConstant(0, dl, VT));
54963 }
54964 }
54965 }
54966
54967 // If we're extracting from a avx512 bool vector and comparing against zero,
54968 // then try to just bitcast the vector to an integer to use TEST/BT directly.
54969 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
54970 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
54971 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
54972 SDValue Src = Op.getOperand(0);
54973 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
54974 isNullConstant(Src.getOperand(1)) &&
54975 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
54976 SDValue BoolVec = Src.getOperand(0);
54977 unsigned ShAmt = 0;
54978 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
54979 ShAmt = BoolVec.getConstantOperandVal(1);
54980 BoolVec = BoolVec.getOperand(0);
54981 }
54982 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
54983 EVT VecVT = BoolVec.getValueType();
54984 unsigned BitWidth = VecVT.getVectorNumElements();
54985 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
54986 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
54987 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
54988 Op = DAG.getBitcast(BCVT, BoolVec);
54989 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
54990 DAG.getConstant(Mask, dl, BCVT));
54991 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54992 DAG.getConstant(0, dl, BCVT));
54993 }
54994 }
54995 }
54996
54997 // Peek through any zero-extend if we're only testing for a zero result.
54998 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
54999 SDValue Src = Op.getOperand(0);
55000 EVT SrcVT = Src.getValueType();
55001 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
55002 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
55003 DAG.getConstant(0, dl, SrcVT));
55004 }
55005
55006 // Look for a truncate.
55007 if (Op.getOpcode() != ISD::TRUNCATE)
55008 return SDValue();
55009
55010 SDValue Trunc = Op;
55011 Op = Op.getOperand(0);
55012
55013 // See if we can compare with zero against the truncation source,
55014 // which should help using the Z flag from many ops. Only do this for
55015 // i32 truncated op to prevent partial-reg compares of promoted ops.
55016 EVT OpVT = Op.getValueType();
55017 APInt UpperBits =
55019 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
55020 onlyZeroFlagUsed(SDValue(N, 0))) {
55021 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55022 DAG.getConstant(0, dl, OpVT));
55023 }
55024
55025 // After this the truncate and arithmetic op must have a single use.
55026 if (!Trunc.hasOneUse() || !Op.hasOneUse())
55027 return SDValue();
55028
55029 unsigned NewOpc;
55030 switch (Op.getOpcode()) {
55031 default: return SDValue();
55032 case ISD::AND:
55033 // Skip and with constant. We have special handling for and with immediate
55034 // during isel to generate test instructions.
55035 if (isa<ConstantSDNode>(Op.getOperand(1)))
55036 return SDValue();
55037 NewOpc = X86ISD::AND;
55038 break;
55039 case ISD::OR: NewOpc = X86ISD::OR; break;
55040 case ISD::XOR: NewOpc = X86ISD::XOR; break;
55041 case ISD::ADD:
55042 // If the carry or overflow flag is used, we can't truncate.
55044 return SDValue();
55045 NewOpc = X86ISD::ADD;
55046 break;
55047 case ISD::SUB:
55048 // If the carry or overflow flag is used, we can't truncate.
55050 return SDValue();
55051 NewOpc = X86ISD::SUB;
55052 break;
55053 }
55054
55055 // We found an op we can narrow. Truncate its inputs.
55056 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
55057 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
55058
55059 // Use a X86 specific opcode to avoid DAG combine messing with it.
55060 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55061 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
55062
55063 // For AND, keep a CMP so that we can match the test pattern.
55064 if (NewOpc == X86ISD::AND)
55065 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55066 DAG.getConstant(0, dl, VT));
55067
55068 // Return the flags.
55069 return Op.getValue(1);
55070}
55071
55074 const X86Subtarget &ST) {
55075 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
55076 "Expected X86ISD::ADD or X86ISD::SUB");
55077
55078 SDLoc DL(N);
55079 SDValue LHS = N->getOperand(0);
55080 SDValue RHS = N->getOperand(1);
55081 MVT VT = LHS.getSimpleValueType();
55082 bool IsSub = X86ISD::SUB == N->getOpcode();
55083 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
55084
55085 if (IsSub && isOneConstant(N->getOperand(1)) && !N->hasAnyUseOfValue(0))
55086 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
55087 return CMP;
55088
55089 // If we don't use the flag result, simplify back to a generic ADD/SUB.
55090 if (!N->hasAnyUseOfValue(1)) {
55091 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
55092 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
55093 }
55094
55095 // Fold any similar generic ADD/SUB opcodes to reuse this node.
55096 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
55097 SDValue Ops[] = {N0, N1};
55098 SDVTList VTs = DAG.getVTList(N->getValueType(0));
55099 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
55100 SDValue Op(N, 0);
55101 if (Negate)
55102 Op = DAG.getNegative(Op, DL, VT);
55103 DCI.CombineTo(GenericAddSub, Op);
55104 }
55105 };
55106 MatchGeneric(LHS, RHS, false);
55107 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
55108
55109 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
55110 // EFLAGS result doesn't change.
55111 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
55112 /*ZeroSecondOpOnly*/ true);
55113}
55114
55116 SDValue LHS = N->getOperand(0);
55117 SDValue RHS = N->getOperand(1);
55118 SDValue BorrowIn = N->getOperand(2);
55119
55120 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
55121 MVT VT = N->getSimpleValueType(0);
55122 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55123 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
55124 }
55125
55126 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
55127 // iff the flag result is dead.
55128 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
55129 !N->hasAnyUseOfValue(1))
55130 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55131 LHS.getOperand(1), BorrowIn);
55132
55133 return SDValue();
55134}
55135
55136// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
55139 SDValue LHS = N->getOperand(0);
55140 SDValue RHS = N->getOperand(1);
55141 SDValue CarryIn = N->getOperand(2);
55142 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
55143 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
55144
55145 // Canonicalize constant to RHS.
55146 if (LHSC && !RHSC)
55147 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
55148 CarryIn);
55149
55150 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
55151 // the result is either zero or one (depending on the input carry bit).
55152 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
55153 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
55154 // We don't have a good way to replace an EFLAGS use, so only do this when
55155 // dead right now.
55156 SDValue(N, 1).use_empty()) {
55157 SDLoc DL(N);
55158 EVT VT = N->getValueType(0);
55159 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
55160 SDValue Res1 = DAG.getNode(
55161 ISD::AND, DL, VT,
55163 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
55164 DAG.getConstant(1, DL, VT));
55165 return DCI.CombineTo(N, Res1, CarryOut);
55166 }
55167
55168 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
55169 // iff the flag result is dead.
55170 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
55171 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
55172 SDLoc DL(N);
55173 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
55174 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
55175 DAG.getConstant(0, DL, LHS.getValueType()),
55176 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
55177 }
55178
55179 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
55180 MVT VT = N->getSimpleValueType(0);
55181 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
55182 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
55183 }
55184
55185 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
55186 // iff the flag result is dead.
55187 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
55188 !N->hasAnyUseOfValue(1))
55189 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
55190 LHS.getOperand(1), CarryIn);
55191
55192 return SDValue();
55193}
55194
55196 const SDLoc &DL, EVT VT,
55197 const X86Subtarget &Subtarget) {
55198 // Example of pattern we try to detect:
55199 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
55200 //(add (build_vector (extract_elt t, 0),
55201 // (extract_elt t, 2),
55202 // (extract_elt t, 4),
55203 // (extract_elt t, 6)),
55204 // (build_vector (extract_elt t, 1),
55205 // (extract_elt t, 3),
55206 // (extract_elt t, 5),
55207 // (extract_elt t, 7)))
55208
55209 if (!Subtarget.hasSSE2())
55210 return SDValue();
55211
55212 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
55214 return SDValue();
55215
55216 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55217 VT.getVectorNumElements() < 4 ||
55219 return SDValue();
55220
55221 // Check if one of Op0,Op1 is of the form:
55222 // (build_vector (extract_elt Mul, 0),
55223 // (extract_elt Mul, 2),
55224 // (extract_elt Mul, 4),
55225 // ...
55226 // the other is of the form:
55227 // (build_vector (extract_elt Mul, 1),
55228 // (extract_elt Mul, 3),
55229 // (extract_elt Mul, 5),
55230 // ...
55231 // and identify Mul.
55232 SDValue Mul;
55233 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
55234 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
55235 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
55236 // TODO: Be more tolerant to undefs.
55237 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55238 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55239 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55240 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
55241 return SDValue();
55242 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
55243 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
55244 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
55245 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
55246 if (!Const0L || !Const1L || !Const0H || !Const1H)
55247 return SDValue();
55248 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
55249 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
55250 // Commutativity of mul allows factors of a product to reorder.
55251 if (Idx0L > Idx1L)
55252 std::swap(Idx0L, Idx1L);
55253 if (Idx0H > Idx1H)
55254 std::swap(Idx0H, Idx1H);
55255 // Commutativity of add allows pairs of factors to reorder.
55256 if (Idx0L > Idx0H) {
55257 std::swap(Idx0L, Idx0H);
55258 std::swap(Idx1L, Idx1H);
55259 }
55260 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
55261 Idx1H != 2 * i + 3)
55262 return SDValue();
55263 if (!Mul) {
55264 // First time an extract_elt's source vector is visited. Must be a MUL
55265 // with 2X number of vector elements than the BUILD_VECTOR.
55266 // Both extracts must be from same MUL.
55267 Mul = Op0L->getOperand(0);
55268 if (Mul->getOpcode() != ISD::MUL ||
55269 Mul.getValueType().getVectorNumElements() != 2 * e)
55270 return SDValue();
55271 }
55272 // Check that the extract is from the same MUL previously seen.
55273 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
55274 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
55275 return SDValue();
55276 }
55277
55278 // Check if the Mul source can be safely shrunk.
55279 ShrinkMode Mode;
55280 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
55281 Mode == ShrinkMode::MULU16)
55282 return SDValue();
55283
55284 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55285 VT.getVectorNumElements() * 2);
55286 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
55287 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
55288
55289 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55290 ArrayRef<SDValue> Ops) {
55291 EVT InVT = Ops[0].getValueType();
55292 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
55293 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55294 InVT.getVectorNumElements() / 2);
55295 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55296 };
55297 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
55298}
55299
55300// Attempt to turn this pattern into PMADDWD.
55301// (add (mul (sext (build_vector)), (sext (build_vector))),
55302// (mul (sext (build_vector)), (sext (build_vector)))
55304 const SDLoc &DL, EVT VT,
55305 const X86Subtarget &Subtarget) {
55306 if (!Subtarget.hasSSE2())
55307 return SDValue();
55308
55309 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
55310 return SDValue();
55311
55312 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
55313 VT.getVectorNumElements() < 4 ||
55315 return SDValue();
55316
55317 SDValue N00 = N0.getOperand(0);
55318 SDValue N01 = N0.getOperand(1);
55319 SDValue N10 = N1.getOperand(0);
55320 SDValue N11 = N1.getOperand(1);
55321
55322 // All inputs need to be sign extends.
55323 // TODO: Support ZERO_EXTEND from known positive?
55324 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
55325 N01.getOpcode() != ISD::SIGN_EXTEND ||
55326 N10.getOpcode() != ISD::SIGN_EXTEND ||
55327 N11.getOpcode() != ISD::SIGN_EXTEND)
55328 return SDValue();
55329
55330 // Peek through the extends.
55331 N00 = N00.getOperand(0);
55332 N01 = N01.getOperand(0);
55333 N10 = N10.getOperand(0);
55334 N11 = N11.getOperand(0);
55335
55336 // Must be extending from vXi16.
55337 EVT InVT = N00.getValueType();
55338 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
55339 N10.getValueType() != InVT || N11.getValueType() != InVT)
55340 return SDValue();
55341
55342 // All inputs should be build_vectors.
55343 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
55344 N01.getOpcode() != ISD::BUILD_VECTOR ||
55345 N10.getOpcode() != ISD::BUILD_VECTOR ||
55347 return SDValue();
55348
55349 // For each element, we need to ensure we have an odd element from one vector
55350 // multiplied by the odd element of another vector and the even element from
55351 // one of the same vectors being multiplied by the even element from the
55352 // other vector. So we need to make sure for each element i, this operator
55353 // is being performed:
55354 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
55355 SDValue In0, In1;
55356 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
55357 SDValue N00Elt = N00.getOperand(i);
55358 SDValue N01Elt = N01.getOperand(i);
55359 SDValue N10Elt = N10.getOperand(i);
55360 SDValue N11Elt = N11.getOperand(i);
55361 // TODO: Be more tolerant to undefs.
55362 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55363 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55364 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55366 return SDValue();
55367 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
55368 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
55369 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
55370 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
55371 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
55372 return SDValue();
55373 unsigned IdxN00 = ConstN00Elt->getZExtValue();
55374 unsigned IdxN01 = ConstN01Elt->getZExtValue();
55375 unsigned IdxN10 = ConstN10Elt->getZExtValue();
55376 unsigned IdxN11 = ConstN11Elt->getZExtValue();
55377 // Add is commutative so indices can be reordered.
55378 if (IdxN00 > IdxN10) {
55379 std::swap(IdxN00, IdxN10);
55380 std::swap(IdxN01, IdxN11);
55381 }
55382 // N0 indices be the even element. N1 indices must be the next odd element.
55383 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
55384 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
55385 return SDValue();
55386 SDValue N00In = N00Elt.getOperand(0);
55387 SDValue N01In = N01Elt.getOperand(0);
55388 SDValue N10In = N10Elt.getOperand(0);
55389 SDValue N11In = N11Elt.getOperand(0);
55390
55391 // First time we find an input capture it.
55392 if (!In0) {
55393 In0 = N00In;
55394 In1 = N01In;
55395
55396 // The input vectors must be at least as wide as the output.
55397 // If they are larger than the output, we extract subvector below.
55398 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
55399 In1.getValueSizeInBits() < VT.getSizeInBits())
55400 return SDValue();
55401 }
55402 // Mul is commutative so the input vectors can be in any order.
55403 // Canonicalize to make the compares easier.
55404 if (In0 != N00In)
55405 std::swap(N00In, N01In);
55406 if (In0 != N10In)
55407 std::swap(N10In, N11In);
55408 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
55409 return SDValue();
55410 }
55411
55412 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55413 ArrayRef<SDValue> Ops) {
55414 EVT OpVT = Ops[0].getValueType();
55415 assert(OpVT.getScalarType() == MVT::i16 &&
55416 "Unexpected scalar element type");
55417 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
55418 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55419 OpVT.getVectorNumElements() / 2);
55420 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55421 };
55422
55423 // If the output is narrower than an input, extract the low part of the input
55424 // vector.
55425 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55426 VT.getVectorNumElements() * 2);
55427 if (OutVT16.bitsLT(In0.getValueType())) {
55428 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
55429 DAG.getIntPtrConstant(0, DL));
55430 }
55431 if (OutVT16.bitsLT(In1.getValueType())) {
55432 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
55433 DAG.getIntPtrConstant(0, DL));
55434 }
55435 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
55436 PMADDBuilder);
55437}
55438
55439// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
55440// If upper element in each pair of both VPMADDWD are zero then we can merge
55441// the operand elements and use the implicit add of VPMADDWD.
55442// TODO: Add support for VPMADDUBSW (which isn't commutable).
55444 const SDLoc &DL, EVT VT) {
55445 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
55446 return SDValue();
55447
55448 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
55449 if (VT.getSizeInBits() > 128)
55450 return SDValue();
55451
55452 unsigned NumElts = VT.getVectorNumElements();
55453 MVT OpVT = N0.getOperand(0).getSimpleValueType();
55455 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
55456
55457 bool Op0HiZero =
55458 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
55459 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
55460 bool Op1HiZero =
55461 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
55462 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
55463
55464 // TODO: Check for zero lower elements once we have actual codegen that
55465 // creates them.
55466 if (!Op0HiZero || !Op1HiZero)
55467 return SDValue();
55468
55469 // Create a shuffle mask packing the lower elements from each VPMADDWD.
55470 SmallVector<int> Mask;
55471 for (int i = 0; i != (int)NumElts; ++i) {
55472 Mask.push_back(2 * i);
55473 Mask.push_back(2 * (i + NumElts));
55474 }
55475
55476 SDValue LHS =
55477 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
55478 SDValue RHS =
55479 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
55480 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
55481}
55482
55483/// CMOV of constants requires materializing constant operands in registers.
55484/// Try to fold those constants into an 'add' instruction to reduce instruction
55485/// count. We do this with CMOV rather the generic 'select' because there are
55486/// earlier folds that may be used to turn select-of-constants into logic hacks.
55488 SelectionDAG &DAG,
55489 const X86Subtarget &Subtarget) {
55490 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
55491 // better because we eliminate 1-2 instructions. This transform is still
55492 // an improvement without zero operands because we trade 2 move constants and
55493 // 1 add for 2 adds (LEA) as long as the constants can be represented as
55494 // immediate asm operands (fit in 32-bits).
55495 auto isSuitableCmov = [](SDValue V) {
55496 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
55497 return false;
55498 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
55499 !isa<ConstantSDNode>(V.getOperand(1)))
55500 return false;
55501 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
55502 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
55503 V.getConstantOperandAPInt(1).isSignedIntN(32));
55504 };
55505
55506 // Match an appropriate CMOV as the first operand of the add.
55507 SDValue Cmov = N->getOperand(0);
55508 SDValue OtherOp = N->getOperand(1);
55509 if (!isSuitableCmov(Cmov))
55510 std::swap(Cmov, OtherOp);
55511 if (!isSuitableCmov(Cmov))
55512 return SDValue();
55513
55514 // Don't remove a load folding opportunity for the add. That would neutralize
55515 // any improvements from removing constant materializations.
55516 if (X86::mayFoldLoad(OtherOp, Subtarget))
55517 return SDValue();
55518
55519 EVT VT = N->getValueType(0);
55520 SDValue FalseOp = Cmov.getOperand(0);
55521 SDValue TrueOp = Cmov.getOperand(1);
55522
55523 // We will push the add through the select, but we can potentially do better
55524 // if we know there is another add in the sequence and this is pointer math.
55525 // In that case, we can absorb an add into the trailing memory op and avoid
55526 // a 3-operand LEA which is likely slower than a 2-operand LEA.
55527 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
55528 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
55529 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
55530 all_of(N->uses(), [&](SDNode *Use) {
55531 auto *MemNode = dyn_cast<MemSDNode>(Use);
55532 return MemNode && MemNode->getBasePtr().getNode() == N;
55533 })) {
55534 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
55535 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
55536 // it is possible that choosing op1 might be better.
55537 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
55538 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
55539 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
55540 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
55541 Cmov.getOperand(2), Cmov.getOperand(3));
55542 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
55543 }
55544
55545 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
55546 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
55547 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
55548 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
55549 Cmov.getOperand(3));
55550}
55551
55554 const X86Subtarget &Subtarget) {
55555 EVT VT = N->getValueType(0);
55556 SDValue Op0 = N->getOperand(0);
55557 SDValue Op1 = N->getOperand(1);
55558 SDLoc DL(N);
55559
55560 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
55561 return Select;
55562
55563 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
55564 return MAdd;
55565 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
55566 return MAdd;
55567 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
55568 return MAdd;
55569
55570 // Try to synthesize horizontal adds from adds of shuffles.
55571 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55572 return V;
55573
55574 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
55575 // iff X and Y won't overflow.
55576 if (Op0.getOpcode() == X86ISD::PSADBW && Op1.getOpcode() == X86ISD::PSADBW &&
55579 if (DAG.willNotOverflowAdd(false, Op0.getOperand(0), Op1.getOperand(0))) {
55580 MVT OpVT = Op0.getOperand(1).getSimpleValueType();
55581 SDValue Sum =
55582 DAG.getNode(ISD::ADD, DL, OpVT, Op0.getOperand(0), Op1.getOperand(0));
55583 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
55584 getZeroVector(OpVT, Subtarget, DAG, DL));
55585 }
55586 }
55587
55588 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
55589 // (sub Y, (sext (vXi1 X))).
55590 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
55591 // generic DAG combine without a legal type check, but adding this there
55592 // caused regressions.
55593 if (VT.isVector()) {
55594 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55595 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
55596 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55597 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
55598 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
55599 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
55600 }
55601
55602 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
55603 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55604 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
55605 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
55606 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
55607 }
55608 }
55609
55610 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
55611 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
55612 X86::isZeroNode(Op0.getOperand(1))) {
55613 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
55614 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
55615 Op0.getOperand(0), Op0.getOperand(2));
55616 }
55617
55618 return combineAddOrSubToADCOrSBB(N, DL, DAG);
55619}
55620
55621// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
55622// condition comes from the subtract node that produced -X. This matches the
55623// cmov expansion for absolute value. By swapping the operands we convert abs
55624// to nabs.
55626 SDValue N0 = N->getOperand(0);
55627 SDValue N1 = N->getOperand(1);
55628
55629 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
55630 return SDValue();
55631
55633 if (CC != X86::COND_S && CC != X86::COND_NS)
55634 return SDValue();
55635
55636 // Condition should come from a negate operation.
55637 SDValue Cond = N1.getOperand(3);
55638 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
55639 return SDValue();
55640 assert(Cond.getResNo() == 1 && "Unexpected result number");
55641
55642 // Get the X and -X from the negate.
55643 SDValue NegX = Cond.getValue(0);
55644 SDValue X = Cond.getOperand(1);
55645
55646 SDValue FalseOp = N1.getOperand(0);
55647 SDValue TrueOp = N1.getOperand(1);
55648
55649 // Cmov operands should be X and NegX. Order doesn't matter.
55650 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
55651 return SDValue();
55652
55653 // Build a new CMOV with the operands swapped.
55654 SDLoc DL(N);
55655 MVT VT = N->getSimpleValueType(0);
55656 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
55657 N1.getOperand(2), Cond);
55658 // Convert sub to add.
55659 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
55660}
55661
55663 SDValue Op0 = N->getOperand(0);
55664 SDValue Op1 = N->getOperand(1);
55665
55666 // (sub C (zero_extend (setcc)))
55667 // =>
55668 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
55669 // Don't disturb (sub 0 setcc), which is easily done with neg.
55670 EVT VT = N->getValueType(0);
55671 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
55672 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
55673 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
55674 Op1.getOperand(0).hasOneUse()) {
55675 SDValue SetCC = Op1.getOperand(0);
55678 APInt NewImm = Op0C->getAPIntValue() - 1;
55679 SDLoc DL(Op1);
55680 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
55681 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
55682 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
55683 DAG.getConstant(NewImm, DL, VT));
55684 }
55685
55686 return SDValue();
55687}
55688
55690 // res, flags2 = sub 0, (setcc cc, flag)
55691 // cload/cstore ..., cond_ne, flag2
55692 // ->
55693 // cload/cstore cc, flag
55694 if (N->getConstantOperandVal(3) != X86::COND_NE)
55695 return SDValue();
55696
55697 SDValue Sub = N->getOperand(4);
55698 if (Sub.getOpcode() != X86ISD::SUB)
55699 return SDValue();
55700
55701 SDValue SetCC = Sub.getOperand(1);
55702
55703 if (!X86::isZeroNode(Sub.getOperand(0)) || SetCC.getOpcode() != X86ISD::SETCC)
55704 return SDValue();
55705
55706 SmallVector<SDValue, 5> Ops(N->op_values());
55707 Ops[3] = SetCC.getOperand(0);
55708 Ops[4] = SetCC.getOperand(1);
55709
55710 return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops,
55711 cast<MemSDNode>(N)->getMemoryVT(),
55712 cast<MemSDNode>(N)->getMemOperand());
55713}
55714
55717 const X86Subtarget &Subtarget) {
55718 SDValue Op0 = N->getOperand(0);
55719 SDValue Op1 = N->getOperand(1);
55720 SDLoc DL(N);
55721
55722 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
55723 auto IsNonOpaqueConstant = [&](SDValue Op) {
55725 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
55726 return !Cst->isOpaque();
55727 return true;
55728 }
55729 return false;
55730 };
55731
55732 // X86 can't encode an immediate LHS of a sub. See if we can push the
55733 // negation into a preceding instruction. If the RHS of the sub is a XOR with
55734 // one use and a constant, invert the immediate, saving one register.
55735 // However, ignore cases where C1 is 0, as those will become a NEG.
55736 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
55737 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
55738 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
55739 Op1->hasOneUse()) {
55740 EVT VT = Op0.getValueType();
55741 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
55742 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
55743 SDValue NewAdd =
55744 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
55745 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
55746 }
55747
55748 if (SDValue V = combineSubABS(N, DAG))
55749 return V;
55750
55751 // Try to synthesize horizontal subs from subs of shuffles.
55752 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55753 return V;
55754
55755 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
55756 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
55757 X86::isZeroNode(Op1.getOperand(1))) {
55758 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55759 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
55760 Op1.getOperand(0), Op1.getOperand(2));
55761 }
55762
55763 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
55764 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
55765 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
55766 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
55767 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55768 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
55769 Op1.getOperand(1), Op1.getOperand(2));
55770 return DAG.getNode(ISD::SUB, DL, Op0.getValueType(), ADC.getValue(0),
55771 Op1.getOperand(0));
55772 }
55773
55774 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55775 return V;
55776
55777 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
55778 return V;
55779
55780 return combineSubSetcc(N, DAG);
55781}
55782
55784 const X86Subtarget &Subtarget) {
55785 unsigned Opcode = N->getOpcode();
55786 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
55787 "Unknown PCMP opcode");
55788
55789 SDValue LHS = N->getOperand(0);
55790 SDValue RHS = N->getOperand(1);
55791 MVT VT = N->getSimpleValueType(0);
55792 unsigned EltBits = VT.getScalarSizeInBits();
55793 unsigned NumElts = VT.getVectorNumElements();
55794 SDLoc DL(N);
55795
55796 if (LHS == RHS)
55797 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
55798 : DAG.getConstant(0, DL, VT);
55799
55800 // Constant Folding.
55801 // PCMPEQ(X,UNDEF) -> UNDEF
55802 // PCMPGT(X,UNDEF) -> 0
55803 // PCMPGT(UNDEF,X) -> 0
55804 APInt LHSUndefs, RHSUndefs;
55805 SmallVector<APInt> LHSBits, RHSBits;
55806 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
55807 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
55808 APInt Ones = APInt::getAllOnes(EltBits);
55809 APInt Zero = APInt::getZero(EltBits);
55810 SmallVector<APInt> Results(NumElts);
55811 for (unsigned I = 0; I != NumElts; ++I) {
55812 if (Opcode == X86ISD::PCMPEQ) {
55813 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
55814 } else {
55815 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
55816 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
55817 }
55818 }
55819 if (Opcode == X86ISD::PCMPEQ)
55820 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
55821 return getConstVector(Results, VT, DAG, DL);
55822 }
55823
55824 return SDValue();
55825}
55826
55827// Helper to determine if we can convert an integer comparison to a float
55828// comparison byt casting the operands.
55829static std::optional<unsigned>
55830CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
55831 unsigned NumSignificantBitsRHS) {
55832 MVT SVT = VT.getScalarType();
55833 assert(SVT == MVT::f32 && "Only tested for float so far");
55835 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
55836 "Only PCMPEQ/PCMPGT currently supported");
55837
55838 // TODO: Handle bitcastable integers.
55839
55840 // For cvt + signed compare we need lhs and rhs to be exactly representable as
55841 // a fp value.
55842 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
55843 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
55844 return ISD::SINT_TO_FP;
55845
55846 return std::nullopt;
55847}
55848
55849/// Helper that combines an array of subvector ops as if they were the operands
55850/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
55851/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
55855 const X86Subtarget &Subtarget) {
55856 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
55857 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55858
55859 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
55860 return DAG.getUNDEF(VT);
55861
55862 if (llvm::all_of(Ops, [](SDValue Op) {
55863 return ISD::isBuildVectorAllZeros(Op.getNode());
55864 }))
55865 return getZeroVector(VT, Subtarget, DAG, DL);
55866
55867 SDValue Op0 = Ops[0];
55868 bool IsSplat = llvm::all_equal(Ops);
55869 unsigned NumOps = Ops.size();
55870 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55871 LLVMContext &Ctx = *DAG.getContext();
55872
55873 // Repeated subvectors.
55874 if (IsSplat &&
55875 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
55876 // If this broadcast is inserted into both halves, use a larger broadcast.
55877 if (Op0.getOpcode() == X86ISD::VBROADCAST)
55878 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
55879
55880 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
55881 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
55882 (Subtarget.hasAVX2() ||
55884 VT.getScalarType(), Subtarget)))
55885 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
55886 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
55887 Op0.getOperand(0),
55888 DAG.getIntPtrConstant(0, DL)));
55889
55890 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
55891 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
55892 (Subtarget.hasAVX2() ||
55893 (EltSizeInBits >= 32 &&
55894 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
55895 Op0.getOperand(0).getValueType() == VT.getScalarType())
55896 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
55897
55898 // concat_vectors(extract_subvector(broadcast(x)),
55899 // extract_subvector(broadcast(x))) -> broadcast(x)
55900 // concat_vectors(extract_subvector(subv_broadcast(x)),
55901 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
55902 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55903 Op0.getOperand(0).getValueType() == VT) {
55904 SDValue SrcVec = Op0.getOperand(0);
55905 if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
55907 return Op0.getOperand(0);
55908 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
55909 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
55910 return Op0.getOperand(0);
55911 }
55912
55913 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
55914 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
55915 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
55916 return DAG.getNode(Op0.getOpcode(), DL, VT,
55918 Op0.getOperand(0), Op0.getOperand(0)),
55919 Op0.getOperand(1));
55920 }
55921
55922 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
55923 // Only concat of subvector high halves which vperm2x128 is best at.
55924 // TODO: This should go in combineX86ShufflesRecursively eventually.
55925 if (VT.is256BitVector() && NumOps == 2) {
55926 SDValue Src0 = peekThroughBitcasts(Ops[0]);
55927 SDValue Src1 = peekThroughBitcasts(Ops[1]);
55928 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55930 EVT SrcVT0 = Src0.getOperand(0).getValueType();
55931 EVT SrcVT1 = Src1.getOperand(0).getValueType();
55932 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
55933 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
55934 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
55935 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
55936 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
55937 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
55938 DAG.getBitcast(VT, Src0.getOperand(0)),
55939 DAG.getBitcast(VT, Src1.getOperand(0)),
55940 DAG.getTargetConstant(0x31, DL, MVT::i8));
55941 }
55942 }
55943 }
55944
55945 // Repeated opcode.
55946 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
55947 // but it currently struggles with different vector widths.
55948 if (llvm::all_of(Ops, [Op0](SDValue Op) {
55949 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
55950 })) {
55951 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
55953 for (SDValue SubOp : SubOps)
55954 Subs.push_back(SubOp.getOperand(I));
55955 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
55956 };
55957 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
55958 bool AllConstants = true;
55959 bool AllSubVectors = true;
55960 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
55961 SDValue Sub = SubOps[I].getOperand(Op);
55962 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
55963 SDValue BC = peekThroughBitcasts(Sub);
55964 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
55966 AllSubVectors &= Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55967 Sub.getOperand(0).getValueType() == VT &&
55968 Sub.getConstantOperandAPInt(1) == (I * NumSubElts);
55969 }
55970 return AllConstants || AllSubVectors;
55971 };
55972
55973 switch (Op0.getOpcode()) {
55974 case X86ISD::VBROADCAST: {
55975 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
55976 return Op.getOperand(0).getValueType().is128BitVector();
55977 })) {
55978 if (VT == MVT::v4f64 || VT == MVT::v4i64)
55979 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
55980 ConcatSubOperand(VT, Ops, 0),
55981 ConcatSubOperand(VT, Ops, 0));
55982 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
55983 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
55984 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
55986 DL, VT, ConcatSubOperand(VT, Ops, 0),
55987 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
55988 }
55989 break;
55990 }
55991 case X86ISD::MOVDDUP:
55992 case X86ISD::MOVSHDUP:
55993 case X86ISD::MOVSLDUP: {
55994 if (!IsSplat)
55995 return DAG.getNode(Op0.getOpcode(), DL, VT,
55996 ConcatSubOperand(VT, Ops, 0));
55997 break;
55998 }
55999 case X86ISD::SHUFP: {
56000 // Add SHUFPD support if/when necessary.
56001 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
56002 llvm::all_of(Ops, [Op0](SDValue Op) {
56003 return Op.getOperand(2) == Op0.getOperand(2);
56004 })) {
56005 return DAG.getNode(Op0.getOpcode(), DL, VT,
56006 ConcatSubOperand(VT, Ops, 0),
56007 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56008 }
56009 break;
56010 }
56011 case X86ISD::UNPCKH:
56012 case X86ISD::UNPCKL: {
56013 // Don't concatenate build_vector patterns.
56014 if (!IsSplat && EltSizeInBits >= 32 &&
56015 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56016 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56017 none_of(Ops, [](SDValue Op) {
56018 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
56020 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
56022 })) {
56023 return DAG.getNode(Op0.getOpcode(), DL, VT,
56024 ConcatSubOperand(VT, Ops, 0),
56025 ConcatSubOperand(VT, Ops, 1));
56026 }
56027 break;
56028 }
56029 case X86ISD::PSHUFHW:
56030 case X86ISD::PSHUFLW:
56031 case X86ISD::PSHUFD:
56032 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
56033 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
56034 return DAG.getNode(Op0.getOpcode(), DL, VT,
56035 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56036 }
56037 [[fallthrough]];
56038 case X86ISD::VPERMILPI:
56039 if (!IsSplat && EltSizeInBits == 32 &&
56040 (VT.is256BitVector() ||
56041 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56042 all_of(Ops, [&Op0](SDValue Op) {
56043 return Op0.getOperand(1) == Op.getOperand(1);
56044 })) {
56045 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
56046 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
56047 Res =
56048 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
56049 return DAG.getBitcast(VT, Res);
56050 }
56051 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
56052 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
56053 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
56054 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
56055 return DAG.getNode(Op0.getOpcode(), DL, VT,
56056 ConcatSubOperand(VT, Ops, 0),
56057 DAG.getTargetConstant(Idx, DL, MVT::i8));
56058 }
56059 break;
56060 case X86ISD::PSHUFB:
56061 case X86ISD::PSADBW:
56062 case X86ISD::VPMADDUBSW:
56063 case X86ISD::VPMADDWD:
56064 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56065 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
56066 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56067 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56068 NumOps * SrcVT.getVectorNumElements());
56069 return DAG.getNode(Op0.getOpcode(), DL, VT,
56070 ConcatSubOperand(SrcVT, Ops, 0),
56071 ConcatSubOperand(SrcVT, Ops, 1));
56072 }
56073 break;
56074 case X86ISD::VPERMV:
56075 if (!IsSplat && NumOps == 2 &&
56076 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
56077 MVT OpVT = Op0.getSimpleValueType();
56078 int NumSrcElts = OpVT.getVectorNumElements();
56079 SmallVector<int, 64> ConcatMask;
56080 for (unsigned i = 0; i != NumOps; ++i) {
56081 SmallVector<int, 64> SubMask;
56083 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
56084 break;
56085 for (int M : SubMask) {
56086 if (0 <= M)
56087 M += i * NumSrcElts;
56088 ConcatMask.push_back(M);
56089 }
56090 }
56091 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56092 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
56093 Ops[1].getOperand(1), DAG, DL);
56094 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56095 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56096 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56097 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
56098 }
56099 }
56100 break;
56101 case X86ISD::VPERMV3:
56102 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
56103 MVT OpVT = Op0.getSimpleValueType();
56104 int NumSrcElts = OpVT.getVectorNumElements();
56105 SmallVector<int, 64> ConcatMask;
56106 for (unsigned i = 0; i != NumOps; ++i) {
56107 SmallVector<int, 64> SubMask;
56109 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
56110 break;
56111 for (int M : SubMask) {
56112 if (0 <= M) {
56113 M += M < NumSrcElts ? 0 : NumSrcElts;
56114 M += i * NumSrcElts;
56115 }
56116 ConcatMask.push_back(M);
56117 }
56118 }
56119 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56120 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
56121 Ops[1].getOperand(0), DAG, DL);
56122 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
56123 Ops[1].getOperand(2), DAG, DL);
56124 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56125 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56126 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56127 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
56128 }
56129 }
56130 break;
56131 case X86ISD::VPERM2X128: {
56132 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
56133 assert(NumOps == 2 && "Bad concat_vectors operands");
56134 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
56135 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
56136 // TODO: Handle zero'd subvectors.
56137 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
56138 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
56139 (int)((Imm1 >> 4) & 0x3)};
56140 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
56141 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
56142 Ops[0].getOperand(1), DAG, DL);
56143 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
56144 Ops[1].getOperand(1), DAG, DL);
56145 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
56146 DAG.getBitcast(ShuffleVT, LHS),
56147 DAG.getBitcast(ShuffleVT, RHS),
56148 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
56149 return DAG.getBitcast(VT, Res);
56150 }
56151 }
56152 break;
56153 }
56154 case X86ISD::SHUF128: {
56155 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
56156 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
56157 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
56158 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
56159 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
56160 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
56161 Ops[0].getOperand(1), DAG, DL);
56162 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
56163 Ops[1].getOperand(1), DAG, DL);
56164 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
56165 DAG.getTargetConstant(Imm, DL, MVT::i8));
56166 }
56167 break;
56168 }
56169 case ISD::TRUNCATE:
56170 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
56171 EVT SrcVT = Ops[0].getOperand(0).getValueType();
56172 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
56173 SrcVT == Ops[1].getOperand(0).getValueType() &&
56174 Subtarget.useAVX512Regs() &&
56175 Subtarget.getPreferVectorWidth() >= 512 &&
56176 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
56177 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
56178 return DAG.getNode(ISD::TRUNCATE, DL, VT,
56179 ConcatSubOperand(NewSrcVT, Ops, 0));
56180 }
56181 }
56182 break;
56183 case X86ISD::VSHLI:
56184 case X86ISD::VSRLI:
56185 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
56186 // TODO: Move this to LowerShiftByScalarImmediate?
56187 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
56188 llvm::all_of(Ops, [](SDValue Op) {
56189 return Op.getConstantOperandAPInt(1) == 32;
56190 })) {
56191 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
56192 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
56193 if (Op0.getOpcode() == X86ISD::VSHLI) {
56194 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56195 {8, 0, 8, 2, 8, 4, 8, 6});
56196 } else {
56197 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56198 {1, 8, 3, 8, 5, 8, 7, 8});
56199 }
56200 return DAG.getBitcast(VT, Res);
56201 }
56202 [[fallthrough]];
56203 case X86ISD::VSRAI:
56204 case X86ISD::VSHL:
56205 case X86ISD::VSRL:
56206 case X86ISD::VSRA:
56207 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
56208 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56209 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
56210 llvm::all_of(Ops, [Op0](SDValue Op) {
56211 return Op0.getOperand(1) == Op.getOperand(1);
56212 })) {
56213 return DAG.getNode(Op0.getOpcode(), DL, VT,
56214 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56215 }
56216 break;
56217 case X86ISD::VPERMI:
56218 case X86ISD::VROTLI:
56219 case X86ISD::VROTRI:
56220 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56221 llvm::all_of(Ops, [Op0](SDValue Op) {
56222 return Op0.getOperand(1) == Op.getOperand(1);
56223 })) {
56224 return DAG.getNode(Op0.getOpcode(), DL, VT,
56225 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56226 }
56227 break;
56228 case ISD::AND:
56229 case ISD::OR:
56230 case ISD::XOR:
56231 case X86ISD::ANDNP:
56232 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56233 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56234 return DAG.getNode(Op0.getOpcode(), DL, VT,
56235 ConcatSubOperand(VT, Ops, 0),
56236 ConcatSubOperand(VT, Ops, 1));
56237 }
56238 break;
56239 case X86ISD::PCMPEQ:
56240 case X86ISD::PCMPGT:
56241 if (!IsSplat && VT.is256BitVector() &&
56242 (Subtarget.hasInt256() || VT == MVT::v8i32) &&
56243 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
56244 if (Subtarget.hasInt256())
56245 return DAG.getNode(Op0.getOpcode(), DL, VT,
56246 ConcatSubOperand(VT, Ops, 0),
56247 ConcatSubOperand(VT, Ops, 1));
56248
56249 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
56250 // TODO: Handle v4f64 as well?
56251 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
56252 for (unsigned I = 0; I != NumOps; ++I) {
56253 MaxSigBitsLHS =
56254 std::max(MaxSigBitsLHS,
56255 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
56256 MaxSigBitsRHS =
56257 std::max(MaxSigBitsRHS,
56258 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
56259 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
56260 break;
56261 }
56262
56263 ISD::CondCode ICC =
56265 ISD::CondCode FCC =
56267
56268 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
56269 MVT FpVT = VT.changeVectorElementType(FpSVT);
56270
56271 if (std::optional<unsigned> CastOpc =
56272 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
56273 SDValue LHS = ConcatSubOperand(VT, Ops, 0);
56274 SDValue RHS = ConcatSubOperand(VT, Ops, 1);
56275 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
56276 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
56277
56278 bool IsAlwaysSignaling;
56279 unsigned FSETCC =
56280 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
56281 return DAG.getBitcast(
56282 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
56283 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
56284 }
56285 }
56286 break;
56287 case ISD::CTPOP:
56288 case ISD::CTTZ:
56289 case ISD::CTLZ:
56292 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56293 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
56294 return DAG.getNode(Op0.getOpcode(), DL, VT,
56295 ConcatSubOperand(VT, Ops, 0));
56296 }
56297 break;
56299 if (!IsSplat &&
56300 (VT.is256BitVector() ||
56301 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56302 llvm::all_of(Ops, [Op0](SDValue Op) {
56303 return Op0.getOperand(2) == Op.getOperand(2);
56304 })) {
56305 return DAG.getNode(Op0.getOpcode(), DL, VT,
56306 ConcatSubOperand(VT, Ops, 0),
56307 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56308 }
56309 break;
56310 case ISD::ADD:
56311 case ISD::SUB:
56312 case ISD::MUL:
56313 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56314 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
56315 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
56316 return DAG.getNode(Op0.getOpcode(), DL, VT,
56317 ConcatSubOperand(VT, Ops, 0),
56318 ConcatSubOperand(VT, Ops, 1));
56319 }
56320 break;
56321 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
56322 // their latency are short, so here we don't replace them unless we won't
56323 // introduce extra VINSERT.
56324 case ISD::FADD:
56325 case ISD::FSUB:
56326 case ISD::FMUL:
56327 if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
56328 (VT.is256BitVector() ||
56329 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56330 return DAG.getNode(Op0.getOpcode(), DL, VT,
56331 ConcatSubOperand(VT, Ops, 0),
56332 ConcatSubOperand(VT, Ops, 1));
56333 }
56334 break;
56335 case ISD::FDIV:
56336 if (!IsSplat && (VT.is256BitVector() ||
56337 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
56338 return DAG.getNode(Op0.getOpcode(), DL, VT,
56339 ConcatSubOperand(VT, Ops, 0),
56340 ConcatSubOperand(VT, Ops, 1));
56341 }
56342 break;
56343 case X86ISD::HADD:
56344 case X86ISD::HSUB:
56345 case X86ISD::FHADD:
56346 case X86ISD::FHSUB:
56347 if (!IsSplat && VT.is256BitVector() &&
56348 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
56349 return DAG.getNode(Op0.getOpcode(), DL, VT,
56350 ConcatSubOperand(VT, Ops, 0),
56351 ConcatSubOperand(VT, Ops, 1));
56352 }
56353 break;
56354 case X86ISD::PACKSS:
56355 case X86ISD::PACKUS:
56356 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56357 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
56358 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
56359 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
56360 NumOps * SrcVT.getVectorNumElements());
56361 return DAG.getNode(Op0.getOpcode(), DL, VT,
56362 ConcatSubOperand(SrcVT, Ops, 0),
56363 ConcatSubOperand(SrcVT, Ops, 1));
56364 }
56365 break;
56366 case X86ISD::PALIGNR:
56367 if (!IsSplat &&
56368 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56369 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
56370 llvm::all_of(Ops, [Op0](SDValue Op) {
56371 return Op0.getOperand(2) == Op.getOperand(2);
56372 })) {
56373 return DAG.getNode(Op0.getOpcode(), DL, VT,
56374 ConcatSubOperand(VT, Ops, 0),
56375 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56376 }
56377 break;
56378 case X86ISD::BLENDI:
56379 if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) {
56380 uint64_t Mask0 = Ops[0].getConstantOperandVal(2);
56381 uint64_t Mask1 = Ops[1].getConstantOperandVal(2);
56382 // MVT::v16i16 has repeated blend mask.
56383 if (Op0.getSimpleValueType() == MVT::v16i16) {
56384 Mask0 = (Mask0 << 8) | Mask0;
56385 Mask1 = (Mask1 << 8) | Mask1;
56386 }
56387 uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0;
56389 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
56390 SDValue Sel =
56391 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
56392 return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1),
56393 ConcatSubOperand(VT, Ops, 0));
56394 }
56395 break;
56396 case ISD::VSELECT:
56397 if (!IsSplat && Subtarget.hasAVX512() &&
56398 (VT.is256BitVector() ||
56399 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56400 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
56401 EVT SelVT = Ops[0].getOperand(0).getValueType();
56402 if (SelVT.getVectorElementType() == MVT::i1) {
56403 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
56404 NumOps * SelVT.getVectorNumElements());
56405 if (TLI.isTypeLegal(SelVT))
56406 return DAG.getNode(Op0.getOpcode(), DL, VT,
56407 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56408 ConcatSubOperand(VT, Ops, 1),
56409 ConcatSubOperand(VT, Ops, 2));
56410 }
56411 }
56412 [[fallthrough]];
56413 case X86ISD::BLENDV:
56414 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
56415 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
56416 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
56417 EVT SelVT = Ops[0].getOperand(0).getValueType();
56418 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
56419 if (TLI.isTypeLegal(SelVT))
56420 return DAG.getNode(Op0.getOpcode(), DL, VT,
56421 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
56422 ConcatSubOperand(VT, Ops, 1),
56423 ConcatSubOperand(VT, Ops, 2));
56424 }
56425 break;
56426 }
56427 }
56428
56429 // Fold subvector loads into one.
56430 // If needed, look through bitcasts to get to the load.
56431 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
56432 unsigned Fast;
56433 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
56434 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
56435 *FirstLd->getMemOperand(), &Fast) &&
56436 Fast) {
56437 if (SDValue Ld =
56438 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
56439 return Ld;
56440 }
56441 }
56442
56443 // Attempt to fold target constant loads.
56444 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
56445 SmallVector<APInt> EltBits;
56446 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
56447 for (unsigned I = 0; I != NumOps; ++I) {
56448 APInt OpUndefElts;
56449 SmallVector<APInt> OpEltBits;
56450 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
56451 OpEltBits, /*AllowWholeUndefs*/ true,
56452 /*AllowPartialUndefs*/ false))
56453 break;
56454 EltBits.append(OpEltBits);
56455 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
56456 }
56457 if (EltBits.size() == VT.getVectorNumElements()) {
56458 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
56459 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
56460 SDValue CV = DAG.getConstantPool(C, PVT);
56463 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
56464 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
56465 DAG.ReplaceAllUsesOfValueWith(Op0, Sub);
56466 return Ld;
56467 }
56468 }
56469
56470 // If this simple subvector or scalar/subvector broadcast_load is inserted
56471 // into both halves, use a larger broadcast_load. Update other uses to use
56472 // an extracted subvector.
56473 if (IsSplat &&
56474 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
56475 if (ISD::isNormalLoad(Op0.getNode()) ||
56478 auto *Mem = cast<MemSDNode>(Op0);
56479 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
56482 if (SDValue BcastLd =
56483 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
56484 SDValue BcastSrc =
56485 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
56486 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
56487 return BcastLd;
56488 }
56489 }
56490 }
56491
56492 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
56493 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
56494 Subtarget.useAVX512Regs()) {
56495 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
56496 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
56497 Res = DAG.getBitcast(ShuffleVT, Res);
56498 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
56499 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
56500 return DAG.getBitcast(VT, Res);
56501 }
56502
56503 return SDValue();
56504}
56505
56508 const X86Subtarget &Subtarget) {
56509 EVT VT = N->getValueType(0);
56510 EVT SrcVT = N->getOperand(0).getValueType();
56511 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56512 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
56513
56514 if (VT.getVectorElementType() == MVT::i1) {
56515 // Attempt to constant fold.
56516 unsigned SubSizeInBits = SrcVT.getSizeInBits();
56518 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
56519 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
56520 if (!C) break;
56521 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
56522 if (I == (E - 1)) {
56523 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
56524 if (TLI.isTypeLegal(IntVT))
56525 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
56526 }
56527 }
56528
56529 // Don't do anything else for i1 vectors.
56530 return SDValue();
56531 }
56532
56533 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
56534 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
56535 DCI, Subtarget))
56536 return R;
56537 }
56538
56539 return SDValue();
56540}
56541
56544 const X86Subtarget &Subtarget) {
56545 if (DCI.isBeforeLegalizeOps())
56546 return SDValue();
56547
56548 MVT OpVT = N->getSimpleValueType(0);
56549
56550 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
56551
56552 SDLoc dl(N);
56553 SDValue Vec = N->getOperand(0);
56554 SDValue SubVec = N->getOperand(1);
56555
56556 uint64_t IdxVal = N->getConstantOperandVal(2);
56557 MVT SubVecVT = SubVec.getSimpleValueType();
56558
56559 if (Vec.isUndef() && SubVec.isUndef())
56560 return DAG.getUNDEF(OpVT);
56561
56562 // Inserting undefs/zeros into zeros/undefs is a zero vector.
56563 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
56564 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
56565 return getZeroVector(OpVT, Subtarget, DAG, dl);
56566
56568 // If we're inserting into a zero vector and then into a larger zero vector,
56569 // just insert into the larger zero vector directly.
56570 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56572 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
56573 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56574 getZeroVector(OpVT, Subtarget, DAG, dl),
56575 SubVec.getOperand(1),
56576 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
56577 }
56578
56579 // If we're inserting into a zero vector and our input was extracted from an
56580 // insert into a zero vector of the same type and the extraction was at
56581 // least as large as the original insertion. Just insert the original
56582 // subvector into a zero vector.
56583 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
56584 isNullConstant(SubVec.getOperand(1)) &&
56586 SDValue Ins = SubVec.getOperand(0);
56587 if (isNullConstant(Ins.getOperand(2)) &&
56588 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
56589 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
56590 SubVecVT.getFixedSizeInBits())
56591 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56592 getZeroVector(OpVT, Subtarget, DAG, dl),
56593 Ins.getOperand(1), N->getOperand(2));
56594 }
56595 }
56596
56597 // Stop here if this is an i1 vector.
56598 if (IsI1Vector)
56599 return SDValue();
56600
56601 // Eliminate an intermediate vector widening:
56602 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
56603 // insert_subvector X, Y, Idx
56604 // TODO: This is a more general version of a DAGCombiner fold, can we move it
56605 // there?
56606 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56607 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
56608 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
56609 SubVec.getOperand(1), N->getOperand(2));
56610
56611 // If this is an insert of an extract, combine to a shuffle. Don't do this
56612 // if the insert or extract can be represented with a subregister operation.
56613 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56614 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
56615 (IdxVal != 0 ||
56616 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
56617 int ExtIdxVal = SubVec.getConstantOperandVal(1);
56618 if (ExtIdxVal != 0) {
56619 int VecNumElts = OpVT.getVectorNumElements();
56620 int SubVecNumElts = SubVecVT.getVectorNumElements();
56621 SmallVector<int, 64> Mask(VecNumElts);
56622 // First create an identity shuffle mask.
56623 for (int i = 0; i != VecNumElts; ++i)
56624 Mask[i] = i;
56625 // Now insert the extracted portion.
56626 for (int i = 0; i != SubVecNumElts; ++i)
56627 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
56628
56629 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
56630 }
56631 }
56632
56633 // Match concat_vector style patterns.
56634 SmallVector<SDValue, 2> SubVectorOps;
56635 if (collectConcatOps(N, SubVectorOps, DAG)) {
56636 if (SDValue Fold =
56637 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
56638 return Fold;
56639
56640 // If we're inserting all zeros into the upper half, change this to
56641 // a concat with zero. We will match this to a move
56642 // with implicit upper bit zeroing during isel.
56643 // We do this here because we don't want combineConcatVectorOps to
56644 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
56645 if (SubVectorOps.size() == 2 &&
56646 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
56647 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56648 getZeroVector(OpVT, Subtarget, DAG, dl),
56649 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
56650
56651 // Attempt to recursively combine to a shuffle.
56652 if (all_of(SubVectorOps, [](SDValue SubOp) {
56653 return isTargetShuffle(SubOp.getOpcode());
56654 })) {
56655 SDValue Op(N, 0);
56656 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
56657 return Res;
56658 }
56659 }
56660
56661 // If this is a broadcast insert into an upper undef, use a larger broadcast.
56662 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
56663 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
56664
56665 // If this is a broadcast load inserted into an upper undef, use a larger
56666 // broadcast load.
56667 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
56668 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
56669 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
56670 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
56671 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
56672 SDValue BcastLd =
56674 MemIntr->getMemoryVT(),
56675 MemIntr->getMemOperand());
56676 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
56677 return BcastLd;
56678 }
56679
56680 // If we're splatting the lower half subvector of a full vector load into the
56681 // upper half, attempt to create a subvector broadcast.
56682 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
56683 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
56684 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
56685 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
56686 if (VecLd && SubLd &&
56687 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
56688 SubVec.getValueSizeInBits() / 8, 0))
56689 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
56690 SubLd, 0, DAG);
56691 }
56692
56693 return SDValue();
56694}
56695
56696/// If we are extracting a subvector of a vector select and the select condition
56697/// is composed of concatenated vectors, try to narrow the select width. This
56698/// is a common pattern for AVX1 integer code because 256-bit selects may be
56699/// legal, but there is almost no integer math/logic available for 256-bit.
56700/// This function should only be called with legal types (otherwise, the calls
56701/// to get simple value types will assert).
56703 SelectionDAG &DAG) {
56704 SDValue Sel = Ext->getOperand(0);
56705 if (Sel.getOpcode() != ISD::VSELECT ||
56706 !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
56707 return SDValue();
56708
56709 // Note: We assume simple value types because this should only be called with
56710 // legal operations/types.
56711 // TODO: This can be extended to handle extraction to 256-bits.
56712 MVT VT = Ext->getSimpleValueType(0);
56713 if (!VT.is128BitVector())
56714 return SDValue();
56715
56716 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
56717 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
56718 return SDValue();
56719
56720 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
56721 MVT SelVT = Sel.getSimpleValueType();
56722 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
56723 "Unexpected vector type with legal operations");
56724
56725 unsigned SelElts = SelVT.getVectorNumElements();
56726 unsigned CastedElts = WideVT.getVectorNumElements();
56727 unsigned ExtIdx = Ext->getConstantOperandVal(1);
56728 if (SelElts % CastedElts == 0) {
56729 // The select has the same or more (narrower) elements than the extract
56730 // operand. The extraction index gets scaled by that factor.
56731 ExtIdx *= (SelElts / CastedElts);
56732 } else if (CastedElts % SelElts == 0) {
56733 // The select has less (wider) elements than the extract operand. Make sure
56734 // that the extraction index can be divided evenly.
56735 unsigned IndexDivisor = CastedElts / SelElts;
56736 if (ExtIdx % IndexDivisor != 0)
56737 return SDValue();
56738 ExtIdx /= IndexDivisor;
56739 } else {
56740 llvm_unreachable("Element count of simple vector types are not divisible?");
56741 }
56742
56743 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
56744 unsigned NarrowElts = SelElts / NarrowingFactor;
56745 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
56746 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
56747 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
56748 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
56749 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
56750 return DAG.getBitcast(VT, NarrowSel);
56751}
56752
56755 const X86Subtarget &Subtarget) {
56756 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
56757 // eventually get combined/lowered into ANDNP) with a concatenated operand,
56758 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
56759 // We let generic combining take over from there to simplify the
56760 // insert/extract and 'not'.
56761 // This pattern emerges during AVX1 legalization. We handle it before lowering
56762 // to avoid complications like splitting constant vector loads.
56763
56764 // Capture the original wide type in the likely case that we need to bitcast
56765 // back to this type.
56766 if (!N->getValueType(0).isSimple())
56767 return SDValue();
56768
56769 MVT VT = N->getSimpleValueType(0);
56770 SDValue InVec = N->getOperand(0);
56771 unsigned IdxVal = N->getConstantOperandVal(1);
56772 SDValue InVecBC = peekThroughBitcasts(InVec);
56773 EVT InVecVT = InVec.getValueType();
56774 unsigned SizeInBits = VT.getSizeInBits();
56775 unsigned InSizeInBits = InVecVT.getSizeInBits();
56776 unsigned NumSubElts = VT.getVectorNumElements();
56777 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56778 SDLoc DL(N);
56779
56780 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
56781 TLI.isTypeLegal(InVecVT) &&
56782 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
56783 auto isConcatenatedNot = [](SDValue V) {
56784 V = peekThroughBitcasts(V);
56785 if (!isBitwiseNot(V))
56786 return false;
56787 SDValue NotOp = V->getOperand(0);
56789 };
56790 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
56791 isConcatenatedNot(InVecBC.getOperand(1))) {
56792 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
56793 SDValue Concat = splitVectorIntBinary(InVecBC, DAG, SDLoc(InVecBC));
56794 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56795 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
56796 }
56797 }
56798
56799 if (DCI.isBeforeLegalizeOps())
56800 return SDValue();
56801
56802 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
56803 return V;
56804
56806 return getZeroVector(VT, Subtarget, DAG, DL);
56807
56808 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
56809 if (VT.getScalarType() == MVT::i1)
56810 return DAG.getConstant(1, DL, VT);
56811 return getOnesVector(VT, DAG, DL);
56812 }
56813
56814 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
56815 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
56816
56817 // If we are extracting from an insert into a larger vector, replace with a
56818 // smaller insert if we don't access less than the original subvector. Don't
56819 // do this for i1 vectors.
56820 // TODO: Relax the matching indices requirement?
56821 if (VT.getVectorElementType() != MVT::i1 &&
56822 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
56823 IdxVal == InVec.getConstantOperandVal(2) &&
56824 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
56825 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56826 InVec.getOperand(0), N->getOperand(1));
56827 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
56828 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
56829 InVec.getOperand(1),
56830 DAG.getVectorIdxConstant(NewIdxVal, DL));
56831 }
56832
56833 // If we're extracting an upper subvector from a broadcast we should just
56834 // extract the lowest subvector instead which should allow
56835 // SimplifyDemandedVectorElts do more simplifications.
56836 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
56838 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
56839 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
56840
56841 // If we're extracting a broadcasted subvector, just use the lowest subvector.
56842 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
56843 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
56844 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
56845
56846 // Attempt to extract from the source of a shuffle vector.
56847 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
56848 SmallVector<int, 32> ShuffleMask;
56849 SmallVector<int, 32> ScaledMask;
56850 SmallVector<SDValue, 2> ShuffleInputs;
56851 unsigned NumSubVecs = InSizeInBits / SizeInBits;
56852 // Decode the shuffle mask and scale it so its shuffling subvectors.
56853 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
56854 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
56855 unsigned SubVecIdx = IdxVal / NumSubElts;
56856 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
56857 return DAG.getUNDEF(VT);
56858 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
56859 return getZeroVector(VT, Subtarget, DAG, DL);
56860 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
56861 if (Src.getValueSizeInBits() == InSizeInBits) {
56862 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
56863 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
56864 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
56865 DL, SizeInBits);
56866 }
56867 }
56868 }
56869
56870 auto IsExtractFree = [](SDValue V) {
56871 V = peekThroughBitcasts(V);
56872 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
56873 return true;
56875 return true;
56876 return V.isUndef();
56877 };
56878
56879 // If we're extracting the lowest subvector and we're the only user,
56880 // we may be able to perform this with a smaller vector width.
56881 unsigned InOpcode = InVec.getOpcode();
56882 if (InVec.hasOneUse()) {
56883 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
56884 // v2f64 CVTDQ2PD(v4i32).
56885 if (InOpcode == ISD::SINT_TO_FP &&
56886 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56887 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
56888 }
56889 // v2f64 CVTUDQ2PD(v4i32).
56890 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
56891 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56892 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
56893 }
56894 // v2f64 CVTPS2PD(v4f32).
56895 if (InOpcode == ISD::FP_EXTEND &&
56896 InVec.getOperand(0).getValueType() == MVT::v4f32) {
56897 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
56898 }
56899 }
56900 // v4i32 CVTPS2DQ(v4f32).
56901 if (InOpcode == ISD::FP_TO_SINT && VT == MVT::v4i32) {
56902 SDValue Src = InVec.getOperand(0);
56903 if (Src.getValueType().getScalarType() == MVT::f32)
56904 return DAG.getNode(InOpcode, DL, VT,
56905 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
56906 }
56907 if (IdxVal == 0 &&
56908 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
56909 (SizeInBits == 128 || SizeInBits == 256) &&
56910 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
56911 SDValue Ext = InVec.getOperand(0);
56912 if (Ext.getValueSizeInBits() > SizeInBits)
56913 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
56914 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
56915 return DAG.getNode(ExtOp, DL, VT, Ext);
56916 }
56917 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
56918 InVec.getOperand(0).getValueType().is256BitVector() &&
56919 InVec.getOperand(1).getValueType().is256BitVector() &&
56920 InVec.getOperand(2).getValueType().is256BitVector()) {
56921 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
56922 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
56923 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
56924 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
56925 }
56926 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
56927 (SizeInBits == 128 || SizeInBits == 256)) {
56928 SDValue InVecSrc = InVec.getOperand(0);
56929 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
56930 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
56931 return DAG.getNode(InOpcode, DL, VT, Ext);
56932 }
56933 if ((InOpcode == X86ISD::CMPP || InOpcode == X86ISD::PCMPEQ ||
56934 InOpcode == X86ISD::PCMPGT) &&
56935 (IsExtractFree(InVec.getOperand(0)) ||
56936 IsExtractFree(InVec.getOperand(1))) &&
56937 SizeInBits == 128) {
56938 SDValue Ext0 =
56939 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56940 SDValue Ext1 =
56941 extractSubVector(InVec.getOperand(1), IdxVal, DAG, DL, SizeInBits);
56942 if (InOpcode == X86ISD::CMPP)
56943 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, InVec.getOperand(2));
56944 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1);
56945 }
56946 if (InOpcode == X86ISD::MOVDDUP &&
56947 (SizeInBits == 128 || SizeInBits == 256)) {
56948 SDValue Ext0 =
56949 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56950 return DAG.getNode(InOpcode, DL, VT, Ext0);
56951 }
56952 }
56953
56954 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
56955 // as this is very likely to fold into a shuffle/truncation.
56956 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
56957 InVecVT.getScalarSizeInBits() == 64 &&
56958 InVec.getConstantOperandAPInt(1) == 32) {
56959 SDValue Ext =
56960 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56961 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
56962 }
56963
56964 return SDValue();
56965}
56966
56968 EVT VT = N->getValueType(0);
56969 SDValue Src = N->getOperand(0);
56970 SDLoc DL(N);
56971
56972 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
56973 // This occurs frequently in our masked scalar intrinsic code and our
56974 // floating point select lowering with AVX512.
56975 // TODO: SimplifyDemandedBits instead?
56976 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
56977 isOneConstant(Src.getOperand(1)))
56978 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
56979
56980 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
56981 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56982 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
56983 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
56984 isNullConstant(Src.getOperand(1)))
56985 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
56986 Src.getOperand(1));
56987
56988 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
56989 // TODO: Move to DAGCombine/SimplifyDemandedBits?
56990 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
56991 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
56992 if (Op.getValueType() != MVT::i64)
56993 return SDValue();
56994 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
56995 if (Op.getOpcode() == Opc &&
56996 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
56997 return Op.getOperand(0);
56998 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
56999 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
57000 if (Ld->getExtensionType() == Ext &&
57001 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
57002 return Op;
57003 if (IsZeroExt) {
57004 KnownBits Known = DAG.computeKnownBits(Op);
57005 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
57006 return Op;
57007 }
57008 return SDValue();
57009 };
57010
57011 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
57012 return DAG.getBitcast(
57013 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57014 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
57015
57016 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
57017 return DAG.getBitcast(
57018 VT,
57019 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
57020 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57021 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
57022 }
57023
57024 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
57025 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
57026 Src.getOperand(0).getValueType() == MVT::x86mmx)
57027 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
57028
57029 // See if we're broadcasting the scalar value, in which case just reuse that.
57030 // Ensure the same SDValue from the SDNode use is being used.
57031 if (VT.getScalarType() == Src.getValueType())
57032 for (SDNode *User : Src->uses())
57033 if (User->getOpcode() == X86ISD::VBROADCAST &&
57034 Src == User->getOperand(0)) {
57035 unsigned SizeInBits = VT.getFixedSizeInBits();
57036 unsigned BroadcastSizeInBits =
57037 User->getValueSizeInBits(0).getFixedValue();
57038 if (BroadcastSizeInBits == SizeInBits)
57039 return SDValue(User, 0);
57040 if (BroadcastSizeInBits > SizeInBits)
57041 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
57042 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
57043 // coverage.
57044 }
57045
57046 return SDValue();
57047}
57048
57049// Simplify PMULDQ and PMULUDQ operations.
57052 const X86Subtarget &Subtarget) {
57053 SDValue LHS = N->getOperand(0);
57054 SDValue RHS = N->getOperand(1);
57055
57056 // Canonicalize constant to RHS.
57059 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
57060
57061 // Multiply by zero.
57062 // Don't return RHS as it may contain UNDEFs.
57063 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
57064 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
57065
57066 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
57067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57068 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
57069 return SDValue(N, 0);
57070
57071 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
57072 // convert it to any_extend_invec, due to the LegalOperations check, do the
57073 // conversion directly to a vector shuffle manually. This exposes combine
57074 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
57075 // combineX86ShufflesRecursively on SSE4.1 targets.
57076 // FIXME: This is basically a hack around several other issues related to
57077 // ANY_EXTEND_VECTOR_INREG.
57078 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
57079 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57080 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57081 LHS.getOperand(0).getValueType() == MVT::v4i32) {
57082 SDLoc dl(N);
57083 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
57084 LHS.getOperand(0), { 0, -1, 1, -1 });
57085 LHS = DAG.getBitcast(MVT::v2i64, LHS);
57086 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57087 }
57088 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
57089 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57090 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57091 RHS.getOperand(0).getValueType() == MVT::v4i32) {
57092 SDLoc dl(N);
57093 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
57094 RHS.getOperand(0), { 0, -1, 1, -1 });
57095 RHS = DAG.getBitcast(MVT::v2i64, RHS);
57096 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57097 }
57098
57099 return SDValue();
57100}
57101
57102// Simplify VPMADDUBSW/VPMADDWD operations.
57105 MVT VT = N->getSimpleValueType(0);
57106 SDValue LHS = N->getOperand(0);
57107 SDValue RHS = N->getOperand(1);
57108 unsigned Opc = N->getOpcode();
57109 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
57110 assert((Opc == X86ISD::VPMADDWD || Opc == X86ISD::VPMADDUBSW) &&
57111 "Unexpected PMADD opcode");
57112
57113 // Multiply by zero.
57114 // Don't return LHS/RHS as it may contain UNDEFs.
57115 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
57117 return DAG.getConstant(0, SDLoc(N), VT);
57118
57119 // Constant folding.
57120 APInt LHSUndefs, RHSUndefs;
57121 SmallVector<APInt> LHSBits, RHSBits;
57122 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
57123 unsigned DstEltBits = VT.getScalarSizeInBits();
57124 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
57125 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
57126 SmallVector<APInt> Result;
57127 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
57128 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
57129 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
57130 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
57131 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
57132 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
57133 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
57134 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
57135 Result.push_back(Res);
57136 }
57137 return getConstVector(Result, VT, DAG, SDLoc(N));
57138 }
57139
57140 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57141 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57142 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57143 return SDValue(N, 0);
57144
57145 return SDValue();
57146}
57147
57150 const X86Subtarget &Subtarget) {
57151 EVT VT = N->getValueType(0);
57152 SDValue In = N->getOperand(0);
57153 unsigned Opcode = N->getOpcode();
57154 unsigned InOpcode = In.getOpcode();
57155 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57156 SDLoc DL(N);
57157
57158 // Try to merge vector loads and extend_inreg to an extload.
57159 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
57160 In.hasOneUse()) {
57161 auto *Ld = cast<LoadSDNode>(In);
57162 if (Ld->isSimple()) {
57163 MVT SVT = In.getSimpleValueType().getVectorElementType();
57166 : ISD::ZEXTLOAD;
57167 EVT MemVT = VT.changeVectorElementType(SVT);
57168 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
57169 SDValue Load = DAG.getExtLoad(
57170 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
57171 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
57172 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
57173 return Load;
57174 }
57175 }
57176 }
57177
57178 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
57179 if (Opcode == InOpcode)
57180 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
57181
57182 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
57183 // -> EXTEND_VECTOR_INREG(X).
57184 // TODO: Handle non-zero subvector indices.
57185 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
57186 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
57187 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
57188 In.getValueSizeInBits())
57189 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
57190
57191 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
57192 // TODO: Move to DAGCombine?
57193 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
57194 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
57195 In.getValueSizeInBits() == VT.getSizeInBits()) {
57196 unsigned NumElts = VT.getVectorNumElements();
57197 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
57198 EVT EltVT = In.getOperand(0).getValueType();
57199 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
57200 for (unsigned I = 0; I != NumElts; ++I)
57201 Elts[I * Scale] = In.getOperand(I);
57202 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
57203 }
57204
57205 // Attempt to combine as a shuffle on SSE41+ targets.
57206 if (Subtarget.hasSSE41()) {
57207 SDValue Op(N, 0);
57208 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
57209 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
57210 return Res;
57211 }
57212
57213 return SDValue();
57214}
57215
57218 EVT VT = N->getValueType(0);
57219
57220 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
57221 return DAG.getConstant(0, SDLoc(N), VT);
57222
57223 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57224 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57225 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57226 return SDValue(N, 0);
57227
57228 return SDValue();
57229}
57230
57231// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
57232// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
57233// extra instructions between the conversion due to going to scalar and back.
57235 const X86Subtarget &Subtarget) {
57236 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
57237 return SDValue();
57238
57239 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
57240 return SDValue();
57241
57242 if (N->getValueType(0) != MVT::f32 ||
57243 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
57244 return SDValue();
57245
57246 SDLoc dl(N);
57247 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
57248 N->getOperand(0).getOperand(0));
57249 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
57250 DAG.getTargetConstant(4, dl, MVT::i32));
57251 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
57252 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
57253 DAG.getIntPtrConstant(0, dl));
57254}
57255
57258 const X86Subtarget &Subtarget) {
57259 EVT VT = N->getValueType(0);
57260 bool IsStrict = N->isStrictFPOpcode();
57261 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57262 EVT SrcVT = Src.getValueType();
57263
57264 SDLoc dl(N);
57265 if (SrcVT.getScalarType() == MVT::bf16) {
57266 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
57267 !IsStrict && Src.getOperand(0).getValueType() == VT)
57268 return Src.getOperand(0);
57269
57270 if (!SrcVT.isVector())
57271 return SDValue();
57272
57273 assert(!IsStrict && "Strict FP doesn't support BF16");
57274 if (VT.getVectorElementType() == MVT::f64) {
57275 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
57276 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
57277 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
57278 }
57279 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
57280 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
57281 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
57282 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
57283 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
57284 return DAG.getBitcast(VT, Src);
57285 }
57286
57287 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57288 return SDValue();
57289
57290 if (Subtarget.hasFP16())
57291 return SDValue();
57292
57293 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
57294 return SDValue();
57295
57296 if (VT.getVectorElementType() != MVT::f32 &&
57297 VT.getVectorElementType() != MVT::f64)
57298 return SDValue();
57299
57300 unsigned NumElts = VT.getVectorNumElements();
57301 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57302 return SDValue();
57303
57304 // Convert the input to vXi16.
57305 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
57306 Src = DAG.getBitcast(IntVT, Src);
57307
57308 // Widen to at least 8 input elements.
57309 if (NumElts < 8) {
57310 unsigned NumConcats = 8 / NumElts;
57311 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
57312 : DAG.getConstant(0, dl, IntVT);
57313 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
57314 Ops[0] = Src;
57315 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
57316 }
57317
57318 // Destination is vXf32 with at least 4 elements.
57319 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
57320 std::max(4U, NumElts));
57321 SDValue Cvt, Chain;
57322 if (IsStrict) {
57323 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
57324 {N->getOperand(0), Src});
57325 Chain = Cvt.getValue(1);
57326 } else {
57327 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
57328 }
57329
57330 if (NumElts < 4) {
57331 assert(NumElts == 2 && "Unexpected size");
57332 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
57333 DAG.getIntPtrConstant(0, dl));
57334 }
57335
57336 if (IsStrict) {
57337 // Extend to the original VT if necessary.
57338 if (Cvt.getValueType() != VT) {
57339 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
57340 {Chain, Cvt});
57341 Chain = Cvt.getValue(1);
57342 }
57343 return DAG.getMergeValues({Cvt, Chain}, dl);
57344 }
57345
57346 // Extend to the original VT if necessary.
57347 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
57348}
57349
57350// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
57351// from. Limit this to cases where the loads have the same input chain and the
57352// output chains are unused. This avoids any memory ordering issues.
57355 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
57356 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
57357 "Unknown broadcast load type");
57358
57359 // Only do this if the chain result is unused.
57360 if (N->hasAnyUseOfValue(1))
57361 return SDValue();
57362
57363 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
57364
57365 SDValue Ptr = MemIntrin->getBasePtr();
57366 SDValue Chain = MemIntrin->getChain();
57367 EVT VT = N->getSimpleValueType(0);
57368 EVT MemVT = MemIntrin->getMemoryVT();
57369
57370 // Look at other users of our base pointer and try to find a wider broadcast.
57371 // The input chain and the size of the memory VT must match.
57372 for (SDNode *User : Ptr->uses())
57373 if (User != N && User->getOpcode() == N->getOpcode() &&
57374 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
57375 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
57376 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
57377 MemVT.getSizeInBits() &&
57378 !User->hasAnyUseOfValue(1) &&
57379 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
57380 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
57381 VT.getSizeInBits());
57382 Extract = DAG.getBitcast(VT, Extract);
57383 return DCI.CombineTo(N, Extract, SDValue(User, 1));
57384 }
57385
57386 return SDValue();
57387}
57388
57390 const X86Subtarget &Subtarget) {
57391 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57392 return SDValue();
57393
57394 bool IsStrict = N->isStrictFPOpcode();
57395 EVT VT = N->getValueType(0);
57396 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57397 EVT SrcVT = Src.getValueType();
57398
57399 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
57400 SrcVT.getVectorElementType() != MVT::f32)
57401 return SDValue();
57402
57403 SDLoc dl(N);
57404
57405 SDValue Cvt, Chain;
57406 unsigned NumElts = VT.getVectorNumElements();
57407 if (Subtarget.hasFP16()) {
57408 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
57409 // v4f32 (xint_to_fp v4i64))))
57410 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
57411 // v8f16 (CVTXI2P v4i64)))
57412 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
57413 Src.getNumOperands() == 2) {
57414 SDValue Cvt0, Cvt1;
57415 SDValue Op0 = Src.getOperand(0);
57416 SDValue Op1 = Src.getOperand(1);
57417 bool IsOp0Strict = Op0->isStrictFPOpcode();
57418 if (Op0.getOpcode() != Op1.getOpcode() ||
57419 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
57420 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
57421 return SDValue();
57422 }
57423 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
57424 if (IsStrict) {
57425 assert(IsOp0Strict && "Op0 must be strict node");
57426 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
57429 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57430 {Op0.getOperand(0), Op0.getOperand(1)});
57431 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
57432 {Op1.getOperand(0), Op1.getOperand(1)});
57433 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57434 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
57435 }
57436 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
57438 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
57439 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
57440 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
57441 }
57442 return SDValue();
57443 }
57444
57445 if (NumElts == 1 || !isPowerOf2_32(NumElts))
57446 return SDValue();
57447
57448 // Widen to at least 4 input elements.
57449 if (NumElts < 4)
57450 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
57451 DAG.getConstantFP(0.0, dl, SrcVT));
57452
57453 // Destination is v8i16 with at least 8 elements.
57454 EVT CvtVT =
57455 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
57456 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
57457 if (IsStrict) {
57458 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
57459 {N->getOperand(0), Src, Rnd});
57460 Chain = Cvt.getValue(1);
57461 } else {
57462 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
57463 }
57464
57465 // Extract down to real number of elements.
57466 if (NumElts < 8) {
57468 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
57469 DAG.getIntPtrConstant(0, dl));
57470 }
57471
57472 Cvt = DAG.getBitcast(VT, Cvt);
57473
57474 if (IsStrict)
57475 return DAG.getMergeValues({Cvt, Chain}, dl);
57476
57477 return Cvt;
57478}
57479
57481 SDValue Src = N->getOperand(0);
57482
57483 // Turn MOVDQ2Q+simple_load into an mmx load.
57484 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
57485 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
57486
57487 if (LN->isSimple()) {
57488 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
57489 LN->getBasePtr(),
57490 LN->getPointerInfo(),
57491 LN->getOriginalAlign(),
57492 LN->getMemOperand()->getFlags());
57493 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
57494 return NewLd;
57495 }
57496 }
57497
57498 return SDValue();
57499}
57500
57503 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
57504 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57505 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
57506 return SDValue(N, 0);
57507
57508 return SDValue();
57509}
57510
57512 DAGCombinerInfo &DCI) const {
57513 SelectionDAG &DAG = DCI.DAG;
57514 switch (N->getOpcode()) {
57515 // clang-format off
57516 default: break;
57518 return combineScalarToVector(N, DAG);
57520 case X86ISD::PEXTRW:
57521 case X86ISD::PEXTRB:
57522 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
57524 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
57526 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
57528 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
57529 case ISD::VSELECT:
57530 case ISD::SELECT:
57531 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
57532 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
57533 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
57534 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
57535 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
57536 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
57537 case X86ISD::ADD:
57538 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
57539 case X86ISD::CLOAD:
57540 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
57541 case X86ISD::SBB: return combineSBB(N, DAG);
57542 case X86ISD::ADC: return combineADC(N, DAG, DCI);
57543 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
57544 case ISD::SHL: return combineShiftLeft(N, DAG);
57545 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
57546 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
57547 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
57548 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
57549 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
57550 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
57551 case ISD::AVGCEILS:
57552 case ISD::AVGCEILU:
57553 case ISD::AVGFLOORS:
57554 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
57555 case X86ISD::BEXTR:
57556 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
57557 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
57558 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
57559 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
57560 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
57562 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
57563 case ISD::SINT_TO_FP:
57565 return combineSIntToFP(N, DAG, DCI, Subtarget);
57566 case ISD::UINT_TO_FP:
57568 return combineUIntToFP(N, DAG, Subtarget);
57569 case ISD::LRINT:
57570 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
57571 case ISD::FADD:
57572 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
57573 case X86ISD::VFCMULC:
57574 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
57575 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
57576 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
57577 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
57578 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
57579 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
57580 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
57581 case X86ISD::FXOR:
57582 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
57583 case X86ISD::FMIN:
57584 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
57585 case ISD::FMINNUM:
57586 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
57587 case X86ISD::CVTSI2P:
57588 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
57589 case X86ISD::CVTP2SI:
57590 case X86ISD::CVTP2UI:
57592 case X86ISD::CVTTP2SI:
57594 case X86ISD::CVTTP2UI:
57595 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
57597 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
57598 case X86ISD::BT: return combineBT(N, DAG, DCI);
57599 case ISD::ANY_EXTEND:
57600 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
57601 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
57602 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
57606 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
57607 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
57608 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
57609 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
57610 case X86ISD::PACKSS:
57611 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
57612 case X86ISD::HADD:
57613 case X86ISD::HSUB:
57614 case X86ISD::FHADD:
57615 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
57616 case X86ISD::VSHL:
57617 case X86ISD::VSRA:
57618 case X86ISD::VSRL:
57619 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
57620 case X86ISD::VSHLI:
57621 case X86ISD::VSRAI:
57622 case X86ISD::VSRLI:
57623 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
57625 case X86ISD::PINSRB:
57626 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
57627 case X86ISD::SHUFP: // Handle all target specific shuffles
57628 case X86ISD::INSERTPS:
57629 case X86ISD::EXTRQI:
57630 case X86ISD::INSERTQI:
57631 case X86ISD::VALIGN:
57632 case X86ISD::PALIGNR:
57633 case X86ISD::VSHLDQ:
57634 case X86ISD::VSRLDQ:
57635 case X86ISD::BLENDI:
57636 case X86ISD::UNPCKH:
57637 case X86ISD::UNPCKL:
57638 case X86ISD::MOVHLPS:
57639 case X86ISD::MOVLHPS:
57640 case X86ISD::PSHUFB:
57641 case X86ISD::PSHUFD:
57642 case X86ISD::PSHUFHW:
57643 case X86ISD::PSHUFLW:
57644 case X86ISD::MOVSHDUP:
57645 case X86ISD::MOVSLDUP:
57646 case X86ISD::MOVDDUP:
57647 case X86ISD::MOVSS:
57648 case X86ISD::MOVSD:
57649 case X86ISD::MOVSH:
57650 case X86ISD::VBROADCAST:
57651 case X86ISD::VPPERM:
57652 case X86ISD::VPERMI:
57653 case X86ISD::VPERMV:
57654 case X86ISD::VPERMV3:
57655 case X86ISD::VPERMIL2:
57656 case X86ISD::VPERMILPI:
57657 case X86ISD::VPERMILPV:
57658 case X86ISD::VPERM2X128:
57659 case X86ISD::SHUF128:
57660 case X86ISD::VZEXT_MOVL:
57661 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
57662 case X86ISD::FMADD_RND:
57663 case X86ISD::FMSUB:
57665 case X86ISD::FMSUB_RND:
57666 case X86ISD::FNMADD:
57668 case X86ISD::FNMADD_RND:
57669 case X86ISD::FNMSUB:
57671 case X86ISD::FNMSUB_RND:
57672 case ISD::FMA:
57673 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
57676 case X86ISD::FMADDSUB:
57677 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
57678 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
57679 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
57680 case X86ISD::MGATHER:
57681 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
57682 case ISD::MGATHER:
57683 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
57684 case X86ISD::PCMPEQ:
57685 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
57686 case X86ISD::PMULDQ:
57687 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
57688 case X86ISD::VPMADDUBSW:
57689 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
57690 case X86ISD::KSHIFTL:
57691 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
57692 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
57694 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
57696 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
57698 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
57699 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
57700 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
57701 // clang-format on
57702 }
57703
57704 return SDValue();
57705}
57706
57708 return false;
57709}
57710
57711// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
57713 EVT ExtVT) const {
57714 return Subtarget.hasAVX512() || !VT.isVector();
57715}
57716
57717bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
57718 if (!isTypeLegal(VT))
57719 return false;
57720
57721 // There are no vXi8 shifts.
57722 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
57723 return false;
57724
57725 // TODO: Almost no 8-bit ops are desirable because they have no actual
57726 // size/speed advantages vs. 32-bit ops, but they do have a major
57727 // potential disadvantage by causing partial register stalls.
57728 //
57729 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
57730 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
57731 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
57732 // check for a constant operand to the multiply.
57733 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
57734 return false;
57735
57736 // i16 instruction encodings are longer and some i16 instructions are slow,
57737 // so those are not desirable.
57738 if (VT == MVT::i16) {
57739 switch (Opc) {
57740 default:
57741 break;
57742 case ISD::LOAD:
57743 case ISD::SIGN_EXTEND:
57744 case ISD::ZERO_EXTEND:
57745 case ISD::ANY_EXTEND:
57746 case ISD::MUL:
57747 return false;
57748 case ISD::SHL:
57749 case ISD::SRA:
57750 case ISD::SRL:
57751 case ISD::SUB:
57752 case ISD::ADD:
57753 case ISD::AND:
57754 case ISD::OR:
57755 case ISD::XOR:
57756 // NDD instruction never has "partial register write" issue b/c it has
57757 // destination register's upper bits [63:OSIZE]) zeroed even when
57758 // OSIZE=8/16.
57759 return Subtarget.hasNDD();
57760 }
57761 }
57762
57763 // Any legal type not explicitly accounted for above here is desirable.
57764 return true;
57765}
57766
57769 int JTI,
57770 SelectionDAG &DAG) const {
57771 const Module *M = DAG.getMachineFunction().getMMI().getModule();
57772 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
57773 if (IsCFProtectionSupported) {
57774 // In case control-flow branch protection is enabled, we need to add
57775 // notrack prefix to the indirect branch.
57776 // In order to do that we create NT_BRIND SDNode.
57777 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
57778 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Value, dl);
57779 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, JTInfo, Addr);
57780 }
57781
57782 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
57783}
57784
57787 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
57789 EVT VT = LogicOp->getValueType(0);
57790 EVT OpVT = SETCC0->getOperand(0).getValueType();
57791 if (!VT.isInteger())
57793
57794 if (VT.isVector())
57799
57800 // Don't use `NotAnd` as even though `not` is generally shorter code size than
57801 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
57802 // `NotAnd` applies, `AddAnd` does as well.
57803 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
57804 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
57806}
57807
57809 EVT VT = Op.getValueType();
57810 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
57811 isa<ConstantSDNode>(Op.getOperand(1));
57812
57813 // i16 is legal, but undesirable since i16 instruction encodings are longer
57814 // and some i16 instructions are slow.
57815 // 8-bit multiply-by-constant can usually be expanded to something cheaper
57816 // using LEA and/or other ALU ops.
57817 if (VT != MVT::i16 && !Is8BitMulByConstant)
57818 return false;
57819
57820 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
57821 if (!Op.hasOneUse())
57822 return false;
57823 SDNode *User = *Op->use_begin();
57825 return false;
57826 auto *Ld = cast<LoadSDNode>(Load);
57827 auto *St = cast<StoreSDNode>(User);
57828 return Ld->getBasePtr() == St->getBasePtr();
57829 };
57830
57831 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
57832 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
57833 return false;
57834 if (!Op.hasOneUse())
57835 return false;
57836 SDNode *User = *Op->use_begin();
57837 if (User->getOpcode() != ISD::ATOMIC_STORE)
57838 return false;
57839 auto *Ld = cast<AtomicSDNode>(Load);
57840 auto *St = cast<AtomicSDNode>(User);
57841 return Ld->getBasePtr() == St->getBasePtr();
57842 };
57843
57844 bool Commute = false;
57845 switch (Op.getOpcode()) {
57846 default: return false;
57847 case ISD::SIGN_EXTEND:
57848 case ISD::ZERO_EXTEND:
57849 case ISD::ANY_EXTEND:
57850 break;
57851 case ISD::SHL:
57852 case ISD::SRA:
57853 case ISD::SRL: {
57854 SDValue N0 = Op.getOperand(0);
57855 // Look out for (store (shl (load), x)).
57856 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
57857 return false;
57858 break;
57859 }
57860 case ISD::ADD:
57861 case ISD::MUL:
57862 case ISD::AND:
57863 case ISD::OR:
57864 case ISD::XOR:
57865 Commute = true;
57866 [[fallthrough]];
57867 case ISD::SUB: {
57868 SDValue N0 = Op.getOperand(0);
57869 SDValue N1 = Op.getOperand(1);
57870 // Avoid disabling potential load folding opportunities.
57871 if (X86::mayFoldLoad(N1, Subtarget) &&
57872 (!Commute || !isa<ConstantSDNode>(N0) ||
57873 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
57874 return false;
57875 if (X86::mayFoldLoad(N0, Subtarget) &&
57876 ((Commute && !isa<ConstantSDNode>(N1)) ||
57877 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
57878 return false;
57879 if (IsFoldableAtomicRMW(N0, Op) ||
57880 (Commute && IsFoldableAtomicRMW(N1, Op)))
57881 return false;
57882 }
57883 }
57884
57885 PVT = MVT::i32;
57886 return true;
57887}
57888
57889//===----------------------------------------------------------------------===//
57890// X86 Inline Assembly Support
57891//===----------------------------------------------------------------------===//
57892
57893// Helper to match a string separated by whitespace.
57895 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
57896
57897 for (StringRef Piece : Pieces) {
57898 if (!S.starts_with(Piece)) // Check if the piece matches.
57899 return false;
57900
57901 S = S.substr(Piece.size());
57903 if (Pos == 0) // We matched a prefix.
57904 return false;
57905
57906 S = S.substr(Pos);
57907 }
57908
57909 return S.empty();
57910}
57911
57913
57914 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
57915 if (llvm::is_contained(AsmPieces, "~{cc}") &&
57916 llvm::is_contained(AsmPieces, "~{flags}") &&
57917 llvm::is_contained(AsmPieces, "~{fpsr}")) {
57918
57919 if (AsmPieces.size() == 3)
57920 return true;
57921 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
57922 return true;
57923 }
57924 }
57925 return false;
57926}
57927
57929 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
57930
57931 const std::string &AsmStr = IA->getAsmString();
57932
57933 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
57934 if (!Ty || Ty->getBitWidth() % 16 != 0)
57935 return false;
57936
57937 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
57938 SmallVector<StringRef, 4> AsmPieces;
57939 SplitString(AsmStr, AsmPieces, ";\n");
57940
57941 switch (AsmPieces.size()) {
57942 default: return false;
57943 case 1:
57944 // FIXME: this should verify that we are targeting a 486 or better. If not,
57945 // we will turn this bswap into something that will be lowered to logical
57946 // ops instead of emitting the bswap asm. For now, we don't support 486 or
57947 // lower so don't worry about this.
57948 // bswap $0
57949 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
57950 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
57951 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
57952 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
57953 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
57954 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
57955 // No need to check constraints, nothing other than the equivalent of
57956 // "=r,0" would be valid here.
57958 }
57959
57960 // rorw $$8, ${0:w} --> llvm.bswap.i16
57961 if (CI->getType()->isIntegerTy(16) &&
57962 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57963 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
57964 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
57965 AsmPieces.clear();
57966 StringRef ConstraintsStr = IA->getConstraintString();
57967 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57968 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57969 if (clobbersFlagRegisters(AsmPieces))
57971 }
57972 break;
57973 case 3:
57974 if (CI->getType()->isIntegerTy(32) &&
57975 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57976 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
57977 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
57978 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
57979 AsmPieces.clear();
57980 StringRef ConstraintsStr = IA->getConstraintString();
57981 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57982 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57983 if (clobbersFlagRegisters(AsmPieces))
57985 }
57986
57987 if (CI->getType()->isIntegerTy(64)) {
57988 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
57989 if (Constraints.size() >= 2 &&
57990 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
57991 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
57992 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
57993 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
57994 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
57995 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
57997 }
57998 }
57999 break;
58000 }
58001 return false;
58002}
58003
58006 .Case("{@cca}", X86::COND_A)
58007 .Case("{@ccae}", X86::COND_AE)
58008 .Case("{@ccb}", X86::COND_B)
58009 .Case("{@ccbe}", X86::COND_BE)
58010 .Case("{@ccc}", X86::COND_B)
58011 .Case("{@cce}", X86::COND_E)
58012 .Case("{@ccz}", X86::COND_E)
58013 .Case("{@ccg}", X86::COND_G)
58014 .Case("{@ccge}", X86::COND_GE)
58015 .Case("{@ccl}", X86::COND_L)
58016 .Case("{@ccle}", X86::COND_LE)
58017 .Case("{@ccna}", X86::COND_BE)
58018 .Case("{@ccnae}", X86::COND_B)
58019 .Case("{@ccnb}", X86::COND_AE)
58020 .Case("{@ccnbe}", X86::COND_A)
58021 .Case("{@ccnc}", X86::COND_AE)
58022 .Case("{@ccne}", X86::COND_NE)
58023 .Case("{@ccnz}", X86::COND_NE)
58024 .Case("{@ccng}", X86::COND_LE)
58025 .Case("{@ccnge}", X86::COND_L)
58026 .Case("{@ccnl}", X86::COND_GE)
58027 .Case("{@ccnle}", X86::COND_G)
58028 .Case("{@ccno}", X86::COND_NO)
58029 .Case("{@ccnp}", X86::COND_NP)
58030 .Case("{@ccns}", X86::COND_NS)
58031 .Case("{@cco}", X86::COND_O)
58032 .Case("{@ccp}", X86::COND_P)
58033 .Case("{@ccs}", X86::COND_S)
58035 return Cond;
58036}
58037
58038/// Given a constraint letter, return the type of constraint for this target.
58041 if (Constraint.size() == 1) {
58042 switch (Constraint[0]) {
58043 case 'R':
58044 case 'q':
58045 case 'Q':
58046 case 'f':
58047 case 't':
58048 case 'u':
58049 case 'y':
58050 case 'x':
58051 case 'v':
58052 case 'l':
58053 case 'k': // AVX512 masking registers.
58054 return C_RegisterClass;
58055 case 'a':
58056 case 'b':
58057 case 'c':
58058 case 'd':
58059 case 'S':
58060 case 'D':
58061 case 'A':
58062 return C_Register;
58063 case 'I':
58064 case 'J':
58065 case 'K':
58066 case 'N':
58067 case 'G':
58068 case 'L':
58069 case 'M':
58070 return C_Immediate;
58071 case 'C':
58072 case 'e':
58073 case 'Z':
58074 return C_Other;
58075 default:
58076 break;
58077 }
58078 }
58079 else if (Constraint.size() == 2) {
58080 switch (Constraint[0]) {
58081 default:
58082 break;
58083 case 'W':
58084 if (Constraint[1] != 's')
58085 break;
58086 return C_Other;
58087 case 'Y':
58088 switch (Constraint[1]) {
58089 default:
58090 break;
58091 case 'z':
58092 return C_Register;
58093 case 'i':
58094 case 'm':
58095 case 'k':
58096 case 't':
58097 case '2':
58098 return C_RegisterClass;
58099 }
58100 break;
58101 case 'j':
58102 switch (Constraint[1]) {
58103 default:
58104 break;
58105 case 'r':
58106 case 'R':
58107 return C_RegisterClass;
58108 }
58109 }
58110 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58111 return C_Other;
58112 return TargetLowering::getConstraintType(Constraint);
58113}
58114
58115/// Examine constraint type and operand type and determine a weight value.
58116/// This object must already have been set up with the operand type
58117/// and the current alternative constraint selected.
58120 AsmOperandInfo &Info, const char *Constraint) const {
58122 Value *CallOperandVal = Info.CallOperandVal;
58123 // If we don't have a value, we can't do a match,
58124 // but allow it at the lowest weight.
58125 if (!CallOperandVal)
58126 return CW_Default;
58127 Type *Ty = CallOperandVal->getType();
58128 // Look at the constraint type.
58129 switch (*Constraint) {
58130 default:
58132 [[fallthrough]];
58133 case 'R':
58134 case 'q':
58135 case 'Q':
58136 case 'a':
58137 case 'b':
58138 case 'c':
58139 case 'd':
58140 case 'S':
58141 case 'D':
58142 case 'A':
58143 if (CallOperandVal->getType()->isIntegerTy())
58144 Wt = CW_SpecificReg;
58145 break;
58146 case 'f':
58147 case 't':
58148 case 'u':
58149 if (Ty->isFloatingPointTy())
58150 Wt = CW_SpecificReg;
58151 break;
58152 case 'y':
58153 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
58154 Wt = CW_SpecificReg;
58155 break;
58156 case 'Y':
58157 if (StringRef(Constraint).size() != 2)
58158 break;
58159 switch (Constraint[1]) {
58160 default:
58161 return CW_Invalid;
58162 // XMM0
58163 case 'z':
58164 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58165 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
58166 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
58167 return CW_SpecificReg;
58168 return CW_Invalid;
58169 // Conditional OpMask regs (AVX512)
58170 case 'k':
58171 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58172 return CW_Register;
58173 return CW_Invalid;
58174 // Any MMX reg
58175 case 'm':
58176 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
58177 return Wt;
58178 return CW_Invalid;
58179 // Any SSE reg when ISA >= SSE2, same as 'x'
58180 case 'i':
58181 case 't':
58182 case '2':
58183 if (!Subtarget.hasSSE2())
58184 return CW_Invalid;
58185 break;
58186 }
58187 break;
58188 case 'j':
58189 if (StringRef(Constraint).size() != 2)
58190 break;
58191 switch (Constraint[1]) {
58192 default:
58193 return CW_Invalid;
58194 case 'r':
58195 case 'R':
58196 if (CallOperandVal->getType()->isIntegerTy())
58197 Wt = CW_SpecificReg;
58198 break;
58199 }
58200 break;
58201 case 'v':
58202 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
58203 Wt = CW_Register;
58204 [[fallthrough]];
58205 case 'x':
58206 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58207 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
58208 Wt = CW_Register;
58209 break;
58210 case 'k':
58211 // Enable conditional vector operations using %k<#> registers.
58212 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58213 Wt = CW_Register;
58214 break;
58215 case 'I':
58216 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
58217 if (C->getZExtValue() <= 31)
58218 Wt = CW_Constant;
58219 break;
58220 case 'J':
58221 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58222 if (C->getZExtValue() <= 63)
58223 Wt = CW_Constant;
58224 break;
58225 case 'K':
58226 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58227 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
58228 Wt = CW_Constant;
58229 break;
58230 case 'L':
58231 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58232 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
58233 Wt = CW_Constant;
58234 break;
58235 case 'M':
58236 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58237 if (C->getZExtValue() <= 3)
58238 Wt = CW_Constant;
58239 break;
58240 case 'N':
58241 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58242 if (C->getZExtValue() <= 0xff)
58243 Wt = CW_Constant;
58244 break;
58245 case 'G':
58246 case 'C':
58247 if (isa<ConstantFP>(CallOperandVal))
58248 Wt = CW_Constant;
58249 break;
58250 case 'e':
58251 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58252 if ((C->getSExtValue() >= -0x80000000LL) &&
58253 (C->getSExtValue() <= 0x7fffffffLL))
58254 Wt = CW_Constant;
58255 break;
58256 case 'Z':
58257 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
58258 if (C->getZExtValue() <= 0xffffffff)
58259 Wt = CW_Constant;
58260 break;
58261 }
58262 return Wt;
58263}
58264
58265/// Try to replace an X constraint, which matches anything, with another that
58266/// has more specific requirements based on the type of the corresponding
58267/// operand.
58269LowerXConstraint(EVT ConstraintVT) const {
58270 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
58271 // 'f' like normal targets.
58272 if (ConstraintVT.isFloatingPoint()) {
58273 if (Subtarget.hasSSE1())
58274 return "x";
58275 }
58276
58277 return TargetLowering::LowerXConstraint(ConstraintVT);
58278}
58279
58280// Lower @cc targets via setcc.
58282 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
58283 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
58285 if (Cond == X86::COND_INVALID)
58286 return SDValue();
58287 // Check that return type is valid.
58288 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
58289 OpInfo.ConstraintVT.getSizeInBits() < 8)
58290 report_fatal_error("Glue output operand is of invalid type");
58291
58292 // Get EFLAGS register. Only update chain when copyfrom is glued.
58293 if (Glue.getNode()) {
58294 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
58295 Chain = Glue.getValue(1);
58296 } else
58297 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
58298 // Extract CC code.
58299 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
58300 // Extend to 32-bits
58301 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
58302
58303 return Result;
58304}
58305
58306/// Lower the specified operand into the Ops vector.
58307/// If it is invalid, don't add anything to Ops.
58309 StringRef Constraint,
58310 std::vector<SDValue> &Ops,
58311 SelectionDAG &DAG) const {
58312 SDValue Result;
58313 char ConstraintLetter = Constraint[0];
58314 switch (ConstraintLetter) {
58315 default: break;
58316 case 'I':
58317 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58318 if (C->getZExtValue() <= 31) {
58319 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58320 Op.getValueType());
58321 break;
58322 }
58323 }
58324 return;
58325 case 'J':
58326 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58327 if (C->getZExtValue() <= 63) {
58328 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58329 Op.getValueType());
58330 break;
58331 }
58332 }
58333 return;
58334 case 'K':
58335 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58336 if (isInt<8>(C->getSExtValue())) {
58337 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58338 Op.getValueType());
58339 break;
58340 }
58341 }
58342 return;
58343 case 'L':
58344 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58345 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
58346 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
58347 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
58348 Op.getValueType());
58349 break;
58350 }
58351 }
58352 return;
58353 case 'M':
58354 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58355 if (C->getZExtValue() <= 3) {
58356 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58357 Op.getValueType());
58358 break;
58359 }
58360 }
58361 return;
58362 case 'N':
58363 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58364 if (C->getZExtValue() <= 255) {
58365 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58366 Op.getValueType());
58367 break;
58368 }
58369 }
58370 return;
58371 case 'O':
58372 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58373 if (C->getZExtValue() <= 127) {
58374 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58375 Op.getValueType());
58376 break;
58377 }
58378 }
58379 return;
58380 case 'e': {
58381 // 32-bit signed value
58382 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58384 C->getSExtValue())) {
58385 // Widen to 64 bits here to get it sign extended.
58386 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
58387 break;
58388 }
58389 // FIXME gcc accepts some relocatable values here too, but only in certain
58390 // memory models; it's complicated.
58391 }
58392 return;
58393 }
58394 case 'W': {
58395 assert(Constraint[1] == 's');
58396 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
58397 // offset.
58398 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
58399 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
58400 BA->getValueType(0)));
58401 } else {
58402 int64_t Offset = 0;
58403 if (Op->getOpcode() == ISD::ADD &&
58404 isa<ConstantSDNode>(Op->getOperand(1))) {
58405 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
58406 Op = Op->getOperand(0);
58407 }
58408 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
58409 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
58410 GA->getValueType(0), Offset));
58411 }
58412 return;
58413 }
58414 case 'Z': {
58415 // 32-bit unsigned value
58416 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58418 C->getZExtValue())) {
58419 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58420 Op.getValueType());
58421 break;
58422 }
58423 }
58424 // FIXME gcc accepts some relocatable values here too, but only in certain
58425 // memory models; it's complicated.
58426 return;
58427 }
58428 case 'i': {
58429 // Literal immediates are always ok.
58430 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
58431 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
58432 BooleanContent BCont = getBooleanContents(MVT::i64);
58433 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
58435 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
58436 : CST->getSExtValue();
58437 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
58438 break;
58439 }
58440
58441 // In any sort of PIC mode addresses need to be computed at runtime by
58442 // adding in a register or some sort of table lookup. These can't
58443 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
58444 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
58445 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
58446 return;
58447
58448 // If we are in non-pic codegen mode, we allow the address of a global (with
58449 // an optional displacement) to be used with 'i'.
58450 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
58451 // If we require an extra load to get this address, as in PIC mode, we
58452 // can't accept it.
58454 Subtarget.classifyGlobalReference(GA->getGlobal())))
58455 return;
58456 break;
58457 }
58458 }
58459
58460 if (Result.getNode()) {
58461 Ops.push_back(Result);
58462 return;
58463 }
58464 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
58465}
58466
58467/// Check if \p RC is a general purpose register class.
58468/// I.e., GR* or one of their variant.
58469static bool isGRClass(const TargetRegisterClass &RC) {
58470 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
58471 RC.hasSuperClassEq(&X86::GR16RegClass) ||
58472 RC.hasSuperClassEq(&X86::GR32RegClass) ||
58473 RC.hasSuperClassEq(&X86::GR64RegClass) ||
58474 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
58475}
58476
58477/// Check if \p RC is a vector register class.
58478/// I.e., FR* / VR* or one of their variant.
58479static bool isFRClass(const TargetRegisterClass &RC) {
58480 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
58481 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
58482 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
58483 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
58484 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
58485 RC.hasSuperClassEq(&X86::VR512RegClass);
58486}
58487
58488/// Check if \p RC is a mask register class.
58489/// I.e., VK* or one of their variant.
58490static bool isVKClass(const TargetRegisterClass &RC) {
58491 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
58492 RC.hasSuperClassEq(&X86::VK2RegClass) ||
58493 RC.hasSuperClassEq(&X86::VK4RegClass) ||
58494 RC.hasSuperClassEq(&X86::VK8RegClass) ||
58495 RC.hasSuperClassEq(&X86::VK16RegClass) ||
58496 RC.hasSuperClassEq(&X86::VK32RegClass) ||
58497 RC.hasSuperClassEq(&X86::VK64RegClass);
58498}
58499
58500static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
58501 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
58502}
58503
58504std::pair<unsigned, const TargetRegisterClass *>
58506 StringRef Constraint,
58507 MVT VT) const {
58508 // First, see if this is a constraint that directly corresponds to an LLVM
58509 // register class.
58510 if (Constraint.size() == 1) {
58511 // GCC Constraint Letters
58512 switch (Constraint[0]) {
58513 default: break;
58514 // 'A' means [ER]AX + [ER]DX.
58515 case 'A':
58516 if (Subtarget.is64Bit())
58517 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
58518 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
58519 "Expecting 64, 32 or 16 bit subtarget");
58520 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58521
58522 // TODO: Slight differences here in allocation order and leaving
58523 // RIP in the class. Do they matter any more here than they do
58524 // in the normal allocation?
58525 case 'k':
58526 if (Subtarget.hasAVX512()) {
58527 if (VT == MVT::v1i1 || VT == MVT::i1)
58528 return std::make_pair(0U, &X86::VK1RegClass);
58529 if (VT == MVT::v8i1 || VT == MVT::i8)
58530 return std::make_pair(0U, &X86::VK8RegClass);
58531 if (VT == MVT::v16i1 || VT == MVT::i16)
58532 return std::make_pair(0U, &X86::VK16RegClass);
58533 }
58534 if (Subtarget.hasBWI()) {
58535 if (VT == MVT::v32i1 || VT == MVT::i32)
58536 return std::make_pair(0U, &X86::VK32RegClass);
58537 if (VT == MVT::v64i1 || VT == MVT::i64)
58538 return std::make_pair(0U, &X86::VK64RegClass);
58539 }
58540 break;
58541 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
58542 if (Subtarget.is64Bit()) {
58543 if (VT == MVT::i8 || VT == MVT::i1)
58544 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58545 ? &X86::GR8RegClass
58546 : &X86::GR8_NOREX2RegClass);
58547 if (VT == MVT::i16)
58548 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58549 ? &X86::GR16RegClass
58550 : &X86::GR16_NOREX2RegClass);
58551 if (VT == MVT::i32 || VT == MVT::f32)
58552 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58553 ? &X86::GR32RegClass
58554 : &X86::GR32_NOREX2RegClass);
58555 if (VT != MVT::f80 && !VT.isVector())
58556 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58557 ? &X86::GR64RegClass
58558 : &X86::GR64_NOREX2RegClass);
58559 break;
58560 }
58561 [[fallthrough]];
58562 // 32-bit fallthrough
58563 case 'Q': // Q_REGS
58564 if (VT == MVT::i8 || VT == MVT::i1)
58565 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
58566 if (VT == MVT::i16)
58567 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
58568 if (VT == MVT::i32 || VT == MVT::f32 ||
58569 (!VT.isVector() && !Subtarget.is64Bit()))
58570 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
58571 if (VT != MVT::f80 && !VT.isVector())
58572 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
58573 break;
58574 case 'r': // GENERAL_REGS
58575 case 'l': // INDEX_REGS
58576 if (VT == MVT::i8 || VT == MVT::i1)
58577 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58578 ? &X86::GR8RegClass
58579 : &X86::GR8_NOREX2RegClass);
58580 if (VT == MVT::i16)
58581 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58582 ? &X86::GR16RegClass
58583 : &X86::GR16_NOREX2RegClass);
58584 if (VT == MVT::i32 || VT == MVT::f32 ||
58585 (!VT.isVector() && !Subtarget.is64Bit()))
58586 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58587 ? &X86::GR32RegClass
58588 : &X86::GR32_NOREX2RegClass);
58589 if (VT != MVT::f80 && !VT.isVector())
58590 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
58591 ? &X86::GR64RegClass
58592 : &X86::GR64_NOREX2RegClass);
58593 break;
58594 case 'R': // LEGACY_REGS
58595 if (VT == MVT::i8 || VT == MVT::i1)
58596 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
58597 if (VT == MVT::i16)
58598 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
58599 if (VT == MVT::i32 || VT == MVT::f32 ||
58600 (!VT.isVector() && !Subtarget.is64Bit()))
58601 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
58602 if (VT != MVT::f80 && !VT.isVector())
58603 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
58604 break;
58605 case 'f': // FP Stack registers.
58606 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
58607 // value to the correct fpstack register class.
58608 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
58609 return std::make_pair(0U, &X86::RFP32RegClass);
58610 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
58611 return std::make_pair(0U, &X86::RFP64RegClass);
58612 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
58613 return std::make_pair(0U, &X86::RFP80RegClass);
58614 break;
58615 case 'y': // MMX_REGS if MMX allowed.
58616 if (!Subtarget.hasMMX()) break;
58617 return std::make_pair(0U, &X86::VR64RegClass);
58618 case 'v':
58619 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
58620 if (!Subtarget.hasSSE1()) break;
58621 bool VConstraint = (Constraint[0] == 'v');
58622
58623 switch (VT.SimpleTy) {
58624 default: break;
58625 // Scalar SSE types.
58626 case MVT::f16:
58627 if (VConstraint && Subtarget.hasFP16())
58628 return std::make_pair(0U, &X86::FR16XRegClass);
58629 break;
58630 case MVT::f32:
58631 case MVT::i32:
58632 if (VConstraint && Subtarget.hasVLX())
58633 return std::make_pair(0U, &X86::FR32XRegClass);
58634 return std::make_pair(0U, &X86::FR32RegClass);
58635 case MVT::f64:
58636 case MVT::i64:
58637 if (VConstraint && Subtarget.hasVLX())
58638 return std::make_pair(0U, &X86::FR64XRegClass);
58639 return std::make_pair(0U, &X86::FR64RegClass);
58640 case MVT::i128:
58641 if (Subtarget.is64Bit()) {
58642 if (VConstraint && Subtarget.hasVLX())
58643 return std::make_pair(0U, &X86::VR128XRegClass);
58644 return std::make_pair(0U, &X86::VR128RegClass);
58645 }
58646 break;
58647 // Vector types and fp128.
58648 case MVT::v8f16:
58649 if (!Subtarget.hasFP16())
58650 break;
58651 if (VConstraint)
58652 return std::make_pair(0U, &X86::VR128XRegClass);
58653 return std::make_pair(0U, &X86::VR128RegClass);
58654 case MVT::v8bf16:
58655 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58656 break;
58657 if (VConstraint)
58658 return std::make_pair(0U, &X86::VR128XRegClass);
58659 return std::make_pair(0U, &X86::VR128RegClass);
58660 case MVT::f128:
58661 case MVT::v16i8:
58662 case MVT::v8i16:
58663 case MVT::v4i32:
58664 case MVT::v2i64:
58665 case MVT::v4f32:
58666 case MVT::v2f64:
58667 if (VConstraint && Subtarget.hasVLX())
58668 return std::make_pair(0U, &X86::VR128XRegClass);
58669 return std::make_pair(0U, &X86::VR128RegClass);
58670 // AVX types.
58671 case MVT::v16f16:
58672 if (!Subtarget.hasFP16())
58673 break;
58674 if (VConstraint)
58675 return std::make_pair(0U, &X86::VR256XRegClass);
58676 return std::make_pair(0U, &X86::VR256RegClass);
58677 case MVT::v16bf16:
58678 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58679 break;
58680 if (VConstraint)
58681 return std::make_pair(0U, &X86::VR256XRegClass);
58682 return std::make_pair(0U, &X86::VR256RegClass);
58683 case MVT::v32i8:
58684 case MVT::v16i16:
58685 case MVT::v8i32:
58686 case MVT::v4i64:
58687 case MVT::v8f32:
58688 case MVT::v4f64:
58689 if (VConstraint && Subtarget.hasVLX())
58690 return std::make_pair(0U, &X86::VR256XRegClass);
58691 if (Subtarget.hasAVX())
58692 return std::make_pair(0U, &X86::VR256RegClass);
58693 break;
58694 case MVT::v32f16:
58695 if (!Subtarget.hasFP16())
58696 break;
58697 if (VConstraint)
58698 return std::make_pair(0U, &X86::VR512RegClass);
58699 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58700 case MVT::v32bf16:
58701 if (!Subtarget.hasBF16())
58702 break;
58703 if (VConstraint)
58704 return std::make_pair(0U, &X86::VR512RegClass);
58705 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58706 case MVT::v64i8:
58707 case MVT::v32i16:
58708 case MVT::v8f64:
58709 case MVT::v16f32:
58710 case MVT::v16i32:
58711 case MVT::v8i64:
58712 if (!Subtarget.hasAVX512()) break;
58713 if (VConstraint)
58714 return std::make_pair(0U, &X86::VR512RegClass);
58715 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58716 }
58717 break;
58718 }
58719 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
58720 switch (Constraint[1]) {
58721 default:
58722 break;
58723 case 'i':
58724 case 't':
58725 case '2':
58726 return getRegForInlineAsmConstraint(TRI, "x", VT);
58727 case 'm':
58728 if (!Subtarget.hasMMX()) break;
58729 return std::make_pair(0U, &X86::VR64RegClass);
58730 case 'z':
58731 if (!Subtarget.hasSSE1()) break;
58732 switch (VT.SimpleTy) {
58733 default: break;
58734 // Scalar SSE types.
58735 case MVT::f16:
58736 if (!Subtarget.hasFP16())
58737 break;
58738 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
58739 case MVT::f32:
58740 case MVT::i32:
58741 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
58742 case MVT::f64:
58743 case MVT::i64:
58744 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
58745 case MVT::v8f16:
58746 if (!Subtarget.hasFP16())
58747 break;
58748 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58749 case MVT::v8bf16:
58750 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58751 break;
58752 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58753 case MVT::f128:
58754 case MVT::v16i8:
58755 case MVT::v8i16:
58756 case MVT::v4i32:
58757 case MVT::v2i64:
58758 case MVT::v4f32:
58759 case MVT::v2f64:
58760 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58761 // AVX types.
58762 case MVT::v16f16:
58763 if (!Subtarget.hasFP16())
58764 break;
58765 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58766 case MVT::v16bf16:
58767 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58768 break;
58769 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58770 case MVT::v32i8:
58771 case MVT::v16i16:
58772 case MVT::v8i32:
58773 case MVT::v4i64:
58774 case MVT::v8f32:
58775 case MVT::v4f64:
58776 if (Subtarget.hasAVX())
58777 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58778 break;
58779 case MVT::v32f16:
58780 if (!Subtarget.hasFP16())
58781 break;
58782 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58783 case MVT::v32bf16:
58784 if (!Subtarget.hasBF16())
58785 break;
58786 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58787 case MVT::v64i8:
58788 case MVT::v32i16:
58789 case MVT::v8f64:
58790 case MVT::v16f32:
58791 case MVT::v16i32:
58792 case MVT::v8i64:
58793 if (Subtarget.hasAVX512())
58794 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58795 break;
58796 }
58797 break;
58798 case 'k':
58799 // This register class doesn't allocate k0 for masked vector operation.
58800 if (Subtarget.hasAVX512()) {
58801 if (VT == MVT::v1i1 || VT == MVT::i1)
58802 return std::make_pair(0U, &X86::VK1WMRegClass);
58803 if (VT == MVT::v8i1 || VT == MVT::i8)
58804 return std::make_pair(0U, &X86::VK8WMRegClass);
58805 if (VT == MVT::v16i1 || VT == MVT::i16)
58806 return std::make_pair(0U, &X86::VK16WMRegClass);
58807 }
58808 if (Subtarget.hasBWI()) {
58809 if (VT == MVT::v32i1 || VT == MVT::i32)
58810 return std::make_pair(0U, &X86::VK32WMRegClass);
58811 if (VT == MVT::v64i1 || VT == MVT::i64)
58812 return std::make_pair(0U, &X86::VK64WMRegClass);
58813 }
58814 break;
58815 }
58816 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
58817 switch (Constraint[1]) {
58818 default:
58819 break;
58820 case 'r':
58821 if (VT == MVT::i8 || VT == MVT::i1)
58822 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
58823 if (VT == MVT::i16)
58824 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
58825 if (VT == MVT::i32 || VT == MVT::f32)
58826 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
58827 if (VT != MVT::f80 && !VT.isVector())
58828 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
58829 break;
58830 case 'R':
58831 if (VT == MVT::i8 || VT == MVT::i1)
58832 return std::make_pair(0U, &X86::GR8RegClass);
58833 if (VT == MVT::i16)
58834 return std::make_pair(0U, &X86::GR16RegClass);
58835 if (VT == MVT::i32 || VT == MVT::f32)
58836 return std::make_pair(0U, &X86::GR32RegClass);
58837 if (VT != MVT::f80 && !VT.isVector())
58838 return std::make_pair(0U, &X86::GR64RegClass);
58839 break;
58840 }
58841 }
58842
58843 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58844 return std::make_pair(0U, &X86::GR32RegClass);
58845
58846 // Use the default implementation in TargetLowering to convert the register
58847 // constraint into a member of a register class.
58848 std::pair<Register, const TargetRegisterClass*> Res;
58850
58851 // Not found as a standard register?
58852 if (!Res.second) {
58853 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
58854 // to/from f80.
58855 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
58856 // Map st(0) -> st(7) -> ST0
58857 if (Constraint.size() == 7 && Constraint[0] == '{' &&
58858 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
58859 Constraint[3] == '(' &&
58860 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
58861 Constraint[5] == ')' && Constraint[6] == '}') {
58862 // st(7) is not allocatable and thus not a member of RFP80. Return
58863 // singleton class in cases where we have a reference to it.
58864 if (Constraint[4] == '7')
58865 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
58866 return std::make_pair(X86::FP0 + Constraint[4] - '0',
58867 &X86::RFP80RegClass);
58868 }
58869
58870 // GCC allows "st(0)" to be called just plain "st".
58871 if (StringRef("{st}").equals_insensitive(Constraint))
58872 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
58873 }
58874
58875 // flags -> EFLAGS
58876 if (StringRef("{flags}").equals_insensitive(Constraint))
58877 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
58878
58879 // dirflag -> DF
58880 // Only allow for clobber.
58881 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
58882 VT == MVT::Other)
58883 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
58884
58885 // fpsr -> FPSW
58886 // Only allow for clobber.
58887 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
58888 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
58889
58890 return Res;
58891 }
58892
58893 // Make sure it isn't a register that requires 64-bit mode.
58894 if (!Subtarget.is64Bit() &&
58895 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
58896 TRI->getEncodingValue(Res.first) >= 8) {
58897 // Register requires REX prefix, but we're in 32-bit mode.
58898 return std::make_pair(0, nullptr);
58899 }
58900
58901 // Make sure it isn't a register that requires AVX512.
58902 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
58903 TRI->getEncodingValue(Res.first) & 0x10) {
58904 // Register requires EVEX prefix.
58905 return std::make_pair(0, nullptr);
58906 }
58907
58908 // Otherwise, check to see if this is a register class of the wrong value
58909 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
58910 // turn into {ax},{dx}.
58911 // MVT::Other is used to specify clobber names.
58912 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
58913 return Res; // Correct type already, nothing to do.
58914
58915 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
58916 // return "eax". This should even work for things like getting 64bit integer
58917 // registers when given an f64 type.
58918 const TargetRegisterClass *Class = Res.second;
58919 // The generic code will match the first register class that contains the
58920 // given register. Thus, based on the ordering of the tablegened file,
58921 // the "plain" GR classes might not come first.
58922 // Therefore, use a helper method.
58923 if (isGRClass(*Class)) {
58924 unsigned Size = VT.getSizeInBits();
58925 if (Size == 1) Size = 8;
58926 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
58927 return std::make_pair(0, nullptr);
58928 Register DestReg = getX86SubSuperRegister(Res.first, Size);
58929 if (DestReg.isValid()) {
58930 bool is64Bit = Subtarget.is64Bit();
58931 const TargetRegisterClass *RC =
58932 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
58933 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
58934 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
58935 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
58936 if (Size == 64 && !is64Bit) {
58937 // Model GCC's behavior here and select a fixed pair of 32-bit
58938 // registers.
58939 switch (DestReg) {
58940 case X86::RAX:
58941 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58942 case X86::RDX:
58943 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
58944 case X86::RCX:
58945 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
58946 case X86::RBX:
58947 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
58948 case X86::RSI:
58949 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
58950 case X86::RDI:
58951 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
58952 case X86::RBP:
58953 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
58954 default:
58955 return std::make_pair(0, nullptr);
58956 }
58957 }
58958 if (RC && RC->contains(DestReg))
58959 return std::make_pair(DestReg, RC);
58960 return Res;
58961 }
58962 // No register found/type mismatch.
58963 return std::make_pair(0, nullptr);
58964 } else if (isFRClass(*Class)) {
58965 // Handle references to XMM physical registers that got mapped into the
58966 // wrong class. This can happen with constraints like {xmm0} where the
58967 // target independent register mapper will just pick the first match it can
58968 // find, ignoring the required type.
58969
58970 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
58971 if (VT == MVT::f16)
58972 Res.second = &X86::FR16XRegClass;
58973 else if (VT == MVT::f32 || VT == MVT::i32)
58974 Res.second = &X86::FR32XRegClass;
58975 else if (VT == MVT::f64 || VT == MVT::i64)
58976 Res.second = &X86::FR64XRegClass;
58977 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
58978 Res.second = &X86::VR128XRegClass;
58979 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
58980 Res.second = &X86::VR256XRegClass;
58981 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
58982 Res.second = &X86::VR512RegClass;
58983 else {
58984 // Type mismatch and not a clobber: Return an error;
58985 Res.first = 0;
58986 Res.second = nullptr;
58987 }
58988 } else if (isVKClass(*Class)) {
58989 if (VT == MVT::v1i1 || VT == MVT::i1)
58990 Res.second = &X86::VK1RegClass;
58991 else if (VT == MVT::v8i1 || VT == MVT::i8)
58992 Res.second = &X86::VK8RegClass;
58993 else if (VT == MVT::v16i1 || VT == MVT::i16)
58994 Res.second = &X86::VK16RegClass;
58995 else if (VT == MVT::v32i1 || VT == MVT::i32)
58996 Res.second = &X86::VK32RegClass;
58997 else if (VT == MVT::v64i1 || VT == MVT::i64)
58998 Res.second = &X86::VK64RegClass;
58999 else {
59000 // Type mismatch and not a clobber: Return an error;
59001 Res.first = 0;
59002 Res.second = nullptr;
59003 }
59004 }
59005
59006 return Res;
59007}
59008
59010 // Integer division on x86 is expensive. However, when aggressively optimizing
59011 // for code size, we prefer to use a div instruction, as it is usually smaller
59012 // than the alternative sequence.
59013 // The exception to this is vector division. Since x86 doesn't have vector
59014 // integer division, leaving the division as-is is a loss even in terms of
59015 // size, because it will have to be scalarized, while the alternative code
59016 // sequence can be performed in vector form.
59017 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
59018 return OptSize && !VT.isVector();
59019}
59020
59021void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
59022 if (!Subtarget.is64Bit())
59023 return;
59024
59025 // Update IsSplitCSR in X86MachineFunctionInfo.
59027 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
59028 AFI->setIsSplitCSR(true);
59029}
59030
59031void X86TargetLowering::insertCopiesSplitCSR(
59032 MachineBasicBlock *Entry,
59033 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
59034 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
59035 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
59036 if (!IStart)
59037 return;
59038
59039 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
59040 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
59041 MachineBasicBlock::iterator MBBI = Entry->begin();
59042 for (const MCPhysReg *I = IStart; *I; ++I) {
59043 const TargetRegisterClass *RC = nullptr;
59044 if (X86::GR64RegClass.contains(*I))
59045 RC = &X86::GR64RegClass;
59046 else
59047 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
59048
59049 Register NewVR = MRI->createVirtualRegister(RC);
59050 // Create copy from CSR to a virtual register.
59051 // FIXME: this currently does not emit CFI pseudo-instructions, it works
59052 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
59053 // nounwind. If we want to generalize this later, we may need to emit
59054 // CFI pseudo-instructions.
59055 assert(
59056 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
59057 "Function should be nounwind in insertCopiesSplitCSR!");
59058 Entry->addLiveIn(*I);
59059 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
59060 .addReg(*I);
59061
59062 // Insert the copy-back instructions right before the terminator.
59063 for (auto *Exit : Exits)
59064 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
59065 TII->get(TargetOpcode::COPY), *I)
59066 .addReg(NewVR);
59067 }
59068}
59069
59071 return Subtarget.is64Bit();
59072}
59073
59077 const TargetInstrInfo *TII) const {
59078 assert(MBBI->isCall() && MBBI->getCFIType() &&
59079 "Invalid call instruction for a KCFI check");
59080
59081 MachineFunction &MF = *MBB.getParent();
59082 // If the call target is a memory operand, unfold it and use R11 for the
59083 // call, so KCFI_CHECK won't have to recompute the address.
59084 switch (MBBI->getOpcode()) {
59085 case X86::CALL64m:
59086 case X86::CALL64m_NT:
59087 case X86::TAILJMPm64:
59088 case X86::TAILJMPm64_REX: {
59091 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
59092 /*UnfoldStore=*/false, NewMIs))
59093 report_fatal_error("Failed to unfold memory operand for a KCFI check");
59094 for (auto *NewMI : NewMIs)
59095 MBBI = MBB.insert(OrigCall, NewMI);
59096 assert(MBBI->isCall() &&
59097 "Unexpected instruction after memory operand unfolding");
59098 if (OrigCall->shouldUpdateCallSiteInfo())
59099 MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
59100 MBBI->setCFIType(MF, OrigCall->getCFIType());
59101 OrigCall->eraseFromParent();
59102 break;
59103 }
59104 default:
59105 break;
59106 }
59107
59108 MachineOperand &Target = MBBI->getOperand(0);
59109 Register TargetReg;
59110 switch (MBBI->getOpcode()) {
59111 case X86::CALL64r:
59112 case X86::CALL64r_NT:
59113 case X86::TAILJMPr64:
59114 case X86::TAILJMPr64_REX:
59115 assert(Target.isReg() && "Unexpected target operand for an indirect call");
59116 Target.setIsRenamable(false);
59117 TargetReg = Target.getReg();
59118 break;
59119 case X86::CALL64pcrel32:
59120 case X86::TAILJMPd64:
59121 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
59122 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
59123 // 64-bit indirect thunk calls.
59124 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
59125 "Unexpected register for an indirect thunk call");
59126 TargetReg = X86::R11;
59127 break;
59128 default:
59129 llvm_unreachable("Unexpected CFI call opcode");
59130 break;
59131 }
59132
59133 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
59134 .addReg(TargetReg)
59135 .addImm(MBBI->getCFIType())
59136 .getInstr();
59137}
59138
59139/// Returns true if stack probing through a function call is requested.
59141 return !getStackProbeSymbolName(MF).empty();
59142}
59143
59144/// Returns true if stack probing through inline assembly is requested.
59146
59147 // No inline stack probe for Windows, they have their own mechanism.
59148 if (Subtarget.isOSWindows() ||
59149 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59150 return false;
59151
59152 // If the function specifically requests inline stack probes, emit them.
59153 if (MF.getFunction().hasFnAttribute("probe-stack"))
59154 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
59155 "inline-asm";
59156
59157 return false;
59158}
59159
59160/// Returns the name of the symbol used to emit stack probes or the empty
59161/// string if not applicable.
59164 // Inline Stack probes disable stack probe call
59165 if (hasInlineStackProbe(MF))
59166 return "";
59167
59168 // If the function specifically requests stack probes, emit them.
59169 if (MF.getFunction().hasFnAttribute("probe-stack"))
59170 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
59171
59172 // Generally, if we aren't on Windows, the platform ABI does not include
59173 // support for stack probes, so don't emit them.
59174 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
59175 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59176 return "";
59177
59178 // We need a stack probe to conform to the Windows ABI. Choose the right
59179 // symbol.
59180 if (Subtarget.is64Bit())
59181 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
59182 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
59183}
59184
59185unsigned
59187 // The default stack probe size is 4096 if the function has no stackprobesize
59188 // attribute.
59189 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
59190 4096);
59191}
59192
59194 if (ML && ML->isInnermost() &&
59195 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
59198}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
static const LLT S1
static const LLT F64
amdgpu AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
Definition: BitTracker.cpp:73
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Looks at all the uses of the given value Returns the Liveness deduced from the uses of this value Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses If the result is MaybeLiveUses might be modified but its content should be ignored(since it might not be complete). DeadArgumentEliminationPass
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
static KnownBits extractBits(unsigned BitWidth, const KnownBits &SrcOpKnown, const KnownBits &OffsetKnown, const KnownBits &WidthKnown)
Hexagon Common GEP
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
static const unsigned MaxDepth
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:528
Live Register Matrix
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
unsigned const TargetRegisterInfo * TRI
#define R2(n)
#define T1
uint64_t High
uint64_t IntrinsicInst * II
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(VerifyEach)
const char LLVMTargetMachineRef TM
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
unsigned OpIndex
static StringRef substr(StringRef Str, uint64_t Len)
This file implements the SmallBitVector class.
This file defines the SmallSet class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
This file describes how to lower LLVM code to machine code.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG)
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
@ ShiftBit
@ UndefBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineBitOpWithPACK(SDNode *N, SelectionDAG &DAG)
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static bool isX86CCSigned(unsigned X86CC)
Return true if the condition is an signed comparison operation.
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue V)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISD::P...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Dispatching routine to lower various 128-bit x86 vector shuffles.
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic=false)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as a zero or any extension.
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to map a 128-bit or larger integer comparison to vector instructions before type legalization spl...
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If a vector select has an operand that is -1 or 0, try to simplify the select to a bitwise logic oper...
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG)
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static bool matchAsm(StringRef S, ArrayRef< const char * > Pieces)
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, SDValue And1_L, SDValue And1_R, const SDLoc &DL, SelectionDAG &DAG)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, SDLoc DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, unsigned Reg)
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, bool AllowTruncate)
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG)
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isLegalConversion(MVT VT, bool IsSigned, const X86Subtarget &Subtarget)
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 256-bit x86 vector shuffles.
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesConstants(ArrayRef< SDValue > Ops, ArrayRef< int > Mask, SDValue Root, bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget)
Try to lower a vector shuffle as a bit shift (shifts in zeros).
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG)
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then concatenate the result back.
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, SDValue Root, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to lower a vector shuffle as a byte rotation.
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, const APInt &Zeroable, ArrayRef< int > Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, MachineBasicBlock *BB)
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG)
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG)
static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG)
Fold "masked merge" expressions like (m & x) | (~m & y) into the equivalent ((x ^ y) & m) ^ y) patter...
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code? Current x86 isa includes the foll...
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5282
static APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition: APFloat.cpp:5307
void clearSign()
Definition: APFloat.h:1203
opStatus next(bool nextDown)
Definition: APFloat.h:1159
void changeSign()
Definition: APFloat.h:1202
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:982
Class for arbitrary precision integers.
Definition: APInt.h:77
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:213
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1386
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:428
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:208
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:402
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1499
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1371
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1628
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1365
uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition: APInt.cpp:489
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1471
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:906
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:185
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1309
APInt abs() const
Get the absolute value.
Definition: APInt.h:1752
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:350
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:237
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:359
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:445
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
void setSignBit()
Set the sign bit to 1.
Definition: APInt.h:1319
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1447
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1090
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:188
bool isMinValue() const
Determine if this is the smallest unsigned value.
Definition: APInt.h:396
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:195
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:308
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition: APInt.h:1228
bool eq(const APInt &RHS) const
Equality comparison.
Definition: APInt.h:1058
int32_t exactLogBase2() const
Definition: APInt.h:1740
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1376
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:813
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1597
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:414
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition: APInt.h:1586
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1556
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:620
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:198
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1490
void flipAllBits()
Toggle every bit to its opposite value.
Definition: APInt.h:1413
unsigned countl_one() const
Count the number of leading one bits.
Definition: APInt.h:1573
void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition: APInt.cpp:368
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition: APInt.h:1396
unsigned logBase2() const
Definition: APInt.h:1718
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1298
bool getBoolValue() const
Convert APInt to a boolean value.
Definition: APInt.h:450
bool isMask(unsigned numBits) const
Definition: APInt.h:467
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:384
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:313
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1129
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1346
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:852
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1236
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:419
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:285
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:320
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:275
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:179
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1368
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:411
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:368
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:265
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:218
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:837
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:830
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1614
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1200
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition: APInt.h:378
APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition: APInt.cpp:942
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:647
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:643
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:695
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:808
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:707
@ Add
*p = old + v
Definition: Instructions.h:711
@ FAdd
*p = old + v
Definition: Instructions.h:732
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:725
@ Or
*p = old | v
Definition: Instructions.h:719
@ Sub
*p = old - v
Definition: Instructions.h:713
@ And
*p = old & v
Definition: Instructions.h:715
@ Xor
*p = old ^ v
Definition: Instructions.h:721
@ FSub
*p = old - v
Definition: Instructions.h:735
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:747
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:723
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:729
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:743
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:727
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:739
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:751
@ Nand
*p = ~(old & v)
Definition: Instructions.h:717
Value * getPointerOperand()
Definition: Instructions.h:851
BinOp getOperation() const
Definition: Instructions.h:786
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:842
Value * getValOperand()
Definition: Instructions.h:855
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:828
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:391
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
The address of a basic block.
Definition: Constants.h:890
A "pseudo-class" with methods for operating on BUILD_VECTORs.
bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
Value * getCalledOperand() const
Definition: InstrTypes.h:1458
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:847
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1292
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition: Constants.cpp:2916
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1575
bool isMachineConstantPoolEntry() const
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1399
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:400
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:432
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Tagged union holding either a T or a Error.
Definition: Error.h:481
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:126
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:698
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:716
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:728
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:695
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition: Function.h:868
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1934
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:350
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:690
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:914
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition: GlobalValue.h:567
bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition: Globals.cpp:399
ThreadLocalMode getThreadLocalMode() const
Definition: GlobalValue.h:271
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2664
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:121
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
Class to represent integer types.
Definition: DerivedTypes.h:40
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:173
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:238
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:799
MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition: MCContext.cpp:241
MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition: MCContext.cpp:246
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:230
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
bool isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
unsigned succ_size() const
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
succ_reverse_iterator succ_rbegin()
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
MachineModuleInfo & getMMI() const
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void moveCallSiteInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
const MCContext & getContext() const
const Module * getModule() const
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
bool readMem() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:333
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
void setFlags(SDNodeFlags NewFlags)
op_iterator op_end() const
op_iterator op_begin() const
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Help to insert SDNodeFlags automatically in transforming.
Definition: SelectionDAG.h:364
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition: SelectionDAG.h:938
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:736
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:968
SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:488
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:492
bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:451
SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:746
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:842
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:486
SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:673
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:487
std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:787
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:782
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:813
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:859
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
std::optional< uint64_t > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVMContext * getContext() const
Definition: SelectionDAG.h:499
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:753
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:568
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition: SelectionDAG.h:922
SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
iterator erase(const_iterator CI)
Definition: SmallVector.h:750
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:289
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:564
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:258
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
size_t size_type
Definition: StringRef.h:56
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:270
static constexpr size_t npos
Definition: StringRef.h:52
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition: StringRef.h:171
size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Definition: StringRef.cpp:251
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
Information about stack frame layout on the target.
virtual bool hasFP(const MachineFunction &MF) const =0
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:667
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:558
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:404
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
uint64_t getArrayNumElements() const
bool isX86_MMXTy() const
Return true if this is X86 MMX.
Definition: Type.h:201
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1795
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
use_iterator use_begin()
Definition: Value.h:360
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
Type * getElementType() const
Definition: DerivedTypes.h:436
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
unsigned getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const
bool hasBasePointer(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getDarwinTLSCallPreservedMask() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
Definition: X86Subtarget.h:248
bool hasAnyFMA() const
Definition: X86Subtarget.h:213
bool isOSWindows() const
Definition: X86Subtarget.h:334
bool isTargetMachO() const
Definition: X86Subtarget.h:300
bool useIndirectThunkBranches() const
Definition: X86Subtarget.h:233
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasThreeDNow() const
Definition: X86Subtarget.h:211
bool isPICStyleGOT() const
Definition: X86Subtarget.h:340
bool hasSSE42() const
Definition: X86Subtarget.h:205
const X86TargetLowering * getTargetLowering() const override
Definition: X86Subtarget.h:125
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition: X86Subtarget.h:288
bool canUseCMOV() const
Definition: X86Subtarget.h:199
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:343
bool isTargetWindowsMSVC() const
Definition: X86Subtarget.h:312
bool canUseCMPXCHG8B() const
Definition: X86Subtarget.h:192
bool isTargetDarwin() const
Definition: X86Subtarget.h:292
bool isTargetWin64() const
Definition: X86Subtarget.h:336
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition: X86Subtarget.h:185
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:290
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:129
bool useAVX512Regs() const
Definition: X86Subtarget.h:265
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:349
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool canExtendTo512DQ() const
Definition: X86Subtarget.h:244
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasMMX() const
Definition: X86Subtarget.h:210
bool isTargetELF() const
Definition: X86Subtarget.h:298
bool hasSSEPrefetch() const
Definition: X86Subtarget.h:220
bool canUseCMPXCHG16B() const
Definition: X86Subtarget.h:193
unsigned char classifyGlobalReference(const GlobalValue *GV, const Module &M) const
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasInt256() const
Definition: X86Subtarget.h:209
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:341
bool isTargetCygMing() const
Definition: X86Subtarget.h:332
unsigned char classifyLocalReference(const GlobalValue *GV) const
Classify a global variable reference for the current subtarget according to how we should reference i...
unsigned char classifyBlockAddressReference() const
Classify a blockaddress reference for the current subtarget according to how we should reference it i...
bool isTargetPS() const
Definition: X86Subtarget.h:296
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:139
bool hasAVX() const
Definition: X86Subtarget.h:206
bool isTargetWindowsGNU() const
Definition: X86Subtarget.h:324
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:237
bool isTargetWindowsItanium() const
Definition: X86Subtarget.h:328
bool isTargetNaCl64() const
Definition: X86Subtarget.h:308
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:131
bool useBWIRegs() const
Definition: X86Subtarget.h:274
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const
Classify a global function reference for the current subtarget.
bool hasAVX2() const
Definition: X86Subtarget.h:207
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool isVectorShiftByScalarCheap(Type *Ty) const override
This is used to enable splatted operand transforms for vector shifts and vector funnel shifts.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Return true if sinking I's operands to the same basic block as I is profitable, e....
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y —> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define INT64_MIN
Definition: DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
@ X86_ThisCall
Similar to X86_StdCall.
Definition: CallingConv.h:122
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition: CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition: CallingConv.h:103
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:764
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1147
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1143
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:737
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:484
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:153
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:505
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1290
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:567
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:728
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1176
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1292
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1262
@ STRICT_FCEIL
Definition: ISDOpcodes.h:434
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1293
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition: ISDOpcodes.h:130
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1023
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:495
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1052
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:797
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:491
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:157
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1275
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:444
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:804
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:551
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:702
@ MEMBARRIER
MEMBARRIER - Compiler barrier only; generate a no-op.
Definition: ISDOpcodes.h:1249
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1254
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition: ISDOpcodes.h:834
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:485
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:927
@ STRICT_FLOG2
Definition: ISDOpcodes.h:429
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1288
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:917
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1289
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1220
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:954
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:418
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1431
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ EH_LABEL
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:1123
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition: ISDOpcodes.h:141
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:899
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:788
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:670
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:458
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:628
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:107
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1068
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:736
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1242
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1009
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:772
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:944
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1098
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1291
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1077
@ GC_TRANSITION_START
GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the beginning and end of GC transition s...
Definition: ISDOpcodes.h:1334
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:741
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1258
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:635
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1172
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:438
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:894
@ STRICT_FP_TO_FP16
Definition: ISDOpcodes.h:930
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:659
@ STRICT_FP16_TO_FP
Definition: ISDOpcodes.h:929
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:719
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:608
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1286
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:581
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:999
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:443
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:432
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:543
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:794
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1232
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:870
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:433
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:756
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1294
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:120
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:986
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1236
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1062
@ ConstantPool
Definition: ISDOpcodes.h:82
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:823
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:812
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:682
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:902
@ STRICT_FROUND
Definition: ISDOpcodes.h:436
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:750
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:457
@ STRICT_BF16_TO_FP
Definition: ISDOpcodes.h:938
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:435
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:437
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:936
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1284
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:451
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:473
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:450
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1005
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1285
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:850
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1203
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:164
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:478
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:694
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1229
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ GET_FPENV_MEM
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1028
@ STRICT_FP_TO_BF16
Definition: ISDOpcodes.h:939
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:665
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:407
@ STRICT_FLOG10
Definition: ISDOpcodes.h:428
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:532
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_FEXP2
Definition: ISDOpcodes.h:426
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1283
@ ExternalSymbol
Definition: ISDOpcodes.h:83
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:959
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:883
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:845
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:921
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:431
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:869
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:147
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:800
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1167
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1091
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:777
@ GC_TRANSITION_END
Definition: ISDOpcodes.h:1335
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:501
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:430
@ SET_FPENV_MEM
Sets the current floating point environment.
Definition: ISDOpcodes.h:1033
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1226
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:523
bool isExtVecInRegOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1630
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false)
Hook for matching ConstantSDNode predicate.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1625
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1446
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition: ISDOpcodes.h:1612
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1587
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1554
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1534
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition: ISDOpcodes.h:1593
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1484
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition: PatternMatch.h:664
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:972
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate, true > m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
@ GeneralDynamic
Definition: CodeGen.h:46
@ X86
Windows x64, Windows Itanium (IA-64)
@ PTR32_UPTR
Definition: X86.h:209
@ FS
Definition: X86.h:206
@ PTR64
Definition: X86.h:210
@ PTR32_SPTR
Definition: X86.h:208
@ GS
Definition: X86.h:205
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:427
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition: X86BaseInfo.h:407
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: X86BaseInfo.h:504
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition: X86BaseInfo.h:466
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:448
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:472
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition: X86BaseInfo.h:454
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition: X86BaseInfo.h:492
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:419
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:379
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition: X86BaseInfo.h:488
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition: X86BaseInfo.h:476
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:441
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition: X86BaseInfo.h:496
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:460
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:435
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:403
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FCMP
X86 strict FP compare instructions.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
@ TO_NEAREST_INT
Definition: X86BaseInfo.h:42
@ CUR_DIRECTION
Definition: X86BaseInfo.h:46
@ AddrNumOperands
Definition: X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
std::optional< const char * > toString(const std::optional< DWARFFormValue > &V)
Take an optional DWARFFormValue and try to extract a string value from it.
constexpr double e
Definition: MathExtras.h:31
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition: X86InstrInfo.h:121
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
@ Length
Definition: DWP.cpp:480
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition: X86InstrInfo.h:139
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition: Utils.cpp:1540
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
AddressSpace
Definition: NVPTXBaseInfo.h:21
@ SjLj
setjmp/longjmp based exceptions
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, unsigned Reg)
Replace the address used in the instruction with the direct memory reference.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
@ SM_SentinelUndef
@ SM_SentinelZero
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition: STLExtras.h:2013
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1522
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1768
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
unsigned M1(unsigned Val)
Definition: VE.h:376
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, unsigned Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition: DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1954
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Add
Sum of integers.
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ TRUNCATE_TO_REG
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ FIXUPIMM_MASKZ
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition: STLExtras.h:1607
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:203
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
#define EQ(a, b)
Definition: regexec.c:112
This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and the bit indexes (Mask) nee...
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:271
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:246
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:250
static const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition: APFloat.cpp:288
static const fltSemantics & IEEEquad() LLVM_READNONE
Definition: APFloat.cpp:273
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:317
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:272
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:269
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:270
opStatus
IEEE-754R 7: Default exception handling.
Definition: APFloat.h:262
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:274
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:213
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
static KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
Definition: KnownBits.cpp:753
static std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition: KnownBits.cpp:488
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition: KnownBits.h:175
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:97
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:76
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:231
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:62
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:150
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition: KnownBits.h:278
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition: KnownBits.h:82
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:161
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:50
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:70
static KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition: KnownBits.cpp:228
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:214
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:285
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:300
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:169
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition: KnownBits.h:185
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:237
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:134
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:51
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:94
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition: KnownBits.h:88
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:797
static std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition: KnownBits.cpp:526
bool isAllOnes() const
Returns true if value is all one bits.
Definition: KnownBits.h:79
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:56
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
MVT ConstraintVT
The ValueType for the operand value.
std::string ConstraintCode
This contains the actual string for the code, like "m".
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
X86AddressMode - This struct holds a generalized full x86 address mode.