LLVM 20.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
99 "x86-br-merging-likely-bias", cl::init(0),
100 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
101 "that all conditionals will be executed. For example for merging "
102 "the conditionals (a == b && c > d), if its known that a == b is "
103 "likely, then it is likely that if the conditionals are split "
104 "both sides will be executed, so it may be desirable to increase "
105 "the instruction cost threshold. Set to -1 to never merge likely "
106 "branches."),
107 cl::Hidden);
108
110 "x86-br-merging-unlikely-bias", cl::init(-1),
111 cl::desc(
112 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
113 "that all conditionals will be executed. For example for merging "
114 "the conditionals (a == b && c > d), if its known that a == b is "
115 "unlikely, then it is unlikely that if the conditionals are split "
116 "both sides will be executed, so it may be desirable to decrease "
117 "the instruction cost threshold. Set to -1 to never merge unlikely "
118 "branches."),
119 cl::Hidden);
120
122 "mul-constant-optimization", cl::init(true),
123 cl::desc("Replace 'mul x, Const' with more effective instructions like "
124 "SHIFT, LEA, etc."),
125 cl::Hidden);
126
128 const X86Subtarget &STI)
129 : TargetLowering(TM), Subtarget(STI) {
130 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
131 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
132
133 // Set up the TargetLowering object.
134
135 // X86 is weird. It always uses i8 for shift amounts and setcc results.
137 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
139
140 // X86 instruction cache is coherent with its data cache so we can use the
141 // default expansion to a no-op.
143
144 // For 64-bit, since we have so many registers, use the ILP scheduler.
145 // For 32-bit, use the register pressure specific scheduling.
146 // For Atom, always use ILP scheduling.
147 if (Subtarget.isAtom())
149 else if (Subtarget.is64Bit())
151 else
153 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
155
156 // Bypass expensive divides and use cheaper ones.
157 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
158 if (Subtarget.hasSlowDivide32())
159 addBypassSlowDiv(32, 8);
160 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
161 addBypassSlowDiv(64, 32);
162 }
163
164 // Setup Windows compiler runtime calls.
165 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
166 static const struct {
167 const RTLIB::Libcall Op;
168 const char * const Name;
169 const CallingConv::ID CC;
170 } LibraryCalls[] = {
171 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
172 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
173 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
174 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
175 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
176 };
177
178 for (const auto &LC : LibraryCalls) {
179 setLibcallName(LC.Op, LC.Name);
180 setLibcallCallingConv(LC.Op, LC.CC);
181 }
182 }
183
184 if (Subtarget.canUseCMPXCHG16B())
186 else if (Subtarget.canUseCMPXCHG8B())
188 else
190
191 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
192
194
195 // Set up the register classes.
196 addRegisterClass(MVT::i8, &X86::GR8RegClass);
197 addRegisterClass(MVT::i16, &X86::GR16RegClass);
198 addRegisterClass(MVT::i32, &X86::GR32RegClass);
199 if (Subtarget.is64Bit())
200 addRegisterClass(MVT::i64, &X86::GR64RegClass);
201
202 for (MVT VT : MVT::integer_valuetypes())
204
205 // We don't accept any truncstore of integer registers.
206 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
207 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
208 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
209 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
210 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
211 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
212
213 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
214
215 // SETOEQ and SETUNE require checking two conditions.
216 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
219 }
220
221 // Integer absolute.
222 if (Subtarget.canUseCMOV()) {
223 setOperationAction(ISD::ABS , MVT::i16 , Custom);
224 setOperationAction(ISD::ABS , MVT::i32 , Custom);
225 if (Subtarget.is64Bit())
226 setOperationAction(ISD::ABS , MVT::i64 , Custom);
227 }
228
229 // Absolute difference.
230 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
231 setOperationAction(Op , MVT::i8 , Custom);
232 setOperationAction(Op , MVT::i16 , Custom);
233 setOperationAction(Op , MVT::i32 , Custom);
234 if (Subtarget.is64Bit())
235 setOperationAction(Op , MVT::i64 , Custom);
236 }
237
238 // Signed saturation subtraction.
242 if (Subtarget.is64Bit())
244
245 // Funnel shifts.
246 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
247 // For slow shld targets we only lower for code size.
248 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
249
250 setOperationAction(ShiftOp , MVT::i8 , Custom);
251 setOperationAction(ShiftOp , MVT::i16 , Custom);
252 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
253 if (Subtarget.is64Bit())
254 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
255 }
256
257 if (!Subtarget.useSoftFloat()) {
258 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
259 // operation.
264 // We have an algorithm for SSE2, and we turn this into a 64-bit
265 // FILD or VCVTUSI2SS/SD for other targets.
268 // We have an algorithm for SSE2->double, and we turn this into a
269 // 64-bit FILD followed by conditional FADD for other targets.
272
273 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
274 // this operation.
277 // SSE has no i16 to fp conversion, only i32. We promote in the handler
278 // to allow f80 to use i16 and f64 to use i16 with sse1 only
281 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
284 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
285 // are Legal, f80 is custom lowered.
288
289 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
290 // this operation.
292 // FIXME: This doesn't generate invalid exception when it should. PR44019.
298 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
299 // are Legal, f80 is custom lowered.
302
303 // Handle FP_TO_UINT by promoting the destination to a larger signed
304 // conversion.
306 // FIXME: This doesn't generate invalid exception when it should. PR44019.
309 // FIXME: This doesn't generate invalid exception when it should. PR44019.
315
320
321 if (!Subtarget.is64Bit()) {
324 }
325 }
326
327 if (Subtarget.hasSSE2()) {
328 // Custom lowering for saturating float to int conversions.
329 // We handle promotion to larger result types manually.
330 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
333 }
334 if (Subtarget.is64Bit()) {
337 }
338 }
339
340 // Handle address space casts between mixed sized pointers.
343
344 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
345 if (!Subtarget.hasSSE2()) {
348 if (Subtarget.is64Bit()) {
350 // Without SSE, i64->f64 goes through memory.
352 }
353 } else if (!Subtarget.is64Bit())
355
356 // Scalar integer divide and remainder are lowered to use operations that
357 // produce two results, to match the available instructions. This exposes
358 // the two-result form to trivial CSE, which is able to combine x/y and x%y
359 // into a single instruction.
360 //
361 // Scalar integer multiply-high is also lowered to use two-result
362 // operations, to match the available instructions. However, plain multiply
363 // (low) operations are left as Legal, as there are single-result
364 // instructions for this in x86. Using the two-result multiply instructions
365 // when both high and low results are needed must be arranged by dagcombine.
366 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
373 }
374
375 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
377 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
378 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
381 }
382 if (Subtarget.is64Bit())
387
388 setOperationAction(ISD::FREM , MVT::f32 , Expand);
389 setOperationAction(ISD::FREM , MVT::f64 , Expand);
390 setOperationAction(ISD::FREM , MVT::f80 , Expand);
391 setOperationAction(ISD::FREM , MVT::f128 , Expand);
392
393 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 }
400
401 // Promote the i8 variants and force them on up to i32 which has a shorter
402 // encoding.
403 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
405 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
406 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
407 // promote that too.
408 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
410
411 if (!Subtarget.hasBMI()) {
412 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
414 if (Subtarget.is64Bit()) {
415 setOperationPromotedToType(ISD::CTTZ , MVT::i32, MVT::i64);
416 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
418 }
419 }
420
421 if (Subtarget.hasLZCNT()) {
422 // When promoting the i8 variants, force them to i32 for a shorter
423 // encoding.
424 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
426 } else {
427 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
428 if (VT == MVT::i64 && !Subtarget.is64Bit())
429 continue;
432 }
433 }
434
437 // Special handling for half-precision floating point conversions.
438 // If we don't have F16C support, then lower half float conversions
439 // into library calls.
441 Op, MVT::f32,
442 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
443 // There's never any support for operations beyond MVT::f32.
444 setOperationAction(Op, MVT::f64, Expand);
445 setOperationAction(Op, MVT::f80, Expand);
446 setOperationAction(Op, MVT::f128, Expand);
447 }
448
449 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
452 }
453
454 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
455 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
456 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
457 setTruncStoreAction(VT, MVT::f16, Expand);
458 setTruncStoreAction(VT, MVT::bf16, Expand);
459
462 }
463
467 if (Subtarget.is64Bit())
469 if (Subtarget.hasPOPCNT()) {
470 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
471 // popcntw is longer to encode than popcntl and also has a false dependency
472 // on the dest that popcntl hasn't had since Cannon Lake.
473 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
474 } else {
479 }
480
482
483 if (!Subtarget.hasMOVBE())
485
486 // X86 wants to expand cmov itself.
487 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
492 }
493 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
494 if (VT == MVT::i64 && !Subtarget.is64Bit())
495 continue;
498 }
499
500 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
503
505 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
506 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
510 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
511 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
512
513 // Darwin ABI issue.
514 for (auto VT : { MVT::i32, MVT::i64 }) {
515 if (VT == MVT::i64 && !Subtarget.is64Bit())
516 continue;
523 }
524
525 // 64-bit shl, sra, srl (iff 32-bit x86)
526 for (auto VT : { MVT::i32, MVT::i64 }) {
527 if (VT == MVT::i64 && !Subtarget.is64Bit())
528 continue;
532 }
533
534 if (Subtarget.hasSSEPrefetch())
536
538
539 // Expand certain atomics
540 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
548 }
549
550 if (!Subtarget.is64Bit())
552
553 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
554 // All CPUs supporting AVX will atomically load/store aligned 128-bit
555 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
558 }
559
560 if (Subtarget.canUseCMPXCHG16B())
562
563 // FIXME - use subtarget debug flags
564 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
565 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
566 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
568 }
569
572
575
576 setOperationAction(ISD::TRAP, MVT::Other, Legal);
578 if (Subtarget.isTargetPS())
580 else
582
583 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
585 setOperationAction(ISD::VAEND , MVT::Other, Expand);
586 bool Is64Bit = Subtarget.is64Bit();
587 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
588 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
589
592
594
595 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
598
600
601 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
602 setOperationAction(ISD::FABS, VT, Action);
603 setOperationAction(ISD::FNEG, VT, Action);
605 setOperationAction(ISD::FREM, VT, Action);
606 setOperationAction(ISD::FMA, VT, Action);
607 setOperationAction(ISD::FMINNUM, VT, Action);
608 setOperationAction(ISD::FMAXNUM, VT, Action);
611 setOperationAction(ISD::FSIN, VT, Action);
612 setOperationAction(ISD::FCOS, VT, Action);
613 setOperationAction(ISD::FSINCOS, VT, Action);
614 setOperationAction(ISD::FTAN, VT, Action);
615 setOperationAction(ISD::FSQRT, VT, Action);
616 setOperationAction(ISD::FPOW, VT, Action);
617 setOperationAction(ISD::FLOG, VT, Action);
618 setOperationAction(ISD::FLOG2, VT, Action);
619 setOperationAction(ISD::FLOG10, VT, Action);
620 setOperationAction(ISD::FEXP, VT, Action);
621 setOperationAction(ISD::FEXP2, VT, Action);
622 setOperationAction(ISD::FEXP10, VT, Action);
623 setOperationAction(ISD::FCEIL, VT, Action);
624 setOperationAction(ISD::FFLOOR, VT, Action);
626 setOperationAction(ISD::FRINT, VT, Action);
627 setOperationAction(ISD::BR_CC, VT, Action);
628 setOperationAction(ISD::SETCC, VT, Action);
631 setOperationAction(ISD::FROUND, VT, Action);
633 setOperationAction(ISD::FTRUNC, VT, Action);
634 setOperationAction(ISD::FLDEXP, VT, Action);
635 };
636
637 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
638 // f16, f32 and f64 use SSE.
639 // Set up the FP register classes.
640 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
641 : &X86::FR16RegClass);
642 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
643 : &X86::FR32RegClass);
644 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
645 : &X86::FR64RegClass);
646
647 // Disable f32->f64 extload as we can only generate this in one instruction
648 // under optsize. So its easier to pattern match (fpext (load)) for that
649 // case instead of needing to emit 2 instructions for extload in the
650 // non-optsize case.
651 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
652
653 for (auto VT : { MVT::f32, MVT::f64 }) {
654 // Use ANDPD to simulate FABS.
656
657 // Use XORP to simulate FNEG.
659
660 // Use ANDPD and ORPD to simulate FCOPYSIGN.
662
663 // These might be better off as horizontal vector ops.
666
667 // We don't support sin/cos/fmod
671 }
672
673 // Half type will be promoted by default.
674 setF16Action(MVT::f16, Promote);
682
712
713 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
714 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
715
716 // Lower this to MOVMSK plus an AND.
719
720 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
721 (UseX87 || Is64Bit)) {
722 // Use SSE for f32, x87 for f64.
723 // Set up the FP register classes.
724 addRegisterClass(MVT::f32, &X86::FR32RegClass);
725 if (UseX87)
726 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
727
728 // Use ANDPS to simulate FABS.
730
731 // Use XORP to simulate FNEG.
733
734 if (UseX87)
736
737 // Use ANDPS and ORPS to simulate FCOPYSIGN.
738 if (UseX87)
741
742 // We don't support sin/cos/fmod
746
747 if (UseX87) {
748 // Always expand sin/cos functions even though x87 has an instruction.
752 }
753 } else if (UseX87) {
754 // f32 and f64 in x87.
755 // Set up the FP register classes.
756 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
757 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
758
759 for (auto VT : { MVT::f32, MVT::f64 }) {
762
763 // Always expand sin/cos functions even though x87 has an instruction.
767 }
768 }
769
770 // Expand FP32 immediates into loads from the stack, save special cases.
771 if (isTypeLegal(MVT::f32)) {
772 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
773 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
774 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
775 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
776 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
777 } else // SSE immediates.
778 addLegalFPImmediate(APFloat(+0.0f)); // xorps
779 }
780 // Expand FP64 immediates into loads from the stack, save special cases.
781 if (isTypeLegal(MVT::f64)) {
782 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
783 addLegalFPImmediate(APFloat(+0.0)); // FLD0
784 addLegalFPImmediate(APFloat(+1.0)); // FLD1
785 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
786 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
787 } else // SSE immediates.
788 addLegalFPImmediate(APFloat(+0.0)); // xorpd
789 }
790 // Support fp16 0 immediate.
791 if (isTypeLegal(MVT::f16))
792 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
793
794 // Handle constrained floating-point operations of scalar.
807
808 // We don't support FMA.
811
812 // f80 always uses X87.
813 if (UseX87) {
814 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
817 {
819 addLegalFPImmediate(TmpFlt); // FLD0
820 TmpFlt.changeSign();
821 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
822
823 bool ignored;
824 APFloat TmpFlt2(+1.0);
826 &ignored);
827 addLegalFPImmediate(TmpFlt2); // FLD1
828 TmpFlt2.changeSign();
829 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
830 }
831
832 // Always expand sin/cos functions even though x87 has an instruction.
833 // clang-format off
844 // clang-format on
845
857
858 // Handle constrained floating-point operations of scalar.
864 if (isTypeLegal(MVT::f16)) {
867 } else {
869 }
870 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
871 // as Custom.
873 }
874
875 // f128 uses xmm registers, but most operations require libcalls.
876 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
877 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
878 : &X86::VR128RegClass);
879
880 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
881
892
896
897 // clang-format off
905 // clang-format on
906 // No STRICT_FSINCOS
909
912 // We need to custom handle any FP_ROUND with an f128 input, but
913 // LegalizeDAG uses the result type to know when to run a custom handler.
914 // So we have to list all legal floating point result types here.
915 if (isTypeLegal(MVT::f32)) {
918 }
919 if (isTypeLegal(MVT::f64)) {
922 }
923 if (isTypeLegal(MVT::f80)) {
926 }
927
929
930 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
931 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
932 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
933 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
934 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
935 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
936 }
937
938 // Always use a library call for pow.
939 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
940 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
941 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
942 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
943
952
953 // Some FP actions are always expanded for vector types.
954 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
955 MVT::v4f32, MVT::v8f32, MVT::v16f32,
956 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
957 // clang-format off
971 // clang-format on
972 }
973
974 // First set operation action for all vector types to either promote
975 // (for widening) or expand (for scalarization). Then we will selectively
976 // turn on ones that can be effectively codegen'd.
1016 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1017 setTruncStoreAction(InnerVT, VT, Expand);
1018
1019 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1020 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1021
1022 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1023 // types, we have to deal with them whether we ask for Expansion or not.
1024 // Setting Expand causes its own optimisation problems though, so leave
1025 // them legal.
1026 if (VT.getVectorElementType() == MVT::i1)
1027 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1028
1029 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1030 // split/scalarized right now.
1031 if (VT.getVectorElementType() == MVT::f16 ||
1032 VT.getVectorElementType() == MVT::bf16)
1033 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1034 }
1035 }
1036
1037 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1038 // with -msoft-float, disable use of MMX as well.
1039 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1040 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1041 // No operations on x86mmx supported, everything uses intrinsics.
1042 }
1043
1044 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1045 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1046 : &X86::VR128RegClass);
1047
1050
1051 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1052 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1059
1060 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1061 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1062
1068 }
1069
1070 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1071 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1072 : &X86::VR128RegClass);
1073
1074 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1075 // registers cannot be used even for integer operations.
1076 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1077 : &X86::VR128RegClass);
1078 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1079 : &X86::VR128RegClass);
1080 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1081 : &X86::VR128RegClass);
1082 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1083 : &X86::VR128RegClass);
1084 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1085 : &X86::VR128RegClass);
1086
1087 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1090 }
1091
1092 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1093 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1098 }
1099
1100 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1101 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1102 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1103
1104 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1105 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1106 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1107 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1108 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1109 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1110 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1111 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1112 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1113 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1116
1117 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1118 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1119 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1120
1121 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1122 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1124
1125 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1126
1127 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1128 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1129 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1130 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1131 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1132 }
1133
1144
1149
1150 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1156
1157 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1158 // setcc all the way to isel and prefer SETGT in some isel patterns.
1161 }
1162
1163 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1164 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1169
1170 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1176 }
1177
1178 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1182
1183 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1184 continue;
1185
1188 }
1189 setF16Action(MVT::v8f16, Expand);
1190 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1191 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1192 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1193 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1194 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1195 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1197
1198 // Custom lower v2i64 and v2f64 selects.
1205
1212
1213 // Custom legalize these to avoid over promotion or custom promotion.
1214 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1219 }
1220
1225
1228
1231
1232 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1237
1242
1243 // We want to legalize this to an f64 load rather than an i64 load on
1244 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1245 // store.
1246 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1247 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1248 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1249 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1250 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1252
1253 // Add 32-bit vector stores to help vectorization opportunities.
1254 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1256
1260 if (!Subtarget.hasAVX512())
1262
1266
1268
1285
1286 // In the customized shift lowering, the legal v4i32/v2i64 cases
1287 // in AVX2 will be recognized.
1288 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1292 if (VT == MVT::v2i64) continue;
1297 }
1298
1304 }
1305
1306 if (Subtarget.hasGFNI()) {
1311 }
1312
1313 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1314 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1315 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1316 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1317
1318 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1321 }
1322
1323 // These might be better off as horizontal vector ops.
1328 }
1329
1330 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1331 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1334 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1338 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1344
1346 }
1347
1348 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1349 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1350 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1351 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1352 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1353 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1354 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1355 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1356
1360
1361 // FIXME: Do we need to handle scalar-to-vector here?
1362 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1363 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1364
1365 // We directly match byte blends in the backend as they match the VSELECT
1366 // condition form.
1368
1369 // SSE41 brings specific instructions for doing vector sign extend even in
1370 // cases where we don't have SRA.
1371 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1374 }
1375
1376 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1377 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1378 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1379 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1380 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1381 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1382 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1383 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1384 }
1385
1386 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1387 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1388 // do the pre and post work in the vector domain.
1391 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1392 // so that DAG combine doesn't try to turn it into uint_to_fp.
1395 }
1396 }
1397
1398 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1400 }
1401
1402 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1403 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1404 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1407 }
1408
1409 // XOP can efficiently perform BITREVERSE with VPPERM.
1410 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1412 }
1413
1414 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1415 bool HasInt256 = Subtarget.hasInt256();
1416
1417 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1418 : &X86::VR256RegClass);
1419 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1420 : &X86::VR256RegClass);
1421 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1422 : &X86::VR256RegClass);
1423 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1424 : &X86::VR256RegClass);
1425 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1426 : &X86::VR256RegClass);
1427 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1428 : &X86::VR256RegClass);
1429 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1430 : &X86::VR256RegClass);
1431
1432 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1445
1447
1451
1454 }
1455
1456 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1457 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1458
1459 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1460 // even though v8i16 is a legal type.
1461 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1462 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1463 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1464 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1468
1475
1487
1488 if (!Subtarget.hasAVX512())
1490
1491 // In the customized shift lowering, the legal v8i32/v4i64 cases
1492 // in AVX2 will be recognized.
1493 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1499 if (VT == MVT::v4i64) continue;
1504 }
1505
1506 // These types need custom splitting if their input is a 128-bit vector.
1511
1515 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1516 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1519
1520 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1524 }
1525
1530
1531 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1536
1537 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1538 // setcc all the way to isel and prefer SETGT in some isel patterns.
1541 }
1542
1543 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1544 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1549
1550 if (Subtarget.hasAnyFMA()) {
1551 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1552 MVT::v2f64, MVT::v4f64 }) {
1555 }
1556 }
1557
1558 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1559 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1560 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1561 }
1562
1563 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1564 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1565 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1566 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1567
1568 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1569 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1570 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1571 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1572 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1573 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1574 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1575 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1576
1577 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1578 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1579
1580 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1581 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1582 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1583 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1584 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1585
1586 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1587 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1588 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1589 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1592 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1593 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1598
1599 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1600 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1601 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1602 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1603 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1604 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1605 }
1606
1607 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1610 }
1611
1612 if (HasInt256) {
1613 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1614 // when we have a 256bit-wide blend with immediate.
1617
1618 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1619 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1620 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1621 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1622 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1623 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1624 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1625 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1626 }
1627 }
1628
1629 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1630 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1631 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1633 }
1634
1635 // Extract subvector is special because the value type
1636 // (result) is 128-bit but the source is 256-bit wide.
1637 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1638 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1640 }
1641
1642 // Custom lower several nodes for 256-bit types.
1643 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1644 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1654 }
1655 setF16Action(MVT::v16f16, Expand);
1656 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1657 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1659 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1660 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1661 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1662 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1663
1664 if (HasInt256) {
1666
1667 // Custom legalize 2x32 to get a little better code.
1670
1671 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1672 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1674 }
1675 }
1676
1677 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1678 Subtarget.hasF16C()) {
1679 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1682 }
1683 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1686 }
1687 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1688 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1689 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1690 }
1691 }
1692
1693 // This block controls legalization of the mask vector sizes that are
1694 // available with AVX512. 512-bit vectors are in a separate block controlled
1695 // by useAVX512Regs.
1696 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1697 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1698 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1699 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1700 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1701 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1702
1706
1707 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1708 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1709 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1710 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1711 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1712 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1713 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1714 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1719
1720 // There is no byte sized k-register load or store without AVX512DQ.
1721 if (!Subtarget.hasDQI()) {
1722 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1723 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1724 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1725 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1726
1731 }
1732
1733 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1734 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1738 }
1739
1740 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1742
1743 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1747
1754 }
1755
1756 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1758 }
1759 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1760 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1763 }
1764 }
1765
1766 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1767 // elements. 512-bits can be disabled based on prefer-vector-width and
1768 // required-vector-width function attributes.
1769 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1770 bool HasBWI = Subtarget.hasBWI();
1771
1772 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1773 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1774 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1775 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1776 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1777 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1778 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1779
1780 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1781 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1782 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1783 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1784 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1785 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1786 if (HasBWI)
1787 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1788 }
1789
1790 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1798 }
1799 setOperationAction(ISD::LRINT, MVT::v16f32,
1800 Subtarget.hasDQI() ? Legal : Custom);
1801 setOperationAction(ISD::LRINT, MVT::v8f64,
1802 Subtarget.hasDQI() ? Legal : Custom);
1803 if (Subtarget.hasDQI())
1804 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1805
1806 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1811 }
1812
1813 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1818 }
1819
1826
1838
1839 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1840 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1841 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1842 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1843 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1844 if (HasBWI)
1845 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1846
1847 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1848 // to 512-bit rather than use the AVX2 instructions so that we can use
1849 // k-masks.
1850 if (!Subtarget.hasVLX()) {
1851 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1852 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1855 }
1856 }
1857
1859 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1860 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1870
1871 if (HasBWI) {
1872 // Extends from v64i1 masks to 512-bit vectors.
1876 }
1877
1878 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1891
1893 }
1894
1895 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1898 }
1899
1900 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1901 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1902 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1903 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1904
1905 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1906 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1907 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1908 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1909
1910 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1911 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1912 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1913 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1914 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1915 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1916 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1917 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1918
1919 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1920 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1921
1922 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1932
1933 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1934 // setcc all the way to isel and prefer SETGT in some isel patterns.
1937 }
1938
1939 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1940 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1945
1946 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1953 }
1954
1955 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1956 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1957 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1959 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1960 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1961 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1962 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1967 }
1968
1969 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1970 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1971 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1972 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1973 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1974 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1975
1976 if (Subtarget.hasDQI()) {
1980 setOperationAction(Opc, MVT::v8i64, Custom);
1981 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1982 }
1983
1984 if (Subtarget.hasCDI()) {
1985 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1986 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1988 }
1989 } // Subtarget.hasCDI()
1990
1991 if (Subtarget.hasVPOPCNTDQ()) {
1992 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1994 }
1995
1996 // Extract subvector is special because the value type
1997 // (result) is 256-bit but the source is 512-bit wide.
1998 // 128-bit was made Legal under AVX1.
1999 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2000 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2002
2003 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2004 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2014 }
2015 setF16Action(MVT::v32f16, Expand);
2020 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2021 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2022
2023 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2028 }
2029 if (HasBWI) {
2030 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2033 }
2034 } else {
2035 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2036 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2037 }
2038
2039 if (Subtarget.hasVBMI2()) {
2040 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2043 }
2044
2045 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2046 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2047 }
2048
2049 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2050 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2052 }// useAVX512Regs
2053
2054 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2055 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2056 MVT::v4i64}) {
2059 }
2060 }
2061
2062 // This block controls legalization for operations that don't have
2063 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2064 // narrower widths.
2065 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2066 // These operations are handled on non-VLX by artificially widening in
2067 // isel patterns.
2068
2072
2073 if (Subtarget.hasDQI()) {
2074 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2075 // v2f32 UINT_TO_FP is already custom under SSE2.
2078 "Unexpected operation action!");
2079 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2084 }
2085
2086 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2092 }
2093
2094 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2097 }
2098
2099 // Custom legalize 2x32 to get a little better code.
2102
2103 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2104 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2106
2107 if (Subtarget.hasDQI()) {
2111 setOperationAction(Opc, MVT::v2i64, Custom);
2112 setOperationAction(Opc, MVT::v4i64, Custom);
2113 }
2114 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2115 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2116 }
2117
2118 if (Subtarget.hasCDI()) {
2119 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2121 }
2122 } // Subtarget.hasCDI()
2123
2124 if (Subtarget.hasVPOPCNTDQ()) {
2125 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2127 }
2128 }
2129
2130 // This block control legalization of v32i1/v64i1 which are available with
2131 // AVX512BW..
2132 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2133 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2134 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2135
2136 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2147 }
2148
2149 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2151
2152 // Extends from v32i1 masks to 256-bit vectors.
2156
2157 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2158 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2159 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2160 }
2161
2162 // These operations are handled on non-VLX by artificially widening in
2163 // isel patterns.
2164 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2165
2166 if (Subtarget.hasBITALG()) {
2167 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2169 }
2170 }
2171
2172 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2173 auto setGroup = [&] (MVT VT) {
2184
2197
2199
2202
2208
2214
2218 };
2219
2220 // AVX512_FP16 scalar operations
2221 setGroup(MVT::f16);
2235
2238
2239 if (Subtarget.useAVX512Regs()) {
2240 setGroup(MVT::v32f16);
2246 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2253
2258 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2260 MVT::v32i16);
2261 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2263 MVT::v32i16);
2264 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2266 MVT::v32i16);
2267 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2269 MVT::v32i16);
2270
2274
2275 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2276 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2277 }
2278
2279 if (Subtarget.hasVLX()) {
2280 setGroup(MVT::v8f16);
2281 setGroup(MVT::v16f16);
2282
2293
2304
2305 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2308
2312
2313 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2314 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2315 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2316 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2317
2318 // Need to custom widen these to prevent scalarization.
2319 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2320 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2321 }
2322 }
2323
2324 if (!Subtarget.useSoftFloat() &&
2325 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2326 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2327 : &X86::VR128RegClass);
2328 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2329 : &X86::VR256RegClass);
2330 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2331 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2332 // Set the operation action Custom to do the customization later.
2335 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2336 setF16Action(VT, Expand);
2341 }
2342 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2343 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2344 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2345 }
2347 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2348 }
2349
2350 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2351 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2352 setF16Action(MVT::v32bf16, Expand);
2353 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2354 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2356 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2360 }
2361
2362 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2363 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2364 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2365 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2366 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2367 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2368
2369 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2370 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2371 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2372 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2373 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2374
2375 if (Subtarget.hasBWI()) {
2376 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2377 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2378 }
2379
2380 if (Subtarget.hasFP16()) {
2381 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2390 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2399 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2404 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2409 }
2410 }
2411
2412 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2413 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2414 }
2415
2416 // We want to custom lower some of our intrinsics.
2420 if (!Subtarget.is64Bit()) {
2422 }
2423
2424 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2425 // handle type legalization for these operations here.
2426 //
2427 // FIXME: We really should do custom legalization for addition and
2428 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2429 // than generic legalization for 64-bit multiplication-with-overflow, though.
2430 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2431 if (VT == MVT::i64 && !Subtarget.is64Bit())
2432 continue;
2433 // Add/Sub/Mul with overflow operations are custom lowered.
2440
2441 // Support carry in as value rather than glue.
2447 }
2448
2449 // Combine sin / cos into _sincos_stret if it is available.
2450 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2451 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2454 }
2455
2456 if (Subtarget.isTargetWin64()) {
2457 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2458 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2459 setOperationAction(ISD::SREM, MVT::i128, Custom);
2460 setOperationAction(ISD::UREM, MVT::i128, Custom);
2469 }
2470
2471 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2472 // is. We should promote the value to 64-bits to solve this.
2473 // This is what the CRT headers do - `fmodf` is an inline header
2474 // function casting to f64 and calling `fmod`.
2475 if (Subtarget.is32Bit() &&
2476 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2477 // clang-format off
2478 for (ISD::NodeType Op :
2495 if (isOperationExpand(Op, MVT::f32))
2496 setOperationAction(Op, MVT::f32, Promote);
2497 // clang-format on
2498
2499 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2500 // it, but it's just a wrapper around ldexp.
2501 if (Subtarget.isOSWindows()) {
2503 if (isOperationExpand(Op, MVT::f32))
2504 setOperationAction(Op, MVT::f32, Promote);
2505 }
2506
2507 // We have target-specific dag combine patterns for the following nodes:
2518 ISD::SHL,
2519 ISD::SRA,
2520 ISD::SRL,
2521 ISD::OR,
2522 ISD::AND,
2528 ISD::ADD,
2529 ISD::FADD,
2530 ISD::FSUB,
2531 ISD::FNEG,
2532 ISD::FMA,
2536 ISD::SUB,
2537 ISD::LOAD,
2538 ISD::LRINT,
2540 ISD::MLOAD,
2541 ISD::STORE,
2555 ISD::SETCC,
2556 ISD::MUL,
2557 ISD::XOR,
2568
2570
2571 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2573 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2575 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2577
2578 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2579 // that needs to benchmarked and balanced with the potential use of vector
2580 // load/store types (PR33329, PR33914).
2583
2584 // Default loop alignment, which can be overridden by -align-loops.
2586
2587 // An out-of-order CPU can speculatively execute past a predictable branch,
2588 // but a conditional move could be stalled by an expensive earlier operation.
2589 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2590 EnableExtLdPromotion = true;
2592
2594
2595 // Default to having -disable-strictnode-mutation on
2596 IsStrictFPEnabled = true;
2597}
2598
2599// This has so far only been implemented for 64-bit MachO.
2601 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2602}
2603
2605 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2606 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2607}
2608
2610 const SDLoc &DL) const {
2611 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2612 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2613 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2614 return SDValue(Node, 0);
2615}
2616
2619 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2620 !Subtarget.hasBWI())
2621 return TypeSplitVector;
2622
2623 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2624 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2625 return TypeSplitVector;
2626
2627 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2628 VT.getVectorElementType() != MVT::i1)
2629 return TypeWidenVector;
2630
2632}
2633
2634FastISel *
2636 const TargetLibraryInfo *libInfo) const {
2637 return X86::createFastISel(funcInfo, libInfo);
2638}
2639
2640//===----------------------------------------------------------------------===//
2641// Other Lowering Hooks
2642//===----------------------------------------------------------------------===//
2643
2645 bool AssumeSingleUse) {
2646 if (!AssumeSingleUse && !Op.hasOneUse())
2647 return false;
2648 if (!ISD::isNormalLoad(Op.getNode()))
2649 return false;
2650
2651 // If this is an unaligned vector, make sure the target supports folding it.
2652 auto *Ld = cast<LoadSDNode>(Op.getNode());
2653 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2654 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2655 return false;
2656
2657 // TODO: If this is a non-temporal load and the target has an instruction
2658 // for it, it should not be folded. See "useNonTemporalLoad()".
2659
2660 return true;
2661}
2662
2664 const X86Subtarget &Subtarget,
2665 bool AssumeSingleUse) {
2666 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2667 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2668 return false;
2669
2670 // We can not replace a wide volatile load with a broadcast-from-memory,
2671 // because that would narrow the load, which isn't legal for volatiles.
2672 auto *Ld = cast<LoadSDNode>(Op.getNode());
2673 return !Ld->isVolatile() ||
2674 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2675}
2676
2678 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2679}
2680
2682 if (Op.hasOneUse()) {
2683 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
2684 return (ISD::ZERO_EXTEND == Opcode);
2685 }
2686 return false;
2687}
2688
2689static bool isLogicOp(unsigned Opcode) {
2690 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2691 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2692}
2693
2694static bool isTargetShuffle(unsigned Opcode) {
2695 switch(Opcode) {
2696 default: return false;
2697 case X86ISD::BLENDI:
2698 case X86ISD::PSHUFB:
2699 case X86ISD::PSHUFD:
2700 case X86ISD::PSHUFHW:
2701 case X86ISD::PSHUFLW:
2702 case X86ISD::SHUFP:
2703 case X86ISD::INSERTPS:
2704 case X86ISD::EXTRQI:
2705 case X86ISD::INSERTQI:
2706 case X86ISD::VALIGN:
2707 case X86ISD::PALIGNR:
2708 case X86ISD::VSHLDQ:
2709 case X86ISD::VSRLDQ:
2710 case X86ISD::MOVLHPS:
2711 case X86ISD::MOVHLPS:
2712 case X86ISD::MOVSHDUP:
2713 case X86ISD::MOVSLDUP:
2714 case X86ISD::MOVDDUP:
2715 case X86ISD::MOVSS:
2716 case X86ISD::MOVSD:
2717 case X86ISD::MOVSH:
2718 case X86ISD::UNPCKL:
2719 case X86ISD::UNPCKH:
2720 case X86ISD::VBROADCAST:
2721 case X86ISD::VPERMILPI:
2722 case X86ISD::VPERMILPV:
2723 case X86ISD::VPERM2X128:
2724 case X86ISD::SHUF128:
2725 case X86ISD::VPERMIL2:
2726 case X86ISD::VPERMI:
2727 case X86ISD::VPPERM:
2728 case X86ISD::VPERMV:
2729 case X86ISD::VPERMV3:
2730 case X86ISD::VZEXT_MOVL:
2731 return true;
2732 }
2733}
2734
2735static bool isTargetShuffleVariableMask(unsigned Opcode) {
2736 switch (Opcode) {
2737 default: return false;
2738 // Target Shuffles.
2739 case X86ISD::PSHUFB:
2740 case X86ISD::VPERMILPV:
2741 case X86ISD::VPERMIL2:
2742 case X86ISD::VPPERM:
2743 case X86ISD::VPERMV:
2744 case X86ISD::VPERMV3:
2745 return true;
2746 // 'Faux' Target Shuffles.
2747 case ISD::OR:
2748 case ISD::AND:
2749 case X86ISD::ANDNP:
2750 return true;
2751 }
2752}
2753
2756 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2758 int ReturnAddrIndex = FuncInfo->getRAIndex();
2759
2760 if (ReturnAddrIndex == 0) {
2761 // Set up a frame object for the return address.
2762 unsigned SlotSize = RegInfo->getSlotSize();
2763 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2764 -(int64_t)SlotSize,
2765 false);
2766 FuncInfo->setRAIndex(ReturnAddrIndex);
2767 }
2768
2769 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2770}
2771
2773 bool HasSymbolicDisplacement) {
2774 // Offset should fit into 32 bit immediate field.
2775 if (!isInt<32>(Offset))
2776 return false;
2777
2778 // If we don't have a symbolic displacement - we don't have any extra
2779 // restrictions.
2780 if (!HasSymbolicDisplacement)
2781 return true;
2782
2783 // We can fold large offsets in the large code model because we always use
2784 // 64-bit offsets.
2785 if (CM == CodeModel::Large)
2786 return true;
2787
2788 // For kernel code model we know that all object resist in the negative half
2789 // of 32bits address space. We may not accept negative offsets, since they may
2790 // be just off and we may accept pretty large positive ones.
2791 if (CM == CodeModel::Kernel)
2792 return Offset >= 0;
2793
2794 // For other non-large code models we assume that latest small object is 16MB
2795 // before end of 31 bits boundary. We may also accept pretty large negative
2796 // constants knowing that all objects are in the positive half of address
2797 // space.
2798 return Offset < 16 * 1024 * 1024;
2799}
2800
2801/// Return true if the condition is an signed comparison operation.
2802static bool isX86CCSigned(unsigned X86CC) {
2803 switch (X86CC) {
2804 default:
2805 llvm_unreachable("Invalid integer condition!");
2806 case X86::COND_E:
2807 case X86::COND_NE:
2808 case X86::COND_B:
2809 case X86::COND_A:
2810 case X86::COND_BE:
2811 case X86::COND_AE:
2812 return false;
2813 case X86::COND_G:
2814 case X86::COND_GE:
2815 case X86::COND_L:
2816 case X86::COND_LE:
2817 return true;
2818 }
2819}
2820
2822 switch (SetCCOpcode) {
2823 // clang-format off
2824 default: llvm_unreachable("Invalid integer condition!");
2825 case ISD::SETEQ: return X86::COND_E;
2826 case ISD::SETGT: return X86::COND_G;
2827 case ISD::SETGE: return X86::COND_GE;
2828 case ISD::SETLT: return X86::COND_L;
2829 case ISD::SETLE: return X86::COND_LE;
2830 case ISD::SETNE: return X86::COND_NE;
2831 case ISD::SETULT: return X86::COND_B;
2832 case ISD::SETUGT: return X86::COND_A;
2833 case ISD::SETULE: return X86::COND_BE;
2834 case ISD::SETUGE: return X86::COND_AE;
2835 // clang-format on
2836 }
2837}
2838
2839/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2840/// condition code, returning the condition code and the LHS/RHS of the
2841/// comparison to make.
2843 bool isFP, SDValue &LHS, SDValue &RHS,
2844 SelectionDAG &DAG) {
2845 if (!isFP) {
2846 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2847 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2848 // X > -1 -> X == 0, jump !sign.
2849 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2850 return X86::COND_NS;
2851 }
2852 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2853 // X < 0 -> X == 0, jump on sign.
2854 return X86::COND_S;
2855 }
2856 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2857 // X >= 0 -> X == 0, jump on !sign.
2858 return X86::COND_NS;
2859 }
2860 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2861 // X < 1 -> X <= 0
2862 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2863 return X86::COND_LE;
2864 }
2865 }
2866
2867 return TranslateIntegerX86CC(SetCCOpcode);
2868 }
2869
2870 // First determine if it is required or is profitable to flip the operands.
2871
2872 // If LHS is a foldable load, but RHS is not, flip the condition.
2873 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2874 !ISD::isNON_EXTLoad(RHS.getNode())) {
2875 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2876 std::swap(LHS, RHS);
2877 }
2878
2879 switch (SetCCOpcode) {
2880 default: break;
2881 case ISD::SETOLT:
2882 case ISD::SETOLE:
2883 case ISD::SETUGT:
2884 case ISD::SETUGE:
2885 std::swap(LHS, RHS);
2886 break;
2887 }
2888
2889 // On a floating point condition, the flags are set as follows:
2890 // ZF PF CF op
2891 // 0 | 0 | 0 | X > Y
2892 // 0 | 0 | 1 | X < Y
2893 // 1 | 0 | 0 | X == Y
2894 // 1 | 1 | 1 | unordered
2895 switch (SetCCOpcode) {
2896 // clang-format off
2897 default: llvm_unreachable("Condcode should be pre-legalized away");
2898 case ISD::SETUEQ:
2899 case ISD::SETEQ: return X86::COND_E;
2900 case ISD::SETOLT: // flipped
2901 case ISD::SETOGT:
2902 case ISD::SETGT: return X86::COND_A;
2903 case ISD::SETOLE: // flipped
2904 case ISD::SETOGE:
2905 case ISD::SETGE: return X86::COND_AE;
2906 case ISD::SETUGT: // flipped
2907 case ISD::SETULT:
2908 case ISD::SETLT: return X86::COND_B;
2909 case ISD::SETUGE: // flipped
2910 case ISD::SETULE:
2911 case ISD::SETLE: return X86::COND_BE;
2912 case ISD::SETONE:
2913 case ISD::SETNE: return X86::COND_NE;
2914 case ISD::SETUO: return X86::COND_P;
2915 case ISD::SETO: return X86::COND_NP;
2916 case ISD::SETOEQ:
2917 case ISD::SETUNE: return X86::COND_INVALID;
2918 // clang-format on
2919 }
2920}
2921
2922/// Is there a floating point cmov for the specific X86 condition code?
2923/// Current x86 isa includes the following FP cmov instructions:
2924/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2925static bool hasFPCMov(unsigned X86CC) {
2926 switch (X86CC) {
2927 default:
2928 return false;
2929 case X86::COND_B:
2930 case X86::COND_BE:
2931 case X86::COND_E:
2932 case X86::COND_P:
2933 case X86::COND_A:
2934 case X86::COND_AE:
2935 case X86::COND_NE:
2936 case X86::COND_NP:
2937 return true;
2938 }
2939}
2940
2941static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
2942 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
2943 VT.is512BitVector();
2944}
2945
2947 const CallInst &I,
2948 MachineFunction &MF,
2949 unsigned Intrinsic) const {
2951 Info.offset = 0;
2952
2953 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
2954 if (!IntrData) {
2955 switch (Intrinsic) {
2956 case Intrinsic::x86_aesenc128kl:
2957 case Intrinsic::x86_aesdec128kl:
2959 Info.ptrVal = I.getArgOperand(1);
2960 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2961 Info.align = Align(1);
2963 return true;
2964 case Intrinsic::x86_aesenc256kl:
2965 case Intrinsic::x86_aesdec256kl:
2967 Info.ptrVal = I.getArgOperand(1);
2968 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2969 Info.align = Align(1);
2971 return true;
2972 case Intrinsic::x86_aesencwide128kl:
2973 case Intrinsic::x86_aesdecwide128kl:
2975 Info.ptrVal = I.getArgOperand(0);
2976 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2977 Info.align = Align(1);
2979 return true;
2980 case Intrinsic::x86_aesencwide256kl:
2981 case Intrinsic::x86_aesdecwide256kl:
2983 Info.ptrVal = I.getArgOperand(0);
2984 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2985 Info.align = Align(1);
2987 return true;
2988 case Intrinsic::x86_cmpccxadd32:
2989 case Intrinsic::x86_cmpccxadd64:
2990 case Intrinsic::x86_atomic_bts:
2991 case Intrinsic::x86_atomic_btc:
2992 case Intrinsic::x86_atomic_btr: {
2994 Info.ptrVal = I.getArgOperand(0);
2995 unsigned Size = I.getType()->getScalarSizeInBits();
2996 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2997 Info.align = Align(Size);
3000 return true;
3001 }
3002 case Intrinsic::x86_atomic_bts_rm:
3003 case Intrinsic::x86_atomic_btc_rm:
3004 case Intrinsic::x86_atomic_btr_rm: {
3006 Info.ptrVal = I.getArgOperand(0);
3007 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3008 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3009 Info.align = Align(Size);
3012 return true;
3013 }
3014 case Intrinsic::x86_aadd32:
3015 case Intrinsic::x86_aadd64:
3016 case Intrinsic::x86_aand32:
3017 case Intrinsic::x86_aand64:
3018 case Intrinsic::x86_aor32:
3019 case Intrinsic::x86_aor64:
3020 case Intrinsic::x86_axor32:
3021 case Intrinsic::x86_axor64:
3022 case Intrinsic::x86_atomic_add_cc:
3023 case Intrinsic::x86_atomic_sub_cc:
3024 case Intrinsic::x86_atomic_or_cc:
3025 case Intrinsic::x86_atomic_and_cc:
3026 case Intrinsic::x86_atomic_xor_cc: {
3028 Info.ptrVal = I.getArgOperand(0);
3029 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3030 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3031 Info.align = Align(Size);
3034 return true;
3035 }
3036 }
3037 return false;
3038 }
3039
3040 switch (IntrData->Type) {
3043 case TRUNCATE_TO_MEM_VI32: {
3045 Info.ptrVal = I.getArgOperand(0);
3046 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3048 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3049 ScalarVT = MVT::i8;
3050 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3051 ScalarVT = MVT::i16;
3052 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3053 ScalarVT = MVT::i32;
3054
3055 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3056 Info.align = Align(1);
3058 break;
3059 }
3060 case GATHER:
3061 case GATHER_AVX2: {
3063 Info.ptrVal = nullptr;
3064 MVT DataVT = MVT::getVT(I.getType());
3065 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3066 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3067 IndexVT.getVectorNumElements());
3068 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3069 Info.align = Align(1);
3071 break;
3072 }
3073 case SCATTER: {
3075 Info.ptrVal = nullptr;
3076 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3077 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3078 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3079 IndexVT.getVectorNumElements());
3080 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3081 Info.align = Align(1);
3083 break;
3084 }
3085 default:
3086 return false;
3087 }
3088
3089 return true;
3090}
3091
3092/// Returns true if the target can instruction select the
3093/// specified FP immediate natively. If false, the legalizer will
3094/// materialize the FP immediate as a load from a constant pool.
3096 bool ForCodeSize) const {
3097 for (const APFloat &FPImm : LegalFPImmediates)
3098 if (Imm.bitwiseIsEqual(FPImm))
3099 return true;
3100 return false;
3101}
3102
3104 ISD::LoadExtType ExtTy,
3105 EVT NewVT) const {
3106 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3107
3108 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3109 // relocation target a movq or addq instruction: don't let the load shrink.
3110 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3111 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3112 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3113 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3114
3115 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3116 // those uses are extracted directly into a store, then the extract + store
3117 // can be store-folded. Therefore, it's probably not worth splitting the load.
3118 EVT VT = Load->getValueType(0);
3119 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3120 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
3121 // Skip uses of the chain value. Result 0 of the node is the load value.
3122 if (UI.getUse().getResNo() != 0)
3123 continue;
3124
3125 // If this use is not an extract + store, it's probably worth splitting.
3126 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
3127 UI->use_begin()->getOpcode() != ISD::STORE)
3128 return true;
3129 }
3130 // All non-chain uses are extract + store.
3131 return false;
3132 }
3133
3134 return true;
3135}
3136
3137/// Returns true if it is beneficial to convert a load of a constant
3138/// to just the constant itself.
3140 Type *Ty) const {
3141 assert(Ty->isIntegerTy());
3142
3143 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3144 if (BitSize == 0 || BitSize > 64)
3145 return false;
3146 return true;
3147}
3148