LLVM  10.0.0svn
X86ISelLowering.cpp
Go to the documentation of this file.
1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that X86 uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86ISelLowering.h"
15 #include "Utils/X86ShuffleDecode.h"
16 #include "X86CallingConv.h"
17 #include "X86FrameLowering.h"
18 #include "X86InstrBuilder.h"
19 #include "X86IntrinsicsInfo.h"
20 #include "X86MachineFunctionInfo.h"
21 #include "X86TargetMachine.h"
22 #include "X86TargetObjectFile.h"
24 #include "llvm/ADT/SmallSet.h"
25 #include "llvm/ADT/Statistic.h"
26 #include "llvm/ADT/StringExtras.h"
27 #include "llvm/ADT/StringSwitch.h"
38 #include "llvm/IR/CallSite.h"
39 #include "llvm/IR/CallingConv.h"
40 #include "llvm/IR/Constants.h"
41 #include "llvm/IR/DerivedTypes.h"
42 #include "llvm/IR/DiagnosticInfo.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/GlobalAlias.h"
45 #include "llvm/IR/GlobalVariable.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/Intrinsics.h"
48 #include "llvm/MC/MCAsmInfo.h"
49 #include "llvm/MC/MCContext.h"
50 #include "llvm/MC/MCExpr.h"
51 #include "llvm/MC/MCSymbol.h"
53 #include "llvm/Support/Debug.h"
55 #include "llvm/Support/KnownBits.h"
58 #include <algorithm>
59 #include <bitset>
60 #include <cctype>
61 #include <numeric>
62 using namespace llvm;
63 
64 #define DEBUG_TYPE "x86-isel"
65 
66 STATISTIC(NumTailCalls, "Number of tail calls");
67 
69  "x86-experimental-pref-loop-alignment", cl::init(4),
70  cl::desc(
71  "Sets the preferable loop alignment for experiments (as log2 bytes)"
72  "(the last x86-experimental-pref-loop-alignment bits"
73  " of the loop header PC will be 0)."),
74  cl::Hidden);
75 
76 // Added in 10.0.
78  "x86-enable-old-knl-abi", cl::init(false),
79  cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
80  "one ZMM register on AVX512F, but not AVX512BW targets."),
81  cl::Hidden);
82 
84  "mul-constant-optimization", cl::init(true),
85  cl::desc("Replace 'mul x, Const' with more effective instructions like "
86  "SHIFT, LEA, etc."),
87  cl::Hidden);
88 
90  "x86-experimental-unordered-atomic-isel", cl::init(false),
91  cl::desc("Use LoadSDNode and StoreSDNode instead of "
92  "AtomicSDNode for unordered atomic loads and "
93  "stores respectively."),
94  cl::Hidden);
95 
96 /// Call this when the user attempts to do something unsupported, like
97 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98 /// report_fatal_error, so calling code should attempt to recover without
99 /// crashing.
100 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101  const char *Msg) {
103  DAG.getContext()->diagnose(
105 }
106 
108  const X86Subtarget &STI)
109  : TargetLowering(TM), Subtarget(STI) {
110  bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
111  X86ScalarSSEf64 = Subtarget.hasSSE2();
112  X86ScalarSSEf32 = Subtarget.hasSSE1();
114 
115  // Set up the TargetLowering object.
116 
117  // X86 is weird. It always uses i8 for shift amounts and setcc results.
119  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
121 
122  // For 64-bit, since we have so many registers, use the ILP scheduler.
123  // For 32-bit, use the register pressure specific scheduling.
124  // For Atom, always use ILP scheduling.
125  if (Subtarget.isAtom())
127  else if (Subtarget.is64Bit())
129  else
131  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
133 
134  // Bypass expensive divides and use cheaper ones.
135  if (TM.getOptLevel() >= CodeGenOpt::Default) {
136  if (Subtarget.hasSlowDivide32())
137  addBypassSlowDiv(32, 8);
138  if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
139  addBypassSlowDiv(64, 32);
140  }
141 
142  if (Subtarget.isTargetWindowsMSVC() ||
143  Subtarget.isTargetWindowsItanium()) {
144  // Setup Windows compiler runtime calls.
145  setLibcallName(RTLIB::SDIV_I64, "_alldiv");
146  setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
147  setLibcallName(RTLIB::SREM_I64, "_allrem");
148  setLibcallName(RTLIB::UREM_I64, "_aullrem");
149  setLibcallName(RTLIB::MUL_I64, "_allmul");
155  }
156 
157  if (Subtarget.isTargetDarwin()) {
158  // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
159  setUseUnderscoreSetJmp(false);
161  } else if (Subtarget.isTargetWindowsGNU()) {
162  // MS runtime is weird: it exports _setjmp, but longjmp!
165  } else {
168  }
169 
170  // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
171  // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
172  // FIXME: Should we be limitting the atomic size on other configs? Default is
173  // 1024.
174  if (!Subtarget.hasCmpxchg8b())
176 
177  // Set up the register classes.
178  addRegisterClass(MVT::i8, &X86::GR8RegClass);
179  addRegisterClass(MVT::i16, &X86::GR16RegClass);
180  addRegisterClass(MVT::i32, &X86::GR32RegClass);
181  if (Subtarget.is64Bit())
182  addRegisterClass(MVT::i64, &X86::GR64RegClass);
183 
184  for (MVT VT : MVT::integer_valuetypes())
186 
187  // We don't accept any truncstore of integer registers.
194 
196 
197  // SETOEQ and SETUNE require checking two conditions.
204 
205  // Integer absolute.
206  if (Subtarget.hasCMov()) {
209  }
211 
212  // Funnel shifts.
213  for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
214  setOperationAction(ShiftOp , MVT::i16 , Custom);
215  setOperationAction(ShiftOp , MVT::i32 , Custom);
216  if (Subtarget.is64Bit())
217  setOperationAction(ShiftOp , MVT::i64 , Custom);
218  }
219 
220  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
221  // operation.
225 
226  if (!Subtarget.useSoftFloat()) {
227  // We have an algorithm for SSE2->double, and we turn this into a
228  // 64-bit FILD followed by conditional FADD for other targets.
230  // We have an algorithm for SSE2, and we turn this into a 64-bit
231  // FILD or VCVTUSI2SS/SD for other targets.
233  } else {
235  }
236 
237  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
238  // this operation.
241 
242  if (!Subtarget.useSoftFloat()) {
243  // SSE has no i16 to fp conversion, only i32.
244  if (X86ScalarSSEf32) {
246  // f32 and f64 cases are Legal, f80 case is not
248  } else {
251  }
252  } else {
255  }
256 
257  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
258  // this operation.
261 
262  if (!Subtarget.useSoftFloat()) {
263  // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
264  // are Legal, f80 is custom lowered.
267 
270  } else {
274  }
275 
276  // Handle FP_TO_UINT by promoting the destination to a larger signed
277  // conversion.
281 
282  if (!Subtarget.useSoftFloat()) {
285  }
286 
287  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
288  if (!X86ScalarSSEf64) {
291  if (Subtarget.is64Bit()) {
293  // Without SSE, i64->f64 goes through memory.
295  }
296  } else if (!Subtarget.is64Bit())
298 
299  // Scalar integer divide and remainder are lowered to use operations that
300  // produce two results, to match the available instructions. This exposes
301  // the two-result form to trivial CSE, which is able to combine x/y and x%y
302  // into a single instruction.
303  //
304  // Scalar integer multiply-high is also lowered to use two-result
305  // operations, to match the available instructions. However, plain multiply
306  // (low) operations are left as Legal, as there are single-result
307  // instructions for this in x86. Using the two-result multiply instructions
308  // when both high and low results are needed must be arranged by dagcombine.
309  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
316  }
317 
320  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
324  }
325  if (Subtarget.is64Bit())
330 
336 
337  // Promote the i8 variants and force them on up to i32 which has a shorter
338  // encoding.
341  if (!Subtarget.hasBMI()) {
346  if (Subtarget.is64Bit()) {
349  }
350  }
351 
352  if (Subtarget.hasLZCNT()) {
353  // When promoting the i8 variants, force them to i32 for a shorter
354  // encoding.
357  } else {
364  if (Subtarget.is64Bit()) {
367  }
368  }
369 
370  // Special handling for half-precision floating point conversions.
371  // If we don't have F16C support, then lower half float conversions
372  // into library calls.
373  if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
376  }
377 
378  // There's never any support for operations beyond MVT::f32.
385 
394 
395  if (Subtarget.hasPOPCNT()) {
397  } else {
401  if (Subtarget.is64Bit())
403  else
405  }
406 
408 
409  if (!Subtarget.hasMOVBE())
411 
412  // These should be promoted to a larger select which is supported.
414  // X86 wants to expand cmov itself.
415  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
418  }
419  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
420  if (VT == MVT::i64 && !Subtarget.is64Bit())
421  continue;
424  }
425 
426  // Custom action for SELECT MMX and expand action for SELECT_CC MMX
429 
431  // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
432  // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
437  setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
438 
439  // Darwin ABI issue.
440  for (auto VT : { MVT::i32, MVT::i64 }) {
441  if (VT == MVT::i64 && !Subtarget.is64Bit())
442  continue;
449  }
450 
451  // 64-bit shl, sra, srl (iff 32-bit x86)
452  for (auto VT : { MVT::i32, MVT::i64 }) {
453  if (VT == MVT::i64 && !Subtarget.is64Bit())
454  continue;
458  }
459 
460  if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
462 
464 
465  // Expand certain atomics
466  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
474  }
475 
476  if (!Subtarget.is64Bit())
478 
479  if (Subtarget.hasCmpxchg16b()) {
481  }
482 
483  // FIXME - use subtarget debug flags
484  if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
485  !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
488  }
489 
492 
495 
498 
499  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
502  bool Is64Bit = Subtarget.is64Bit();
504  setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
505 
508 
510 
511  // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
514 
515  if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
516  // f32 and f64 use SSE.
517  // Set up the FP register classes.
518  addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
519  : &X86::FR32RegClass);
520  addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
521  : &X86::FR64RegClass);
522 
523  // Disable f32->f64 extload as we can only generate this in one instruction
524  // under optsize. So its easier to pattern match (fpext (load)) for that
525  // case instead of needing to emit 2 instructions for extload in the
526  // non-optsize case.
528 
529  for (auto VT : { MVT::f32, MVT::f64 }) {
530  // Use ANDPD to simulate FABS.
532 
533  // Use XORP to simulate FNEG.
535 
536  // Use ANDPD and ORPD to simulate FCOPYSIGN.
538 
539  // These might be better off as horizontal vector ops.
542 
543  // We don't support sin/cos/fmod
544  setOperationAction(ISD::FSIN , VT, Expand);
545  setOperationAction(ISD::FCOS , VT, Expand);
546  setOperationAction(ISD::FSINCOS, VT, Expand);
547  }
548 
549  // Lower this to MOVMSK plus an AND.
552 
553  } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) {
554  // Use SSE for f32, x87 for f64.
555  // Set up the FP register classes.
556  addRegisterClass(MVT::f32, &X86::FR32RegClass);
557  if (UseX87)
558  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
559 
560  // Use ANDPS to simulate FABS.
562 
563  // Use XORP to simulate FNEG.
565 
566  if (UseX87)
568 
569  // Use ANDPS and ORPS to simulate FCOPYSIGN.
570  if (UseX87)
573 
574  // We don't support sin/cos/fmod
578 
579  if (UseX87) {
580  // Always expand sin/cos functions even though x87 has an instruction.
584  }
585  } else if (UseX87) {
586  // f32 and f64 in x87.
587  // Set up the FP register classes.
588  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
589  addRegisterClass(MVT::f32, &X86::RFP32RegClass);
590 
591  for (auto VT : { MVT::f32, MVT::f64 }) {
592  setOperationAction(ISD::UNDEF, VT, Expand);
593  setOperationAction(ISD::FCOPYSIGN, VT, Expand);
594 
595  // Always expand sin/cos functions even though x87 has an instruction.
596  setOperationAction(ISD::FSIN , VT, Expand);
597  setOperationAction(ISD::FCOS , VT, Expand);
598  setOperationAction(ISD::FSINCOS, VT, Expand);
599  }
600  }
601 
602  // Expand FP32 immediates into loads from the stack, save special cases.
603  if (isTypeLegal(MVT::f32)) {
604  if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
605  addLegalFPImmediate(APFloat(+0.0f)); // FLD0
606  addLegalFPImmediate(APFloat(+1.0f)); // FLD1
607  addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
608  addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
609  } else // SSE immediates.
610  addLegalFPImmediate(APFloat(+0.0f)); // xorps
611  }
612  // Expand FP64 immediates into loads from the stack, save special cases.
613  if (isTypeLegal(MVT::f64)) {
614  if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
615  addLegalFPImmediate(APFloat(+0.0)); // FLD0
616  addLegalFPImmediate(APFloat(+1.0)); // FLD1
617  addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
618  addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
619  } else // SSE immediates.
620  addLegalFPImmediate(APFloat(+0.0)); // xorpd
621  }
622 
623  // We don't support FMA.
626 
627  // f80 always uses X87.
628  if (UseX87) {
629  addRegisterClass(MVT::f80, &X86::RFP80RegClass);
632  {
634  addLegalFPImmediate(TmpFlt); // FLD0
635  TmpFlt.changeSign();
636  addLegalFPImmediate(TmpFlt); // FLD0/FCHS
637 
638  bool ignored;
639  APFloat TmpFlt2(+1.0);
641  &ignored);
642  addLegalFPImmediate(TmpFlt2); // FLD1
643  TmpFlt2.changeSign();
644  addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
645  }
646 
647  // Always expand sin/cos functions even though x87 has an instruction.
651 
662  }
663 
664  // f128 uses xmm registers, but most operations require libcalls.
665  if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
666  addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
667  : &X86::VR128RegClass);
668 
669  addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
670 
676 
680 
685 
687  // We need to custom handle any FP_ROUND with an f128 input, but
688  // LegalizeDAG uses the result type to know when to run a custom handler.
689  // So we have to list all legal floating point result types here.
690  if (isTypeLegal(MVT::f32)) {
693  }
694  if (isTypeLegal(MVT::f64)) {
697  }
698  if (isTypeLegal(MVT::f80)) {
701  }
702 
704 
711  }
712 
713  // Always use a library call for pow.
718 
726 
727  // Some FP actions are always expanded for vector types.
728  for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
730  setOperationAction(ISD::FSIN, VT, Expand);
731  setOperationAction(ISD::FSINCOS, VT, Expand);
732  setOperationAction(ISD::FCOS, VT, Expand);
733  setOperationAction(ISD::FREM, VT, Expand);
734  setOperationAction(ISD::FCOPYSIGN, VT, Expand);
735  setOperationAction(ISD::FPOW, VT, Expand);
736  setOperationAction(ISD::FLOG, VT, Expand);
737  setOperationAction(ISD::FLOG2, VT, Expand);
738  setOperationAction(ISD::FLOG10, VT, Expand);
739  setOperationAction(ISD::FEXP, VT, Expand);
740  setOperationAction(ISD::FEXP2, VT, Expand);
741  }
742 
743  // First set operation action for all vector types to either promote
744  // (for widening) or expand (for scalarization). Then we will selectively
745  // turn on ones that can be effectively codegen'd.
746  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
747  setOperationAction(ISD::SDIV, VT, Expand);
748  setOperationAction(ISD::UDIV, VT, Expand);
749  setOperationAction(ISD::SREM, VT, Expand);
750  setOperationAction(ISD::UREM, VT, Expand);
755  setOperationAction(ISD::FMA, VT, Expand);
756  setOperationAction(ISD::FFLOOR, VT, Expand);
757  setOperationAction(ISD::FCEIL, VT, Expand);
758  setOperationAction(ISD::FTRUNC, VT, Expand);
759  setOperationAction(ISD::FRINT, VT, Expand);
760  setOperationAction(ISD::FNEARBYINT, VT, Expand);
761  setOperationAction(ISD::SMUL_LOHI, VT, Expand);
762  setOperationAction(ISD::MULHS, VT, Expand);
763  setOperationAction(ISD::UMUL_LOHI, VT, Expand);
764  setOperationAction(ISD::MULHU, VT, Expand);
765  setOperationAction(ISD::SDIVREM, VT, Expand);
766  setOperationAction(ISD::UDIVREM, VT, Expand);
767  setOperationAction(ISD::CTPOP, VT, Expand);
768  setOperationAction(ISD::CTTZ, VT, Expand);
769  setOperationAction(ISD::CTLZ, VT, Expand);
770  setOperationAction(ISD::ROTL, VT, Expand);
771  setOperationAction(ISD::ROTR, VT, Expand);
772  setOperationAction(ISD::BSWAP, VT, Expand);
773  setOperationAction(ISD::SETCC, VT, Expand);
774  setOperationAction(ISD::FP_TO_UINT, VT, Expand);
775  setOperationAction(ISD::FP_TO_SINT, VT, Expand);
776  setOperationAction(ISD::UINT_TO_FP, VT, Expand);
777  setOperationAction(ISD::SINT_TO_FP, VT, Expand);
779  setOperationAction(ISD::TRUNCATE, VT, Expand);
782  setOperationAction(ISD::ANY_EXTEND, VT, Expand);
783  setOperationAction(ISD::SELECT_CC, VT, Expand);
784  for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
785  setTruncStoreAction(InnerVT, VT, Expand);
786 
787  setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
788  setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
789 
790  // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
791  // types, we have to deal with them whether we ask for Expansion or not.
792  // Setting Expand causes its own optimisation problems though, so leave
793  // them legal.
794  if (VT.getVectorElementType() == MVT::i1)
795  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
796 
797  // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
798  // split/scalarized right now.
799  if (VT.getVectorElementType() == MVT::f16)
800  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
801  }
802  }
803 
804  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
805  // with -msoft-float, disable use of MMX as well.
806  if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
807  addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
808  // No operations on x86mmx supported, everything uses intrinsics.
809  }
810 
811  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
812  addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
813  : &X86::VR128RegClass);
814 
824 
827 
829  }
830 
831  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
832  addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
833  : &X86::VR128RegClass);
834 
835  // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
836  // registers cannot be used even for integer operations.
837  addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
838  : &X86::VR128RegClass);
839  addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
840  : &X86::VR128RegClass);
841  addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
842  : &X86::VR128RegClass);
843  addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
844  : &X86::VR128RegClass);
845 
846  for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
852  }
853 
857 
871 
872  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
874  setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
875  setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
876  setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
877  }
878 
891 
895 
896  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
900 
901  // The condition codes aren't legal in SSE/AVX and under AVX512 we use
902  // setcc all the way to isel and prefer SETGT in some isel patterns.
905  }
906 
907  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
913  }
914 
915  for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
919 
920  if (VT == MVT::v2i64 && !Subtarget.is64Bit())
921  continue;
922 
925  }
926 
927  // Custom lower v2i64 and v2f64 selects.
933 
936 
937  // Custom legalize these to avoid over promotion or custom promotion.
948 
949  // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into
950  // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is
951  // split again based on the input type, this will cause an AssertSExt i16 to
952  // be emitted instead of an AssertZExt. This will allow packssdw followed by
953  // packuswb to be used to truncate to v8i8. This is necessary since packusdw
954  // isn't available until sse4.1.
956 
959 
961 
962  // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
964 
967 
968  // We want to legalize this to an f64 load rather than an i64 load on
969  // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
970  // store.
977 
981  if (!Subtarget.hasAVX512())
983 
987 
989 
996 
997  // In the customized shift lowering, the legal v4i32/v2i64 cases
998  // in AVX2 will be recognized.
999  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1003  }
1004 
1007 
1008  // With AVX512, expanding (and promoting the shifts) is better.
1009  if (!Subtarget.hasAVX512())
1011  }
1012 
1013  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1022 
1023  // These might be better off as horizontal vector ops.
1028  }
1029 
1030  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1031  for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1032  setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1033  setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1034  setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1035  setOperationAction(ISD::FRINT, RoundedTy, Legal);
1037  }
1038 
1047 
1048  // FIXME: Do we need to handle scalar-to-vector here?
1050 
1051  // We directly match byte blends in the backend as they match the VSELECT
1052  // condition form.
1054 
1055  // SSE41 brings specific instructions for doing vector sign extend even in
1056  // cases where we don't have SRA.
1057  for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1060  }
1061 
1062  // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1063  for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1070  }
1071 
1072  // i8 vectors are custom because the source register and source
1073  // source memory operand types are not the same width.
1075  }
1076 
1077  if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1078  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1081 
1082  // XOP can efficiently perform BITREVERSE with VPPERM.
1083  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1085 
1086  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1089  }
1090 
1091  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1092  bool HasInt256 = Subtarget.hasInt256();
1093 
1094  addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1095  : &X86::VR256RegClass);
1096  addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1097  : &X86::VR256RegClass);
1098  addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1099  : &X86::VR256RegClass);
1100  addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1101  : &X86::VR256RegClass);
1102  addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1103  : &X86::VR256RegClass);
1104  addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1105  : &X86::VR256RegClass);
1106 
1107  for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1116  }
1117 
1118  // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1119  // even though v8i16 is a legal type.
1123 
1125 
1127 
1128  if (!Subtarget.hasAVX512())
1130 
1131  // In the customized shift lowering, the legal v8i32/v4i64 cases
1132  // in AVX2 will be recognized.
1133  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1137  }
1138 
1139  // These types need custom splitting if their input is a 128-bit vector.
1144 
1147 
1148  // With BWI, expanding (and promoting the shifts) is the better.
1149  if (!Subtarget.hasBWI())
1151 
1158 
1159  for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1163  }
1164 
1169 
1170  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1174 
1175  // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1176  // setcc all the way to isel and prefer SETGT in some isel patterns.
1179  }
1180 
1181  if (Subtarget.hasAnyFMA()) {
1182  for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1185  }
1186 
1187  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1188  setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1189  setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1190  }
1191 
1194  setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1196 
1199  setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1200  setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1203 
1209 
1210  setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1211  setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1212  setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1213  setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1214  setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1215  setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1216  setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1217  setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1218 
1219  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1220  setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1221  setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1222  setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1223  setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1224  setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1225  }
1226 
1227  for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1230  }
1231 
1232  if (HasInt256) {
1233  // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1234  // when we have a 256bit-wide blend with immediate.
1236 
1237  // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1238  for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1245  }
1246  }
1247 
1248  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1250  setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1252  }
1253 
1254  // Extract subvector is special because the value type
1255  // (result) is 128-bit but the source is 256-bit wide.
1256  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1257  MVT::v4f32, MVT::v2f64 }) {
1259  }
1260 
1261  // Custom lower several nodes for 256-bit types.
1263  MVT::v8f32, MVT::v4f64 }) {
1266  setOperationAction(ISD::VSELECT, VT, Custom);
1272  setOperationAction(ISD::STORE, VT, Custom);
1273  }
1274 
1275  if (HasInt256) {
1277 
1278  // Custom legalize 2x32 to get a little better code.
1281 
1282  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1284  setOperationAction(ISD::MGATHER, VT, Custom);
1285  }
1286  }
1287 
1288  // This block controls legalization of the mask vector sizes that are
1289  // available with AVX512. 512-bit vectors are in a separate block controlled
1290  // by useAVX512Regs.
1291  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1292  addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1293  addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1294  addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1295  addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1296  addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1297 
1301 
1308 
1309  // There is no byte sized k-register load or store without AVX512DQ.
1310  if (!Subtarget.hasDQI()) {
1315 
1320  }
1321 
1322  // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1323  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1327  }
1328 
1329  for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1340 
1347  setOperationAction(ISD::VSELECT, VT, Expand);
1348  }
1349 
1350  for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1352  }
1353 
1354  // This block controls legalization for 512-bit operations with 32/64 bit
1355  // elements. 512-bits can be disabled based on prefer-vector-width and
1356  // required-vector-width function attributes.
1357  if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1358  addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1359  addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1360  addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1361  addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1362 
1363  for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1369  }
1370 
1371  for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1376  }
1377 
1388 
1390 
1396 
1397  // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1398  // to 512-bit rather than use the AVX2 instructions so that we can use
1399  // k-masks.
1400  if (!Subtarget.hasVLX()) {
1401  for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1405  }
1406  }
1407 
1416 
1417  // Need to custom widen this if we don't have AVX512BW.
1421 
1422  for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1428 
1430  }
1431 
1432  // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1433  for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
1436  }
1437 
1442 
1445 
1448 
1449  for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1463 
1464  // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1465  // setcc all the way to isel and prefer SETGT in some isel patterns.
1468  }
1469 
1470  if (Subtarget.hasDQI()) {
1475 
1477  }
1478 
1479  if (Subtarget.hasCDI()) {
1480  // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1481  for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1483  }
1484  } // Subtarget.hasCDI()
1485 
1486  if (Subtarget.hasVPOPCNTDQ()) {
1487  for (auto VT : { MVT::v16i32, MVT::v8i64 })
1489  }
1490 
1491  // Extract subvector is special because the value type
1492  // (result) is 256-bit but the source is 512-bit wide.
1493  // 128-bit was made Legal under AVX1.
1494  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1497 
1498  for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1510  }
1511  if (!Subtarget.hasBWI()) {
1512  // Need to custom split v32i16/v64i8 bitcasts.
1515 
1516  // Better to split these into two 256-bit ops.
1519  }
1520 
1521  if (Subtarget.hasVBMI2()) {
1522  for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1525  }
1526  }
1527  }// has AVX-512
1528 
1529  // This block controls legalization for operations that don't have
1530  // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1531  // narrower widths.
1532  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1533  // These operations are handled on non-VLX by artificially widening in
1534  // isel patterns.
1535  // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1536 
1542 
1543  for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1549  }
1550 
1551  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1554  }
1555 
1556  // Custom legalize 2x32 to get a little better code.
1559 
1560  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1563 
1564  if (Subtarget.hasDQI()) {
1565  for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1570 
1572  }
1573  }
1574 
1575  if (Subtarget.hasCDI()) {
1576  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1578  }
1579  } // Subtarget.hasCDI()
1580 
1581  if (Subtarget.hasVPOPCNTDQ()) {
1582  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1584  }
1585  }
1586 
1587  // This block control legalization of v32i1/v64i1 which are available with
1588  // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1589  // useBWIRegs.
1590  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1591  addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1592  addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1593 
1594  for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1598  setOperationAction(ISD::VSELECT, VT, Expand);
1603 
1611  }
1612 
1617  for (auto VT : { MVT::v16i1, MVT::v32i1 })
1619 
1620  // Extends from v32i1 masks to 256-bit vectors.
1624  }
1625 
1626  // This block controls legalization for v32i16 and v64i8. 512-bits can be
1627  // disabled based on prefer-vector-width and required-vector-width function
1628  // attributes.
1629  if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
1630  addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1631  addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1632 
1633  // Extends from v64i1 masks to 512-bit vectors.
1637 
1661 
1664 
1666 
1667  for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1688 
1689  // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1690  // setcc all the way to isel and prefer SETGT in some isel patterns.
1693  }
1694 
1695  for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1697  }
1698 
1699  if (Subtarget.hasBITALG()) {
1700  for (auto VT : { MVT::v64i8, MVT::v32i16 })
1702  }
1703 
1704  if (Subtarget.hasVBMI2()) {
1707  }
1708  }
1709 
1710  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1711  for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1712  setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1713  setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1714  }
1715 
1716  // These operations are handled on non-VLX by artificially widening in
1717  // isel patterns.
1718  // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1719 
1720  if (Subtarget.hasBITALG()) {
1721  for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1723  }
1724  }
1725 
1726  if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1732 
1738 
1739  if (Subtarget.hasDQI()) {
1740  // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1741  // v2f32 UINT_TO_FP is already custom under SSE2.
1744  "Unexpected operation action!");
1745  // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1748  }
1749 
1750  if (Subtarget.hasBWI()) {
1753  }
1754 
1755  if (Subtarget.hasVBMI2()) {
1756  // TODO: Make these legal even without VLX?
1757  for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1761  }
1762  }
1763 
1767  }
1768 
1769  // We want to custom lower some of our intrinsics.
1773  if (!Subtarget.is64Bit()) {
1775  }
1776 
1777  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1778  // handle type legalization for these operations here.
1779  //
1780  // FIXME: We really should do custom legalization for addition and
1781  // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1782  // than generic legalization for 64-bit multiplication-with-overflow, though.
1783  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1784  if (VT == MVT::i64 && !Subtarget.is64Bit())
1785  continue;
1786  // Add/Sub/Mul with overflow operations are custom lowered.
1793 
1794  // Support carry in as value rather than glue.
1798  }
1799 
1800  if (!Subtarget.is64Bit()) {
1801  // These libcalls are not available in 32-bit.
1802  setLibcallName(RTLIB::SHL_I128, nullptr);
1803  setLibcallName(RTLIB::SRL_I128, nullptr);
1804  setLibcallName(RTLIB::SRA_I128, nullptr);
1805  setLibcallName(RTLIB::MUL_I128, nullptr);
1806  }
1807 
1808  // Combine sin / cos into _sincos_stret if it is available.
1809  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1810  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1813  }
1814 
1815  if (Subtarget.isTargetWin64()) {
1822  }
1823 
1824  // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1825  // is. We should promote the value to 64-bits to solve this.
1826  // This is what the CRT headers do - `fmodf` is an inline header
1827  // function casting to f64 and calling `fmod`.
1828  if (Subtarget.is32Bit() &&
1829  (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
1830  for (ISD::NodeType Op :
1835 
1836  // We have target-specific dag combine patterns for the following nodes:
1878 
1880 
1881  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1883  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1885  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1887 
1888  // TODO: These control memcmp expansion in CGP and could be raised higher, but
1889  // that needs to benchmarked and balanced with the potential use of vector
1890  // load/store types (PR33329, PR33914).
1891  MaxLoadsPerMemcmp = 2;
1893 
1894  // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1896 
1897  // An out-of-order CPU can speculatively execute past a predictable branch,
1898  // but a conditional move could be stalled by an expensive earlier operation.
1899  PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1900  EnableExtLdPromotion = true;
1902 
1904 }
1905 
1906 // This has so far only been implemented for 64-bit MachO.
1908  return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1909 }
1910 
1912  // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1913  return Subtarget.getTargetTriple().isOSMSVCRT();
1914 }
1915 
1917  const SDLoc &DL) const {
1918  EVT PtrTy = getPointerTy(DAG.getDataLayout());
1919  unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1920  MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1921  return SDValue(Node, 0);
1922 }
1923 
1926  if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1927  return TypeSplitVector;
1928 
1929  if (VT.getVectorNumElements() != 1 &&
1930  VT.getVectorElementType() != MVT::i1)
1931  return TypeWidenVector;
1932 
1934 }
1935 
1937  CallingConv::ID CC,
1938  EVT VT) const {
1939  // v32i1 vectors should be promoted to v32i8 to match avx2.
1940  if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1941  return MVT::v32i8;
1942  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
1943  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
1944  Subtarget.hasAVX512() &&
1946  (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
1947  (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
1948  return MVT::i8;
1949  // FIXME: Should we just make these types legal and custom split operations?
1950  if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
1951  Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
1952  return MVT::v16i32;
1953  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1954 }
1955 
1957  CallingConv::ID CC,
1958  EVT VT) const {
1959  // v32i1 vectors should be promoted to v32i8 to match avx2.
1960  if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1961  return 1;
1962  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
1963  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
1964  Subtarget.hasAVX512() &&
1966  (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
1967  (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
1968  return VT.getVectorNumElements();
1969  // FIXME: Should we just make these types legal and custom split operations?
1970  if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
1971  Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
1972  return 1;
1973  return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1974 }
1975 
1977  LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1978  unsigned &NumIntermediates, MVT &RegisterVT) const {
1979  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
1980  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
1981  Subtarget.hasAVX512() &&
1983  (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
1984  (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
1985  RegisterVT = MVT::i8;
1986  IntermediateVT = MVT::i1;
1987  NumIntermediates = VT.getVectorNumElements();
1988  return NumIntermediates;
1989  }
1990 
1991  return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
1992  NumIntermediates, RegisterVT);
1993 }
1994 
1997  EVT VT) const {
1998  if (!VT.isVector())
1999  return MVT::i8;
2000 
2001  if (Subtarget.hasAVX512()) {
2002  const unsigned NumElts = VT.getVectorNumElements();
2003 
2004  // Figure out what this type will be legalized to.
2005  EVT LegalVT = VT;
2006  while (getTypeAction(Context, LegalVT) != TypeLegal)
2007  LegalVT = getTypeToTransformTo(Context, LegalVT);
2008 
2009  // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2010  if (LegalVT.getSimpleVT().is512BitVector())
2011  return EVT::getVectorVT(Context, MVT::i1, NumElts);
2012 
2013  if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2014  // If we legalized to less than a 512-bit vector, then we will use a vXi1
2015  // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2016  // vXi16/vXi8.
2017  MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2018  if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2019  return EVT::getVectorVT(Context, MVT::i1, NumElts);
2020  }
2021  }
2022 
2024 }
2025 
2026 /// Helper for getByValTypeAlignment to determine
2027 /// the desired ByVal argument alignment.
2028 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
2029  if (MaxAlign == 16)
2030  return;
2031  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2032  if (VTy->getBitWidth() == 128)
2033  MaxAlign = 16;
2034  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2035  unsigned EltAlign = 0;
2036  getMaxByValAlign(ATy->getElementType(), EltAlign);
2037  if (EltAlign > MaxAlign)
2038  MaxAlign = EltAlign;
2039  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2040  for (auto *EltTy : STy->elements()) {
2041  unsigned EltAlign = 0;
2042  getMaxByValAlign(EltTy, EltAlign);
2043  if (EltAlign > MaxAlign)
2044  MaxAlign = EltAlign;
2045  if (MaxAlign == 16)
2046  break;
2047  }
2048  }
2049 }
2050 
2051 /// Return the desired alignment for ByVal aggregate
2052 /// function arguments in the caller parameter area. For X86, aggregates
2053 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
2054 /// are at 4-byte boundaries.
2056  const DataLayout &DL) const {
2057  if (Subtarget.is64Bit()) {
2058  // Max of 8 and alignment of type.
2059  unsigned TyAlign = DL.getABITypeAlignment(Ty);
2060  if (TyAlign > 8)
2061  return TyAlign;
2062  return 8;
2063  }
2064 
2065  unsigned Align = 4;
2066  if (Subtarget.hasSSE1())
2067  getMaxByValAlign(Ty, Align);
2068  return Align;
2069 }
2070 
2071 /// Returns the target specific optimal type for load
2072 /// and store operations as a result of memset, memcpy, and memmove
2073 /// lowering. If DstAlign is zero that means it's safe to destination
2074 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
2075 /// means there isn't a need to check it against alignment requirement,
2076 /// probably because the source does not need to be loaded. If 'IsMemset' is
2077 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
2078 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
2079 /// source is constant so it does not need to be loaded.
2080 /// It returns EVT::Other if the type should be determined using generic
2081 /// target-independent logic.
2082 /// For vector ops we check that the overall size isn't larger than our
2083 /// preferred vector width.
2085  uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
2086  bool ZeroMemset, bool MemcpyStrSrc,
2087  const AttributeList &FuncAttributes) const {
2088  if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2089  if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
2090  ((DstAlign == 0 || DstAlign >= 16) &&
2091  (SrcAlign == 0 || SrcAlign >= 16)))) {
2092  // FIXME: Check if unaligned 64-byte accesses are slow.
2093  if (Size >= 64 && Subtarget.hasAVX512() &&
2094  (Subtarget.getPreferVectorWidth() >= 512)) {
2095  return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2096  }
2097  // FIXME: Check if unaligned 32-byte accesses are slow.
2098  if (Size >= 32 && Subtarget.hasAVX() &&
2099  (Subtarget.getPreferVectorWidth() >= 256)) {
2100  // Although this isn't a well-supported type for AVX1, we'll let
2101  // legalization and shuffle lowering produce the optimal codegen. If we
2102  // choose an optimal type with a vector element larger than a byte,
2103  // getMemsetStores() may create an intermediate splat (using an integer
2104  // multiply) before we splat as a vector.
2105  return MVT::v32i8;
2106  }
2107  if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2108  return MVT::v16i8;
2109  // TODO: Can SSE1 handle a byte vector?
2110  // If we have SSE1 registers we should be able to use them.
2111  if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2112  (Subtarget.getPreferVectorWidth() >= 128))
2113  return MVT::v4f32;
2114  } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
2115  !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2116  // Do not use f64 to lower memcpy if source is string constant. It's
2117  // better to use i32 to avoid the loads.
2118  // Also, do not use f64 to lower memset unless this is a memset of zeros.
2119  // The gymnastics of splatting a byte value into an XMM register and then
2120  // only using 8-byte stores (because this is a CPU with slow unaligned
2121  // 16-byte accesses) makes that a loser.
2122  return MVT::f64;
2123  }
2124  }
2125  // This is a compromise. If we reach here, unaligned accesses may be slow on
2126  // this target. However, creating smaller, aligned accesses could be even
2127  // slower and would certainly be a lot more code.
2128  if (Subtarget.is64Bit() && Size >= 8)
2129  return MVT::i64;
2130  return MVT::i32;
2131 }
2132 
2134  if (VT == MVT::f32)
2135  return X86ScalarSSEf32;
2136  else if (VT == MVT::f64)
2137  return X86ScalarSSEf64;
2138  return true;
2139 }
2140 
2142  EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
2143  bool *Fast) const {
2144  if (Fast) {
2145  switch (VT.getSizeInBits()) {
2146  default:
2147  // 8-byte and under are always assumed to be fast.
2148  *Fast = true;
2149  break;
2150  case 128:
2151  *Fast = !Subtarget.isUnalignedMem16Slow();
2152  break;
2153  case 256:
2154  *Fast = !Subtarget.isUnalignedMem32Slow();
2155  break;
2156  // TODO: What about AVX-512 (512-bit) accesses?
2157  }
2158  }
2159  // NonTemporal vector memory ops must be aligned.
2160  if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2161  // NT loads can only be vector aligned, so if its less aligned than the
2162  // minimum vector size (which we can split the vector down to), we might as
2163  // well use a regular unaligned vector load.
2164  // We don't have any NT loads pre-SSE41.
2165  if (!!(Flags & MachineMemOperand::MOLoad))
2166  return (Align < 16 || !Subtarget.hasSSE41());
2167  return false;
2168  }
2169  // Misaligned accesses of any size are always allowed.
2170  return true;
2171 }
2172 
2173 /// Return the entry encoding for a jump table in the
2174 /// current function. The returned value is a member of the
2175 /// MachineJumpTableInfo::JTEntryKind enum.
2177  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2178  // symbol.
2179  if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2181 
2182  // Otherwise, use the normal jump table encoding heuristics.
2184 }
2185 
2187  return Subtarget.useSoftFloat();
2188 }
2189 
2191  ArgListTy &Args) const {
2192 
2193  // Only relabel X86-32 for C / Stdcall CCs.
2194  if (Subtarget.is64Bit())
2195  return;
2196  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2197  return;
2198  unsigned ParamRegs = 0;
2199  if (auto *M = MF->getFunction().getParent())
2200  ParamRegs = M->getNumberRegisterParameters();
2201 
2202  // Mark the first N int arguments as having reg
2203  for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
2204  Type *T = Args[Idx].Ty;
2205  if (T->isIntOrPtrTy())
2206  if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2207  unsigned numRegs = 1;
2208  if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2209  numRegs = 2;
2210  if (ParamRegs < numRegs)
2211  return;
2212  ParamRegs -= numRegs;
2213  Args[Idx].IsInReg = true;
2214  }
2215  }
2216 }
2217 
2218 const MCExpr *
2220  const MachineBasicBlock *MBB,
2221  unsigned uid,MCContext &Ctx) const{
2222  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2223  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2224  // entries.
2225  return MCSymbolRefExpr::create(MBB->getSymbol(),
2227 }
2228 
2229 /// Returns relocation base for the given PIC jumptable.
2231  SelectionDAG &DAG) const {
2232  if (!Subtarget.is64Bit())
2233  // This doesn't have SDLoc associated with it, but is not really the
2234  // same as a Register.
2235  return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2236  getPointerTy(DAG.getDataLayout()));
2237  return Table;
2238 }
2239 
2240 /// This returns the relocation base for the given PIC jumptable,
2241 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2244  MCContext &Ctx) const {
2245  // X86-64 uses RIP relative addressing based on the jump table label.
2246  if (Subtarget.isPICStyleRIPRel())
2247  return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2248 
2249  // Otherwise, the reference is relative to the PIC base.
2250  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2251 }
2252 
2253 std::pair<const TargetRegisterClass *, uint8_t>
2255  MVT VT) const {
2256  const TargetRegisterClass *RRC = nullptr;
2257  uint8_t Cost = 1;
2258  switch (VT.SimpleTy) {
2259  default:
2261  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2262  RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2263  break;
2264  case MVT::x86mmx:
2265  RRC = &X86::VR64RegClass;
2266  break;
2267  case MVT::f32: case MVT::f64:
2268  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2269  case MVT::v4f32: case MVT::v2f64:
2270  case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2271  case MVT::v8f32: case MVT::v4f64:
2272  case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2273  case MVT::v16f32: case MVT::v8f64:
2274  RRC = &X86::VR128XRegClass;
2275  break;
2276  }
2277  return std::make_pair(RRC, Cost);
2278 }
2279 
2280 unsigned X86TargetLowering::getAddressSpace() const {
2281  if (Subtarget.is64Bit())
2282  return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2283  return 256;
2284 }
2285 
2286 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2287  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2288  (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2289 }
2290 
2292  unsigned Offset, unsigned AddressSpace) {
2295  Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2296 }
2297 
2299  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2300  // tcbhead_t; use it instead of the usual global variable (see
2301  // sysdeps/{i386,x86_64}/nptl/tls.h)
2302  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2303  if (Subtarget.isTargetFuchsia()) {
2304  // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2305  return SegmentOffset(IRB, 0x10, getAddressSpace());
2306  } else {
2307  // %fs:0x28, unless we're using a Kernel code model, in which case
2308  // it's %gs:0x28. gs:0x14 on i386.
2309  unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2310  return SegmentOffset(IRB, Offset, getAddressSpace());
2311  }
2312  }
2313 
2314  return TargetLowering::getIRStackGuard(IRB);
2315 }
2316 
2318  // MSVC CRT provides functionalities for stack protection.
2319  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2321  // MSVC CRT has a global variable holding security cookie.
2322  M.getOrInsertGlobal("__security_cookie",
2324 
2325  // MSVC CRT has a function to validate security cookie.
2326  FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2327  "__security_check_cookie", Type::getVoidTy(M.getContext()),
2329  if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2330  F->setCallingConv(CallingConv::X86_FastCall);
2331  F->addAttribute(1, Attribute::AttrKind::InReg);
2332  }
2333  return;
2334  }
2335  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2336  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2337  return;
2339 }
2340 
2342  // MSVC CRT has a global variable holding security cookie.
2343  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2345  return M.getGlobalVariable("__security_cookie");
2346  }
2348 }
2349 
2351  // MSVC CRT has a function to validate security cookie.
2352  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2354  return M.getFunction("__security_check_cookie");
2355  }
2357 }
2358 
2360  if (Subtarget.getTargetTriple().isOSContiki())
2361  return getDefaultSafeStackPointerLocation(IRB, false);
2362 
2363  // Android provides a fixed TLS slot for the SafeStack pointer. See the
2364  // definition of TLS_SLOT_SAFESTACK in
2365  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2366  if (Subtarget.isTargetAndroid()) {
2367  // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2368  // %gs:0x24 on i386
2369  unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2370  return SegmentOffset(IRB, Offset, getAddressSpace());
2371  }
2372 
2373  // Fuchsia is similar.
2374  if (Subtarget.isTargetFuchsia()) {
2375  // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2376  return SegmentOffset(IRB, 0x18, getAddressSpace());
2377  }
2378 
2380 }
2381 
2383  unsigned DestAS) const {
2384  assert(SrcAS != DestAS && "Expected different address spaces!");
2385 
2386  return SrcAS < 256 && DestAS < 256;
2387 }
2388 
2389 //===----------------------------------------------------------------------===//
2390 // Return Value Calling Convention Implementation
2391 //===----------------------------------------------------------------------===//
2392 
2393 bool X86TargetLowering::CanLowerReturn(
2394  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2395  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2397  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2398  return CCInfo.CheckReturn(Outs, RetCC_X86);
2399 }
2400 
2401 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2402  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2403  return ScratchRegs;
2404 }
2405 
2406 /// Lowers masks values (v*i1) to the local register values
2407 /// \returns DAG node after lowering to register type
2408 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2409  const SDLoc &Dl, SelectionDAG &DAG) {
2410  EVT ValVT = ValArg.getValueType();
2411 
2412  if (ValVT == MVT::v1i1)
2413  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2414  DAG.getIntPtrConstant(0, Dl));
2415 
2416  if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2417  (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2418  // Two stage lowering might be required
2419  // bitcast: v8i1 -> i8 / v16i1 -> i16
2420  // anyextend: i8 -> i32 / i16 -> i32
2421  EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2422  SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2423  if (ValLoc == MVT::i32)
2424  ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2425  return ValToCopy;
2426  }
2427 
2428  if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2429  (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2430  // One stage lowering is required
2431  // bitcast: v32i1 -> i32 / v64i1 -> i64
2432  return DAG.getBitcast(ValLoc, ValArg);
2433  }
2434 
2435  return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2436 }
2437 
2438 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2440  const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2441  SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
2442  CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2443  assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2444  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2445  assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2446  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2447  "The value should reside in two registers");
2448 
2449  // Before splitting the value we cast it to i64
2450  Arg = DAG.getBitcast(MVT::i64, Arg);
2451 
2452  // Splitting the value into two i32 types
2453  SDValue Lo, Hi;
2454  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2455  DAG.getConstant(0, Dl, MVT::i32));
2456  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2457  DAG.getConstant(1, Dl, MVT::i32));
2458 
2459  // Attach the two i32 types into corresponding registers
2460  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2461  RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2462 }
2463 
2464 SDValue
2465 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2466  bool isVarArg,
2467  const SmallVectorImpl<ISD::OutputArg> &Outs,
2468  const SmallVectorImpl<SDValue> &OutVals,
2469  const SDLoc &dl, SelectionDAG &DAG) const {
2470  MachineFunction &MF = DAG.getMachineFunction();
2472 
2473  // In some cases we need to disable registers from the default CSR list.
2474  // For example, when they are used for argument passing.
2475  bool ShouldDisableCalleeSavedRegister =
2476  CallConv == CallingConv::X86_RegCall ||
2477  MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2478 
2479  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2480  report_fatal_error("X86 interrupts may not return any value");
2481 
2483  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2484  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2485 
2486  SDValue Flag;
2487  SmallVector<SDValue, 6> RetOps;
2488  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2489  // Operand #1 = Bytes To Pop
2490  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2491  MVT::i32));
2492 
2493  // Copy the result values into the output registers.
2494  for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2495  ++I, ++OutsIndex) {
2496  CCValAssign &VA = RVLocs[I];
2497  assert(VA.isRegLoc() && "Can only return in registers!");
2498 
2499  // Add the register to the CalleeSaveDisableRegs list.
2500  if (ShouldDisableCalleeSavedRegister)
2502 
2503  SDValue ValToCopy = OutVals[OutsIndex];
2504  EVT ValVT = ValToCopy.getValueType();
2505 
2506  // Promote values to the appropriate types.
2507  if (VA.getLocInfo() == CCValAssign::SExt)
2508  ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2509  else if (VA.getLocInfo() == CCValAssign::ZExt)
2510  ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2511  else if (VA.getLocInfo() == CCValAssign::AExt) {
2512  if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2513  ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2514  else
2515  ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2516  }
2517  else if (VA.getLocInfo() == CCValAssign::BCvt)
2518  ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2519 
2521  "Unexpected FP-extend for return value.");
2522 
2523  // If this is x86-64, and we disabled SSE, we can't return FP values,
2524  // or SSE or MMX vectors.
2525  if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2526  VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2527  (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2528  errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2529  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2530  } else if (ValVT == MVT::f64 &&
2531  (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2532  // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2533  // llvm-gcc has never done it right and no one has noticed, so this
2534  // should be OK for now.
2535  errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2536  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2537  }
2538 
2539  // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2540  // the RET instruction and handled by the FP Stackifier.
2541  if (VA.getLocReg() == X86::FP0 ||
2542  VA.getLocReg() == X86::FP1) {
2543  // If this is a copy from an xmm register to ST(0), use an FPExtend to
2544  // change the value to the FP stack register class.
2545  if (isScalarFPTypeInSSEReg(VA.getValVT()))
2546  ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2547  RetOps.push_back(ValToCopy);
2548  // Don't emit a copytoreg.
2549  continue;
2550  }
2551 
2552  // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2553  // which is returned in RAX / RDX.
2554  if (Subtarget.is64Bit()) {
2555  if (ValVT == MVT::x86mmx) {
2556  if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2557  ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2558  ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2559  ValToCopy);
2560  // If we don't have SSE2 available, convert to v4f32 so the generated
2561  // register is legal.
2562  if (!Subtarget.hasSSE2())
2563  ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2564  }
2565  }
2566  }
2567 
2569 
2570  if (VA.needsCustom()) {
2571  assert(VA.getValVT() == MVT::v64i1 &&
2572  "Currently the only custom case is when we split v64i1 to 2 regs");
2573 
2574  Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I],
2575  Subtarget);
2576 
2577  assert(2 == RegsToPass.size() &&
2578  "Expecting two registers after Pass64BitArgInRegs");
2579 
2580  // Add the second register to the CalleeSaveDisableRegs list.
2581  if (ShouldDisableCalleeSavedRegister)
2582  MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2583  } else {
2584  RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2585  }
2586 
2587  // Add nodes to the DAG and add the values into the RetOps list
2588  for (auto &Reg : RegsToPass) {
2589  Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2590  Flag = Chain.getValue(1);
2591  RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2592  }
2593  }
2594 
2595  // Swift calling convention does not require we copy the sret argument
2596  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2597 
2598  // All x86 ABIs require that for returning structs by value we copy
2599  // the sret argument into %rax/%eax (depending on ABI) for the return.
2600  // We saved the argument into a virtual register in the entry block,
2601  // so now we copy the value out and into %rax/%eax.
2602  //
2603  // Checking Function.hasStructRetAttr() here is insufficient because the IR
2604  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2605  // false, then an sret argument may be implicitly inserted in the SelDAG. In
2606  // either case FuncInfo->setSRetReturnReg() will have been called.
2607  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2608  // When we have both sret and another return value, we should use the
2609  // original Chain stored in RetOps[0], instead of the current Chain updated
2610  // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2611 
2612  // For the case of sret and another return value, we have
2613  // Chain_0 at the function entry
2614  // Chain_1 = getCopyToReg(Chain_0) in the above loop
2615  // If we use Chain_1 in getCopyFromReg, we will have
2616  // Val = getCopyFromReg(Chain_1)
2617  // Chain_2 = getCopyToReg(Chain_1, Val) from below
2618 
2619  // getCopyToReg(Chain_0) will be glued together with
2620  // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2621  // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2622  // Data dependency from Unit B to Unit A due to usage of Val in
2623  // getCopyToReg(Chain_1, Val)
2624  // Chain dependency from Unit A to Unit B
2625 
2626  // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2627  SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2628  getPointerTy(MF.getDataLayout()));
2629 
2630  unsigned RetValReg
2631  = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2632  X86::RAX : X86::EAX;
2633  Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2634  Flag = Chain.getValue(1);
2635 
2636  // RAX/EAX now acts like a return value.
2637  RetOps.push_back(
2638  DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2639 
2640  // Add the returned register to the CalleeSaveDisableRegs list.
2641  if (ShouldDisableCalleeSavedRegister)
2642  MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2643  }
2644 
2645  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2646  const MCPhysReg *I =
2648  if (I) {
2649  for (; *I; ++I) {
2650  if (X86::GR64RegClass.contains(*I))
2651  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2652  else
2653  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2654  }
2655  }
2656 
2657  RetOps[0] = Chain; // Update chain.
2658 
2659  // Add the flag if we have it.
2660  if (Flag.getNode())
2661  RetOps.push_back(Flag);
2662 
2664  if (CallConv == CallingConv::X86_INTR)
2665  opcode = X86ISD::IRET;
2666  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2667 }
2668 
2669 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2670  if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2671  return false;
2672 
2673  SDValue TCChain = Chain;
2674  SDNode *Copy = *N->use_begin();
2675  if (Copy->getOpcode() == ISD::CopyToReg) {
2676  // If the copy has a glue operand, we conservatively assume it isn't safe to
2677  // perform a tail call.
2678  if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2679  return false;
2680  TCChain = Copy->getOperand(0);
2681  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2682  return false;
2683 
2684  bool HasRet = false;
2685  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2686  UI != UE; ++UI) {
2687  if (UI->getOpcode() != X86ISD::RET_FLAG)
2688  return false;
2689  // If we are returning more than one value, we can definitely
2690  // not make a tail call see PR19530
2691  if (UI->getNumOperands() > 4)
2692  return false;
2693  if (UI->getNumOperands() == 4 &&
2694  UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2695  return false;
2696  HasRet = true;
2697  }
2698 
2699  if (!HasRet)
2700  return false;
2701 
2702  Chain = TCChain;
2703  return true;
2704 }
2705 
2706 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2707  ISD::NodeType ExtendKind) const {
2708  MVT ReturnMVT = MVT::i32;
2709 
2710  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2711  if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2712  // The ABI does not require i1, i8 or i16 to be extended.
2713  //
2714  // On Darwin, there is code in the wild relying on Clang's old behaviour of
2715  // always extending i8/i16 return values, so keep doing that for now.
2716  // (PR26665).
2717  ReturnMVT = MVT::i8;
2718  }
2719 
2720  EVT MinVT = getRegisterType(Context, ReturnMVT);
2721  return VT.bitsLT(MinVT) ? MinVT : VT;
2722 }
2723 
2724 /// Reads two 32 bit registers and creates a 64 bit mask value.
2725 /// \param VA The current 32 bit value that need to be assigned.
2726 /// \param NextVA The next 32 bit value that need to be assigned.
2727 /// \param Root The parent DAG node.
2728 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2729 /// glue purposes. In the case the DAG is already using
2730 /// physical register instead of virtual, we should glue
2731 /// our new SDValue to InFlag SDvalue.
2732 /// \return a new SDvalue of size 64bit.
2734  SDValue &Root, SelectionDAG &DAG,
2735  const SDLoc &Dl, const X86Subtarget &Subtarget,
2736  SDValue *InFlag = nullptr) {
2737  assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2738  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2739  assert(VA.getValVT() == MVT::v64i1 &&
2740  "Expecting first location of 64 bit width type");
2741  assert(NextVA.getValVT() == VA.getValVT() &&
2742  "The locations should have the same type");
2743  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2744  "The values should reside in two registers");
2745 
2746  SDValue Lo, Hi;
2747  SDValue ArgValueLo, ArgValueHi;
2748 
2749  MachineFunction &MF = DAG.getMachineFunction();
2750  const TargetRegisterClass *RC = &X86::GR32RegClass;
2751 
2752  // Read a 32 bit value from the registers.
2753  if (nullptr == InFlag) {
2754  // When no physical register is present,
2755  // create an intermediate virtual register.
2756  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2757  ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2758  Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2759  ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2760  } else {
2761  // When a physical register is available read the value from it and glue
2762  // the reads together.
2763  ArgValueLo =
2764  DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2765  *InFlag = ArgValueLo.getValue(2);
2766  ArgValueHi =
2767  DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2768  *InFlag = ArgValueHi.getValue(2);
2769  }
2770 
2771  // Convert the i32 type into v32i1 type.
2772  Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2773 
2774  // Convert the i32 type into v32i1 type.
2775  Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2776 
2777  // Concatenate the two values together.
2778  return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2779 }
2780 
2781 /// The function will lower a register of various sizes (8/16/32/64)
2782 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2783 /// \returns a DAG node contains the operand after lowering to mask type.
2784 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2785  const EVT &ValLoc, const SDLoc &Dl,
2786  SelectionDAG &DAG) {
2787  SDValue ValReturned = ValArg;
2788 
2789  if (ValVT == MVT::v1i1)
2790  return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2791 
2792  if (ValVT == MVT::v64i1) {
2793  // In 32 bit machine, this case is handled by getv64i1Argument
2794  assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2795  // In 64 bit machine, There is no need to truncate the value only bitcast
2796  } else {
2797  MVT maskLen;
2798  switch (ValVT.getSimpleVT().SimpleTy) {
2799  case MVT::v8i1:
2800  maskLen = MVT::i8;
2801  break;
2802  case MVT::v16i1:
2803  maskLen = MVT::i16;
2804  break;
2805  case MVT::v32i1:
2806  maskLen = MVT::i32;
2807  break;
2808  default:
2809  llvm_unreachable("Expecting a vector of i1 types");
2810  }
2811 
2812  ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2813  }
2814  return DAG.getBitcast(ValVT, ValReturned);
2815 }
2816 
2817 /// Lower the result values of a call into the
2818 /// appropriate copies out of appropriate physical registers.
2819 ///
2820 SDValue X86TargetLowering::LowerCallResult(
2821  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2822  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2823  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2824  uint32_t *RegMask) const {
2825 
2826  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2827  // Assign locations to each value returned by this call.
2829  bool Is64Bit = Subtarget.is64Bit();
2830  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2831  *DAG.getContext());
2832  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2833 
2834  // Copy all of the result registers out of their specified physreg.
2835  for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2836  ++I, ++InsIndex) {
2837  CCValAssign &VA = RVLocs[I];
2838  EVT CopyVT = VA.getLocVT();
2839 
2840  // In some calling conventions we need to remove the used registers
2841  // from the register mask.
2842  if (RegMask) {
2843  for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2844  SubRegs.isValid(); ++SubRegs)
2845  RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2846  }
2847 
2848  // If this is x86-64, and we disabled SSE, we can't return FP values
2849  if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2850  ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2851  errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2852  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2853  } else if (CopyVT == MVT::f64 &&
2854  (Is64Bit && !Subtarget.hasSSE2())) {
2855  errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2856  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2857  }
2858 
2859  // If we prefer to use the value in xmm registers, copy it out as f80 and
2860  // use a truncate to move it from fp stack reg to xmm reg.
2861  bool RoundAfterCopy = false;
2862  if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2864  if (!Subtarget.hasX87())
2865  report_fatal_error("X87 register return with X87 disabled");
2866  CopyVT = MVT::f80;
2867  RoundAfterCopy = (CopyVT != VA.getLocVT());
2868  }
2869 
2870  SDValue Val;
2871  if (VA.needsCustom()) {
2872  assert(VA.getValVT() == MVT::v64i1 &&
2873  "Currently the only custom case is when we split v64i1 to 2 regs");
2874  Val =
2875  getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2876  } else {
2877  Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2878  .getValue(1);
2879  Val = Chain.getValue(0);
2880  InFlag = Chain.getValue(2);
2881  }
2882 
2883  if (RoundAfterCopy)
2884  Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2885  // This truncation won't change the value.
2886  DAG.getIntPtrConstant(1, dl));
2887 
2888  if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2889  if (VA.getValVT().isVector() &&
2890  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2891  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2892  // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2893  Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2894  } else
2895  Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2896  }
2897 
2898  InVals.push_back(Val);
2899  }
2900 
2901  return Chain;
2902 }
2903 
2904 //===----------------------------------------------------------------------===//
2905 // C & StdCall & Fast Calling Convention implementation
2906 //===----------------------------------------------------------------------===//
2907 // StdCall calling convention seems to be standard for many Windows' API
2908 // routines and around. It differs from C calling convention just a little:
2909 // callee should clean up the stack, not caller. Symbols should be also
2910 // decorated in some fancy way :) It doesn't support any vector arguments.
2911 // For info on fast calling convention see Fast Calling Convention (tail call)
2912 // implementation LowerX86_32FastCCCallTo.
2913 
2914 /// CallIsStructReturn - Determines whether a call uses struct return
2915 /// semantics.
2920 };
2921 static StructReturnType
2923  if (Outs.empty())
2924  return NotStructReturn;
2925 
2926  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2927  if (!Flags.isSRet())
2928  return NotStructReturn;
2929  if (Flags.isInReg() || IsMCU)
2930  return RegStructReturn;
2931  return StackStructReturn;
2932 }
2933 
2934 /// Determines whether a function uses struct return semantics.
2935 static StructReturnType
2937  if (Ins.empty())
2938  return NotStructReturn;
2939 
2940  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2941  if (!Flags.isSRet())
2942  return NotStructReturn;
2943  if (Flags.isInReg() || IsMCU)
2944  return RegStructReturn;
2945  return StackStructReturn;
2946 }
2947 
2948 /// Make a copy of an aggregate at address specified by "Src" to address
2949 /// "Dst" with size and alignment information specified by the specific
2950 /// parameter attribute. The copy will be passed as a byval function parameter.
2952  SDValue Chain, ISD::ArgFlagsTy Flags,
2953  SelectionDAG &DAG, const SDLoc &dl) {
2954  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2955 
2956  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2957  /*isVolatile*/false, /*AlwaysInline=*/true,
2958  /*isTailCall*/false,
2960 }
2961 
2962 /// Return true if the calling convention is one that we can guarantee TCO for.
2964  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2966  CC == CallingConv::HHVM || CC == CallingConv::Tail);
2967 }
2968 
2969 /// Return true if we might ever do TCO for calls with this calling convention.
2971  switch (CC) {
2972  // C calling conventions:
2973  case CallingConv::C:
2974  case CallingConv::Win64:
2976  // Callee pop conventions:
2981  // Swift:
2982  case CallingConv::Swift:
2983  return true;
2984  default:
2985  return canGuaranteeTCO(CC);
2986  }
2987 }
2988 
2989 /// Return true if the function is being made into a tailcall target by
2990 /// changing its ABI.
2991 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2992  return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
2993 }
2994 
2995 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2996  auto Attr =
2997  CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2998  if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2999  return false;
3000 
3001  ImmutableCallSite CS(CI);
3002  CallingConv::ID CalleeCC = CS.getCallingConv();
3003  if (!mayTailCallThisCC(CalleeCC))
3004  return false;
3005 
3006  return true;
3007 }
3008 
3009 SDValue
3010 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3011  const SmallVectorImpl<ISD::InputArg> &Ins,
3012  const SDLoc &dl, SelectionDAG &DAG,
3013  const CCValAssign &VA,
3014  MachineFrameInfo &MFI, unsigned i) const {
3015  // Create the nodes corresponding to a load from this parameter slot.
3016  ISD::ArgFlagsTy Flags = Ins[i].Flags;
3017  bool AlwaysUseMutable = shouldGuaranteeTCO(
3018  CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3019  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3020  EVT ValVT;
3021  MVT PtrVT = getPointerTy(DAG.getDataLayout());
3022 
3023  // If value is passed by pointer we have address passed instead of the value
3024  // itself. No need to extend if the mask value and location share the same
3025  // absolute size.
3026  bool ExtendedInMem =
3027  VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3028  VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3029 
3030  if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3031  ValVT = VA.getLocVT();
3032  else
3033  ValVT = VA.getValVT();
3034 
3035  // FIXME: For now, all byval parameter objects are marked mutable. This can be
3036  // changed with more analysis.
3037  // In case of tail call optimization mark all arguments mutable. Since they
3038  // could be overwritten by lowering of arguments in case of a tail call.
3039  if (Flags.isByVal()) {
3040  unsigned Bytes = Flags.getByValSize();
3041  if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3042 
3043  // FIXME: For now, all byval parameter objects are marked as aliasing. This
3044  // can be improved with deeper analysis.
3045  int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3046  /*isAliased=*/true);
3047  return DAG.getFrameIndex(FI, PtrVT);
3048  }
3049 
3050  // This is an argument in memory. We might be able to perform copy elision.
3051  // If the argument is passed directly in memory without any extension, then we
3052  // can perform copy elision. Large vector types, for example, may be passed
3053  // indirectly by pointer.
3054  if (Flags.isCopyElisionCandidate() &&
3055  VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
3056  EVT ArgVT = Ins[i].ArgVT;
3057  SDValue PartAddr;
3058  if (Ins[i].PartOffset == 0) {
3059  // If this is a one-part value or the first part of a multi-part value,
3060  // create a stack object for the entire argument value type and return a
3061  // load from our portion of it. This assumes that if the first part of an
3062  // argument is in memory, the rest will also be in memory.
3063  int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3064  /*IsImmutable=*/false);
3065  PartAddr = DAG.getFrameIndex(FI, PtrVT);
3066  return DAG.getLoad(
3067  ValVT, dl, Chain, PartAddr,
3069  } else {
3070  // This is not the first piece of an argument in memory. See if there is
3071  // already a fixed stack object including this offset. If so, assume it
3072  // was created by the PartOffset == 0 branch above and create a load from
3073  // the appropriate offset into it.
3074  int64_t PartBegin = VA.getLocMemOffset();
3075  int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3076  int FI = MFI.getObjectIndexBegin();
3077  for (; MFI.isFixedObjectIndex(FI); ++FI) {
3078  int64_t ObjBegin = MFI.getObjectOffset(FI);
3079  int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3080  if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3081  break;
3082  }
3083  if (MFI.isFixedObjectIndex(FI)) {
3084  SDValue Addr =
3085  DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3086  DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3087  return DAG.getLoad(
3088  ValVT, dl, Chain, Addr,
3090  Ins[i].PartOffset));
3091  }
3092  }
3093  }
3094 
3095  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3096  VA.getLocMemOffset(), isImmutable);
3097 
3098  // Set SExt or ZExt flag.
3099  if (VA.getLocInfo() == CCValAssign::ZExt) {
3100  MFI.setObjectZExt(FI, true);
3101  } else if (VA.getLocInfo() == CCValAssign::SExt) {
3102  MFI.setObjectSExt(FI, true);
3103  }
3104 
3105  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3106  SDValue Val = DAG.getLoad(
3107  ValVT, dl, Chain, FIN,
3109  return ExtendedInMem
3110  ? (VA.getValVT().isVector()
3111  ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3112  : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3113  : Val;
3114 }
3115 
3116 // FIXME: Get this from tablegen.
3118  const X86Subtarget &Subtarget) {
3119  assert(Subtarget.is64Bit());
3120 
3121  if (Subtarget.isCallingConvWin64(CallConv)) {
3122  static const MCPhysReg GPR64ArgRegsWin64[] = {
3123  X86::RCX, X86::RDX, X86::R8, X86::R9
3124  };
3125  return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3126  }
3127 
3128  static const MCPhysReg GPR64ArgRegs64Bit[] = {
3129  X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3130  };
3131  return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3132 }
3133 
3134 // FIXME: Get this from tablegen.
3136  CallingConv::ID CallConv,
3137  const X86Subtarget &Subtarget) {
3138  assert(Subtarget.is64Bit());
3139  if (Subtarget.isCallingConvWin64(CallConv)) {
3140  // The XMM registers which might contain var arg parameters are shadowed
3141  // in their paired GPR. So we only need to save the GPR to their home
3142  // slots.
3143  // TODO: __vectorcall will change this.
3144  return None;
3145  }
3146 
3147  const Function &F = MF.getFunction();
3148  bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
3149  bool isSoftFloat = Subtarget.useSoftFloat();
3150  assert(!(isSoftFloat && NoImplicitFloatOps) &&
3151  "SSE register cannot be used when SSE is disabled!");
3152  if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
3153  // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3154  // registers.
3155  return None;
3156 
3157  static const MCPhysReg XMMArgRegs64Bit[] = {
3158  X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3159  X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3160  };
3161  return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3162 }
3163 
3164 #ifndef NDEBUG
3166  return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
3167  [](const CCValAssign &A, const CCValAssign &B) -> bool {
3168  return A.getValNo() < B.getValNo();
3169  });
3170 }
3171 #endif
3172 
3173 SDValue X86TargetLowering::LowerFormalArguments(
3174  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3175  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3176  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3177  MachineFunction &MF = DAG.getMachineFunction();
3179  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3180 
3181  const Function &F = MF.getFunction();
3182  if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3183  F.getName() == "main")
3184  FuncInfo->setForceFramePointer(true);
3185 
3186  MachineFrameInfo &MFI = MF.getFrameInfo();
3187  bool Is64Bit = Subtarget.is64Bit();
3188  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3189 
3190  assert(
3191  !(isVarArg && canGuaranteeTCO(CallConv)) &&
3192  "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3193 
3194  // Assign locations to all of the incoming arguments.
3196  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3197 
3198  // Allocate shadow area for Win64.
3199  if (IsWin64)
3200  CCInfo.AllocateStack(32, 8);
3201 
3202  CCInfo.AnalyzeArguments(Ins, CC_X86);
3203 
3204  // In vectorcall calling convention a second pass is required for the HVA
3205  // types.
3206  if (CallingConv::X86_VectorCall == CallConv) {
3207  CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3208  }
3209 
3210  // The next loop assumes that the locations are in the same order of the
3211  // input arguments.
3212  assert(isSortedByValueNo(ArgLocs) &&
3213  "Argument Location list must be sorted before lowering");
3214 
3215  SDValue ArgValue;
3216  for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3217  ++I, ++InsIndex) {
3218  assert(InsIndex < Ins.size() && "Invalid Ins index");
3219  CCValAssign &VA = ArgLocs[I];
3220 
3221  if (VA.isRegLoc()) {
3222  EVT RegVT = VA.getLocVT();
3223  if (VA.needsCustom()) {
3224  assert(
3225  VA.getValVT() == MVT::v64i1 &&
3226  "Currently the only custom case is when we split v64i1 to 2 regs");
3227 
3228  // v64i1 values, in regcall calling convention, that are
3229  // compiled to 32 bit arch, are split up into two registers.
3230  ArgValue =
3231  getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3232  } else {
3233  const TargetRegisterClass *RC;
3234  if (RegVT == MVT::i8)
3235  RC = &X86::GR8RegClass;
3236  else if (RegVT == MVT::i16)
3237  RC = &X86::GR16RegClass;
3238  else if (RegVT == MVT::i32)
3239  RC = &X86::GR32RegClass;
3240  else if (Is64Bit && RegVT == MVT::i64)
3241  RC = &X86::GR64RegClass;
3242  else if (RegVT == MVT::f32)
3243  RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3244  else if (RegVT == MVT::f64)
3245  RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3246  else if (RegVT == MVT::f80)
3247  RC = &X86::RFP80RegClass;
3248  else if (RegVT == MVT::f128)
3249  RC = &X86::VR128RegClass;
3250  else if (RegVT.is512BitVector())
3251  RC = &X86::VR512RegClass;
3252  else if (RegVT.is256BitVector())
3253  RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3254  else if (RegVT.is128BitVector())
3255  RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3256  else if (RegVT == MVT::x86mmx)
3257  RC = &X86::VR64RegClass;
3258  else if (RegVT == MVT::v1i1)
3259  RC = &X86::VK1RegClass;
3260  else if (RegVT == MVT::v8i1)
3261  RC = &X86::VK8RegClass;
3262  else if (RegVT == MVT::v16i1)
3263  RC = &X86::VK16RegClass;
3264  else if (RegVT == MVT::v32i1)
3265  RC = &X86::VK32RegClass;
3266  else if (RegVT == MVT::v64i1)
3267  RC = &X86::VK64RegClass;
3268  else
3269  llvm_unreachable("Unknown argument type!");
3270 
3271  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3272  ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3273  }
3274 
3275  // If this is an 8 or 16-bit value, it is really passed promoted to 32
3276  // bits. Insert an assert[sz]ext to capture this, then truncate to the
3277  // right size.
3278  if (VA.getLocInfo() == CCValAssign::SExt)
3279  ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3280  DAG.getValueType(VA.getValVT()));
3281  else if (VA.getLocInfo() == CCValAssign::ZExt)
3282  ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3283  DAG.getValueType(VA.getValVT()));
3284  else if (VA.getLocInfo() == CCValAssign::BCvt)
3285  ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3286 
3287  if (VA.isExtInLoc()) {
3288  // Handle MMX values passed in XMM regs.
3289  if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3290  ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3291  else if (VA.getValVT().isVector() &&
3292  VA.getValVT().getScalarType() == MVT::i1 &&
3293  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3294  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3295  // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3296  ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3297  } else
3298  ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3299  }
3300  } else {
3301  assert(VA.isMemLoc());
3302  ArgValue =
3303  LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3304  }
3305 
3306  // If value is passed via pointer - do a load.
3307  if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3308  ArgValue =
3309  DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3310 
3311  InVals.push_back(ArgValue);
3312  }
3313 
3314  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3315  // Swift calling convention does not require we copy the sret argument
3316  // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3317  if (CallConv == CallingConv::Swift)
3318  continue;
3319 
3320  // All x86 ABIs require that for returning structs by value we copy the
3321  // sret argument into %rax/%eax (depending on ABI) for the return. Save
3322  // the argument into a virtual register so that we can access it from the
3323  // return points.
3324  if (Ins[I].Flags.isSRet()) {
3325  unsigned Reg = FuncInfo->getSRetReturnReg();
3326  if (!Reg) {
3327  MVT PtrTy = getPointerTy(DAG.getDataLayout());
3328  Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3329  FuncInfo->setSRetReturnReg(Reg);
3330  }
3331  SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3332  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3333  break;
3334  }
3335  }
3336 
3337  unsigned StackSize = CCInfo.getNextStackOffset();
3338  // Align stack specially for tail calls.
3339  if (shouldGuaranteeTCO(CallConv,
3341  StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3342 
3343  // If the function takes variable number of arguments, make a frame index for
3344  // the start of the first vararg value... for expansion of llvm.va_start. We
3345  // can skip this if there are no va_start calls.
3346  if (MFI.hasVAStart() &&
3347  (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3348  CallConv != CallingConv::X86_ThisCall))) {
3349  FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3350  }
3351 
3352  // Figure out if XMM registers are in use.
3353  assert(!(Subtarget.useSoftFloat() &&
3354  F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3355  "SSE register cannot be used when SSE is disabled!");
3356 
3357  // 64-bit calling conventions support varargs and register parameters, so we
3358  // have to do extra work to spill them in the prologue.
3359  if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3360  // Find the first unallocated argument registers.
3361  ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3362  ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3363  unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3364  unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3365  assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3366  "SSE register cannot be used when SSE is disabled!");
3367 
3368  // Gather all the live in physical registers.
3369  SmallVector<SDValue, 6> LiveGPRs;
3370  SmallVector<SDValue, 8> LiveXMMRegs;
3371  SDValue ALVal;
3372  for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3373  unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3374  LiveGPRs.push_back(
3375  DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3376  }
3377  if (!ArgXMMs.empty()) {
3378  unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3379  ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3380  for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3381  unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3382  LiveXMMRegs.push_back(
3383  DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3384  }
3385  }
3386 
3387  if (IsWin64) {
3388  // Get to the caller-allocated home save location. Add 8 to account
3389  // for the return address.
3390  int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3391  FuncInfo->setRegSaveFrameIndex(
3392  MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3393  // Fixup to set vararg frame on shadow area (4 x i64).
3394  if (NumIntRegs < 4)
3395  FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3396  } else {
3397  // For X86-64, if there are vararg parameters that are passed via
3398  // registers, then we must store them to their spots on the stack so
3399  // they may be loaded by dereferencing the result of va_next.
3400  FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3401  FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3403  ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3404  }
3405 
3406  // Store the integer parameter registers.
3407  SmallVector<SDValue, 8> MemOps;
3408  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3409  getPointerTy(DAG.getDataLayout()));
3410  unsigned Offset = FuncInfo->getVarArgsGPOffset();
3411  for (SDValue Val : LiveGPRs) {
3412  SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3413  RSFIN, DAG.getIntPtrConstant(Offset, dl));
3414  SDValue Store =
3415  DAG.getStore(Val.getValue(1), dl, Val, FIN,
3417  DAG.getMachineFunction(),
3418  FuncInfo->getRegSaveFrameIndex(), Offset));
3419  MemOps.push_back(Store);
3420  Offset += 8;
3421  }
3422 
3423  if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3424  // Now store the XMM (fp + vector) parameter registers.
3425  SmallVector<SDValue, 12> SaveXMMOps;
3426  SaveXMMOps.push_back(Chain);
3427  SaveXMMOps.push_back(ALVal);
3428  SaveXMMOps.push_back(DAG.getIntPtrConstant(
3429  FuncInfo->getRegSaveFrameIndex(), dl));
3430  SaveXMMOps.push_back(DAG.getIntPtrConstant(
3431  FuncInfo->getVarArgsFPOffset(), dl));
3432  SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3433  LiveXMMRegs.end());
3435  MVT::Other, SaveXMMOps));
3436  }
3437 
3438  if (!MemOps.empty())
3439  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3440  }
3441 
3442  if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3443  // Find the largest legal vector type.
3444  MVT VecVT = MVT::Other;
3445  // FIXME: Only some x86_32 calling conventions support AVX512.
3446  if (Subtarget.useAVX512Regs() &&
3447  (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3448  CallConv == CallingConv::Intel_OCL_BI)))
3449  VecVT = MVT::v16f32;
3450  else if (Subtarget.hasAVX())
3451  VecVT = MVT::v8f32;
3452  else if (Subtarget.hasSSE2())
3453  VecVT = MVT::v4f32;
3454 
3455  // We forward some GPRs and some vector types.
3456  SmallVector<MVT, 2> RegParmTypes;
3457  MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3458  RegParmTypes.push_back(IntVT);
3459  if (VecVT != MVT::Other)
3460  RegParmTypes.push_back(VecVT);
3461 
3462  // Compute the set of forwarded registers. The rest are scratch.
3464  FuncInfo->getForwardedMustTailRegParms();
3465  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3466 
3467  // Conservatively forward AL on x86_64, since it might be used for varargs.
3468  if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3469  unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3470  Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3471  }
3472 
3473  // Copy all forwards from physical to virtual registers.
3474  for (ForwardedRegister &FR : Forwards) {
3475  // FIXME: Can we use a less constrained schedule?
3476  SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
3477  FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
3478  Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
3479  }
3480  }
3481 
3482  // Some CCs need callee pop.
3483  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3485  FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3486  } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3487  // X86 interrupts must pop the error code (and the alignment padding) if
3488  // present.
3489  FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3490  } else {
3491  FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3492  // If this is an sret function, the return should pop the hidden pointer.
3493  if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3494  !Subtarget.getTargetTriple().isOSMSVCRT() &&
3495  argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3496  FuncInfo->setBytesToPopOnReturn(4);
3497  }
3498 
3499  if (!Is64Bit) {
3500  // RegSaveFrameIndex is X86-64 only.
3501  FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3502  if (CallConv == CallingConv::X86_FastCall ||
3503  CallConv == CallingConv::X86_ThisCall)
3504  // fastcc functions can't have varargs.
3505  FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3506  }
3507 
3508  FuncInfo->setArgumentStackSize(StackSize);
3509 
3510  if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3512  if (Personality == EHPersonality::CoreCLR) {
3513  assert(Is64Bit);
3514  // TODO: Add a mechanism to frame lowering that will allow us to indicate
3515  // that we'd prefer this slot be allocated towards the bottom of the frame
3516  // (i.e. near the stack pointer after allocating the frame). Every
3517  // funclet needs a copy of this slot in its (mostly empty) frame, and the
3518  // offset from the bottom of this and each funclet's frame must be the
3519  // same, so the size of funclets' (mostly empty) frames is dictated by
3520  // how far this slot is from the bottom (since they allocate just enough
3521  // space to accommodate holding this slot at the correct offset).
3522  int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3523  EHInfo->PSPSymFrameIdx = PSPSymFI;
3524  }
3525  }
3526 
3527  if (CallConv == CallingConv::X86_RegCall ||
3528  F.hasFnAttribute("no_caller_saved_registers")) {
3530  for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3531  MRI.disableCalleeSavedRegister(Pair.first);
3532  }
3533 
3534  return Chain;
3535 }
3536 
3537 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3538  SDValue Arg, const SDLoc &dl,
3539  SelectionDAG &DAG,
3540  const CCValAssign &VA,
3541  ISD::ArgFlagsTy Flags) const {
3542  unsigned LocMemOffset = VA.getLocMemOffset();
3543  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3544  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3545  StackPtr, PtrOff);
3546  if (Flags.isByVal())
3547  return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3548 
3549  return DAG.getStore(
3550  Chain, dl, Arg, PtrOff,
3551  MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3552 }
3553 
3554 /// Emit a load of return address if tail call
3555 /// optimization is performed and it is required.
3556 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3557  SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3558  bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3559  // Adjust the Return address stack slot.
3560  EVT VT = getPointerTy(DAG.getDataLayout());
3561  OutRetAddr = getReturnAddressFrameIndex(DAG);
3562 
3563  // Load the "old"