LLVM  6.0.0svn
X86ISelLowering.cpp
Go to the documentation of this file.
1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/DiagnosticInfo.h"
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/GlobalAlias.h"
46 #include "llvm/IR/GlobalVariable.h"
47 #include "llvm/IR/Instructions.h"
48 #include "llvm/IR/Intrinsics.h"
49 #include "llvm/MC/MCAsmInfo.h"
50 #include "llvm/MC/MCContext.h"
51 #include "llvm/MC/MCExpr.h"
52 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/KnownBits.h"
60 #include <algorithm>
61 #include <bitset>
62 #include <cctype>
63 #include <numeric>
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "x86-isel"
67 
68 STATISTIC(NumTailCalls, "Number of tail calls");
69 
71  "x86-experimental-vector-widening-legalization", cl::init(false),
72  cl::desc("Enable an experimental vector type legalization through widening "
73  "rather than promotion."),
74  cl::Hidden);
75 
77  "x86-experimental-pref-loop-alignment", cl::init(4),
78  cl::desc("Sets the preferable loop alignment for experiments "
79  "(the last x86-experimental-pref-loop-alignment bits"
80  " of the loop header PC will be 0)."),
81  cl::Hidden);
82 
84  "mul-constant-optimization", cl::init(true),
85  cl::desc("Replace 'mul x, Const' with more effective instructions like "
86  "SHIFT, LEA, etc."),
87  cl::Hidden);
88 
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
92 /// crashing.
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
94  const char *Msg) {
96  DAG.getContext()->diagnose(
98 }
99 
101  const X86Subtarget &STI)
102  : TargetLowering(TM), Subtarget(STI) {
103  bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104  X86ScalarSSEf64 = Subtarget.hasSSE2();
105  X86ScalarSSEf32 = Subtarget.hasSSE1();
106  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
107 
108  // Set up the TargetLowering object.
109 
110  // X86 is weird. It always uses i8 for shift amounts and setcc results.
112  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
114 
115  // For 64-bit, since we have so many registers, use the ILP scheduler.
116  // For 32-bit, use the register pressure specific scheduling.
117  // For Atom, always use ILP scheduling.
118  if (Subtarget.isAtom())
120  else if (Subtarget.is64Bit())
122  else
124  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
126 
127  // Bypass expensive divides and use cheaper ones.
128  if (TM.getOptLevel() >= CodeGenOpt::Default) {
129  if (Subtarget.hasSlowDivide32())
130  addBypassSlowDiv(32, 8);
131  if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132  addBypassSlowDiv(64, 32);
133  }
134 
135  if (Subtarget.isTargetKnownWindowsMSVC() ||
136  Subtarget.isTargetWindowsItanium()) {
137  // Setup Windows compiler runtime calls.
138  setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139  setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140  setLibcallName(RTLIB::SREM_I64, "_allrem");
141  setLibcallName(RTLIB::UREM_I64, "_aullrem");
142  setLibcallName(RTLIB::MUL_I64, "_allmul");
148  }
149 
150  if (Subtarget.isTargetDarwin()) {
151  // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152  setUseUnderscoreSetJmp(false);
154  } else if (Subtarget.isTargetWindowsGNU()) {
155  // MS runtime is weird: it exports _setjmp, but longjmp!
158  } else {
161  }
162 
163  // Set up the register classes.
164  addRegisterClass(MVT::i8, &X86::GR8RegClass);
165  addRegisterClass(MVT::i16, &X86::GR16RegClass);
166  addRegisterClass(MVT::i32, &X86::GR32RegClass);
167  if (Subtarget.is64Bit())
168  addRegisterClass(MVT::i64, &X86::GR64RegClass);
169 
170  for (MVT VT : MVT::integer_valuetypes())
172 
173  // We don't accept any truncstore of integer registers.
180 
182 
183  // SETOEQ and SETUNE require checking two conditions.
190 
191  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
192  // operation.
196 
197  if (Subtarget.is64Bit()) {
198  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
199  // f32/f64 are legal, f80 is custom.
201  else
204  } else if (!Subtarget.useSoftFloat()) {
205  // We have an algorithm for SSE2->double, and we turn this into a
206  // 64-bit FILD followed by conditional FADD for other targets.
208  // We have an algorithm for SSE2, and we turn this into a 64-bit
209  // FILD or VCVTUSI2SS/SD for other targets.
211  }
212 
213  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
214  // this operation.
217 
218  if (!Subtarget.useSoftFloat()) {
219  // SSE has no i16 to fp conversion, only i32.
220  if (X86ScalarSSEf32) {
222  // f32 and f64 cases are Legal, f80 case is not
224  } else {
227  }
228  } else {
231  }
232 
233  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
234  // this operation.
237 
238  if (!Subtarget.useSoftFloat()) {
239  // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
240  // are Legal, f80 is custom lowered.
243 
244  if (X86ScalarSSEf32) {
246  // f32 and f64 cases are Legal, f80 case is not
248  } else {
251  }
252  } else {
256  }
257 
258  // Handle FP_TO_UINT by promoting the destination to a larger signed
259  // conversion.
263 
264  if (Subtarget.is64Bit()) {
265  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
266  // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
269  } else {
272  }
273  } else if (!Subtarget.useSoftFloat()) {
274  // Since AVX is a superset of SSE3, only check for SSE here.
275  if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
276  // Expand FP_TO_UINT into a select.
277  // FIXME: We would like to use a Custom expander here eventually to do
278  // the optimal thing for SSE vs. the default expansion in the legalizer.
280  else
281  // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
282  // With SSE3 we can use fisttpll to convert to a signed i64; without
283  // SSE, we're stuck with a fistpll.
285 
287  }
288 
289  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
290  if (!X86ScalarSSEf64) {
293  if (Subtarget.is64Bit()) {
295  // Without SSE, i64->f64 goes through memory.
297  }
298  } else if (!Subtarget.is64Bit())
300 
301  // Scalar integer divide and remainder are lowered to use operations that
302  // produce two results, to match the available instructions. This exposes
303  // the two-result form to trivial CSE, which is able to combine x/y and x%y
304  // into a single instruction.
305  //
306  // Scalar integer multiply-high is also lowered to use two-result
307  // operations, to match the available instructions. However, plain multiply
308  // (low) operations are left as Legal, as there are single-result
309  // instructions for this in x86. Using the two-result multiply instructions
310  // when both high and low results are needed must be arranged by dagcombine.
311  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
318  }
319 
322  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
326  }
327  if (Subtarget.is64Bit())
333 
338 
339  // Promote the i8 variants and force them on up to i32 which has a shorter
340  // encoding.
343  if (!Subtarget.hasBMI()) {
348  if (Subtarget.is64Bit()) {
351  }
352  }
353 
354  if (Subtarget.hasLZCNT()) {
355  // When promoting the i8 variants, force them to i32 for a shorter
356  // encoding.
359  } else {
366  if (Subtarget.is64Bit()) {
369  }
370  }
371 
372  // Special handling for half-precision floating point conversions.
373  // If we don't have F16C support, then lower half float conversions
374  // into library calls.
375  if (Subtarget.useSoftFloat() ||
376  (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
379  }
380 
381  // There's never any support for operations beyond MVT::f32.
386 
393 
394  if (Subtarget.hasPOPCNT()) {
396  } else {
400  if (Subtarget.is64Bit())
402  }
403 
405 
406  if (!Subtarget.hasMOVBE())
408 
409  // These should be promoted to a larger select which is supported.
411  // X86 wants to expand cmov itself.
412  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
415  }
416  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
417  if (VT == MVT::i64 && !Subtarget.is64Bit())
418  continue;
421  }
423  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
424  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
425  // support continuation, user-level threading, and etc.. As a result, no
426  // other SjLj exception interfaces are implemented and please don't build
427  // your own exception handling based on them.
428  // LLVM/Clang supports zero-cost DWARF exception handling.
433  setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
434 
435  // Darwin ABI issue.
436  for (auto VT : { MVT::i32, MVT::i64 }) {
437  if (VT == MVT::i64 && !Subtarget.is64Bit())
438  continue;
445  }
446 
447  // 64-bit shl, sra, srl (iff 32-bit x86)
448  for (auto VT : { MVT::i32, MVT::i64 }) {
449  if (VT == MVT::i64 && !Subtarget.is64Bit())
450  continue;
454  }
455 
456  if (Subtarget.hasSSE1())
458 
460 
461  // Expand certain atomics
462  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
470  }
471 
472  if (Subtarget.hasCmpxchg16b()) {
474  }
475 
476  // FIXME - use subtarget debug flags
477  if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
478  !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
481  }
482 
485 
488 
491 
492  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
495  bool Is64Bit = Subtarget.is64Bit();
497  setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
498 
501 
503 
504  // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
507 
508  if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
509  // f32 and f64 use SSE.
510  // Set up the FP register classes.
511  addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
512  : &X86::FR32RegClass);
513  addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
514  : &X86::FR64RegClass);
515 
516  for (auto VT : { MVT::f32, MVT::f64 }) {
517  // Use ANDPD to simulate FABS.
519 
520  // Use XORP to simulate FNEG.
522 
523  // Use ANDPD and ORPD to simulate FCOPYSIGN.
525 
526  // We don't support sin/cos/fmod
527  setOperationAction(ISD::FSIN , VT, Expand);
528  setOperationAction(ISD::FCOS , VT, Expand);
529  setOperationAction(ISD::FSINCOS, VT, Expand);
530  }
531 
532  // Lower this to MOVMSK plus an AND.
535 
536  // Expand FP immediates into loads from the stack, except for the special
537  // cases we handle.
538  addLegalFPImmediate(APFloat(+0.0)); // xorpd
539  addLegalFPImmediate(APFloat(+0.0f)); // xorps
540  } else if (UseX87 && X86ScalarSSEf32) {
541  // Use SSE for f32, x87 for f64.
542  // Set up the FP register classes.
543  addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
544  : &X86::FR32RegClass);
545  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
546 
547  // Use ANDPS to simulate FABS.
549 
550  // Use XORP to simulate FNEG.
552 
554 
555  // Use ANDPS and ORPS to simulate FCOPYSIGN.
558 
559  // We don't support sin/cos/fmod
563 
564  // Special cases we handle for FP constants.
565  addLegalFPImmediate(APFloat(+0.0f)); // xorps
566  addLegalFPImmediate(APFloat(+0.0)); // FLD0
567  addLegalFPImmediate(APFloat(+1.0)); // FLD1
568  addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
569  addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
570 
571  if (!TM.Options.UnsafeFPMath) {
575  }
576  } else if (UseX87) {
577  // f32 and f64 in x87.
578  // Set up the FP register classes.
579  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
580  addRegisterClass(MVT::f32, &X86::RFP32RegClass);
581 
582  for (auto VT : { MVT::f32, MVT::f64 }) {
583  setOperationAction(ISD::UNDEF, VT, Expand);
584  setOperationAction(ISD::FCOPYSIGN, VT, Expand);
585 
586  if (!TM.Options.UnsafeFPMath) {
587  setOperationAction(ISD::FSIN , VT, Expand);
588  setOperationAction(ISD::FCOS , VT, Expand);
589  setOperationAction(ISD::FSINCOS, VT, Expand);
590  }
591  }
592  addLegalFPImmediate(APFloat(+0.0)); // FLD0
593  addLegalFPImmediate(APFloat(+1.0)); // FLD1
594  addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
595  addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
596  addLegalFPImmediate(APFloat(+0.0f)); // FLD0
597  addLegalFPImmediate(APFloat(+1.0f)); // FLD1
598  addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
599  addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
600  }
601 
602  // We don't support FMA.
605 
606  // Long double always uses X87, except f128 in MMX.
607  if (UseX87) {
608  if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
609  addRegisterClass(MVT::f128, &X86::FR128RegClass);
614  }
615 
616  addRegisterClass(MVT::f80, &X86::RFP80RegClass);
619  {
621  addLegalFPImmediate(TmpFlt); // FLD0
622  TmpFlt.changeSign();
623  addLegalFPImmediate(TmpFlt); // FLD0/FCHS
624 
625  bool ignored;
626  APFloat TmpFlt2(+1.0);
628  &ignored);
629  addLegalFPImmediate(TmpFlt2); // FLD1
630  TmpFlt2.changeSign();
631  addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
632  }
633 
634  if (!TM.Options.UnsafeFPMath) {
638  }
639 
646  }
647 
648  // Always use a library call for pow.
652 
660 
661  // Some FP actions are always expanded for vector types.
662  for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
664  setOperationAction(ISD::FSIN, VT, Expand);
665  setOperationAction(ISD::FSINCOS, VT, Expand);
666  setOperationAction(ISD::FCOS, VT, Expand);
667  setOperationAction(ISD::FREM, VT, Expand);
668  setOperationAction(ISD::FCOPYSIGN, VT, Expand);
669  setOperationAction(ISD::FPOW, VT, Expand);
670  setOperationAction(ISD::FLOG, VT, Expand);
671  setOperationAction(ISD::FLOG2, VT, Expand);
672  setOperationAction(ISD::FLOG10, VT, Expand);
673  setOperationAction(ISD::FEXP, VT, Expand);
674  setOperationAction(ISD::FEXP2, VT, Expand);
675  }
676 
677  // First set operation action for all vector types to either promote
678  // (for widening) or expand (for scalarization). Then we will selectively
679  // turn on ones that can be effectively codegen'd.
680  for (MVT VT : MVT::vector_valuetypes()) {
681  setOperationAction(ISD::SDIV, VT, Expand);
682  setOperationAction(ISD::UDIV, VT, Expand);
683  setOperationAction(ISD::SREM, VT, Expand);
684  setOperationAction(ISD::UREM, VT, Expand);
689  setOperationAction(ISD::FMA, VT, Expand);
690  setOperationAction(ISD::FFLOOR, VT, Expand);
691  setOperationAction(ISD::FCEIL, VT, Expand);
692  setOperationAction(ISD::FTRUNC, VT, Expand);
693  setOperationAction(ISD::FRINT, VT, Expand);
694  setOperationAction(ISD::FNEARBYINT, VT, Expand);
695  setOperationAction(ISD::SMUL_LOHI, VT, Expand);
696  setOperationAction(ISD::MULHS, VT, Expand);
697  setOperationAction(ISD::UMUL_LOHI, VT, Expand);
698  setOperationAction(ISD::MULHU, VT, Expand);
699  setOperationAction(ISD::SDIVREM, VT, Expand);
700  setOperationAction(ISD::UDIVREM, VT, Expand);
701  setOperationAction(ISD::CTPOP, VT, Expand);
702  setOperationAction(ISD::CTTZ, VT, Expand);
703  setOperationAction(ISD::CTLZ, VT, Expand);
704  setOperationAction(ISD::ROTL, VT, Expand);
705  setOperationAction(ISD::ROTR, VT, Expand);
706  setOperationAction(ISD::BSWAP, VT, Expand);
707  setOperationAction(ISD::SETCC, VT, Expand);
708  setOperationAction(ISD::FP_TO_UINT, VT, Expand);
709  setOperationAction(ISD::FP_TO_SINT, VT, Expand);
710  setOperationAction(ISD::UINT_TO_FP, VT, Expand);
711  setOperationAction(ISD::SINT_TO_FP, VT, Expand);
713  setOperationAction(ISD::TRUNCATE, VT, Expand);
716  setOperationAction(ISD::ANY_EXTEND, VT, Expand);
717  setOperationAction(ISD::SELECT_CC, VT, Expand);
718  for (MVT InnerVT : MVT::vector_valuetypes()) {
719  setTruncStoreAction(InnerVT, VT, Expand);
720 
721  setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
722  setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
723 
724  // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
725  // types, we have to deal with them whether we ask for Expansion or not.
726  // Setting Expand causes its own optimisation problems though, so leave
727  // them legal.
728  if (VT.getVectorElementType() == MVT::i1)
729  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
730 
731  // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
732  // split/scalarized right now.
733  if (VT.getVectorElementType() == MVT::f16)
734  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
735  }
736  }
737 
738  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
739  // with -msoft-float, disable use of MMX as well.
740  if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
741  addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
742  // No operations on x86mmx supported, everything uses intrinsics.
743  }
744 
745  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
746  addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
747  : &X86::VR128RegClass);
748 
758  }
759 
760  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
761  addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
762  : &X86::VR128RegClass);
763 
764  // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
765  // registers cannot be used even for integer operations.
766  addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
767  : &X86::VR128RegClass);
768  addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
769  : &X86::VR128RegClass);
770  addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
771  : &X86::VR128RegClass);
772  addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
773  : &X86::VR128RegClass);
774 
788 
793 
797 
798  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
802  }
803 
804  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
810  }
811 
812  // We support custom legalizing of sext and anyext loads for specific
813  // memory vector types which we can load as a scalar (or sequence of
814  // scalars) and extend in-register to a legal 128-bit vector type. For sext
815  // loads these must work with a single scalar load.
816  for (MVT VT : MVT::integer_vector_valuetypes()) {
826  }
827 
828  for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
832 
833  if (VT == MVT::v2i64 && !Subtarget.is64Bit())
834  continue;
835 
838  }
839 
840  // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
841  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
847  }
848 
849  // Custom lower v2i64 and v2f64 selects.
852 
855 
858 
862 
863  // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
865 
868 
869  for (MVT VT : MVT::fp_vector_valuetypes())
871 
875 
879 
880  // In the customized shift lowering, the legal v4i32/v2i64 cases
881  // in AVX2 will be recognized.
882  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
886  }
887  }
888 
889  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
898  }
899 
900  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
901  for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
902  setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
903  setOperationAction(ISD::FCEIL, RoundedTy, Legal);
904  setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
905  setOperationAction(ISD::FRINT, RoundedTy, Legal);
907  }
908 
917 
918  // FIXME: Do we need to handle scalar-to-vector here?
920 
921  // We directly match byte blends in the backend as they match the VSELECT
922  // condition form.
924 
925  // SSE41 brings specific instructions for doing vector sign extend even in
926  // cases where we don't have SRA.
927  for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
930  }
931 
932  for (MVT VT : MVT::integer_vector_valuetypes()) {
936  }
937 
938  // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
939  for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
946  }
947 
948  // i8 vectors are custom because the source register and source
949  // source memory operand types are not the same width.
951  }
952 
953  if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
954  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
957 
958  // XOP can efficiently perform BITREVERSE with VPPERM.
959  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
961 
962  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
965  }
966 
967  if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
968  bool HasInt256 = Subtarget.hasInt256();
969 
970  addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
971  : &X86::VR256RegClass);
972  addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
973  : &X86::VR256RegClass);
974  addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
975  : &X86::VR256RegClass);
976  addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
977  : &X86::VR256RegClass);
978  addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
979  : &X86::VR256RegClass);
980  addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
981  : &X86::VR256RegClass);
982 
983  for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
992  }
993 
994  // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
995  // even though v8i16 is a legal type.
999 
1003 
1006 
1007  for (MVT VT : MVT::fp_vector_valuetypes())
1009 
1010  // In the customized shift lowering, the legal v8i32/v4i64 cases
1011  // in AVX2 will be recognized.
1012  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1016  }
1017 
1021 
1022  for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1026  }
1027 
1032 
1033  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1038  }
1039 
1040  if (Subtarget.hasAnyFMA()) {
1041  for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1044  }
1045 
1046  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1047  setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1048  setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1049  }
1050 
1053  setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1055 
1058 
1059  setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1060  setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1063 
1064  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1065  setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1066  setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1067  setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1068  setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1069  setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1070  }
1071 
1072  if (HasInt256) {
1076 
1077  // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1078  // when we have a 256bit-wide blend with immediate.
1080 
1081  // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1082  for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1089  }
1090  }
1091 
1092  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1096  }
1097 
1098  // Extract subvector is special because the value type
1099  // (result) is 128-bit but the source is 256-bit wide.
1100  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1101  MVT::v4f32, MVT::v2f64 }) {
1103  }
1104 
1105  // Custom lower several nodes for 256-bit types.
1107  MVT::v8f32, MVT::v4f64 }) {
1110  setOperationAction(ISD::VSELECT, VT, Custom);
1116  }
1117 
1118  if (HasInt256)
1120 
1121  // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1122  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1128  }
1129  }
1130 
1131  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1132  addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1133  addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1134  addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1135  addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1136 
1137  addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1138  addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1139  addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1140 
1141  for (MVT VT : MVT::fp_vector_valuetypes())
1143 
1144  for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1151  }
1152 
1156  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1157  setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1158  setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1159  setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1160  setTruncStoreAction(VT, MaskVT, Custom);
1161  }
1162 
1163  for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1168  }
1169 
1195 
1201  if (Subtarget.hasVLX()){
1207 
1213  } else {
1214  for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1218  }
1219  }
1222 
1223  if (Subtarget.hasDQI()) {
1224  for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1229  }
1230  if (Subtarget.hasVLX()) {
1231  // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1235  }
1236  }
1237  if (Subtarget.hasVLX()) {
1249 
1250  // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1261  }
1262 
1273 
1274  for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1280  }
1281 
1284 
1285  // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1288 
1294 
1296 
1303 
1305 
1306  // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1309 
1310  for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1317 
1322  setOperationAction(ISD::VSELECT, VT, Expand);
1323  }
1324 
1325  for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1336  }
1337 
1338  // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1340  MVT::v8i64}) {
1343  }
1344 
1345  // Need to promote to 64-bit even though we have 32-bit masked instructions
1346  // because the IR optimizers rearrange bitcasts around logic ops leaving
1347  // too many variations to handle if we don't promote them.
1351 
1352  if (Subtarget.hasCDI()) {
1353  // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1354  for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1355  MVT::v4i64, MVT::v8i64}) {
1358  }
1359  } // Subtarget.hasCDI()
1360 
1361  if (Subtarget.hasDQI()) {
1362  // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1366  }
1367 
1368  if (Subtarget.hasVPOPCNTDQ()) {
1369  // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1370  // version of popcntd/q.
1371  for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1374  }
1375 
1376  // Custom lower several nodes.
1377  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1381  }
1382  // Extract subvector is special because the value type
1383  // (result) is 256-bit but the source is 512-bit wide.
1384  // 128-bit was made Custom under AVX1.
1385  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1388  for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1391 
1392  for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1404  }
1405  for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1408  }
1409  }// has AVX-512
1410 
1411  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1412  addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1413  addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1414 
1415  addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1416  addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1417 
1424 
1470 
1472 
1474  if (Subtarget.hasVLX()) {
1477  }
1478 
1479  LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1480  for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1481  setOperationAction(ISD::MLOAD, VT, Action);
1482  setOperationAction(ISD::MSTORE, VT, Action);
1483  }
1484 
1485  if (Subtarget.hasCDI()) {
1488  }
1489 
1490  for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1505 
1509  }
1510 
1511  for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1513  if (Subtarget.hasVLX()) {
1514  // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1517  }
1518  }
1519  }
1520 
1521  if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1522  addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1523  addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1524 
1525  for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1529  setOperationAction(ISD::VSELECT, VT, Expand);
1530 
1538  }
1539 
1544 
1545  for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1550  }
1551  }
1552 
1553  // We want to custom lower some of our intrinsics.
1557  if (!Subtarget.is64Bit()) {
1560  }
1561 
1562  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1563  // handle type legalization for these operations here.
1564  //
1565  // FIXME: We really should do custom legalization for addition and
1566  // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1567  // than generic legalization for 64-bit multiplication-with-overflow, though.
1568  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1569  if (VT == MVT::i64 && !Subtarget.is64Bit())
1570  continue;
1571  // Add/Sub/Mul with overflow operations are custom lowered.
1578 
1579  // Support carry in as value rather than glue.
1583  }
1584 
1585  if (!Subtarget.is64Bit()) {
1586  // These libcalls are not available in 32-bit.
1587  setLibcallName(RTLIB::SHL_I128, nullptr);
1588  setLibcallName(RTLIB::SRL_I128, nullptr);
1589  setLibcallName(RTLIB::SRA_I128, nullptr);
1590  }
1591 
1592  // Combine sin / cos into one node or libcall if possible.
1593  if (Subtarget.hasSinCos()) {
1594  setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1595  setLibcallName(RTLIB::SINCOS_F64, "sincos");
1596  if (Subtarget.isTargetDarwin()) {
1597  // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1598  // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1601  }
1602  }
1603 
1604  if (Subtarget.isTargetWin64()) {
1611  }
1612 
1613  // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1614  // is. We should promote the value to 64-bits to solve this.
1615  // This is what the CRT headers do - `fmodf` is an inline header
1616  // function casting to f64 and calling `fmod`.
1617  if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1618  Subtarget.isTargetWindowsItanium()))
1619  for (ISD::NodeType Op :
1624 
1625  // We have target-specific dag combine patterns for the following nodes:
1663 
1665 
1666  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1668  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1670  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1672 
1673  // TODO: These control memcmp expansion in CGP and could be raised higher, but
1674  // that needs to benchmarked and balanced with the potential use of vector
1675  // load/store types (PR33329).
1676  MaxLoadsPerMemcmp = 4;
1678 
1679  // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1681 
1682  // An out-of-order CPU can speculatively execute past a predictable branch,
1683  // but a conditional move could be stalled by an expensive earlier operation.
1684  PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1685  EnableExtLdPromotion = true;
1686  setPrefFunctionAlignment(4); // 2^4 bytes.
1687 
1689 }
1690 
1691 // This has so far only been implemented for 64-bit MachO.
1693  return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1694 }
1695 
1699  VT.getVectorNumElements() != 1 &&
1701  return TypeWidenVector;
1702 
1704 }
1705 
1708  EVT VT) const {
1709  if (!VT.isVector())
1710  return MVT::i8;
1711 
1712  if (VT.isSimple()) {
1713  MVT VVT = VT.getSimpleVT();
1714  const unsigned NumElts = VVT.getVectorNumElements();
1715  MVT EltVT = VVT.getVectorElementType();
1716  if (VVT.is512BitVector()) {
1717  if (Subtarget.hasAVX512())
1718  if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1719  EltVT == MVT::f32 || EltVT == MVT::f64)
1720  switch(NumElts) {
1721  case 8: return MVT::v8i1;
1722  case 16: return MVT::v16i1;
1723  }
1724  if (Subtarget.hasBWI())
1725  if (EltVT == MVT::i8 || EltVT == MVT::i16)
1726  switch(NumElts) {
1727  case 32: return MVT::v32i1;
1728  case 64: return MVT::v64i1;
1729  }
1730  }
1731 
1732  if (Subtarget.hasBWI() && Subtarget.hasVLX())
1733  return MVT::getVectorVT(MVT::i1, NumElts);
1734 
1735  if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1736  EVT LegalVT = getTypeToTransformTo(Context, VT);
1737  EltVT = LegalVT.getVectorElementType().getSimpleVT();
1738  }
1739 
1740  if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1741  switch(NumElts) {
1742  case 2: return MVT::v2i1;
1743  case 4: return MVT::v4i1;
1744  case 8: return MVT::v8i1;
1745  }
1746  }
1747 
1749 }
1750 
1751 /// Helper for getByValTypeAlignment to determine
1752 /// the desired ByVal argument alignment.
1753 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1754  if (MaxAlign == 16)
1755  return;
1756  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1757  if (VTy->getBitWidth() == 128)
1758  MaxAlign = 16;
1759  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1760  unsigned EltAlign = 0;
1761  getMaxByValAlign(ATy->getElementType(), EltAlign);
1762  if (EltAlign > MaxAlign)
1763  MaxAlign = EltAlign;
1764  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1765  for (auto *EltTy : STy->elements()) {
1766  unsigned EltAlign = 0;
1767  getMaxByValAlign(EltTy, EltAlign);
1768  if (EltAlign > MaxAlign)
1769  MaxAlign = EltAlign;
1770  if (MaxAlign == 16)
1771  break;
1772  }
1773  }
1774 }
1775 
1776 /// Return the desired alignment for ByVal aggregate
1777 /// function arguments in the caller parameter area. For X86, aggregates
1778 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1779 /// are at 4-byte boundaries.
1781  const DataLayout &DL) const {
1782  if (Subtarget.is64Bit()) {
1783  // Max of 8 and alignment of type.
1784  unsigned TyAlign = DL.getABITypeAlignment(Ty);
1785  if (TyAlign > 8)
1786  return TyAlign;
1787  return 8;
1788  }
1789 
1790  unsigned Align = 4;
1791  if (Subtarget.hasSSE1())
1792  getMaxByValAlign(Ty, Align);
1793  return Align;
1794 }
1795 
1796 /// Returns the target specific optimal type for load
1797 /// and store operations as a result of memset, memcpy, and memmove
1798 /// lowering. If DstAlign is zero that means it's safe to destination
1799 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1800 /// means there isn't a need to check it against alignment requirement,
1801 /// probably because the source does not need to be loaded. If 'IsMemset' is
1802 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1803 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1804 /// source is constant so it does not need to be loaded.
1805 /// It returns EVT::Other if the type should be determined using generic
1806 /// target-independent logic.
1807 EVT
1809  unsigned DstAlign, unsigned SrcAlign,
1810  bool IsMemset, bool ZeroMemset,
1811  bool MemcpyStrSrc,
1812  MachineFunction &MF) const {
1813  const Function *F = MF.getFunction();
1814  if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1815  if (Size >= 16 &&
1816  (!Subtarget.isUnalignedMem16Slow() ||
1817  ((DstAlign == 0 || DstAlign >= 16) &&
1818  (SrcAlign == 0 || SrcAlign >= 16)))) {
1819  // FIXME: Check if unaligned 32-byte accesses are slow.
1820  if (Size >= 32 && Subtarget.hasAVX()) {
1821  // Although this isn't a well-supported type for AVX1, we'll let
1822  // legalization and shuffle lowering produce the optimal codegen. If we
1823  // choose an optimal type with a vector element larger than a byte,
1824  // getMemsetStores() may create an intermediate splat (using an integer
1825  // multiply) before we splat as a vector.
1826  return MVT::v32i8;
1827  }
1828  if (Subtarget.hasSSE2())
1829  return MVT::v16i8;
1830  // TODO: Can SSE1 handle a byte vector?
1831  if (Subtarget.hasSSE1())
1832  return MVT::v4f32;
1833  } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1834  !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1835  // Do not use f64 to lower memcpy if source is string constant. It's
1836  // better to use i32 to avoid the loads.
1837  // Also, do not use f64 to lower memset unless this is a memset of zeros.
1838  // The gymnastics of splatting a byte value into an XMM register and then
1839  // only using 8-byte stores (because this is a CPU with slow unaligned
1840  // 16-byte accesses) makes that a loser.
1841  return MVT::f64;
1842  }
1843  }
1844  // This is a compromise. If we reach here, unaligned accesses may be slow on
1845  // this target. However, creating smaller, aligned accesses could be even
1846  // slower and would certainly be a lot more code.
1847  if (Subtarget.is64Bit() && Size >= 8)
1848  return MVT::i64;
1849  return MVT::i32;
1850 }
1851 
1853  if (VT == MVT::f32)
1854  return X86ScalarSSEf32;
1855  else if (VT == MVT::f64)
1856  return X86ScalarSSEf64;
1857  return true;
1858 }
1859 
1860 bool
1862  unsigned,
1863  unsigned,
1864  bool *Fast) const {
1865  if (Fast) {
1866  switch (VT.getSizeInBits()) {
1867  default:
1868  // 8-byte and under are always assumed to be fast.
1869  *Fast = true;
1870  break;
1871  case 128:
1872  *Fast = !Subtarget.isUnalignedMem16Slow();
1873  break;
1874  case 256:
1875  *Fast = !Subtarget.isUnalignedMem32Slow();
1876  break;
1877  // TODO: What about AVX-512 (512-bit) accesses?
1878  }
1879  }
1880  // Misaligned accesses of any size are always allowed.
1881  return true;
1882 }
1883 
1884 /// Return the entry encoding for a jump table in the
1885 /// current function. The returned value is a member of the
1886 /// MachineJumpTableInfo::JTEntryKind enum.
1888  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1889  // symbol.
1890  if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1892 
1893  // Otherwise, use the normal jump table encoding heuristics.
1895 }
1896 
1898  return Subtarget.useSoftFloat();
1899 }
1900 
1902  ArgListTy &Args) const {
1903 
1904  // Only relabel X86-32 for C / Stdcall CCs.
1905  if (Subtarget.is64Bit())
1906  return;
1907  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1908  return;
1909  unsigned ParamRegs = 0;
1910  if (auto *M = MF->getFunction()->getParent())
1911  ParamRegs = M->getNumberRegisterParameters();
1912 
1913  // Mark the first N int arguments as having reg
1914  for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1915  Type *T = Args[Idx].Ty;
1916  if (T->isPointerTy() || T->isIntegerTy())
1917  if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1918  unsigned numRegs = 1;
1919  if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1920  numRegs = 2;
1921  if (ParamRegs < numRegs)
1922  return;
1923  ParamRegs -= numRegs;
1924  Args[Idx].IsInReg = true;
1925  }
1926  }
1927 }
1928 
1929 const MCExpr *
1931  const MachineBasicBlock *MBB,
1932  unsigned uid,MCContext &Ctx) const{
1933  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1934  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1935  // entries.
1936  return MCSymbolRefExpr::create(MBB->getSymbol(),
1938 }
1939 
1940 /// Returns relocation base for the given PIC jumptable.
1942  SelectionDAG &DAG) const {
1943  if (!Subtarget.is64Bit())
1944  // This doesn't have SDLoc associated with it, but is not really the
1945  // same as a Register.
1946  return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1947  getPointerTy(DAG.getDataLayout()));
1948  return Table;
1949 }
1950 
1951 /// This returns the relocation base for the given PIC jumptable,
1952 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1955  MCContext &Ctx) const {
1956  // X86-64 uses RIP relative addressing based on the jump table label.
1957  if (Subtarget.isPICStyleRIPRel())
1958  return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1959 
1960  // Otherwise, the reference is relative to the PIC base.
1961  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1962 }
1963 
1964 std::pair<const TargetRegisterClass *, uint8_t>
1966  MVT VT) const {
1967  const TargetRegisterClass *RRC = nullptr;
1968  uint8_t Cost = 1;
1969  switch (VT.SimpleTy) {
1970  default:
1972  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1973  RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1974  break;
1975  case MVT::x86mmx:
1976  RRC = &X86::VR64RegClass;
1977  break;
1978  case MVT::f32: case MVT::f64:
1979  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1980  case MVT::v4f32: case MVT::v2f64:
1981  case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1982  case MVT::v8f32: case MVT::v4f64:
1983  case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1984  case MVT::v16f32: case MVT::v8f64:
1985  RRC = &X86::VR128XRegClass;
1986  break;
1987  }
1988  return std::make_pair(RRC, Cost);
1989 }
1990 
1991 unsigned X86TargetLowering::getAddressSpace() const {
1992  if (Subtarget.is64Bit())
1993  return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1994  return 256;
1995 }
1996 
1997 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
1998  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
1999  (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2000 }
2001 
2003  unsigned Offset, unsigned AddressSpace) {
2006  Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2007 }
2008 
2010  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2011  // tcbhead_t; use it instead of the usual global variable (see
2012  // sysdeps/{i386,x86_64}/nptl/tls.h)
2013  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2014  if (Subtarget.isTargetFuchsia()) {
2015  // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
2016  return SegmentOffset(IRB, 0x10, getAddressSpace());
2017  } else {
2018  // %fs:0x28, unless we're using a Kernel code model, in which case
2019  // it's %gs:0x28. gs:0x14 on i386.
2020  unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2021  return SegmentOffset(IRB, Offset, getAddressSpace());
2022  }
2023  }
2024 
2025  return TargetLowering::getIRStackGuard(IRB);
2026 }
2027 
2029  // MSVC CRT provides functionalities for stack protection.
2030  if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2031  // MSVC CRT has a global variable holding security cookie.
2032  M.getOrInsertGlobal("__security_cookie",
2034 
2035  // MSVC CRT has a function to validate security cookie.
2036  auto *SecurityCheckCookie = cast<Function>(
2037  M.getOrInsertFunction("__security_check_cookie",
2040  SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2041  SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2042  return;
2043  }
2044  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2045  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2046  return;
2048 }
2049 
2051  // MSVC CRT has a global variable holding security cookie.
2052  if (Subtarget.getTargetTriple().isOSMSVCRT())
2053  return M.getGlobalVariable("__security_cookie");
2055 }
2056 
2058  // MSVC CRT has a function to validate security cookie.
2059  if (Subtarget.getTargetTriple().isOSMSVCRT())
2060  return M.getFunction("__security_check_cookie");
2062 }
2063 
2065  if (Subtarget.getTargetTriple().isOSContiki())
2066  return getDefaultSafeStackPointerLocation(IRB, false);
2067 
2068  // Android provides a fixed TLS slot for the SafeStack pointer. See the
2069  // definition of TLS_SLOT_SAFESTACK in
2070  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2071  if (Subtarget.isTargetAndroid()) {
2072  // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2073  // %gs:0x24 on i386
2074  unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2075  return SegmentOffset(IRB, Offset, getAddressSpace());
2076  }
2077 
2078  // Fuchsia is similar.
2079  if (Subtarget.isTargetFuchsia()) {
2080  // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2081  return SegmentOffset(IRB, 0x18, getAddressSpace());
2082  }
2083 
2085 }
2086 
2088  unsigned DestAS) const {
2089  assert(SrcAS != DestAS && "Expected different address spaces!");
2090 
2091  return SrcAS < 256 && DestAS < 256;
2092 }
2093 
2094 //===----------------------------------------------------------------------===//
2095 // Return Value Calling Convention Implementation
2096 //===----------------------------------------------------------------------===//
2097 
2098 #include "X86GenCallingConv.inc"
2099 
2100 bool X86TargetLowering::CanLowerReturn(
2101  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2102  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2104  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2105  return CCInfo.CheckReturn(Outs, RetCC_X86);
2106 }
2107 
2108 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2109  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2110  return ScratchRegs;
2111 }
2112 
2113 /// Lowers masks values (v*i1) to the local register values
2114 /// \returns DAG node after lowering to register type
2115 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2116  const SDLoc &Dl, SelectionDAG &DAG) {
2117  EVT ValVT = ValArg.getValueType();
2118 
2119  if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2120  (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2121  // Two stage lowering might be required
2122  // bitcast: v8i1 -> i8 / v16i1 -> i16
2123  // anyextend: i8 -> i32 / i16 -> i32
2124  EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2125  SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2126  if (ValLoc == MVT::i32)
2127  ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2128  return ValToCopy;
2129  } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2130  (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2131  // One stage lowering is required
2132  // bitcast: v32i1 -> i32 / v64i1 -> i64
2133  return DAG.getBitcast(ValLoc, ValArg);
2134  } else
2135  return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2136 }
2137 
2138 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2140  const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2141  SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2142  CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2143  assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2144  "Expected AVX512BW or AVX512BMI target!");
2145  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2146  assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2147  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2148  "The value should reside in two registers");
2149 
2150  // Before splitting the value we cast it to i64
2151  Arg = DAG.getBitcast(MVT::i64, Arg);
2152 
2153  // Splitting the value into two i32 types
2154  SDValue Lo, Hi;
2155  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2156  DAG.getConstant(0, Dl, MVT::i32));
2157  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2158  DAG.getConstant(1, Dl, MVT::i32));
2159 
2160  // Attach the two i32 types into corresponding registers
2161  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2162  RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2163 }
2164 
2165 SDValue
2166 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2167  bool isVarArg,
2168  const SmallVectorImpl<ISD::OutputArg> &Outs,
2169  const SmallVectorImpl<SDValue> &OutVals,
2170  const SDLoc &dl, SelectionDAG &DAG) const {
2171  MachineFunction &MF = DAG.getMachineFunction();
2173 
2174  // In some cases we need to disable registers from the default CSR list.
2175  // For example, when they are used for argument passing.
2176  bool ShouldDisableCalleeSavedRegister =
2177  CallConv == CallingConv::X86_RegCall ||
2178  MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2179 
2180  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2181  report_fatal_error("X86 interrupts may not return any value");
2182 
2184  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2185  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2186 
2187  SDValue Flag;
2188  SmallVector<SDValue, 6> RetOps;
2189  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2190  // Operand #1 = Bytes To Pop
2191  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2192  MVT::i32));
2193 
2194  // Copy the result values into the output registers.
2195  for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2196  ++I, ++OutsIndex) {
2197  CCValAssign &VA = RVLocs[I];
2198  assert(VA.isRegLoc() && "Can only return in registers!");
2199 
2200  // Add the register to the CalleeSaveDisableRegs list.
2201  if (ShouldDisableCalleeSavedRegister)
2203 
2204  SDValue ValToCopy = OutVals[OutsIndex];
2205  EVT ValVT = ValToCopy.getValueType();
2206 
2207  // Promote values to the appropriate types.
2208  if (VA.getLocInfo() == CCValAssign::SExt)
2209  ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2210  else if (VA.getLocInfo() == CCValAssign::ZExt)
2211  ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2212  else if (VA.getLocInfo() == CCValAssign::AExt) {
2213  if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2214  ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2215  else
2216  ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2217  }
2218  else if (VA.getLocInfo() == CCValAssign::BCvt)
2219  ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2220 
2222  "Unexpected FP-extend for return value.");
2223 
2224  // If this is x86-64, and we disabled SSE, we can't return FP values,
2225  // or SSE or MMX vectors.
2226  if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2227  VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2228  (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2229  errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2230  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2231  } else if (ValVT == MVT::f64 &&
2232  (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2233  // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2234  // llvm-gcc has never done it right and no one has noticed, so this
2235  // should be OK for now.
2236  errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2237  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2238  }
2239 
2240  // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2241  // the RET instruction and handled by the FP Stackifier.
2242  if (VA.getLocReg() == X86::FP0 ||
2243  VA.getLocReg() == X86::FP1) {
2244  // If this is a copy from an xmm register to ST(0), use an FPExtend to
2245  // change the value to the FP stack register class.
2246  if (isScalarFPTypeInSSEReg(VA.getValVT()))
2247  ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2248  RetOps.push_back(ValToCopy);
2249  // Don't emit a copytoreg.
2250  continue;
2251  }
2252 
2253  // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2254  // which is returned in RAX / RDX.
2255  if (Subtarget.is64Bit()) {
2256  if (ValVT == MVT::x86mmx) {
2257  if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2258  ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2259  ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2260  ValToCopy);
2261  // If we don't have SSE2 available, convert to v4f32 so the generated
2262  // register is legal.
2263  if (!Subtarget.hasSSE2())
2264  ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2265  }
2266  }
2267  }
2268 
2270 
2271  if (VA.needsCustom()) {
2272  assert(VA.getValVT() == MVT::v64i1 &&
2273  "Currently the only custom case is when we split v64i1 to 2 regs");
2274 
2275  Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2276  Subtarget);
2277 
2278  assert(2 == RegsToPass.size() &&
2279  "Expecting two registers after Pass64BitArgInRegs");
2280 
2281  // Add the second register to the CalleeSaveDisableRegs list.
2282  if (ShouldDisableCalleeSavedRegister)
2283  MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2284  } else {
2285  RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2286  }
2287 
2288  // Add nodes to the DAG and add the values into the RetOps list
2289  for (auto &Reg : RegsToPass) {
2290  Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2291  Flag = Chain.getValue(1);
2292  RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2293  }
2294  }
2295 
2296  // Swift calling convention does not require we copy the sret argument
2297  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2298 
2299  // All x86 ABIs require that for returning structs by value we copy
2300  // the sret argument into %rax/%eax (depending on ABI) for the return.
2301  // We saved the argument into a virtual register in the entry block,
2302  // so now we copy the value out and into %rax/%eax.
2303  //
2304  // Checking Function.hasStructRetAttr() here is insufficient because the IR
2305  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2306  // false, then an sret argument may be implicitly inserted in the SelDAG. In
2307  // either case FuncInfo->setSRetReturnReg() will have been called.
2308  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2309  // When we have both sret and another return value, we should use the
2310  // original Chain stored in RetOps[0], instead of the current Chain updated
2311  // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2312 
2313  // For the case of sret and another return value, we have
2314  // Chain_0 at the function entry
2315  // Chain_1 = getCopyToReg(Chain_0) in the above loop
2316  // If we use Chain_1 in getCopyFromReg, we will have
2317  // Val = getCopyFromReg(Chain_1)
2318  // Chain_2 = getCopyToReg(Chain_1, Val) from below
2319 
2320  // getCopyToReg(Chain_0) will be glued together with
2321  // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2322  // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2323  // Data dependency from Unit B to Unit A due to usage of Val in
2324  // getCopyToReg(Chain_1, Val)
2325  // Chain dependency from Unit A to Unit B
2326 
2327  // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2328  SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2329  getPointerTy(MF.getDataLayout()));
2330 
2331  unsigned RetValReg
2332  = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2333  X86::RAX : X86::EAX;
2334  Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2335  Flag = Chain.getValue(1);
2336 
2337  // RAX/EAX now acts like a return value.
2338  RetOps.push_back(
2339  DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2340 
2341  // Add the returned register to the CalleeSaveDisableRegs list.
2342  if (ShouldDisableCalleeSavedRegister)
2343  MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2344  }
2345 
2346  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2347  const MCPhysReg *I =
2349  if (I) {
2350  for (; *I; ++I) {
2351  if (X86::GR64RegClass.contains(*I))
2352  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2353  else
2354  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2355  }
2356  }
2357 
2358  RetOps[0] = Chain; // Update chain.
2359 
2360  // Add the flag if we have it.
2361  if (Flag.getNode())
2362  RetOps.push_back(Flag);
2363 
2365  if (CallConv == CallingConv::X86_INTR)
2366  opcode = X86ISD::IRET;
2367  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2368 }
2369 
2370 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2371  if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2372  return false;
2373 
2374  SDValue TCChain = Chain;
2375  SDNode *Copy = *N->use_begin();
2376  if (Copy->getOpcode() == ISD::CopyToReg) {
2377  // If the copy has a glue operand, we conservatively assume it isn't safe to
2378  // perform a tail call.
2379  if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2380  return false;
2381  TCChain = Copy->getOperand(0);
2382  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2383  return false;
2384 
2385  bool HasRet = false;
2386  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2387  UI != UE; ++UI) {
2388  if (UI->getOpcode() != X86ISD::RET_FLAG)
2389  return false;
2390  // If we are returning more than one value, we can definitely
2391  // not make a tail call see PR19530
2392  if (UI->getNumOperands() > 4)
2393  return false;
2394  if (UI->getNumOperands() == 4 &&
2395  UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2396  return false;
2397  HasRet = true;
2398  }
2399 
2400  if (!HasRet)
2401  return false;
2402 
2403  Chain = TCChain;
2404  return true;
2405 }
2406 
2407 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2408  ISD::NodeType ExtendKind) const {
2409  MVT ReturnMVT = MVT::i32;
2410 
2411  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2412  if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2413  // The ABI does not require i1, i8 or i16 to be extended.
2414  //
2415  // On Darwin, there is code in the wild relying on Clang's old behaviour of
2416  // always extending i8/i16 return values, so keep doing that for now.
2417  // (PR26665).
2418  ReturnMVT = MVT::i8;
2419  }
2420 
2421  EVT MinVT = getRegisterType(Context, ReturnMVT);
2422  return VT.bitsLT(MinVT) ? MinVT : VT;
2423 }
2424 
2425 /// Reads two 32 bit registers and creates a 64 bit mask value.
2426 /// \param VA The current 32 bit value that need to be assigned.
2427 /// \param NextVA The next 32 bit value that need to be assigned.
2428 /// \param Root The parent DAG node.
2429 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2430 /// glue purposes. In the case the DAG is already using
2431 /// physical register instead of virtual, we should glue
2432 /// our new SDValue to InFlag SDvalue.
2433 /// \return a new SDvalue of size 64bit.
2435  SDValue &Root, SelectionDAG &DAG,
2436  const SDLoc &Dl, const X86Subtarget &Subtarget,
2437  SDValue *InFlag = nullptr) {
2438  assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2439  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2440  assert(VA.getValVT() == MVT::v64i1 &&
2441  "Expecting first location of 64 bit width type");
2442  assert(NextVA.getValVT() == VA.getValVT() &&
2443  "The locations should have the same type");
2444  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2445  "The values should reside in two registers");
2446 
2447  SDValue Lo, Hi;
2448  unsigned Reg;
2449  SDValue ArgValueLo, ArgValueHi;
2450 
2451  MachineFunction &MF = DAG.getMachineFunction();
2452  const TargetRegisterClass *RC = &X86::GR32RegClass;
2453 
2454  // Read a 32 bit value from the registers
2455  if (nullptr == InFlag) {
2456  // When no physical register is present,
2457  // create an intermediate virtual register
2458  Reg = MF.addLiveIn(VA.getLocReg(), RC);
2459  ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2460  Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2461  ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2462  } else {
2463  // When a physical register is available read the value from it and glue
2464  // the reads together.
2465  ArgValueLo =
2466  DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2467  *InFlag = ArgValueLo.getValue(2);
2468  ArgValueHi =
2469  DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2470  *InFlag = ArgValueHi.getValue(2);
2471  }
2472 
2473  // Convert the i32 type into v32i1 type
2474  Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2475 
2476  // Convert the i32 type into v32i1 type
2477  Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2478 
2479  // Concatenate the two values together
2480  return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2481 }
2482 
2483 /// The function will lower a register of various sizes (8/16/32/64)
2484 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2485 /// \returns a DAG node contains the operand after lowering to mask type.
2486 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2487  const EVT &ValLoc, const SDLoc &Dl,
2488  SelectionDAG &DAG) {
2489  SDValue ValReturned = ValArg;
2490 
2491  if (ValVT == MVT::v1i1)
2492  return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2493 
2494  if (ValVT == MVT::v64i1) {
2495  // In 32 bit machine, this case is handled by getv64i1Argument
2496  assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2497  // In 64 bit machine, There is no need to truncate the value only bitcast
2498  } else {
2499  MVT maskLen;
2500  switch (ValVT.getSimpleVT().SimpleTy) {
2501  case MVT::v8i1:
2502  maskLen = MVT::i8;
2503  break;
2504  case MVT::v16i1:
2505  maskLen = MVT::i16;
2506  break;
2507  case MVT::v32i1:
2508  maskLen = MVT::i32;
2509  break;
2510  default:
2511  llvm_unreachable("Expecting a vector of i1 types");
2512  }
2513 
2514  ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2515  }
2516  return DAG.getBitcast(ValVT, ValReturned);
2517 }
2518 
2519 /// Lower the result values of a call into the
2520 /// appropriate copies out of appropriate physical registers.
2521 ///
2522 SDValue X86TargetLowering::LowerCallResult(
2523  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2524  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2525  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2526  uint32_t *RegMask) const {
2527 
2528  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2529  // Assign locations to each value returned by this call.
2531  bool Is64Bit = Subtarget.is64Bit();
2532  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2533  *DAG.getContext());
2534  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2535 
2536  // Copy all of the result registers out of their specified physreg.
2537  for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2538  ++I, ++InsIndex) {
2539  CCValAssign &VA = RVLocs[I];
2540  EVT CopyVT = VA.getLocVT();
2541 
2542  // In some calling conventions we need to remove the used registers
2543  // from the register mask.
2544  if (RegMask) {
2545  for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2546  SubRegs.isValid(); ++SubRegs)
2547  RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2548  }
2549 
2550  // If this is x86-64, and we disabled SSE, we can't return FP values
2551  if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2552  ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2553  errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2554  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2555  }
2556 
2557  // If we prefer to use the value in xmm registers, copy it out as f80 and
2558  // use a truncate to move it from fp stack reg to xmm reg.
2559  bool RoundAfterCopy = false;
2560  if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2562  if (!Subtarget.hasX87())
2563  report_fatal_error("X87 register return with X87 disabled");
2564  CopyVT = MVT::f80;
2565  RoundAfterCopy = (CopyVT != VA.getLocVT());
2566  }
2567 
2568  SDValue Val;
2569  if (VA.needsCustom()) {
2570  assert(VA.getValVT() == MVT::v64i1 &&
2571  "Currently the only custom case is when we split v64i1 to 2 regs");
2572  Val =
2573  getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2574  } else {
2575  Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2576  .getValue(1);
2577  Val = Chain.getValue(0);
2578  InFlag = Chain.getValue(2);
2579  }
2580 
2581  if (RoundAfterCopy)
2582  Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2583  // This truncation won't change the value.
2584  DAG.getIntPtrConstant(1, dl));
2585 
2586  if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2587  if (VA.getValVT().isVector() &&
2588  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2589  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2590  // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2591  Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2592  } else
2593  Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2594  }
2595 
2596  InVals.push_back(Val);
2597  }
2598 
2599  return Chain;
2600 }
2601 
2602 //===----------------------------------------------------------------------===//
2603 // C & StdCall & Fast Calling Convention implementation
2604 //===----------------------------------------------------------------------===//
2605 // StdCall calling convention seems to be standard for many Windows' API
2606 // routines and around. It differs from C calling convention just a little:
2607 // callee should clean up the stack, not caller. Symbols should be also
2608 // decorated in some fancy way :) It doesn't support any vector arguments.
2609 // For info on fast calling convention see Fast Calling Convention (tail call)
2610 // implementation LowerX86_32FastCCCallTo.
2611 
2612 /// CallIsStructReturn - Determines whether a call uses struct return
2613 /// semantics.
2618 };
2619 static StructReturnType
2621  if (Outs.empty())
2622  return NotStructReturn;
2623 
2624  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2625  if (!Flags.isSRet())
2626  return NotStructReturn;
2627  if (Flags.isInReg() || IsMCU)
2628  return RegStructReturn;
2629  return StackStructReturn;
2630 }
2631 
2632 /// Determines whether a function uses struct return semantics.
2633 static StructReturnType
2635  if (Ins.empty())
2636  return NotStructReturn;
2637 
2638  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2639  if (!Flags.isSRet())
2640  return NotStructReturn;
2641  if (Flags.isInReg() || IsMCU)
2642  return RegStructReturn;
2643  return StackStructReturn;
2644 }
2645 
2646 /// Make a copy of an aggregate at address specified by "Src" to address
2647 /// "Dst" with size and alignment information specified by the specific
2648 /// parameter attribute. The copy will be passed as a byval function parameter.
2650  SDValue Chain, ISD::ArgFlagsTy Flags,
2651  SelectionDAG &DAG, const SDLoc &dl) {
2652  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2653 
2654  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2655  /*isVolatile*/false, /*AlwaysInline=*/true,
2656  /*isTailCall*/false,
2658 }
2659 
2660 /// Return true if the calling convention is one that we can guarantee TCO for.
2662  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2664  CC == CallingConv::HHVM);
2665 }
2666 
2667 /// Return true if we might ever do TCO for calls with this calling convention.
2669  switch (CC) {
2670  // C calling conventions:
2671  case CallingConv::C:
2672  case CallingConv::Win64:
2674  // Callee pop conventions:
2679  return true;
2680  default:
2681  return canGuaranteeTCO(CC);
2682  }
2683 }
2684 
2685 /// Return true if the function is being made into a tailcall target by
2686 /// changing its ABI.
2687 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2688  return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2689 }
2690 
2691 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2692  auto Attr =
2693  CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2694  if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2695  return false;
2696 
2697  ImmutableCallSite CS(CI);
2698  CallingConv::ID CalleeCC = CS.getCallingConv();
2699  if (!mayTailCallThisCC(CalleeCC))
2700  return false;
2701 
2702  return true;
2703 }
2704 
2705 SDValue
2706 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2707  const SmallVectorImpl<ISD::InputArg> &Ins,
2708  const SDLoc &dl, SelectionDAG &DAG,
2709  const CCValAssign &VA,
2710  MachineFrameInfo &MFI, unsigned i) const {
2711  // Create the nodes corresponding to a load from this parameter slot.
2712  ISD::ArgFlagsTy Flags = Ins[i].Flags;
2713  bool AlwaysUseMutable = shouldGuaranteeTCO(
2714  CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2715  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2716  EVT ValVT;
2717  MVT PtrVT = getPointerTy(DAG.getDataLayout());
2718 
2719  // If value is passed by pointer we have address passed instead of the value
2720  // itself. No need to extend if the mask value and location share the same
2721  // absolute size.
2722  bool ExtendedInMem =
2723  VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2724  VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2725 
2726  if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2727  ValVT = VA.getLocVT();
2728  else
2729  ValVT = VA.getValVT();
2730 
2731  // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2732  // taken by a return address.
2733  int Offset = 0;
2734  if (CallConv == CallingConv::X86_INTR) {
2735  // X86 interrupts may take one or two arguments.
2736  // On the stack there will be no return address as in regular call.
2737  // Offset of last argument need to be set to -4/-8 bytes.
2738  // Where offset of the first argument out of two, should be set to 0 bytes.
2739  Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2740  if (Subtarget.is64Bit() && Ins.size() == 2) {
2741  // The stack pointer needs to be realigned for 64 bit handlers with error
2742  // code, so the argument offset changes by 8 bytes.
2743  Offset += 8;
2744  }
2745  }
2746 
2747  // FIXME: For now, all byval parameter objects are marked mutable. This can be
2748  // changed with more analysis.
2749  // In case of tail call optimization mark all arguments mutable. Since they
2750  // could be overwritten by lowering of arguments in case of a tail call.
2751  if (Flags.isByVal()) {
2752  unsigned Bytes = Flags.getByValSize();
2753  if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2754  int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2755  // Adjust SP offset of interrupt parameter.
2756  if (CallConv == CallingConv::X86_INTR) {
2757  MFI.setObjectOffset(FI, Offset);
2758  }
2759  return DAG.getFrameIndex(FI, PtrVT);
2760  }
2761 
2762  // This is an argument in memory. We might be able to perform copy elision.
2763  if (Flags.isCopyElisionCandidate()) {
2764  EVT ArgVT = Ins[i].ArgVT;
2765  SDValue PartAddr;
2766  if (Ins[i].PartOffset == 0) {
2767  // If this is a one-part value or the first part of a multi-part value,
2768  // create a stack object for the entire argument value type and return a
2769  // load from our portion of it. This assumes that if the first part of an
2770  // argument is in memory, the rest will also be in memory.
2771  int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2772  /*Immutable=*/false);
2773  PartAddr = DAG.getFrameIndex(FI, PtrVT);
2774  return DAG.getLoad(
2775  ValVT, dl, Chain, PartAddr,
2777  } else {
2778  // This is not the first piece of an argument in memory. See if there is
2779  // already a fixed stack object including this offset. If so, assume it
2780  // was created by the PartOffset == 0 branch above and create a load from
2781  // the appropriate offset into it.
2782  int64_t PartBegin = VA.getLocMemOffset();
2783  int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2784  int FI = MFI.getObjectIndexBegin();
2785  for (; MFI.isFixedObjectIndex(FI); ++FI) {
2786  int64_t ObjBegin = MFI.getObjectOffset(FI);
2787  int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2788  if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2789  break;
2790  }
2791  if (MFI.isFixedObjectIndex(FI)) {
2792  SDValue Addr =
2793  DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2794  DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2795  return DAG.getLoad(
2796  ValVT, dl, Chain, Addr,
2798  Ins[i].PartOffset));
2799  }
2800  }
2801  }
2802 
2803  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2804  VA.getLocMemOffset(), isImmutable);
2805 
2806  // Set SExt or ZExt flag.
2807  if (VA.getLocInfo() == CCValAssign::ZExt) {
2808  MFI.setObjectZExt(FI, true);
2809  } else if (VA.getLocInfo() == CCValAssign::SExt) {
2810  MFI.setObjectSExt(FI, true);
2811  }
2812 
2813  // Adjust SP offset of interrupt parameter.
2814  if (CallConv == CallingConv::X86_INTR) {
2815  MFI.setObjectOffset(FI, Offset);
2816  }
2817 
2818  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2819  SDValue Val = DAG.getLoad(
2820  ValVT, dl, Chain, FIN,
2822  return ExtendedInMem
2823  ? (VA.getValVT().isVector()
2824  ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2825  : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2826  : Val;
2827 }
2828 
2829 // FIXME: Get this from tablegen.
2831  const X86Subtarget &Subtarget) {
2832  assert(Subtarget.is64Bit());
2833 
2834  if (Subtarget.isCallingConvWin64(CallConv)) {
2835  static const MCPhysReg GPR64ArgRegsWin64[] = {
2836  X86::RCX, X86::RDX, X86::R8, X86::R9
2837  };
2838  return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2839  }
2840 
2841  static const MCPhysReg GPR64ArgRegs64Bit[] = {
2842  X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2843  };
2844  return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2845 }
2846 
2847 // FIXME: Get this from tablegen.
2849  CallingConv::ID CallConv,
2850  const X86Subtarget &Subtarget) {
2851  assert(Subtarget.is64Bit());
2852  if (Subtarget.isCallingConvWin64(CallConv)) {
2853  // The XMM registers which might contain var arg parameters are shadowed
2854  // in their paired GPR. So we only need to save the GPR to their home
2855  // slots.
2856  // TODO: __vectorcall will change this.
2857  return None;
2858  }
2859 
2860  const Function *Fn = MF.getFunction();
2861  bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2862  bool isSoftFloat = Subtarget.useSoftFloat();
2863  assert(!(isSoftFloat && NoImplicitFloatOps) &&
2864  "SSE register cannot be used when SSE is disabled!");
2865  if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2866  // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2867  // registers.
2868  return None;
2869 
2870  static const MCPhysReg XMMArgRegs64Bit[] = {
2871  X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2872  X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2873  };
2874  return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2875 }
2876 
2877 #ifndef NDEBUG
2879  return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2880  [](const CCValAssign &A, const CCValAssign &B) -> bool {
2881  return A.getValNo() < B.getValNo();
2882  });
2883 }
2884 #endif
2885 
2886 SDValue X86TargetLowering::LowerFormalArguments(
2887  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2888  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2889  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2890  MachineFunction &MF = DAG.getMachineFunction();
2892  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2893 
2894  const Function *Fn = MF.getFunction();
2895  if (Fn->hasExternalLinkage() &&
2896  Subtarget.isTargetCygMing() &&
2897  Fn->getName() == "main")
2898  FuncInfo->setForceFramePointer(true);
2899 
2900  MachineFrameInfo &MFI = MF.getFrameInfo();
2901  bool Is64Bit = Subtarget.is64Bit();
2902  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2903 
2904  assert(
2905  !(isVarArg && canGuaranteeTCO(CallConv)) &&
2906  "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2907 
2908  if (CallConv == CallingConv::X86_INTR) {
2909  bool isLegal = Ins.size() == 1 ||
2910  (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2911  (!Is64Bit && Ins[1].VT == MVT::i32)));
2912  if (!isLegal)
2913  report_fatal_error("X86 interrupts may take one or two arguments");
2914  }
2915 
2916  // Assign locations to all of the incoming arguments.
2918  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2919 
2920  // Allocate shadow area for Win64.
2921  if (IsWin64)
2922  CCInfo.AllocateStack(32, 8);
2923 
2924  CCInfo.AnalyzeArguments(Ins, CC_X86);
2925 
2926  // In vectorcall calling convention a second pass is required for the HVA
2927  // types.
2928  if (CallingConv::X86_VectorCall == CallConv) {
2929  CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2930  }
2931 
2932  // The next loop assumes that the locations are in the same order of the
2933  // input arguments.
2934  assert(isSortedByValueNo(ArgLocs) &&
2935  "Argument Location list must be sorted before lowering");
2936 
2937  SDValue ArgValue;
2938  for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2939  ++I, ++InsIndex) {
2940  assert(InsIndex < Ins.size() && "Invalid Ins index");
2941  CCValAssign &VA = ArgLocs[I];
2942 
2943  if (VA.isRegLoc()) {
2944  EVT RegVT = VA.getLocVT();
2945  if (VA.needsCustom()) {
2946  assert(
2947  VA.getValVT() == MVT::v64i1 &&
2948  "Currently the only custom case is when we split v64i1 to 2 regs");
2949 
2950  // v64i1 values, in regcall calling convention, that are
2951  // compiled to 32 bit arch, are split up into two registers.
2952  ArgValue =
2953  getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2954  } else {
2955  const TargetRegisterClass *RC;
2956  if (RegVT == MVT::i32)
2957  RC = &X86::GR32RegClass;
2958  else if (Is64Bit && RegVT == MVT::i64)
2959  RC = &X86::GR64RegClass;
2960  else if (RegVT == MVT::f32)
2961  RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2962  else if (RegVT == MVT::f64)
2963  RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2964  else if (RegVT == MVT::f80)
2965  RC = &X86::RFP80RegClass;
2966  else if (RegVT == MVT::f128)
2967  RC = &X86::FR128RegClass;
2968  else if (RegVT.is512BitVector())
2969  RC = &X86::VR512RegClass;
2970  else if (RegVT.is256BitVector())
2971  RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2972  else if (RegVT.is128BitVector())
2973  RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2974  else if (RegVT == MVT::x86mmx)
2975  RC = &X86::VR64RegClass;
2976  else if (RegVT == MVT::v1i1)
2977  RC = &X86::VK1RegClass;
2978  else if (RegVT == MVT::v8i1)
2979  RC = &X86::VK8RegClass;
2980  else if (RegVT == MVT::v16i1)
2981  RC = &X86::VK16RegClass;
2982  else if (RegVT == MVT::v32i1)
2983  RC = &X86::VK32RegClass;
2984  else if (RegVT == MVT::v64i1)
2985  RC = &X86::VK64RegClass;
2986  else
2987  llvm_unreachable("Unknown argument type!");
2988 
2989  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2990  ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2991  }
2992 
2993  // If this is an 8 or 16-bit value, it is really passed promoted to 32
2994  // bits. Insert an assert[sz]ext to capture this, then truncate to the
2995  // right size.
2996  if (VA.getLocInfo() == CCValAssign::SExt)
2997  ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2998  DAG.getValueType(VA.getValVT()));
2999  else if (VA.getLocInfo() == CCValAssign::ZExt)
3000  ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3001  DAG.getValueType(VA.getValVT()));
3002  else if (VA.getLocInfo() == CCValAssign::BCvt)
3003  ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3004 
3005  if (VA.isExtInLoc()) {
3006  // Handle MMX values passed in XMM regs.
3007  if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3008  ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3009  else if (VA.getValVT().isVector() &&
3010  VA.getValVT().getScalarType() == MVT::i1 &&
3011  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3012  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3013  // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3014  ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3015  } else
3016  ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3017  }
3018  } else {
3019  assert(VA.isMemLoc());
3020  ArgValue =
3021  LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3022  }
3023 
3024  // If value is passed via pointer - do a load.
3025  if (VA.getLocInfo() == CCValAssign::Indirect)
3026  ArgValue =
3027  DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3028 
3029  InVals.push_back(ArgValue);
3030  }
3031 
3032  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3033  // Swift calling convention does not require we copy the sret argument
3034  // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3035  if (CallConv == CallingConv::Swift)
3036  continue;
3037 
3038  // All x86 ABIs require that for returning structs by value we copy the
3039  // sret argument into %rax/%eax (depending on ABI) for the return. Save
3040  // the argument into a virtual register so that we can access it from the
3041  // return points.
3042  if (Ins[I].Flags.isSRet()) {
3043  unsigned Reg = FuncInfo->getSRetReturnReg();
3044  if (!Reg) {
3045  MVT PtrTy = getPointerTy(DAG.getDataLayout());
3046  Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3047  FuncInfo->setSRetReturnReg(Reg);
3048  }
3049  SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3050  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3051  break;
3052  }
3053  }
3054 
3055  unsigned StackSize = CCInfo.getNextStackOffset();
3056  // Align stack specially for tail calls.
3057  if (shouldGuaranteeTCO(CallConv,
3059  StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3060 
3061  // If the function takes variable number of arguments, make a frame index for
3062  // the start of the first vararg value... for expansion of llvm.va_start. We
3063  // can skip this if there are no va_start calls.
3064  if (MFI.hasVAStart() &&
3065  (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3066  CallConv != CallingConv::X86_ThisCall))) {
3067  FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3068  }
3069 
3070  // Figure out if XMM registers are in use.
3071  assert(!(Subtarget.useSoftFloat() &&
3072  Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3073  "SSE register cannot be used when SSE is disabled!");
3074 
3075  // 64-bit calling conventions support varargs and register parameters, so we
3076  // have to do extra work to spill them in the prologue.
3077  if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3078  // Find the first unallocated argument registers.
3079  ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3080  ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3081  unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3082  unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3083  assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3084  "SSE register cannot be used when SSE is disabled!");
3085 
3086  // Gather all the live in physical registers.
3087  SmallVector<SDValue, 6> LiveGPRs;
3088  SmallVector<SDValue, 8> LiveXMMRegs;
3089  SDValue ALVal;
3090  for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3091  unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3092  LiveGPRs.push_back(
3093  DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3094  }
3095  if (!ArgXMMs.empty()) {
3096  unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3097  ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3098  for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3099  unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3100  LiveXMMRegs.push_back(
3101  DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3102  }
3103  }
3104 
3105  if (IsWin64) {
3106  // Get to the caller-allocated home save location. Add 8 to account
3107  // for the return address.
3108  int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3109  FuncInfo->setRegSaveFrameIndex(
3110  MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3111  // Fixup to set vararg frame on shadow area (4 x i64).
3112  if (NumIntRegs < 4)
3113  FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3114  } else {
3115  // For X86-64, if there are vararg parameters that are passed via
3116  // registers, then we must store them to their spots on the stack so
3117  // they may be loaded by dereferencing the result of va_next.
3118  FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3119  FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3121  ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3122  }
3123 
3124  // Store the integer parameter registers.
3125  SmallVector<SDValue, 8> MemOps;
3126  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3127  getPointerTy(DAG.getDataLayout()));
3128  unsigned Offset = FuncInfo->getVarArgsGPOffset();
3129  for (SDValue Val : LiveGPRs) {
3130  SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3131  RSFIN, DAG.getIntPtrConstant(Offset, dl));
3132  SDValue Store =
3133  DAG.getStore(Val.getValue(1), dl, Val, FIN,
3135  DAG.getMachineFunction(),
3136  FuncInfo->getRegSaveFrameIndex(), Offset));
3137  MemOps.push_back(Store);
3138  Offset += 8;
3139  }
3140 
3141  if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3142  // Now store the XMM (fp + vector) parameter registers.
3143  SmallVector<SDValue, 12> SaveXMMOps;
3144  SaveXMMOps.push_back(Chain);
3145  SaveXMMOps.push_back(ALVal);
3146  SaveXMMOps.push_back(DAG.getIntPtrConstant(
3147  FuncInfo->getRegSaveFrameIndex(), dl));
3148  SaveXMMOps.push_back(DAG.getIntPtrConstant(
3149  FuncInfo->getVarArgsFPOffset(), dl));
3150  SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3151  LiveXMMRegs.end());
3153  MVT::Other, SaveXMMOps));
3154  }
3155 
3156  if (!MemOps.empty())
3157  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3158  }
3159 
3160  if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3161  // Find the largest legal vector type.
3162  MVT VecVT = MVT::Other;
3163  // FIXME: Only some x86_32 calling conventions support AVX512.
3164  if (Subtarget.hasAVX512() &&
3165  (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3166  CallConv == CallingConv::Intel_OCL_BI)))
3167  VecVT = MVT::v16f32;
3168  else if (Subtarget.hasAVX())
3169  VecVT = MVT::v8f32;
3170  else if (Subtarget.hasSSE2())
3171  VecVT = MVT::v4f32;
3172 
3173  // We forward some GPRs and some vector types.
3174  SmallVector<MVT, 2> RegParmTypes;
3175  MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3176  RegParmTypes.push_back(IntVT);
3177  if (VecVT != MVT::Other)
3178  RegParmTypes.push_back(VecVT);
3179 
3180  // Compute the set of forwarded registers. The rest are scratch.
3182  FuncInfo->getForwardedMustTailRegParms();
3183  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3184 
3185  // Conservatively forward AL on x86_64, since it might be used for varargs.
3186  if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3187  unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3188  Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3189  }
3190 
3191  // Copy all forwards from physical to virtual registers.
3192  for (ForwardedRegister &F : Forwards) {
3193  // FIXME: Can we use a less constrained schedule?
3194  SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3196  Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3197  }
3198  }
3199 
3200  // Some CCs need callee pop.
3201  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3203  FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3204  } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3205  // X86 interrupts must pop the error code (and the alignment padding) if
3206  // present.
3207  FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3208  } else {
3209  FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3210  // If this is an sret function, the return should pop the hidden pointer.
3211  if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3212  !Subtarget.getTargetTriple().isOSMSVCRT() &&
3213  argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3214  FuncInfo->setBytesToPopOnReturn(4);
3215  }
3216 
3217  if (!Is64Bit) {
3218  // RegSaveFrameIndex is X86-64 only.
3219  FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3220  if (CallConv == CallingConv::X86_FastCall ||
3221  CallConv == CallingConv::X86_ThisCall)
3222  // fastcc functions can't have varargs.
3223  FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3224  }
3225 
3226  FuncInfo->setArgumentStackSize(StackSize);
3227 
3228  if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3230  if (Personality == EHPersonality::CoreCLR) {
3231  assert(Is64Bit);
3232  // TODO: Add a mechanism to frame lowering that will allow us to indicate
3233  // that we'd prefer this slot be allocated towards the bottom of the frame
3234  // (i.e. near the stack pointer after allocating the frame). Every
3235  // funclet needs a copy of this slot in its (mostly empty) frame, and the
3236  // offset from the bottom of this and each funclet's frame must be the
3237  // same, so the size of funclets' (mostly empty) frames is dictated by
3238  // how far this slot is from the bottom (since they allocate just enough
3239  // space to accommodate holding this slot at the correct offset).
3240  int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3241  EHInfo->PSPSymFrameIdx = PSPSymFI;
3242  }
3243  }
3244 
3245  if (CallConv == CallingConv::X86_RegCall ||
3246  Fn->hasFnAttribute("no_caller_saved_registers")) {
3247  const MachineRegisterInfo &MRI = MF.getRegInfo();
3248  for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3249  MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3250  }
3251 
3252  return Chain;
3253 }
3254 
3255 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3256  SDValue Arg, const SDLoc &dl,
3257  SelectionDAG &DAG,
3258  const CCValAssign &VA,
3259  ISD::ArgFlagsTy Flags) const {
3260  unsigned LocMemOffset = VA.getLocMemOffset();
3261  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3262  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3263  StackPtr, PtrOff);
3264  if (Flags.isByVal())
3265  return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3266 
3267  return DAG.getStore(
3268  Chain, dl, Arg, PtrOff,
3269  MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3270 }
3271 
3272 /// Emit a load of return address if tail call
3273 /// optimization is performed and it is required.
3274 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3275  SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3276  bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3277  // Adjust the Return address stack slot.
3278  EVT VT = getPointerTy(DAG.getDataLayout());
3279  OutRetAddr = getReturnAddressFrameIndex(DAG);
3280 
3281  // Load the "old" Return address.
3282  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3283  return SDValue(OutRetAddr.getNode(), 1);
3284 }
3285 
3286 /// Emit a store of the return address if tail call
3287 /// optimization is performed and it is required (FPDiff!=0).
3289  SDValue Chain, SDValue RetAddrFrIdx,
3290  EVT PtrVT, unsigned SlotSize,
3291  int FPDiff, const SDLoc &dl) {
3292  // Store the return address to the appropriate stack slot.
3293  if (!FPDiff) return Chain;
3294  // Calculate the new stack slot for the return address.
3295  int NewReturnAddrFI =
3296  MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3297  false);
3298  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3299  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3301  DAG.getMachineFunction(), NewReturnAddrFI));
3302  return Chain;
3303 }
3304 
3305 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3306 /// operation of specified width.
3307 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3308  SDValue V2) {
3309  unsigned NumElems = VT.getVectorNumElements();
3311  Mask.push_back(NumElems);
3312  for (unsigned i = 1; i != NumElems; ++i)
3313  Mask.push_back(i);
3314  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3315 }
3316 
3317 SDValue
3318 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3319  SmallVectorImpl<SDValue> &InVals) const {
3320  SelectionDAG &DAG = CLI.DAG;
3321  SDLoc &dl = CLI.DL;
3323  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3325  SDValue Chain = CLI.Chain;
3326  SDValue Callee = CLI.Callee;
3327  CallingConv::ID CallConv = CLI.CallConv;
3328  bool &isTailCall = CLI.IsTailCall;
3329  bool isVarArg = CLI.IsVarArg;
3330 
3331  MachineFunction &MF = DAG.getMachineFunction();
3332  bool Is64Bit = Subtarget.is64Bit();
3333  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3334  StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3335  bool IsSibcall = false;
3337  auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3338  const CallInst *CI =
3339  CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3340  const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3341  bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3342  (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3343 
3344  if (CallConv == CallingConv::X86_INTR)
3345  report_fatal_error("X86 interrupts may not be called directly");
3346 
3347  if (Attr.getValueAsString() == "true")
3348  isTailCall = false;
3349 
3350  if (Subtarget.isPICStyleGOT() &&
3352  // If we are using a GOT, disable tail calls to external symbols with
3353  // default visibility. Tail calling such a symbol requires using a GOT
3354  // relocation, which forces early binding of the symbol. This breaks code
3355  // that require lazy function symbol resolution. Using musttail or
3356  // GuaranteedTailCallOpt will override this.
3358  if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3360  isTailCall = false;
3361  }
3362 
3363  bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3364  if (IsMustTail) {
3365  // Force this to be a tail call. The verifier rules are enough to ensure
3366  // that we can lower this successfully without moving the return address
3367  // around.
3368  isTailCall = true;
3369  } else if (isTailCall) {
3370  // Check if it's really possible to do a tail call.
3371  isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3372  isVarArg, SR != NotStructReturn,
3373  MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3374  Outs, OutVals, Ins, DAG);
3375 
3376  // Sibcalls are automatically detected tailcalls which do not require
3377  // ABI changes.
3378  if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3379  IsSibcall = true;
3380 
3381  if (isTailCall)
3382  ++NumTailCalls;
3383  }
3384 
3385  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3386  "Var args not supported with calling convention fastcc, ghc or hipe");
3387 
3388  // Analyze operands of the call, assigning locations to each operand.
3390  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3391 
3392  // Allocate shadow area for Win64.
3393  if (IsWin64)
3394  CCInfo.AllocateStack(32, 8);
3395 
3396  CCInfo.AnalyzeArguments(Outs, CC_X86);
3397 
3398  // In vectorcall calling convention a second pass is required for the HVA
3399  // types.
3400  if (CallingConv::X86_VectorCall == CallConv) {
3401  CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3402  }
3403 
3404  // Get a count of how many bytes are to be pushed on the stack.
3405  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3406  if (IsSibcall)
3407  // This is a sibcall. The memory operands are available in caller's
3408  // own caller's stack.
3409  NumBytes = 0;
3410  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3411  canGuaranteeTCO(CallConv))
3412  NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3413 
3414  int FPDiff = 0;
3415  if (isTailCall && !IsSibcall && !IsMustTail) {
3416  // Lower arguments at fp - stackoffset + fpdiff.
3417  unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3418 
3419  FPDiff = NumBytesCallerPushed - NumBytes;
3420 
3421  // Set the delta of movement of the returnaddr stackslot.
3422  // But only set if delta is greater than previous delta.
3423  if (FPDiff < X86Info->getTCReturnAddrDelta())
3424  X86Info->setTCReturnAddrDelta(FPDiff);
3425  }
3426 
3427  unsigned NumBytesToPush = NumBytes;
3428  unsigned NumBytesToPop = NumBytes;
3429 
3430  // If we have an inalloca argument, all stack space has already been allocated
3431  // for us and be right at the top of the stack. We don't support multiple
3432  // arguments passed in memory when using inalloca.
3433  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3434  NumBytesToPush = 0;
3435  if (!ArgLocs.back().isMemLoc())
3436  report_fatal_error("cannot use inalloca attribute on a register "
3437  "parameter");
3438  if (ArgLocs.back().getLocMemOffset() != 0)
3439  report_fatal_error("any parameter with the inalloca attribute must be "
3440  "the only memory argument");
3441  }
3442 
3443  if (!IsSibcall)
3444  Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3445  NumBytes - NumBytesToPush, dl);
3446 
3447  SDValue RetAddrFrIdx;
3448  // Load return address for tail calls.
3449  if (isTailCall && FPDiff)
3450  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3451  Is64Bit, FPDiff, dl);
3452 
3454  SmallVector<SDValue, 8> MemOpChains;
3455  SDValue StackPtr;
3456 
3457  // The next loop assumes that the locations are in the same order of the
3458  // input arguments.
3459  assert(isSortedByValueNo(ArgLocs) &&
3460  "Argument Location list must be sorted before lowering");
3461 
3462  // Walk the register/memloc assignments, inserting copies/loads. In the case
3463  // of tail call optimization arguments are handle later.
3464  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3465  for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3466  ++I, ++OutIndex) {
3467  assert(OutIndex < Outs.size() && "Invalid Out index");
3468  // Skip inalloca arguments, they have already been written.
3469  ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3470  if (Flags.isInAlloca())
3471  continue;
3472 
3473  CCValAssign &VA = ArgLocs[I];
3474  EVT RegVT = VA.getLocVT();
3475  SDValue Arg = OutVals[OutIndex];
3476  bool isByVal = Flags.isByVal();
3477 
3478  // Promote the value if needed.
3479  switch (VA.getLocInfo()) {
3480  default: llvm_unreachable("Unknown loc info!");
3481  case CCValAssign::Full: break;
3482  case CCValAssign::SExt:
3483  Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3484  break;
3485  case CCValAssign::ZExt:
3486  Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3487  break;
3488  case CCValAssign::AExt:
3489  if (Arg.getValueType().isVector() &&
3491  Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3492  else if (RegVT.is128BitVector()) {
3493  // Special case: passing MMX values in XMM registers.
3494  Arg = DAG.getBitcast(MVT::i64, Arg);
3495  Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3496  Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3497  } else
3498  Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3499  break;
3500  case CCValAssign::BCvt:
3501  Arg = DAG.getBitcast(RegVT, Arg);
3502  break;
3503  case CCValAssign::Indirect: {
3504  // Store the argument.
3505  SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3506  int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3507  Chain = DAG.getStore(
3508  Chain, dl, Arg, SpillSlot,
3510  Arg = SpillSlot;
3511  break;
3512  }
3513  }
3514 
3515  if (VA.needsCustom()) {
3516  assert(VA.getValVT() == MVT::v64i1 &&
3517  "Currently the only custom case is when we split v64i1 to 2 regs");
3518  // Split v64i1 value into two registers
3519  Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3520  Subtarget);
3521  } else if (VA.isRegLoc()) {
3522  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3523  if (isVarArg && IsWin64) {
3524  // Win64 ABI requires argument XMM reg to be copied to the corresponding
3525  // shadow reg if callee is a varargs function.
3526  unsigned ShadowReg = 0;
3527  switch (VA.getLocReg()) {
3528  case X86::XMM0: ShadowReg = X86::RCX; break;
3529  case X86::XMM1: ShadowReg = X86::RDX; break;
3530  case X86::XMM2: ShadowReg = X86::R8; break;
3531  case X86::XMM3: ShadowReg = X86::R9; break;
3532  }
3533  if (ShadowReg)
3534  RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3535  }
3536  } else if (!IsSibcall && (!isTailCall || isByVal)) {
3537  assert(VA.isMemLoc());
3538  if (!StackPtr.getNode())
3539  StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3540  getPointerTy(DAG.getDataLayout()));
3541  MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3542  dl, DAG, VA, Flags));
3543  }
3544  }
3545 
3546  if (!MemOpChains.empty())
3547  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3548 
3549  if (Subtarget.isPICStyleGOT()) {
3550  // ELF / PIC requires GOT in the EBX register before function calls via PLT
3551  // GOT pointer.
3552  if (!isTailCall) {
3553  RegsToPass.push_back(std::make_pair(
3554  unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3555  getPointerTy(DAG.getDataLayout()))));
3556  } else {
3557  // If we are tail calling and generating PIC/GOT style code load the
3558  // address of the callee into ECX. The value in ecx is used as target of
3559  // the tail jump. This is done to circumvent the ebx/callee-saved problem
3560  // for tail calls on PIC/GOT architectures. Normally we would just put the
3561  // address of GOT into ebx and then call target@PLT. But for tail calls
3562  // ebx would be restored (since ebx is callee saved) before jumping to the
3563  // target@PLT.
3564 
3565  // Note: The actual moving to ECX is done further down.
3567  if (G && !G->getGlobal()->hasLocalLinkage() &&
3569  Callee = LowerGlobalAddress(Callee, DAG);
3570  else if (isa<ExternalSymbolSDNode>(Callee))
3571  Callee = LowerExternalSymbol(Callee, DAG);
3572  }
3573  }
3574 
3575  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3576  // From AMD64 ABI document:
3577  // For calls that may call functions that use varargs or stdargs
3578  // (prototype-less calls or calls to functions containing ellipsis (...) in
3579  // the declaration) %al is used as hidden argument to specify the number
3580  // of SSE registers used. The contents of %al do not need to match exactly
3581  // the number of registers, but must be an ubound on the number of SSE
3582  // registers used and is in the range 0 - 8 inclusive.
3583 
3584  // Count the number of XMM registers allocated.
3585  static const MCPhysReg XMMArgRegs[] = {
3586  X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3587  X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3588  };
3589  unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3590  assert((Subtarget.hasSSE1() || !NumXMMRegs)
3591  && "SSE registers cannot be used when SSE is disabled");
3592 
3593  RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3594  DAG.getConstant(NumXMMRegs, dl,
3595  MVT::i8)));
3596  }
3597 
3598  if (isVarArg && IsMustTail) {
3599  const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3600  for (const auto &F : Forwards) {
3601  SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3602  RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3603  }
3604  }
3605 
3606  // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3607  // don't need this because the eligibility check rejects calls that require
3608  // shuffling arguments passed in memory.
3609  if (!IsSibcall && isTailCall) {
3610  // Force all the incoming stack arguments to be loaded from the stack
3611  // before any new outgoing arguments are stored to the stack, because the
3612  // outgoing stack slots may alias the incoming argument stack slots, and
3613  // the alias isn't otherwise explicit. This is slightly more conservative
3614  // than necessary, because it means that each store effectively depends
3615  // on every argument instead of just those arguments it would clobber.
3616  SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3617 
3618  SmallVector<SDValue, 8> MemOpChains2;
3619  SDValue FIN;
3620  int FI = 0;
3621  for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3622  ++I, ++OutsIndex) {
3623  CCValAssign &VA = ArgLocs[I];
3624 
3625  if (VA.isRegLoc()) {
3626  if (VA.needsCustom()) {
3627  assert((CallConv == CallingConv::X86_RegCall) &&
3628  "Expecting custom case only in regcall calling convention");
3629  // This means that we are in special case where one argument was
3630  // passed through two register locations - Skip the next location
3631  ++I;
3632  }
3633 
3634  continue;
3635  }
3636 
3637  assert(VA.isMemLoc());
3638  SDValue Arg = OutVals[OutsIndex];
3639  ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3640  // Skip inalloca arguments. They don't require any work.
3641  if (Flags.isInAlloca())
3642  continue;
3643  // Create frame index.
3644  int32_t Offset = VA.getLocMemOffset()+FPDiff;
3645  uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3646  FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3647  FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3648 
3649  if (Flags.isByVal()) {
3650  // Copy relative to framepointer.
3652  if (!StackPtr.getNode())
3653  StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3654  getPointerTy(DAG.getDataLayout()));
3655  Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3656  StackPtr, Source);
3657 
3658  MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3659  ArgChain,
3660  Flags, DAG, dl));
3661  } else {
3662  // Store relative to framepointer.
3663  MemOpChains2.push_back(DAG.getStore(
3664  ArgChain, dl, Arg, FIN,
3666  }
3667  }
3668 
3669  if (!MemOpChains2.empty())
3670  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3671 
3672  // Store the return address to the appropriate stack slot.
3673  Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3674  getPointerTy(DAG.getDataLayout()),
3675  RegInfo->getSlotSize(), FPDiff, dl);
3676  }
3677 
3678  // Build a sequence of copy-to-reg nodes chained together with token chain
3679  // and flag operands which copy the outgoing args into registers.
3680  SDValue InFlag;
3681  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3682  Chain = DAG.g