LLVM  6.0.0svn
X86ISelLowering.cpp
Go to the documentation of this file.
1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/DiagnosticInfo.h"
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/GlobalAlias.h"
46 #include "llvm/IR/GlobalVariable.h"
47 #include "llvm/IR/Instructions.h"
48 #include "llvm/IR/Intrinsics.h"
49 #include "llvm/MC/MCAsmInfo.h"
50 #include "llvm/MC/MCContext.h"
51 #include "llvm/MC/MCExpr.h"
52 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/KnownBits.h"
60 #include <algorithm>
61 #include <bitset>
62 #include <cctype>
63 #include <numeric>
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "x86-isel"
67 
68 STATISTIC(NumTailCalls, "Number of tail calls");
69 
71  "x86-experimental-vector-widening-legalization", cl::init(false),
72  cl::desc("Enable an experimental vector type legalization through widening "
73  "rather than promotion."),
74  cl::Hidden);
75 
77  "x86-experimental-pref-loop-alignment", cl::init(4),
78  cl::desc("Sets the preferable loop alignment for experiments "
79  "(the last x86-experimental-pref-loop-alignment bits"
80  " of the loop header PC will be 0)."),
81  cl::Hidden);
82 
84  "mul-constant-optimization", cl::init(true),
85  cl::desc("Replace 'mul x, Const' with more effective instructions like "
86  "SHIFT, LEA, etc."),
87  cl::Hidden);
88 
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
92 /// crashing.
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
94  const char *Msg) {
96  DAG.getContext()->diagnose(
98 }
99 
101  const X86Subtarget &STI)
102  : TargetLowering(TM), Subtarget(STI) {
103  bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104  X86ScalarSSEf64 = Subtarget.hasSSE2();
105  X86ScalarSSEf32 = Subtarget.hasSSE1();
106  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
107 
108  // Set up the TargetLowering object.
109 
110  // X86 is weird. It always uses i8 for shift amounts and setcc results.
112  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
114 
115  // For 64-bit, since we have so many registers, use the ILP scheduler.
116  // For 32-bit, use the register pressure specific scheduling.
117  // For Atom, always use ILP scheduling.
118  if (Subtarget.isAtom())
120  else if (Subtarget.is64Bit())
122  else
124  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
126 
127  // Bypass expensive divides and use cheaper ones.
128  if (TM.getOptLevel() >= CodeGenOpt::Default) {
129  if (Subtarget.hasSlowDivide32())
130  addBypassSlowDiv(32, 8);
131  if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132  addBypassSlowDiv(64, 32);
133  }
134 
135  if (Subtarget.isTargetKnownWindowsMSVC() ||
136  Subtarget.isTargetWindowsItanium()) {
137  // Setup Windows compiler runtime calls.
138  setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139  setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140  setLibcallName(RTLIB::SREM_I64, "_allrem");
141  setLibcallName(RTLIB::UREM_I64, "_aullrem");
142  setLibcallName(RTLIB::MUL_I64, "_allmul");
148  }
149 
150  if (Subtarget.isTargetDarwin()) {
151  // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152  setUseUnderscoreSetJmp(false);
154  } else if (Subtarget.isTargetWindowsGNU()) {
155  // MS runtime is weird: it exports _setjmp, but longjmp!
158  } else {
161  }
162 
163  // Set up the register classes.
164  addRegisterClass(MVT::i8, &X86::GR8RegClass);
165  addRegisterClass(MVT::i16, &X86::GR16RegClass);
166  addRegisterClass(MVT::i32, &X86::GR32RegClass);
167  if (Subtarget.is64Bit())
168  addRegisterClass(MVT::i64, &X86::GR64RegClass);
169 
170  for (MVT VT : MVT::integer_valuetypes())
172 
173  // We don't accept any truncstore of integer registers.
180 
182 
183  // SETOEQ and SETUNE require checking two conditions.
190 
191  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
192  // operation.
196 
197  if (Subtarget.is64Bit()) {
198  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
199  // f32/f64 are legal, f80 is custom.
201  else
204  } else if (!Subtarget.useSoftFloat()) {
205  // We have an algorithm for SSE2->double, and we turn this into a
206  // 64-bit FILD followed by conditional FADD for other targets.
208  // We have an algorithm for SSE2, and we turn this into a 64-bit
209  // FILD or VCVTUSI2SS/SD for other targets.
211  }
212 
213  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
214  // this operation.
217 
218  if (!Subtarget.useSoftFloat()) {
219  // SSE has no i16 to fp conversion, only i32.
220  if (X86ScalarSSEf32) {
222  // f32 and f64 cases are Legal, f80 case is not
224  } else {
227  }
228  } else {
231  }
232 
233  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
234  // this operation.
237 
238  if (!Subtarget.useSoftFloat()) {
239  // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
240  // are Legal, f80 is custom lowered.
243 
244  if (X86ScalarSSEf32) {
246  // f32 and f64 cases are Legal, f80 case is not
248  } else {
251  }
252  } else {
256  }
257 
258  // Handle FP_TO_UINT by promoting the destination to a larger signed
259  // conversion.
263 
264  if (Subtarget.is64Bit()) {
265  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
266  // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
269  } else {
272  }
273  } else if (!Subtarget.useSoftFloat()) {
274  // Since AVX is a superset of SSE3, only check for SSE here.
275  if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
276  // Expand FP_TO_UINT into a select.
277  // FIXME: We would like to use a Custom expander here eventually to do
278  // the optimal thing for SSE vs. the default expansion in the legalizer.
280  else
281  // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
282  // With SSE3 we can use fisttpll to convert to a signed i64; without
283  // SSE, we're stuck with a fistpll.
285 
287  }
288 
289  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
290  if (!X86ScalarSSEf64) {
293  if (Subtarget.is64Bit()) {
295  // Without SSE, i64->f64 goes through memory.
297  }
298  } else if (!Subtarget.is64Bit())
300 
301  // Scalar integer divide and remainder are lowered to use operations that
302  // produce two results, to match the available instructions. This exposes
303  // the two-result form to trivial CSE, which is able to combine x/y and x%y
304  // into a single instruction.
305  //
306  // Scalar integer multiply-high is also lowered to use two-result
307  // operations, to match the available instructions. However, plain multiply
308  // (low) operations are left as Legal, as there are single-result
309  // instructions for this in x86. Using the two-result multiply instructions
310  // when both high and low results are needed must be arranged by dagcombine.
311  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
318  }
319 
322  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
326  }
327  if (Subtarget.is64Bit())
333 
338 
339  // Promote the i8 variants and force them on up to i32 which has a shorter
340  // encoding.
343  if (!Subtarget.hasBMI()) {
348  if (Subtarget.is64Bit()) {
351  }
352  }
353 
354  if (Subtarget.hasLZCNT()) {
355  // When promoting the i8 variants, force them to i32 for a shorter
356  // encoding.
359  } else {
366  if (Subtarget.is64Bit()) {
369  }
370  }
371 
372  // Special handling for half-precision floating point conversions.
373  // If we don't have F16C support, then lower half float conversions
374  // into library calls.
375  if (Subtarget.useSoftFloat() ||
376  (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
379  }
380 
381  // There's never any support for operations beyond MVT::f32.
386 
393 
394  if (Subtarget.hasPOPCNT()) {
396  } else {
400  if (Subtarget.is64Bit())
402  }
403 
405 
406  if (!Subtarget.hasMOVBE())
408 
409  // These should be promoted to a larger select which is supported.
411  // X86 wants to expand cmov itself.
412  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
415  }
416  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
417  if (VT == MVT::i64 && !Subtarget.is64Bit())
418  continue;
421  }
422 
423  // Custom action for SELECT MMX and expand action for SELECT_CC MMX
426 
428  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
429  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
430  // support continuation, user-level threading, and etc.. As a result, no
431  // other SjLj exception interfaces are implemented and please don't build
432  // your own exception handling based on them.
433  // LLVM/Clang supports zero-cost DWARF exception handling.
438  setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
439 
440  // Darwin ABI issue.
441  for (auto VT : { MVT::i32, MVT::i64 }) {
442  if (VT == MVT::i64 && !Subtarget.is64Bit())
443  continue;
450  }
451 
452  // 64-bit shl, sra, srl (iff 32-bit x86)
453  for (auto VT : { MVT::i32, MVT::i64 }) {
454  if (VT == MVT::i64 && !Subtarget.is64Bit())
455  continue;
459  }
460 
461  if (Subtarget.hasSSE1())
463 
465 
466  // Expand certain atomics
467  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
475  }
476 
477  if (Subtarget.hasCmpxchg16b()) {
479  }
480 
481  // FIXME - use subtarget debug flags
482  if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
483  !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
486  }
487 
490 
493 
496 
497  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
500  bool Is64Bit = Subtarget.is64Bit();
502  setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
503 
506 
508 
509  // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
512 
513  if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
514  // f32 and f64 use SSE.
515  // Set up the FP register classes.
516  addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
517  : &X86::FR32RegClass);
518  addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
519  : &X86::FR64RegClass);
520 
521  for (auto VT : { MVT::f32, MVT::f64 }) {
522  // Use ANDPD to simulate FABS.
524 
525  // Use XORP to simulate FNEG.
527 
528  // Use ANDPD and ORPD to simulate FCOPYSIGN.
530 
531  // We don't support sin/cos/fmod
532  setOperationAction(ISD::FSIN , VT, Expand);
533  setOperationAction(ISD::FCOS , VT, Expand);
534  setOperationAction(ISD::FSINCOS, VT, Expand);
535  }
536 
537  // Lower this to MOVMSK plus an AND.
540 
541  // Expand FP immediates into loads from the stack, except for the special
542  // cases we handle.
543  addLegalFPImmediate(APFloat(+0.0)); // xorpd
544  addLegalFPImmediate(APFloat(+0.0f)); // xorps
545  } else if (UseX87 && X86ScalarSSEf32) {
546  // Use SSE for f32, x87 for f64.
547  // Set up the FP register classes.
548  addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
549  : &X86::FR32RegClass);
550  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
551 
552  // Use ANDPS to simulate FABS.
554 
555  // Use XORP to simulate FNEG.
557 
559 
560  // Use ANDPS and ORPS to simulate FCOPYSIGN.
563 
564  // We don't support sin/cos/fmod
568 
569  // Special cases we handle for FP constants.
570  addLegalFPImmediate(APFloat(+0.0f)); // xorps
571  addLegalFPImmediate(APFloat(+0.0)); // FLD0
572  addLegalFPImmediate(APFloat(+1.0)); // FLD1
573  addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
574  addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
575 
576  // Always expand sin/cos functions even though x87 has an instruction.
580  } else if (UseX87) {
581  // f32 and f64 in x87.
582  // Set up the FP register classes.
583  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
584  addRegisterClass(MVT::f32, &X86::RFP32RegClass);
585 
586  for (auto VT : { MVT::f32, MVT::f64 }) {
587  setOperationAction(ISD::UNDEF, VT, Expand);
588  setOperationAction(ISD::FCOPYSIGN, VT, Expand);
589 
590  // Always expand sin/cos functions even though x87 has an instruction.
591  setOperationAction(ISD::FSIN , VT, Expand);
592  setOperationAction(ISD::FCOS , VT, Expand);
593  setOperationAction(ISD::FSINCOS, VT, Expand);
594  }
595  addLegalFPImmediate(APFloat(+0.0)); // FLD0
596  addLegalFPImmediate(APFloat(+1.0)); // FLD1
597  addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
598  addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
599  addLegalFPImmediate(APFloat(+0.0f)); // FLD0
600  addLegalFPImmediate(APFloat(+1.0f)); // FLD1
601  addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
602  addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
603  }
604 
605  // We don't support FMA.
608 
609  // Long double always uses X87, except f128 in MMX.
610  if (UseX87) {
611  if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
612  addRegisterClass(MVT::f128, &X86::FR128RegClass);
617  }
618 
619  addRegisterClass(MVT::f80, &X86::RFP80RegClass);
622  {
624  addLegalFPImmediate(TmpFlt); // FLD0
625  TmpFlt.changeSign();
626  addLegalFPImmediate(TmpFlt); // FLD0/FCHS
627 
628  bool ignored;
629  APFloat TmpFlt2(+1.0);
631  &ignored);
632  addLegalFPImmediate(TmpFlt2); // FLD1
633  TmpFlt2.changeSign();
634  addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
635  }
636 
637  // Always expand sin/cos functions even though x87 has an instruction.
641 
648  }
649 
650  // Always use a library call for pow.
654 
662 
663  // Some FP actions are always expanded for vector types.
664  for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
666  setOperationAction(ISD::FSIN, VT, Expand);
667  setOperationAction(ISD::FSINCOS, VT, Expand);
668  setOperationAction(ISD::FCOS, VT, Expand);
669  setOperationAction(ISD::FREM, VT, Expand);
670  setOperationAction(ISD::FCOPYSIGN, VT, Expand);
671  setOperationAction(ISD::FPOW, VT, Expand);
672  setOperationAction(ISD::FLOG, VT, Expand);
673  setOperationAction(ISD::FLOG2, VT, Expand);
674  setOperationAction(ISD::FLOG10, VT, Expand);
675  setOperationAction(ISD::FEXP, VT, Expand);
676  setOperationAction(ISD::FEXP2, VT, Expand);
677  }
678 
679  // First set operation action for all vector types to either promote
680  // (for widening) or expand (for scalarization). Then we will selectively
681  // turn on ones that can be effectively codegen'd.
682  for (MVT VT : MVT::vector_valuetypes()) {
683  setOperationAction(ISD::SDIV, VT, Expand);
684  setOperationAction(ISD::UDIV, VT, Expand);
685  setOperationAction(ISD::SREM, VT, Expand);
686  setOperationAction(ISD::UREM, VT, Expand);
691  setOperationAction(ISD::FMA, VT, Expand);
692  setOperationAction(ISD::FFLOOR, VT, Expand);
693  setOperationAction(ISD::FCEIL, VT, Expand);
694  setOperationAction(ISD::FTRUNC, VT, Expand);
695  setOperationAction(ISD::FRINT, VT, Expand);
696  setOperationAction(ISD::FNEARBYINT, VT, Expand);
697  setOperationAction(ISD::SMUL_LOHI, VT, Expand);
698  setOperationAction(ISD::MULHS, VT, Expand);
699  setOperationAction(ISD::UMUL_LOHI, VT, Expand);
700  setOperationAction(ISD::MULHU, VT, Expand);
701  setOperationAction(ISD::SDIVREM, VT, Expand);
702  setOperationAction(ISD::UDIVREM, VT, Expand);
703  setOperationAction(ISD::CTPOP, VT, Expand);
704  setOperationAction(ISD::CTTZ, VT, Expand);
705  setOperationAction(ISD::CTLZ, VT, Expand);
706  setOperationAction(ISD::ROTL, VT, Expand);
707  setOperationAction(ISD::ROTR, VT, Expand);
708  setOperationAction(ISD::BSWAP, VT, Expand);
709  setOperationAction(ISD::SETCC, VT, Expand);
710  setOperationAction(ISD::FP_TO_UINT, VT, Expand);
711  setOperationAction(ISD::FP_TO_SINT, VT, Expand);
712  setOperationAction(ISD::UINT_TO_FP, VT, Expand);
713  setOperationAction(ISD::SINT_TO_FP, VT, Expand);
715  setOperationAction(ISD::TRUNCATE, VT, Expand);
718  setOperationAction(ISD::ANY_EXTEND, VT, Expand);
719  setOperationAction(ISD::SELECT_CC, VT, Expand);
720  for (MVT InnerVT : MVT::vector_valuetypes()) {
721  setTruncStoreAction(InnerVT, VT, Expand);
722 
723  setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
724  setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
725 
726  // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
727  // types, we have to deal with them whether we ask for Expansion or not.
728  // Setting Expand causes its own optimisation problems though, so leave
729  // them legal.
730  if (VT.getVectorElementType() == MVT::i1)
731  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
732 
733  // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
734  // split/scalarized right now.
735  if (VT.getVectorElementType() == MVT::f16)
736  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
737  }
738  }
739 
740  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
741  // with -msoft-float, disable use of MMX as well.
742  if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
743  addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
744  // No operations on x86mmx supported, everything uses intrinsics.
745  }
746 
747  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
748  addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
749  : &X86::VR128RegClass);
750 
760  }
761 
762  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
763  addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
764  : &X86::VR128RegClass);
765 
766  // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
767  // registers cannot be used even for integer operations.
768  addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
769  : &X86::VR128RegClass);
770  addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
771  : &X86::VR128RegClass);
772  addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
773  : &X86::VR128RegClass);
774  addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
775  : &X86::VR128RegClass);
776 
790 
795 
799 
800  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
804  }
805 
806  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
812  }
813 
814  // We support custom legalizing of sext and anyext loads for specific
815  // memory vector types which we can load as a scalar (or sequence of
816  // scalars) and extend in-register to a legal 128-bit vector type. For sext
817  // loads these must work with a single scalar load.
818  for (MVT VT : MVT::integer_vector_valuetypes()) {
828  }
829 
830  for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
834 
835  if (VT == MVT::v2i64 && !Subtarget.is64Bit())
836  continue;
837 
840  }
841 
842  // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
843  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
849  }
850 
851  // Custom lower v2i64 and v2f64 selects.
854 
857 
860 
864 
865  // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
867 
870 
871  for (MVT VT : MVT::fp_vector_valuetypes())
873 
877 
881 
882  // In the customized shift lowering, the legal v4i32/v2i64 cases
883  // in AVX2 will be recognized.
884  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
888  }
889  }
890 
891  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
900  }
901 
902  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
903  for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
904  setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
905  setOperationAction(ISD::FCEIL, RoundedTy, Legal);
906  setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
907  setOperationAction(ISD::FRINT, RoundedTy, Legal);
909  }
910 
919 
920  // FIXME: Do we need to handle scalar-to-vector here?
922 
923  // We directly match byte blends in the backend as they match the VSELECT
924  // condition form.
926 
927  // SSE41 brings specific instructions for doing vector sign extend even in
928  // cases where we don't have SRA.
929  for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
932  }
933 
934  for (MVT VT : MVT::integer_vector_valuetypes()) {
938  }
939 
940  // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
941  for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
949  }
950 
951  // i8 vectors are custom because the source register and source
952  // source memory operand types are not the same width.
954  }
955 
956  if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
957  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
960 
961  // XOP can efficiently perform BITREVERSE with VPPERM.
962  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
964 
965  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
968  }
969 
970  if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
971  bool HasInt256 = Subtarget.hasInt256();
972 
973  addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
974  : &X86::VR256RegClass);
975  addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
976  : &X86::VR256RegClass);
977  addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
978  : &X86::VR256RegClass);
979  addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
980  : &X86::VR256RegClass);
981  addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
982  : &X86::VR256RegClass);
983  addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
984  : &X86::VR256RegClass);
985 
986  for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
995  }
996 
997  // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
998  // even though v8i16 is a legal type.
1002 
1006 
1009 
1010  for (MVT VT : MVT::fp_vector_valuetypes())
1012 
1013  // In the customized shift lowering, the legal v8i32/v4i64 cases
1014  // in AVX2 will be recognized.
1015  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1019  }
1020 
1024 
1025  for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1029  }
1030 
1035 
1036  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1041  }
1042 
1043  if (Subtarget.hasAnyFMA()) {
1044  for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1047  }
1048 
1049  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1050  setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1051  setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1052  }
1053 
1056  setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1058 
1061 
1062  setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1063  setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1066 
1067  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1068  setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1069  setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1070  setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1071  setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1072  setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1073  }
1074 
1075  if (HasInt256) {
1079 
1080  // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1081  // when we have a 256bit-wide blend with immediate.
1083 
1084  // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1085  for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1092  }
1093  }
1094 
1095  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1099  }
1100 
1101  // Extract subvector is special because the value type
1102  // (result) is 128-bit but the source is 256-bit wide.
1103  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1104  MVT::v4f32, MVT::v2f64 }) {
1106  }
1107 
1108  // Custom lower several nodes for 256-bit types.
1110  MVT::v8f32, MVT::v4f64 }) {
1113  setOperationAction(ISD::VSELECT, VT, Custom);
1119  }
1120 
1121  if (HasInt256)
1123 
1124  // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1125  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1131  }
1132  }
1133 
1134  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1135  addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1136  addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1137  addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1138  addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1139 
1140  addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1141  addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1142  addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1143 
1144  for (MVT VT : MVT::fp_vector_valuetypes())
1146 
1147  for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1154  }
1155 
1159  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1160  setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1161  setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1162  setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1163  setTruncStoreAction(VT, MaskVT, Custom);
1164  }
1165 
1166  for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1171  }
1172 
1200 
1206  if (Subtarget.hasVLX()){
1212 
1218  } else {
1219  for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1223  }
1224  }
1227 
1228  if (Subtarget.hasDQI()) {
1229  for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1234  }
1235  if (Subtarget.hasVLX()) {
1236  // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1240  }
1241  }
1242  if (Subtarget.hasVLX()) {
1254 
1255  // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1266  }
1267 
1278 
1279  for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1285  }
1286 
1289 
1290  // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1293 
1299 
1301 
1308 
1310 
1311  // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1314 
1315  for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1322 
1327  setOperationAction(ISD::VSELECT, VT, Expand);
1328  }
1329 
1330  for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1341  }
1342 
1343  // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1345  MVT::v8i64}) {
1348  }
1349 
1350  // Need to promote to 64-bit even though we have 32-bit masked instructions
1351  // because the IR optimizers rearrange bitcasts around logic ops leaving
1352  // too many variations to handle if we don't promote them.
1356 
1357  if (Subtarget.hasCDI()) {
1358  // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1359  for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1360  MVT::v4i64, MVT::v8i64}) {
1363  }
1364  } // Subtarget.hasCDI()
1365 
1366  if (Subtarget.hasDQI()) {
1367  // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1371  }
1372 
1373  if (Subtarget.hasVPOPCNTDQ()) {
1374  // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1375  // version of popcntd/q.
1376  for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1379  }
1380 
1381  // Custom lower several nodes.
1382  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1386  }
1387 
1389 
1390  // Extract subvector is special because the value type
1391  // (result) is 256-bit but the source is 512-bit wide.
1392  // 128-bit was made Legal under AVX1.
1393  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1396  for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1399 
1400  for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1412  }
1413  for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1416  }
1417  }// has AVX-512
1418 
1419  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1420  addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1421  addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1422 
1423  addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1424  addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1425 
1432 
1478 
1480 
1482  if (Subtarget.hasVLX()) {
1485  }
1486 
1487  LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1488  for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1489  setOperationAction(ISD::MLOAD, VT, Action);
1490  setOperationAction(ISD::MSTORE, VT, Action);
1491  }
1492 
1493  if (Subtarget.hasCDI()) {
1496  }
1497 
1498  for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1513 
1517  }
1518 
1519  for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1521  if (Subtarget.hasVLX()) {
1522  // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1525  }
1526  }
1527  }
1528 
1529  if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1530  addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1531  addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1532 
1533  for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1537  setOperationAction(ISD::VSELECT, VT, Expand);
1538 
1546  }
1547 
1552 
1553  for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1558  }
1559  }
1560 
1561  // We want to custom lower some of our intrinsics.
1565  if (!Subtarget.is64Bit()) {
1568  }
1569 
1570  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1571  // handle type legalization for these operations here.
1572  //
1573  // FIXME: We really should do custom legalization for addition and
1574  // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1575  // than generic legalization for 64-bit multiplication-with-overflow, though.
1576  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1577  if (VT == MVT::i64 && !Subtarget.is64Bit())
1578  continue;
1579  // Add/Sub/Mul with overflow operations are custom lowered.
1586 
1587  // Support carry in as value rather than glue.
1591  }
1592 
1593  if (!Subtarget.is64Bit()) {
1594  // These libcalls are not available in 32-bit.
1595  setLibcallName(RTLIB::SHL_I128, nullptr);
1596  setLibcallName(RTLIB::SRL_I128, nullptr);
1597  setLibcallName(RTLIB::SRA_I128, nullptr);
1598  }
1599 
1600  // Combine sin / cos into one node or libcall if possible.
1601  if (Subtarget.hasSinCos()) {
1602  setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1603  setLibcallName(RTLIB::SINCOS_F64, "sincos");
1604  if (Subtarget.isTargetDarwin()) {
1605  // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1606  // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1609  }
1610  }
1611 
1612  if (Subtarget.isTargetWin64()) {
1619  }
1620 
1621  // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1622  // is. We should promote the value to 64-bits to solve this.
1623  // This is what the CRT headers do - `fmodf` is an inline header
1624  // function casting to f64 and calling `fmod`.
1625  if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1626  Subtarget.isTargetWindowsItanium()))
1627  for (ISD::NodeType Op :
1632 
1633  // We have target-specific dag combine patterns for the following nodes:
1672 
1674 
1675  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1677  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1679  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1681 
1682  // TODO: These control memcmp expansion in CGP and could be raised higher, but
1683  // that needs to benchmarked and balanced with the potential use of vector
1684  // load/store types (PR33329, PR33914).
1685  MaxLoadsPerMemcmp = 2;
1687 
1688  // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1690 
1691  // An out-of-order CPU can speculatively execute past a predictable branch,
1692  // but a conditional move could be stalled by an expensive earlier operation.
1693  PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1694  EnableExtLdPromotion = true;
1695  setPrefFunctionAlignment(4); // 2^4 bytes.
1696 
1698 }
1699 
1700 // This has so far only been implemented for 64-bit MachO.
1702  return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1703 }
1704 
1708  VT.getVectorNumElements() != 1 &&
1710  return TypeWidenVector;
1711 
1713 }
1714 
1717  EVT VT) const {
1718  if (!VT.isVector())
1719  return MVT::i8;
1720 
1721  if (VT.isSimple()) {
1722  MVT VVT = VT.getSimpleVT();
1723  const unsigned NumElts = VVT.getVectorNumElements();
1724  MVT EltVT = VVT.getVectorElementType();
1725  if (VVT.is512BitVector()) {
1726  if (Subtarget.hasAVX512())
1727  if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1728  EltVT == MVT::f32 || EltVT == MVT::f64)
1729  switch(NumElts) {
1730  case 8: return MVT::v8i1;
1731  case 16: return MVT::v16i1;
1732  }
1733  if (Subtarget.hasBWI())
1734  if (EltVT == MVT::i8 || EltVT == MVT::i16)
1735  switch(NumElts) {
1736  case 32: return MVT::v32i1;
1737  case 64: return MVT::v64i1;
1738  }
1739  }
1740 
1741  if (Subtarget.hasBWI() && Subtarget.hasVLX())
1742  return MVT::getVectorVT(MVT::i1, NumElts);
1743 
1744  if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1745  EVT LegalVT = getTypeToTransformTo(Context, VT);
1746  EltVT = LegalVT.getVectorElementType().getSimpleVT();
1747  }
1748 
1749  if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1750  switch(NumElts) {
1751  case 2: return MVT::v2i1;
1752  case 4: return MVT::v4i1;
1753  case 8: return MVT::v8i1;
1754  }
1755  }
1756 
1758 }
1759 
1760 /// Helper for getByValTypeAlignment to determine
1761 /// the desired ByVal argument alignment.
1762 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1763  if (MaxAlign == 16)
1764  return;
1765  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1766  if (VTy->getBitWidth() == 128)
1767  MaxAlign = 16;
1768  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1769  unsigned EltAlign = 0;
1770  getMaxByValAlign(ATy->getElementType(), EltAlign);
1771  if (EltAlign > MaxAlign)
1772  MaxAlign = EltAlign;
1773  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1774  for (auto *EltTy : STy->elements()) {
1775  unsigned EltAlign = 0;
1776  getMaxByValAlign(EltTy, EltAlign);
1777  if (EltAlign > MaxAlign)
1778  MaxAlign = EltAlign;
1779  if (MaxAlign == 16)
1780  break;
1781  }
1782  }
1783 }
1784 
1785 /// Return the desired alignment for ByVal aggregate
1786 /// function arguments in the caller parameter area. For X86, aggregates
1787 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1788 /// are at 4-byte boundaries.
1790  const DataLayout &DL) const {
1791  if (Subtarget.is64Bit()) {
1792  // Max of 8 and alignment of type.
1793  unsigned TyAlign = DL.getABITypeAlignment(Ty);
1794  if (TyAlign > 8)
1795  return TyAlign;
1796  return 8;
1797  }
1798 
1799  unsigned Align = 4;
1800  if (Subtarget.hasSSE1())
1801  getMaxByValAlign(Ty, Align);
1802  return Align;
1803 }
1804 
1805 /// Returns the target specific optimal type for load
1806 /// and store operations as a result of memset, memcpy, and memmove
1807 /// lowering. If DstAlign is zero that means it's safe to destination
1808 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1809 /// means there isn't a need to check it against alignment requirement,
1810 /// probably because the source does not need to be loaded. If 'IsMemset' is
1811 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1812 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1813 /// source is constant so it does not need to be loaded.
1814 /// It returns EVT::Other if the type should be determined using generic
1815 /// target-independent logic.
1816 EVT
1818  unsigned DstAlign, unsigned SrcAlign,
1819  bool IsMemset, bool ZeroMemset,
1820  bool MemcpyStrSrc,
1821  MachineFunction &MF) const {
1822  const Function *F = MF.getFunction();
1823  if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1824  if (Size >= 16 &&
1825  (!Subtarget.isUnalignedMem16Slow() ||
1826  ((DstAlign == 0 || DstAlign >= 16) &&
1827  (SrcAlign == 0 || SrcAlign >= 16)))) {
1828  // FIXME: Check if unaligned 32-byte accesses are slow.
1829  if (Size >= 32 && Subtarget.hasAVX()) {
1830  // Although this isn't a well-supported type for AVX1, we'll let
1831  // legalization and shuffle lowering produce the optimal codegen. If we
1832  // choose an optimal type with a vector element larger than a byte,
1833  // getMemsetStores() may create an intermediate splat (using an integer
1834  // multiply) before we splat as a vector.
1835  return MVT::v32i8;
1836  }
1837  if (Subtarget.hasSSE2())
1838  return MVT::v16i8;
1839  // TODO: Can SSE1 handle a byte vector?
1840  if (Subtarget.hasSSE1())
1841  return MVT::v4f32;
1842  } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1843  !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1844  // Do not use f64 to lower memcpy if source is string constant. It's
1845  // better to use i32 to avoid the loads.
1846  // Also, do not use f64 to lower memset unless this is a memset of zeros.
1847  // The gymnastics of splatting a byte value into an XMM register and then
1848  // only using 8-byte stores (because this is a CPU with slow unaligned
1849  // 16-byte accesses) makes that a loser.
1850  return MVT::f64;
1851  }
1852  }
1853  // This is a compromise. If we reach here, unaligned accesses may be slow on
1854  // this target. However, creating smaller, aligned accesses could be even
1855  // slower and would certainly be a lot more code.
1856  if (Subtarget.is64Bit() && Size >= 8)
1857  return MVT::i64;
1858  return MVT::i32;
1859 }
1860 
1862  if (VT == MVT::f32)
1863  return X86ScalarSSEf32;
1864  else if (VT == MVT::f64)
1865  return X86ScalarSSEf64;
1866  return true;
1867 }
1868 
1869 bool
1871  unsigned,
1872  unsigned,
1873  bool *Fast) const {
1874  if (Fast) {
1875  switch (VT.getSizeInBits()) {
1876  default:
1877  // 8-byte and under are always assumed to be fast.
1878  *Fast = true;
1879  break;
1880  case 128:
1881  *Fast = !Subtarget.isUnalignedMem16Slow();
1882  break;
1883  case 256:
1884  *Fast = !Subtarget.isUnalignedMem32Slow();
1885  break;
1886  // TODO: What about AVX-512 (512-bit) accesses?
1887  }
1888  }
1889  // Misaligned accesses of any size are always allowed.
1890  return true;
1891 }
1892 
1893 /// Return the entry encoding for a jump table in the
1894 /// current function. The returned value is a member of the
1895 /// MachineJumpTableInfo::JTEntryKind enum.
1897  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1898  // symbol.
1899  if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1901 
1902  // Otherwise, use the normal jump table encoding heuristics.
1904 }
1905 
1907  return Subtarget.useSoftFloat();
1908 }
1909 
1911  ArgListTy &Args) const {
1912 
1913  // Only relabel X86-32 for C / Stdcall CCs.
1914  if (Subtarget.is64Bit())
1915  return;
1916  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1917  return;
1918  unsigned ParamRegs = 0;
1919  if (auto *M = MF->getFunction()->getParent())
1920  ParamRegs = M->getNumberRegisterParameters();
1921 
1922  // Mark the first N int arguments as having reg
1923  for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1924  Type *T = Args[Idx].Ty;
1925  if (T->isPointerTy() || T->isIntegerTy())
1926  if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1927  unsigned numRegs = 1;
1928  if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1929  numRegs = 2;
1930  if (ParamRegs < numRegs)
1931  return;
1932  ParamRegs -= numRegs;
1933  Args[Idx].IsInReg = true;
1934  }
1935  }
1936 }
1937 
1938 const MCExpr *
1940  const MachineBasicBlock *MBB,
1941  unsigned uid,MCContext &Ctx) const{
1942  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1943  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1944  // entries.
1945  return MCSymbolRefExpr::create(MBB->getSymbol(),
1947 }
1948 
1949 /// Returns relocation base for the given PIC jumptable.
1951  SelectionDAG &DAG) const {
1952  if (!Subtarget.is64Bit())
1953  // This doesn't have SDLoc associated with it, but is not really the
1954  // same as a Register.
1955  return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1956  getPointerTy(DAG.getDataLayout()));
1957  return Table;
1958 }
1959 
1960 /// This returns the relocation base for the given PIC jumptable,
1961 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1964  MCContext &Ctx) const {
1965  // X86-64 uses RIP relative addressing based on the jump table label.
1966  if (Subtarget.isPICStyleRIPRel())
1967  return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1968 
1969  // Otherwise, the reference is relative to the PIC base.
1970  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1971 }
1972 
1973 std::pair<const TargetRegisterClass *, uint8_t>
1975  MVT VT) const {
1976  const TargetRegisterClass *RRC = nullptr;
1977  uint8_t Cost = 1;
1978  switch (VT.SimpleTy) {
1979  default:
1981  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1982  RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1983  break;
1984  case MVT::x86mmx:
1985  RRC = &X86::VR64RegClass;
1986  break;
1987  case MVT::f32: case MVT::f64:
1988  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1989  case MVT::v4f32: case MVT::v2f64:
1990  case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1991  case MVT::v8f32: case MVT::v4f64:
1992  case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1993  case MVT::v16f32: case MVT::v8f64:
1994  RRC = &X86::VR128XRegClass;
1995  break;
1996  }
1997  return std::make_pair(RRC, Cost);
1998 }
1999 
2000 unsigned X86TargetLowering::getAddressSpace() const {
2001  if (Subtarget.is64Bit())
2002  return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2003  return 256;
2004 }
2005 
2006 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2007  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2008  (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2009 }
2010 
2012  unsigned Offset, unsigned AddressSpace) {
2015  Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2016 }
2017 
2019  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2020  // tcbhead_t; use it instead of the usual global variable (see
2021  // sysdeps/{i386,x86_64}/nptl/tls.h)
2022  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2023  if (Subtarget.isTargetFuchsia()) {
2024  // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2025  return SegmentOffset(IRB, 0x10, getAddressSpace());
2026  } else {
2027  // %fs:0x28, unless we're using a Kernel code model, in which case
2028  // it's %gs:0x28. gs:0x14 on i386.
2029  unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2030  return SegmentOffset(IRB, Offset, getAddressSpace());
2031  }
2032  }
2033 
2034  return TargetLowering::getIRStackGuard(IRB);
2035 }
2036 
2038  // MSVC CRT provides functionalities for stack protection.
2039  if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2040  // MSVC CRT has a global variable holding security cookie.
2041  M.getOrInsertGlobal("__security_cookie",
2043 
2044  // MSVC CRT has a function to validate security cookie.
2045  auto *SecurityCheckCookie = cast<Function>(
2046  M.getOrInsertFunction("__security_check_cookie",
2049  SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2050  SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2051  return;
2052  }
2053  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2054  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2055  return;
2057 }
2058 
2060  // MSVC CRT has a global variable holding security cookie.
2061  if (Subtarget.getTargetTriple().isOSMSVCRT())
2062  return M.getGlobalVariable("__security_cookie");
2064 }
2065 
2067  // MSVC CRT has a function to validate security cookie.
2068  if (Subtarget.getTargetTriple().isOSMSVCRT())
2069  return M.getFunction("__security_check_cookie");
2071 }
2072 
2074  if (Subtarget.getTargetTriple().isOSContiki())
2075  return getDefaultSafeStackPointerLocation(IRB, false);
2076 
2077  // Android provides a fixed TLS slot for the SafeStack pointer. See the
2078  // definition of TLS_SLOT_SAFESTACK in
2079  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2080  if (Subtarget.isTargetAndroid()) {
2081  // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2082  // %gs:0x24 on i386
2083  unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2084  return SegmentOffset(IRB, Offset, getAddressSpace());
2085  }
2086 
2087  // Fuchsia is similar.
2088  if (Subtarget.isTargetFuchsia()) {
2089  // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2090  return SegmentOffset(IRB, 0x18, getAddressSpace());
2091  }
2092 
2094 }
2095 
2097  unsigned DestAS) const {
2098  assert(SrcAS != DestAS && "Expected different address spaces!");
2099 
2100  return SrcAS < 256 && DestAS < 256;
2101 }
2102 
2103 //===----------------------------------------------------------------------===//
2104 // Return Value Calling Convention Implementation
2105 //===----------------------------------------------------------------------===//
2106 
2107 #include "X86GenCallingConv.inc"
2108 
2109 bool X86TargetLowering::CanLowerReturn(
2110  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2111  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2113  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2114  return CCInfo.CheckReturn(Outs, RetCC_X86);
2115 }
2116 
2117 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2118  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2119  return ScratchRegs;
2120 }
2121 
2122 /// Lowers masks values (v*i1) to the local register values
2123 /// \returns DAG node after lowering to register type
2124 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2125  const SDLoc &Dl, SelectionDAG &DAG) {
2126  EVT ValVT = ValArg.getValueType();
2127 
2128  if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2129  (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2130  // Two stage lowering might be required
2131  // bitcast: v8i1 -> i8 / v16i1 -> i16
2132  // anyextend: i8 -> i32 / i16 -> i32
2133  EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2134  SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2135  if (ValLoc == MVT::i32)
2136  ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2137  return ValToCopy;
2138  } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2139  (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2140  // One stage lowering is required
2141  // bitcast: v32i1 -> i32 / v64i1 -> i64
2142  return DAG.getBitcast(ValLoc, ValArg);
2143  } else
2144  return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2145 }
2146 
2147 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2149  const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2150  SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2151  CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2152  assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2153  "Expected AVX512BW or AVX512BMI target!");
2154  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2155  assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2156  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2157  "The value should reside in two registers");
2158 
2159  // Before splitting the value we cast it to i64
2160  Arg = DAG.getBitcast(MVT::i64, Arg);
2161 
2162  // Splitting the value into two i32 types
2163  SDValue Lo, Hi;
2164  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2165  DAG.getConstant(0, Dl, MVT::i32));
2166  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2167  DAG.getConstant(1, Dl, MVT::i32));
2168 
2169  // Attach the two i32 types into corresponding registers
2170  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2171  RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2172 }
2173 
2174 SDValue
2175 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2176  bool isVarArg,
2177  const SmallVectorImpl<ISD::OutputArg> &Outs,
2178  const SmallVectorImpl<SDValue> &OutVals,
2179  const SDLoc &dl, SelectionDAG &DAG) const {
2180  MachineFunction &MF = DAG.getMachineFunction();
2182 
2183  // In some cases we need to disable registers from the default CSR list.
2184  // For example, when they are used for argument passing.
2185  bool ShouldDisableCalleeSavedRegister =
2186  CallConv == CallingConv::X86_RegCall ||
2187  MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2188 
2189  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2190  report_fatal_error("X86 interrupts may not return any value");
2191 
2193  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2194  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2195 
2196  SDValue Flag;
2197  SmallVector<SDValue, 6> RetOps;
2198  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2199  // Operand #1 = Bytes To Pop
2200  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2201  MVT::i32));
2202 
2203  // Copy the result values into the output registers.
2204  for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2205  ++I, ++OutsIndex) {
2206  CCValAssign &VA = RVLocs[I];
2207  assert(VA.isRegLoc() && "Can only return in registers!");
2208 
2209  // Add the register to the CalleeSaveDisableRegs list.
2210  if (ShouldDisableCalleeSavedRegister)
2212 
2213  SDValue ValToCopy = OutVals[OutsIndex];
2214  EVT ValVT = ValToCopy.getValueType();
2215 
2216  // Promote values to the appropriate types.
2217  if (VA.getLocInfo() == CCValAssign::SExt)
2218  ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2219  else if (VA.getLocInfo() == CCValAssign::ZExt)
2220  ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2221  else if (VA.getLocInfo() == CCValAssign::AExt) {
2222  if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2223  ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2224  else
2225  ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2226  }
2227  else if (VA.getLocInfo() == CCValAssign::BCvt)
2228  ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2229 
2231  "Unexpected FP-extend for return value.");
2232 
2233  // If this is x86-64, and we disabled SSE, we can't return FP values,
2234  // or SSE or MMX vectors.
2235  if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2236  VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2237  (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2238  errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2239  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2240  } else if (ValVT == MVT::f64 &&
2241  (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2242  // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2243  // llvm-gcc has never done it right and no one has noticed, so this
2244  // should be OK for now.
2245  errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2246  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2247  }
2248 
2249  // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2250  // the RET instruction and handled by the FP Stackifier.
2251  if (VA.getLocReg() == X86::FP0 ||
2252  VA.getLocReg() == X86::FP1) {
2253  // If this is a copy from an xmm register to ST(0), use an FPExtend to
2254  // change the value to the FP stack register class.
2255  if (isScalarFPTypeInSSEReg(VA.getValVT()))
2256  ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2257  RetOps.push_back(ValToCopy);
2258  // Don't emit a copytoreg.
2259  continue;
2260  }
2261 
2262  // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2263  // which is returned in RAX / RDX.
2264  if (Subtarget.is64Bit()) {
2265  if (ValVT == MVT::x86mmx) {
2266  if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2267  ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2268  ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2269  ValToCopy);
2270  // If we don't have SSE2 available, convert to v4f32 so the generated
2271  // register is legal.
2272  if (!Subtarget.hasSSE2())
2273  ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2274  }
2275  }
2276  }
2277 
2279 
2280  if (VA.needsCustom()) {
2281  assert(VA.getValVT() == MVT::v64i1 &&
2282  "Currently the only custom case is when we split v64i1 to 2 regs");
2283 
2284  Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2285  Subtarget);
2286 
2287  assert(2 == RegsToPass.size() &&
2288  "Expecting two registers after Pass64BitArgInRegs");
2289 
2290  // Add the second register to the CalleeSaveDisableRegs list.
2291  if (ShouldDisableCalleeSavedRegister)
2292  MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2293  } else {
2294  RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2295  }
2296 
2297  // Add nodes to the DAG and add the values into the RetOps list
2298  for (auto &Reg : RegsToPass) {
2299  Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2300  Flag = Chain.getValue(1);
2301  RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2302  }
2303  }
2304 
2305  // Swift calling convention does not require we copy the sret argument
2306  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2307 
2308  // All x86 ABIs require that for returning structs by value we copy
2309  // the sret argument into %rax/%eax (depending on ABI) for the return.
2310  // We saved the argument into a virtual register in the entry block,
2311  // so now we copy the value out and into %rax/%eax.
2312  //
2313  // Checking Function.hasStructRetAttr() here is insufficient because the IR
2314  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2315  // false, then an sret argument may be implicitly inserted in the SelDAG. In
2316  // either case FuncInfo->setSRetReturnReg() will have been called.
2317  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2318  // When we have both sret and another return value, we should use the
2319  // original Chain stored in RetOps[0], instead of the current Chain updated
2320  // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2321 
2322  // For the case of sret and another return value, we have
2323  // Chain_0 at the function entry
2324  // Chain_1 = getCopyToReg(Chain_0) in the above loop
2325  // If we use Chain_1 in getCopyFromReg, we will have
2326  // Val = getCopyFromReg(Chain_1)
2327  // Chain_2 = getCopyToReg(Chain_1, Val) from below
2328 
2329  // getCopyToReg(Chain_0) will be glued together with
2330  // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2331  // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2332  // Data dependency from Unit B to Unit A due to usage of Val in
2333  // getCopyToReg(Chain_1, Val)
2334  // Chain dependency from Unit A to Unit B
2335 
2336  // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2337  SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2338  getPointerTy(MF.getDataLayout()));
2339 
2340  unsigned RetValReg
2341  = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2342  X86::RAX : X86::EAX;
2343  Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2344  Flag = Chain.getValue(1);
2345 
2346  // RAX/EAX now acts like a return value.
2347  RetOps.push_back(
2348  DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2349 
2350  // Add the returned register to the CalleeSaveDisableRegs list.
2351  if (ShouldDisableCalleeSavedRegister)
2352  MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2353  }
2354 
2355  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2356  const MCPhysReg *I =
2358  if (I) {
2359  for (; *I; ++I) {
2360  if (X86::GR64RegClass.contains(*I))
2361  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2362  else
2363  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2364  }
2365  }
2366 
2367  RetOps[0] = Chain; // Update chain.
2368 
2369  // Add the flag if we have it.
2370  if (Flag.getNode())
2371  RetOps.push_back(Flag);
2372 
2374  if (CallConv == CallingConv::X86_INTR)
2375  opcode = X86ISD::IRET;
2376  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2377 }
2378 
2379 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2380  if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2381  return false;
2382 
2383  SDValue TCChain = Chain;
2384  SDNode *Copy = *N->use_begin();
2385  if (Copy->getOpcode() == ISD::CopyToReg) {
2386  // If the copy has a glue operand, we conservatively assume it isn't safe to
2387  // perform a tail call.
2388  if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2389  return false;
2390  TCChain = Copy->getOperand(0);
2391  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2392  return false;
2393 
2394  bool HasRet = false;
2395  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2396  UI != UE; ++UI) {
2397  if (UI->getOpcode() != X86ISD::RET_FLAG)
2398  return false;
2399  // If we are returning more than one value, we can definitely
2400  // not make a tail call see PR19530
2401  if (UI->getNumOperands() > 4)
2402  return false;
2403  if (UI->getNumOperands() == 4 &&
2404  UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2405  return false;
2406  HasRet = true;
2407  }
2408 
2409  if (!HasRet)
2410  return false;
2411 
2412  Chain = TCChain;
2413  return true;
2414 }
2415 
2416 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2417  ISD::NodeType ExtendKind) const {
2418  MVT ReturnMVT = MVT::i32;
2419 
2420  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2421  if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2422  // The ABI does not require i1, i8 or i16 to be extended.
2423  //
2424  // On Darwin, there is code in the wild relying on Clang's old behaviour of
2425  // always extending i8/i16 return values, so keep doing that for now.
2426  // (PR26665).
2427  ReturnMVT = MVT::i8;
2428  }
2429 
2430  EVT MinVT = getRegisterType(Context, ReturnMVT);
2431  return VT.bitsLT(MinVT) ? MinVT : VT;
2432 }
2433 
2434 /// Reads two 32 bit registers and creates a 64 bit mask value.
2435 /// \param VA The current 32 bit value that need to be assigned.
2436 /// \param NextVA The next 32 bit value that need to be assigned.
2437 /// \param Root The parent DAG node.
2438 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2439 /// glue purposes. In the case the DAG is already using
2440 /// physical register instead of virtual, we should glue
2441 /// our new SDValue to InFlag SDvalue.
2442 /// \return a new SDvalue of size 64bit.
2444  SDValue &Root, SelectionDAG &DAG,
2445  const SDLoc &Dl, const X86Subtarget &Subtarget,
2446  SDValue *InFlag = nullptr) {
2447  assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2448  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2449  assert(VA.getValVT() == MVT::v64i1 &&
2450  "Expecting first location of 64 bit width type");
2451  assert(NextVA.getValVT() == VA.getValVT() &&
2452  "The locations should have the same type");
2453  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2454  "The values should reside in two registers");
2455 
2456  SDValue Lo, Hi;
2457  unsigned Reg;
2458  SDValue ArgValueLo, ArgValueHi;
2459 
2460  MachineFunction &MF = DAG.getMachineFunction();
2461  const TargetRegisterClass *RC = &X86::GR32RegClass;
2462 
2463  // Read a 32 bit value from the registers
2464  if (nullptr == InFlag) {
2465  // When no physical register is present,
2466  // create an intermediate virtual register
2467  Reg = MF.addLiveIn(VA.getLocReg(), RC);
2468  ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2469  Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2470  ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2471  } else {
2472  // When a physical register is available read the value from it and glue
2473  // the reads together.
2474  ArgValueLo =
2475  DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2476  *InFlag = ArgValueLo.getValue(2);
2477  ArgValueHi =
2478  DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2479  *InFlag = ArgValueHi.getValue(2);
2480  }
2481 
2482  // Convert the i32 type into v32i1 type
2483  Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2484 
2485  // Convert the i32 type into v32i1 type
2486  Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2487 
2488  // Concatenate the two values together
2489  return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2490 }
2491 
2492 /// The function will lower a register of various sizes (8/16/32/64)
2493 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2494 /// \returns a DAG node contains the operand after lowering to mask type.
2495 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2496  const EVT &ValLoc, const SDLoc &Dl,
2497  SelectionDAG &DAG) {
2498  SDValue ValReturned = ValArg;
2499 
2500  if (ValVT == MVT::v1i1)
2501  return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2502 
2503  if (ValVT == MVT::v64i1) {
2504  // In 32 bit machine, this case is handled by getv64i1Argument
2505  assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2506  // In 64 bit machine, There is no need to truncate the value only bitcast
2507  } else {
2508  MVT maskLen;
2509  switch (ValVT.getSimpleVT().SimpleTy) {
2510  case MVT::v8i1:
2511  maskLen = MVT::i8;
2512  break;
2513  case MVT::v16i1:
2514  maskLen = MVT::i16;
2515  break;
2516  case MVT::v32i1:
2517  maskLen = MVT::i32;
2518  break;
2519  default:
2520  llvm_unreachable("Expecting a vector of i1 types");
2521  }
2522 
2523  ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2524  }
2525  return DAG.getBitcast(ValVT, ValReturned);
2526 }
2527 
2528 /// Lower the result values of a call into the
2529 /// appropriate copies out of appropriate physical registers.
2530 ///
2531 SDValue X86TargetLowering::LowerCallResult(
2532  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2533  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2534  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2535  uint32_t *RegMask) const {
2536 
2537  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2538  // Assign locations to each value returned by this call.
2540  bool Is64Bit = Subtarget.is64Bit();
2541  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2542  *DAG.getContext());
2543  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2544 
2545  // Copy all of the result registers out of their specified physreg.
2546  for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2547  ++I, ++InsIndex) {
2548  CCValAssign &VA = RVLocs[I];
2549  EVT CopyVT = VA.getLocVT();
2550 
2551  // In some calling conventions we need to remove the used registers
2552  // from the register mask.
2553  if (RegMask) {
2554  for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2555  SubRegs.isValid(); ++SubRegs)
2556  RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2557  }
2558 
2559  // If this is x86-64, and we disabled SSE, we can't return FP values
2560  if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2561  ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2562  errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2563  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2564  }
2565 
2566  // If we prefer to use the value in xmm registers, copy it out as f80 and
2567  // use a truncate to move it from fp stack reg to xmm reg.
2568  bool RoundAfterCopy = false;
2569  if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2571  if (!Subtarget.hasX87())
2572  report_fatal_error("X87 register return with X87 disabled");
2573  CopyVT = MVT::f80;
2574  RoundAfterCopy = (CopyVT != VA.getLocVT());
2575  }
2576 
2577  SDValue Val;
2578  if (VA.needsCustom()) {
2579  assert(VA.getValVT() == MVT::v64i1 &&
2580  "Currently the only custom case is when we split v64i1 to 2 regs");
2581  Val =
2582  getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2583  } else {
2584  Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2585  .getValue(1);
2586  Val = Chain.getValue(0);
2587  InFlag = Chain.getValue(2);
2588  }
2589 
2590  if (RoundAfterCopy)
2591  Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2592  // This truncation won't change the value.
2593  DAG.getIntPtrConstant(1, dl));
2594 
2595  if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2596  if (VA.getValVT().isVector() &&
2597  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2598  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2599  // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2600  Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2601  } else
2602  Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2603  }
2604 
2605  InVals.push_back(Val);
2606  }
2607 
2608  return Chain;
2609 }
2610 
2611 //===----------------------------------------------------------------------===//
2612 // C & StdCall & Fast Calling Convention implementation
2613 //===----------------------------------------------------------------------===//
2614 // StdCall calling convention seems to be standard for many Windows' API
2615 // routines and around. It differs from C calling convention just a little:
2616 // callee should clean up the stack, not caller. Symbols should be also
2617 // decorated in some fancy way :) It doesn't support any vector arguments.
2618 // For info on fast calling convention see Fast Calling Convention (tail call)
2619 // implementation LowerX86_32FastCCCallTo.
2620 
2621 /// CallIsStructReturn - Determines whether a call uses struct return
2622 /// semantics.
2627 };
2628 static StructReturnType
2630  if (Outs.empty())
2631  return NotStructReturn;
2632 
2633  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2634  if (!Flags.isSRet())
2635  return NotStructReturn;
2636  if (Flags.isInReg() || IsMCU)
2637  return RegStructReturn;
2638  return StackStructReturn;
2639 }
2640 
2641 /// Determines whether a function uses struct return semantics.
2642 static StructReturnType
2644  if (Ins.empty())
2645  return NotStructReturn;
2646 
2647  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2648  if (!Flags.isSRet())
2649  return NotStructReturn;
2650  if (Flags.isInReg() || IsMCU)
2651  return RegStructReturn;
2652  return StackStructReturn;
2653 }
2654 
2655 /// Make a copy of an aggregate at address specified by "Src" to address
2656 /// "Dst" with size and alignment information specified by the specific
2657 /// parameter attribute. The copy will be passed as a byval function parameter.
2659  SDValue Chain, ISD::ArgFlagsTy Flags,
2660  SelectionDAG &DAG, const SDLoc &dl) {
2661  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2662 
2663  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2664  /*isVolatile*/false, /*AlwaysInline=*/true,
2665  /*isTailCall*/false,
2667 }
2668 
2669 /// Return true if the calling convention is one that we can guarantee TCO for.
2671  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2673  CC == CallingConv::HHVM);
2674 }
2675 
2676 /// Return true if we might ever do TCO for calls with this calling convention.
2678  switch (CC) {
2679  // C calling conventions:
2680  case CallingConv::C:
2681  case CallingConv::Win64:
2683  // Callee pop conventions:
2688  return true;
2689  default:
2690  return canGuaranteeTCO(CC);
2691  }
2692 }
2693 
2694 /// Return true if the function is being made into a tailcall target by
2695 /// changing its ABI.
2696 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2697  return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2698 }
2699 
2700 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2701  auto Attr =
2702  CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2703  if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2704  return false;
2705 
2706  ImmutableCallSite CS(CI);
2707  CallingConv::ID CalleeCC = CS.getCallingConv();
2708  if (!mayTailCallThisCC(CalleeCC))
2709  return false;
2710 
2711  return true;
2712 }
2713 
2714 SDValue
2715 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2716  const SmallVectorImpl<ISD::InputArg> &Ins,
2717  const SDLoc &dl, SelectionDAG &DAG,
2718  const CCValAssign &VA,
2719  MachineFrameInfo &MFI, unsigned i) const {
2720  // Create the nodes corresponding to a load from this parameter slot.
2721  ISD::ArgFlagsTy Flags = Ins[i].Flags;
2722  bool AlwaysUseMutable = shouldGuaranteeTCO(
2723  CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2724  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2725  EVT ValVT;
2726  MVT PtrVT = getPointerTy(DAG.getDataLayout());
2727 
2728  // If value is passed by pointer we have address passed instead of the value
2729  // itself. No need to extend if the mask value and location share the same
2730  // absolute size.
2731  bool ExtendedInMem =
2732  VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2733  VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2734 
2735  if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2736  ValVT = VA.getLocVT();
2737  else
2738  ValVT = VA.getValVT();
2739 
2740  // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2741  // taken by a return address.
2742  int Offset = 0;
2743  if (CallConv == CallingConv::X86_INTR) {
2744  // X86 interrupts may take one or two arguments.
2745  // On the stack there will be no return address as in regular call.
2746  // Offset of last argument need to be set to -4/-8 bytes.
2747  // Where offset of the first argument out of two, should be set to 0 bytes.
2748  Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2749  if (Subtarget.is64Bit() && Ins.size() == 2) {
2750  // The stack pointer needs to be realigned for 64 bit handlers with error
2751  // code, so the argument offset changes by 8 bytes.
2752  Offset += 8;
2753  }
2754  }
2755 
2756  // FIXME: For now, all byval parameter objects are marked mutable. This can be
2757  // changed with more analysis.
2758  // In case of tail call optimization mark all arguments mutable. Since they
2759  // could be overwritten by lowering of arguments in case of a tail call.
2760  if (Flags.isByVal()) {
2761  unsigned Bytes = Flags.getByValSize();
2762  if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2763  int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2764  // Adjust SP offset of interrupt parameter.
2765  if (CallConv == CallingConv::X86_INTR) {
2766  MFI.setObjectOffset(FI, Offset);
2767  }
2768  return DAG.getFrameIndex(FI, PtrVT);
2769  }
2770 
2771  // This is an argument in memory. We might be able to perform copy elision.
2772  if (Flags.isCopyElisionCandidate()) {
2773  EVT ArgVT = Ins[i].ArgVT;
2774  SDValue PartAddr;
2775  if (Ins[i].PartOffset == 0) {
2776  // If this is a one-part value or the first part of a multi-part value,
2777  // create a stack object for the entire argument value type and return a
2778  // load from our portion of it. This assumes that if the first part of an
2779  // argument is in memory, the rest will also be in memory.
2780  int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2781  /*Immutable=*/false);
2782  PartAddr = DAG.getFrameIndex(FI, PtrVT);
2783  return DAG.getLoad(
2784  ValVT, dl, Chain, PartAddr,
2786  } else {
2787  // This is not the first piece of an argument in memory. See if there is
2788  // already a fixed stack object including this offset. If so, assume it
2789  // was created by the PartOffset == 0 branch above and create a load from
2790  // the appropriate offset into it.
2791  int64_t PartBegin = VA.getLocMemOffset();
2792  int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2793  int FI = MFI.getObjectIndexBegin();
2794  for (; MFI.isFixedObjectIndex(FI); ++FI) {
2795  int64_t ObjBegin = MFI.getObjectOffset(FI);
2796  int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2797  if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2798  break;
2799  }
2800  if (MFI.isFixedObjectIndex(FI)) {
2801  SDValue Addr =
2802  DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2803  DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2804  return DAG.getLoad(
2805  ValVT, dl, Chain, Addr,
2807  Ins[i].PartOffset));
2808  }
2809  }
2810  }
2811 
2812  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2813  VA.getLocMemOffset(), isImmutable);
2814 
2815  // Set SExt or ZExt flag.
2816  if (VA.getLocInfo() == CCValAssign::ZExt) {
2817  MFI.setObjectZExt(FI, true);
2818  } else if (VA.getLocInfo() == CCValAssign::SExt) {
2819  MFI.setObjectSExt(FI, true);
2820  }
2821 
2822  // Adjust SP offset of interrupt parameter.
2823  if (CallConv == CallingConv::X86_INTR) {
2824  MFI.setObjectOffset(FI, Offset);
2825  }
2826 
2827  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2828  SDValue Val = DAG.getLoad(
2829  ValVT, dl, Chain, FIN,
2831  return ExtendedInMem
2832  ? (VA.getValVT().isVector()
2833  ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2834  : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2835  : Val;
2836 }
2837 
2838 // FIXME: Get this from tablegen.
2840  const X86Subtarget &Subtarget) {
2841  assert(Subtarget.is64Bit());
2842 
2843  if (Subtarget.isCallingConvWin64(CallConv)) {
2844  static const MCPhysReg GPR64ArgRegsWin64[] = {
2845  X86::RCX, X86::RDX, X86::R8, X86::R9
2846  };
2847  return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2848  }
2849 
2850  static const MCPhysReg GPR64ArgRegs64Bit[] = {
2851  X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2852  };
2853  return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2854 }
2855 
2856 // FIXME: Get this from tablegen.
2858  CallingConv::ID CallConv,
2859  const X86Subtarget &Subtarget) {
2860  assert(Subtarget.is64Bit());
2861  if (Subtarget.isCallingConvWin64(CallConv)) {
2862  // The XMM registers which might contain var arg parameters are shadowed
2863  // in their paired GPR. So we only need to save the GPR to their home
2864  // slots.
2865  // TODO: __vectorcall will change this.
2866  return None;
2867  }
2868 
2869  const Function *Fn = MF.getFunction();
2870  bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2871  bool isSoftFloat = Subtarget.useSoftFloat();
2872  assert(!(isSoftFloat && NoImplicitFloatOps) &&
2873  "SSE register cannot be used when SSE is disabled!");
2874  if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2875  // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2876  // registers.
2877  return None;
2878 
2879  static const MCPhysReg XMMArgRegs64Bit[] = {
2880  X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2881  X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2882  };
2883  return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2884 }
2885 
2886 #ifndef NDEBUG
2888  return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2889  [](const CCValAssign &A, const CCValAssign &B) -> bool {
2890  return A.getValNo() < B.getValNo();
2891  });
2892 }
2893 #endif
2894 
2895 SDValue X86TargetLowering::LowerFormalArguments(
2896  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2897  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2898  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2899  MachineFunction &MF = DAG.getMachineFunction();
2901  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2902 
2903  const Function *Fn = MF.getFunction();
2904  if (Fn->hasExternalLinkage() &&
2905  Subtarget.isTargetCygMing() &&
2906  Fn->getName() == "main")
2907  FuncInfo->setForceFramePointer(true);
2908 
2909  MachineFrameInfo &MFI = MF.getFrameInfo();
2910  bool Is64Bit = Subtarget.is64Bit();
2911  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2912 
2913  assert(
2914  !(isVarArg && canGuaranteeTCO(CallConv)) &&
2915  "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2916 
2917  if (CallConv == CallingConv::X86_INTR) {
2918  bool isLegal = Ins.size() == 1 ||
2919  (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2920  (!Is64Bit && Ins[1].VT == MVT::i32)));
2921  if (!isLegal)
2922  report_fatal_error("X86 interrupts may take one or two arguments");
2923  }
2924 
2925  // Assign locations to all of the incoming arguments.
2927  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2928 
2929  // Allocate shadow area for Win64.
2930  if (IsWin64)
2931  CCInfo.AllocateStack(32, 8);
2932 
2933  CCInfo.AnalyzeArguments(Ins, CC_X86);
2934 
2935  // In vectorcall calling convention a second pass is required for the HVA
2936  // types.
2937  if (CallingConv::X86_VectorCall == CallConv) {
2938  CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2939  }
2940 
2941  // The next loop assumes that the locations are in the same order of the
2942  // input arguments.
2943  assert(isSortedByValueNo(ArgLocs) &&
2944  "Argument Location list must be sorted before lowering");
2945 
2946  SDValue ArgValue;
2947  for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2948  ++I, ++InsIndex) {
2949  assert(InsIndex < Ins.size() && "Invalid Ins index");
2950  CCValAssign &VA = ArgLocs[I];
2951 
2952  if (VA.isRegLoc()) {
2953  EVT RegVT = VA.getLocVT();
2954  if (VA.needsCustom()) {
2955  assert(
2956  VA.getValVT() == MVT::v64i1 &&
2957  "Currently the only custom case is when we split v64i1 to 2 regs");
2958 
2959  // v64i1 values, in regcall calling convention, that are
2960  // compiled to 32 bit arch, are split up into two registers.
2961  ArgValue =
2962  getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2963  } else {
2964  const TargetRegisterClass *RC;
2965  if (RegVT == MVT::i32)
2966  RC = &X86::GR32RegClass;
2967  else if (Is64Bit && RegVT == MVT::i64)
2968  RC = &X86::GR64RegClass;
2969  else if (RegVT == MVT::f32)
2970  RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2971  else if (RegVT == MVT::f64)
2972  RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2973  else if (RegVT == MVT::f80)
2974  RC = &X86::RFP80RegClass;
2975  else if (RegVT == MVT::f128)
2976  RC = &X86::FR128RegClass;
2977  else if (RegVT.is512BitVector())
2978  RC = &X86::VR512RegClass;
2979  else if (RegVT.is256BitVector())
2980  RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2981  else if (RegVT.is128BitVector())
2982  RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2983  else if (RegVT == MVT::x86mmx)
2984  RC = &X86::VR64RegClass;
2985  else if (RegVT == MVT::v1i1)
2986  RC = &X86::VK1RegClass;
2987  else if (RegVT == MVT::v8i1)
2988  RC = &X86::VK8RegClass;
2989  else if (RegVT == MVT::v16i1)
2990  RC = &X86::VK16RegClass;
2991  else if (RegVT == MVT::v32i1)
2992  RC = &X86::VK32RegClass;
2993  else if (RegVT == MVT::v64i1)
2994  RC = &X86::VK64RegClass;
2995  else
2996  llvm_unreachable("Unknown argument type!");
2997 
2998  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2999  ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3000  }
3001 
3002  // If this is an 8 or 16-bit value, it is really passed promoted to 32
3003  // bits. Insert an assert[sz]ext to capture this, then truncate to the
3004  // right size.
3005  if (VA.getLocInfo() == CCValAssign::SExt)
3006  ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3007  DAG.getValueType(VA.getValVT()));
3008  else if (VA.getLocInfo() == CCValAssign::ZExt)
3009  ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3010  DAG.getValueType(VA.getValVT()));
3011  else if (VA.getLocInfo() == CCValAssign::BCvt)
3012  ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3013 
3014  if (VA.isExtInLoc()) {
3015  // Handle MMX values passed in XMM regs.
3016  if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3017  ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3018  else if (VA.getValVT().isVector() &&
3019  VA.getValVT().getScalarType() == MVT::i1 &&
3020  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3021  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3022  // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3023  ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3024  } else
3025  ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3026  }
3027  } else {
3028  assert(VA.isMemLoc());
3029  ArgValue =
3030  LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3031  }
3032 
3033  // If value is passed via pointer - do a load.
3034  if (VA.getLocInfo() == CCValAssign::Indirect)
3035  ArgValue =
3036  DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3037 
3038  InVals.push_back(ArgValue);
3039  }
3040 
3041  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3042  // Swift calling convention does not require we copy the sret argument
3043  // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3044  if (CallConv == CallingConv::Swift)
3045  continue;
3046 
3047  // All x86 ABIs require that for returning structs by value we copy the
3048  // sret argument into %rax/%eax (depending on ABI) for the return. Save
3049  // the argument into a virtual register so that we can access it from the
3050  // return points.
3051  if (Ins[I].Flags.isSRet()) {
3052  unsigned Reg = FuncInfo->getSRetReturnReg();
3053  if (!Reg) {
3054  MVT PtrTy = getPointerTy(DAG.getDataLayout());
3055  Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3056  FuncInfo->setSRetReturnReg(Reg);
3057  }
3058  SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3059  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3060  break;
3061  }
3062  }
3063 
3064  unsigned StackSize = CCInfo.getNextStackOffset();
3065  // Align stack specially for tail calls.
3066  if (shouldGuaranteeTCO(CallConv,
3068  StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3069 
3070  // If the function takes variable number of arguments, make a frame index for
3071  // the start of the first vararg value... for expansion of llvm.va_start. We
3072  // can skip this if there are no va_start calls.
3073  if (MFI.hasVAStart() &&
3074  (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3075  CallConv != CallingConv::X86_ThisCall))) {
3076  FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3077  }
3078 
3079  // Figure out if XMM registers are in use.
3080  assert(!(Subtarget.useSoftFloat() &&
3081  Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3082  "SSE register cannot be used when SSE is disabled!");
3083 
3084  // 64-bit calling conventions support varargs and register parameters, so we
3085  // have to do extra work to spill them in the prologue.
3086  if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3087  // Find the first unallocated argument registers.
3088  ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3089  ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3090  unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3091  unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3092  assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3093  "SSE register cannot be used when SSE is disabled!");
3094 
3095  // Gather all the live in physical registers.
3096  SmallVector<SDValue, 6> LiveGPRs;
3097  SmallVector<SDValue, 8> LiveXMMRegs;
3098  SDValue ALVal;
3099  for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3100  unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3101  LiveGPRs.push_back(
3102  DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3103  }
3104  if (!ArgXMMs.empty()) {
3105  unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3106  ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3107  for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3108  unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3109  LiveXMMRegs.push_back(
3110  DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3111  }
3112  }
3113 
3114  if (IsWin64) {
3115  // Get to the caller-allocated home save location. Add 8 to account
3116  // for the return address.
3117  int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3118  FuncInfo->setRegSaveFrameIndex(
3119  MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3120  // Fixup to set vararg frame on shadow area (4 x i64).
3121  if (NumIntRegs < 4)
3122  FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3123  } else {
3124  // For X86-64, if there are vararg parameters that are passed via
3125  // registers, then we must store them to their spots on the stack so
3126  // they may be loaded by dereferencing the result of va_next.
3127  FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3128  FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3130  ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3131  }
3132 
3133  // Store the integer parameter registers.
3134  SmallVector<SDValue, 8> MemOps;
3135  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3136  getPointerTy(DAG.getDataLayout()));
3137  unsigned Offset = FuncInfo->getVarArgsGPOffset();
3138  for (SDValue Val : LiveGPRs) {
3139  SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3140  RSFIN, DAG.getIntPtrConstant(Offset, dl));
3141  SDValue Store =
3142  DAG.getStore(Val.getValue(1), dl, Val, FIN,
3144  DAG.getMachineFunction(),
3145  FuncInfo->getRegSaveFrameIndex(), Offset));
3146  MemOps.push_back(Store);
3147  Offset += 8;
3148  }
3149 
3150  if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3151  // Now store the XMM (fp + vector) parameter registers.
3152  SmallVector<SDValue, 12> SaveXMMOps;
3153  SaveXMMOps.push_back(Chain);
3154  SaveXMMOps.push_back(ALVal);
3155  SaveXMMOps.push_back(DAG.getIntPtrConstant(
3156  FuncInfo->getRegSaveFrameIndex(), dl));
3157  SaveXMMOps.push_back(DAG.getIntPtrConstant(
3158  FuncInfo->getVarArgsFPOffset(), dl));
3159  SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3160  LiveXMMRegs.end());
3162  MVT::Other, SaveXMMOps));
3163  }
3164 
3165  if (!MemOps.empty())
3166  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3167  }
3168 
3169  if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3170  // Find the largest legal vector type.
3171  MVT VecVT = MVT::Other;
3172  // FIXME: Only some x86_32 calling conventions support AVX512.
3173  if (Subtarget.hasAVX512() &&
3174  (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3175  CallConv == CallingConv::Intel_OCL_BI)))
3176  VecVT = MVT::v16f32;
3177  else if (Subtarget.hasAVX())
3178  VecVT = MVT::v8f32;
3179  else if (Subtarget.hasSSE2())
3180  VecVT = MVT::v4f32;
3181 
3182  // We forward some GPRs and some vector types.
3183  SmallVector<MVT, 2> RegParmTypes;
3184  MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3185  RegParmTypes.push_back(IntVT);
3186  if (VecVT != MVT::Other)
3187  RegParmTypes.push_back(VecVT);
3188 
3189  // Compute the set of forwarded registers. The rest are scratch.
3191  FuncInfo->getForwardedMustTailRegParms();
3192  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3193 
3194  // Conservatively forward AL on x86_64, since it might be used for varargs.
3195  if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3196  unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3197  Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3198  }
3199 
3200  // Copy all forwards from physical to virtual registers.
3201  for (ForwardedRegister &F : Forwards) {
3202  // FIXME: Can we use a less constrained schedule?
3203  SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3205  Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3206  }
3207  }
3208 
3209  // Some CCs need callee pop.
3210  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3212  FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3213  } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3214  // X86 interrupts must pop the error code (and the alignment padding) if
3215  // present.
3216  FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3217  } else {
3218  FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3219  // If this is an sret function, the return should pop the hidden pointer.
3220  if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3221  !Subtarget.getTargetTriple().isOSMSVCRT() &&
3222  argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3223  FuncInfo->setBytesToPopOnReturn(4);
3224  }
3225 
3226  if (!Is64Bit) {
3227  // RegSaveFrameIndex is X86-64 only.
3228  FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3229  if (CallConv == CallingConv::X86_FastCall ||
3230  CallConv == CallingConv::X86_ThisCall)
3231  // fastcc functions can't have varargs.
3232  FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3233  }
3234 
3235  FuncInfo->setArgumentStackSize(StackSize);
3236 
3237  if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3239  if (Personality == EHPersonality::CoreCLR) {
3240  assert(Is64Bit);
3241  // TODO: Add a mechanism to frame lowering that will allow us to indicate
3242  // that we'd prefer this slot be allocated towards the bottom of the frame
3243  // (i.e. near the stack pointer after allocating the frame). Every
3244  // funclet needs a copy of this slot in its (mostly empty) frame, and the
3245  // offset from the bottom of this and each funclet's frame must be the
3246  // same, so the size of funclets' (mostly empty) frames is dictated by
3247  // how far this slot is from the bottom (since they allocate just enough
3248  // space to accommodate holding this slot at the correct offset).
3249  int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3250  EHInfo->PSPSymFrameIdx = PSPSymFI;
3251  }
3252  }
3253 
3254  if (CallConv == CallingConv::X86_RegCall ||
3255  Fn->hasFnAttribute("no_caller_saved_registers")) {
3256  const MachineRegisterInfo &MRI = MF.getRegInfo();
3257  for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3258  MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3259  }
3260 
3261  return Chain;
3262 }
3263 
3264 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3265  SDValue Arg, const SDLoc &dl,
3266  SelectionDAG &DAG,
3267  const CCValAssign &VA,
3268  ISD::ArgFlagsTy Flags) const {
3269  unsigned LocMemOffset = VA.getLocMemOffset();
3270  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3271  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3272  StackPtr, PtrOff);
3273  if (Flags.isByVal())
3274  return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3275 
3276  return DAG.getStore(
3277  Chain, dl, Arg, PtrOff,
3278  MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3279 }
3280 
3281 /// Emit a load of return address if tail call
3282 /// optimization is performed and it is required.
3283 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3284  SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3285  bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3286  // Adjust the Return address stack slot.
3287  EVT VT = getPointerTy(DAG.getDataLayout());
3288  OutRetAddr = getReturnAddressFrameIndex(DAG);
3289 
3290  // Load the "old" Return address.
3291  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3292  return SDValue(OutRetAddr.getNode(), 1);
3293 }
3294 
3295 /// Emit a store of the return address if tail call
3296 /// optimization is performed and it is required (FPDiff!=0).
3298  SDValue Chain, SDValue RetAddrFrIdx,
3299  EVT PtrVT, unsigned SlotSize,
3300  int FPDiff, const SDLoc &dl) {
3301  // Store the return address to the appropriate stack slot.
3302  if (!FPDiff) return Chain;
3303  // Calculate the new stack slot for the return address.
3304  int NewReturnAddrFI =
3305  MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3306  false);
3307  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3308  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3310  DAG.getMachineFunction(), NewReturnAddrFI));
3311  return Chain;
3312 }
3313 
3314 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3315 /// operation of specified width.
3316 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3317  SDValue V2) {
3318  unsigned NumElems = VT.getVectorNumElements();
3320  Mask.push_back(NumElems);
3321  for (unsigned i = 1; i != NumElems; ++i)
3322  Mask.push_back(i);
3323  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3324 }
3325 
3326 SDValue
3327 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3328  SmallVectorImpl<SDValue> &InVals) const {
3329  SelectionDAG &DAG = CLI.DAG;
3330  SDLoc &dl = CLI.DL;
3332  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3334  SDValue Chain = CLI.Chain;
3335  SDValue Callee = CLI.Callee;
3336  CallingConv::ID CallConv = CLI.CallConv;
3337  bool &isTailCall = CLI.IsTailCall;
3338  bool isVarArg = CLI.IsVarArg;
3339 
3340  MachineFunction &MF = DAG.getMachineFunction();
3341  bool Is64Bit = Subtarget.is64Bit();
3342  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3343  StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3344  bool IsSibcall = false;
3346  auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3347  const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3348  const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3349  bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3350  (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3351 
3352  if (CallConv == CallingConv::X86_INTR)
3353  report_fatal_error("X86 interrupts may not be called directly");
3354 
3355  if (Attr.getValueAsString() == "true")
3356  isTailCall = false;
3357 
3358  if (Subtarget.isPICStyleGOT() &&
3360  // If we are using a GOT, disable tail calls to external symbols with
3361  // default visibility. Tail calling such a symbol requires using a GOT
3362  // relocation, which forces early binding of the symbol. This breaks code
3363  // that require lazy function symbol resolution. Using musttail or
3364  // GuaranteedTailCallOpt will override this.
3366  if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3368  isTailCall = false;
3369  }
3370 
3371  bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3372  if (IsMustTail) {
3373  // Force this to be a tail call. The verifier rules are enough to ensure
3374  // that we can lower this successfully without moving the return address
3375  // around.
3376  isTailCall = true;
3377  } else if (isTailCall) {
3378  // Check if it's really possible to do a tail call.
3379  isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3380  isVarArg, SR != NotStructReturn,
3381  MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3382  Outs, OutVals, Ins, DAG);
3383 
3384  // Sibcalls are automatically detected tailcalls which do not require
3385  // ABI changes.
3386  if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3387  IsSibcall = true;
3388 
3389  if (isTailCall)
3390  ++NumTailCalls;
3391  }
3392 
3393  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3394  "Var args not supported with calling convention fastcc, ghc or hipe");
3395 
3396  // Analyze operands of the call, assigning locations to each operand.
3398  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3399 
3400  // Allocate shadow area for Win64.
3401  if (IsWin64)
3402  CCInfo.AllocateStack(32, 8);
3403 
3404  CCInfo.AnalyzeArguments(Outs, CC_X86);
3405 
3406  // In vectorcall calling convention a second pass is required for the HVA
3407  // types.
3408  if (CallingConv::X86_VectorCall == CallConv) {
3409  CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3410  }
3411 
3412  // Get a count of how many bytes are to be pushed on the stack.
3413  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3414  if (IsSibcall)
3415  // This is a sibcall. The memory operands are available in caller's
3416  // own caller's stack.
3417  NumBytes = 0;
3418  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3419  canGuaranteeTCO(CallConv))
3420  NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3421 
3422  int FPDiff = 0;
3423  if (isTailCall && !IsSibcall && !IsMustTail) {
3424  // Lower arguments at fp - stackoffset + fpdiff.
3425  unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3426 
3427  FPDiff = NumBytesCallerPushed - NumBytes;
3428 
3429  // Set the delta of movement of the returnaddr stackslot.
3430  // But only set if delta is greater than previous delta.
3431  if (FPDiff < X86Info->getTCReturnAddrDelta())
3432  X86Info->setTCReturnAddrDelta(FPDiff);
3433  }
3434 
3435  unsigned NumBytesToPush = NumBytes;
3436  unsigned NumBytesToPop = NumBytes;
3437 
3438  // If we have an inalloca argument, all stack space has already been allocated
3439  // for us and be right at the top of the stack. We don't support multiple
3440  // arguments passed in memory when using inalloca.
3441  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3442  NumBytesToPush = 0;
3443  if (!ArgLocs.back().isMemLoc())
3444  report_fatal_error("cannot use inalloca attribute on a register "
3445  "parameter");
3446  if (ArgLocs.back().getLocMemOffset() != 0)
3447  report_fatal_error("any parameter with the inalloca attribute must be "
3448  "the only memory argument");
3449  }
3450 
3451  if (!IsSibcall)
3452  Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3453  NumBytes - NumBytesToPush, dl);
3454 
3455  SDValue RetAddrFrIdx;
3456  // Load return address for tail calls.
3457  if (isTailCall && FPDiff)
3458  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3459  Is64Bit, FPDiff, dl);
3460 
3462  SmallVector<SDValue, 8> MemOpChains;
3463  SDValue StackPtr;
3464 
3465  // The next loop assumes that the locations are in the same order of the
3466  // input arguments.
3467  assert(isSortedByValueNo(ArgLocs) &&
3468  "Argument Location list must be sorted before lowering");
3469 
3470  // Walk the register/memloc assignments, inserting copies/loads. In the case
3471  // of tail call optimization arguments are handle later.
3472  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3473  for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3474  ++I, ++OutIndex) {
3475  assert(OutIndex < Outs.size() && "Invalid Out index");
3476  // Skip inalloca arguments, they have already been written.
3477  ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3478  if (Flags.isInAlloca())
3479  continue;
3480 
3481  CCValAssign &VA = ArgLocs[I];
3482  EVT RegVT = VA.getLocVT();
3483  SDValue Arg = OutVals[OutIndex];
3484  bool isByVal = Flags.isByVal();
3485 
3486  // Promote the value if needed.
3487  switch (VA.getLocInfo()) {
3488  default: llvm_unreachable("Unknown loc info!");
3489  case CCValAssign::Full: break;
3490  case CCValAssign::SExt:
3491  Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3492  break;
3493  case CCValAssign::ZExt:
3494  Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3495  break;
3496  case CCValAssign::AExt:
3497  if (Arg.getValueType().isVector() &&
3499  Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3500  else if (RegVT.is128BitVector()) {
3501  // Special case: passing MMX values in XMM registers.
3502  Arg = DAG.getBitcast(MVT::i64, Arg);
3503  Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3504  Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3505  } else
3506  Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3507  break;
3508  case CCValAssign::BCvt:
3509  Arg = DAG.getBitcast(RegVT, Arg);
3510  break;
3511  case CCValAssign::Indirect: {
3512  // Store the argument.
3513  SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3514  int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3515  Chain = DAG.getStore(
3516  Chain, dl, Arg, SpillSlot,
3518  Arg = SpillSlot;
3519  break;
3520  }
3521  }
3522 
3523  if (VA.needsCustom()) {
3524  assert(VA.getValVT() == MVT::v64i1 &&
3525  "Currently the only custom case is when we split v64i1 to 2 regs");
3526  // Split v64i1 value into two registers
3527  Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3528  Subtarget);
3529  } else if (VA.isRegLoc()) {
3530  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3531  if (isVarArg && IsWin64) {
3532  // Win64 ABI requires argument XMM reg to be copied to the corresponding
3533  // shadow reg if callee is a varargs function.
3534  unsigned ShadowReg = 0;
3535  switch (VA.getLocReg()) {
3536  case X86::XMM0: ShadowReg = X86::RCX; break;
3537  case X86::XMM1: ShadowReg = X86::RDX; break;
3538  case X86::XMM2: ShadowReg = X86::R8; break;
3539  case X86::XMM3: ShadowReg = X86::R9; break;
3540  }
3541  if (ShadowReg)
3542  RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3543  }
3544  } else if (!IsSibcall && (!isTailCall || isByVal)) {
3545  assert(VA.isMemLoc());
3546  if (!StackPtr.getNode())
3547  StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3548  getPointerTy(DAG.getDataLayout()));
3549  MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3550  dl, DAG, VA, Flags));
3551  }
3552  }
3553 
3554  if (!MemOpChains.empty())
3555  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3556 
3557  if (Subtarget.isPICStyleGOT()) {
3558  // ELF / PIC requires GOT in the EBX register before function calls via PLT
3559  // GOT pointer.
3560  if (!isTailCall) {
3561  RegsToPass.push_back(std::make_pair(
3562  unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3563  getPointerTy(DAG.getDataLayout()))));
3564  } else {
3565  // If we are tail calling and generating PIC/GOT style code load the
3566  // address of the callee into ECX. The value in ecx is used as target of
3567  // the tail jump. This is done to circumvent the ebx/callee-saved problem
3568  // for tail calls on PIC/GOT architectures. Normally we would just put the
3569  // address of GOT into ebx and then call target@PLT. But for tail calls
3570  // ebx would be restored (since ebx is callee saved) before jumping to the
3571  // target@PLT.
3572 
3573  // Note: The actual moving to ECX is done further down.
3575  if (G && !G->getGlobal()->hasLocalLinkage() &&
3577  Callee = LowerGlobalAddress(Callee, DAG);
3578  else if (isa<ExternalSymbolSDNode>(Callee))
3579  Callee = LowerExternalSymbol(Callee, DAG);
3580  }
3581  }
3582 
3583  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3584  // From AMD64 ABI document:
3585  // For calls that may call functions that use varargs or stdargs
3586  // (prototype-less calls or calls to functions containing ellipsis (...) in
3587  // the declaration) %al is used as hidden argument to specify the number
3588  // of SSE registers used. The contents of %al do not need to match exactly
3589  // the number of registers, but must be an ubound on the number of SSE
3590  // registers used and is in the range 0 - 8 inclusive.
3591 
3592  // Count the number of XMM registers allocated.
3593  static const MCPhysReg XMMArgRegs[] = {
3594  X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3595  X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3596  };
3597  unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3598  assert((Subtarget.hasSSE1() || !NumXMMRegs)
3599  && "SSE registers cannot be used when SSE is disabled");
3600 
3601  RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3602  DAG.getConstant(NumXMMRegs, dl,
3603  MVT::i8)));
3604  }
3605 
3606  if (isVarArg && IsMustTail) {
3607  const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3608  for (const auto &F : Forwards) {
3609  SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3610  RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3611  }
3612  }
3613 
3614  // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3615  // don't need this because the eligibility check rejects calls that require
3616  // shuffling arguments passed in memory.
3617  if (!IsSibcall && isTailCall) {
3618  // Force all the incoming stack arguments to be loaded from the stack
3619  // before any new outgoing arguments are stored to the stack, because the
3620  // outgoing stack slots may alias the incoming argument stack slots, and
3621  // the alias isn't otherwise explicit. This is slightly more conservative
3622  // than necessary, because it means that each store effectively depends
3623  // on every argument instead of just those arguments it would clobber.
3624  SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3625 
3626  SmallVector<SDValue, 8> MemOpChains2;
3627  SDValue FIN;
3628  int FI = 0;
3629  for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3630  ++I, ++OutsIndex) {
3631  CCValAssign &VA = ArgLocs[I];
3632 
3633  if (VA.isRegLoc()) {
3634  if (VA.needsCustom()) {
3635  assert((CallConv == CallingConv::X86_RegCall) &&
3636  "Expecting custom case only in regcall calling convention");
3637  // This means that we are in special case where one argument was
3638  // passed through two register locations - Skip the next location
3639  ++I;
3640  }
3641 
3642  continue;
3643  }
3644 
3645  assert(VA.isMemLoc());
3646  SDValue Arg = OutVals[OutsIndex];
3647  ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3648  // Skip inalloca arguments. They don't require any work.
3649  if (Flags.isInAlloca())
3650  continue;
3651  // Create frame index.
3652  int32_t Offset = VA.getLocMemOffset()+FPDiff;
3653  uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3654  FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3655  FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3656 
3657  if (Flags.isByVal()) {
3658  // Copy relative to framepointer.
3660  if (!StackPtr.getNode())
3661  StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3662  getPointerTy(DAG.getDataLayout()));
3663  Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3664  StackPtr, Source);
3665 
3666  MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3667  ArgChain,
3668  Flags, DAG, dl));
3669  } else {
3670  // Store relative to f