LLVM  7.0.0svn
X86ISelLowering.cpp
Go to the documentation of this file.
1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
55 #include "llvm/Support/Debug.h"
57 #include "llvm/Support/KnownBits.h"
60 #include <algorithm>
61 #include <bitset>
62 #include <cctype>
63 #include <numeric>
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "x86-isel"
67 
68 STATISTIC(NumTailCalls, "Number of tail calls");
69 
71  "x86-experimental-vector-widening-legalization", cl::init(false),
72  cl::desc("Enable an experimental vector type legalization through widening "
73  "rather than promotion."),
74  cl::Hidden);
75 
77  "x86-experimental-pref-loop-alignment", cl::init(4),
78  cl::desc("Sets the preferable loop alignment for experiments "
79  "(the last x86-experimental-pref-loop-alignment bits"
80  " of the loop header PC will be 0)."),
81  cl::Hidden);
82 
84  "mul-constant-optimization", cl::init(true),
85  cl::desc("Replace 'mul x, Const' with more effective instructions like "
86  "SHIFT, LEA, etc."),
87  cl::Hidden);
88 
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
92 /// crashing.
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
94  const char *Msg) {
96  DAG.getContext()->diagnose(
98 }
99 
101  const X86Subtarget &STI)
102  : TargetLowering(TM), Subtarget(STI) {
103  bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104  X86ScalarSSEf64 = Subtarget.hasSSE2();
105  X86ScalarSSEf32 = Subtarget.hasSSE1();
106  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
107 
108  // Set up the TargetLowering object.
109 
110  // X86 is weird. It always uses i8 for shift amounts and setcc results.
112  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
114 
115  // For 64-bit, since we have so many registers, use the ILP scheduler.
116  // For 32-bit, use the register pressure specific scheduling.
117  // For Atom, always use ILP scheduling.
118  if (Subtarget.isAtom())
120  else if (Subtarget.is64Bit())
122  else
124  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
126 
127  // Bypass expensive divides and use cheaper ones.
128  if (TM.getOptLevel() >= CodeGenOpt::Default) {
129  if (Subtarget.hasSlowDivide32())
130  addBypassSlowDiv(32, 8);
131  if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132  addBypassSlowDiv(64, 32);
133  }
134 
135  if (Subtarget.isTargetKnownWindowsMSVC() ||
136  Subtarget.isTargetWindowsItanium()) {
137  // Setup Windows compiler runtime calls.
138  setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139  setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140  setLibcallName(RTLIB::SREM_I64, "_allrem");
141  setLibcallName(RTLIB::UREM_I64, "_aullrem");
142  setLibcallName(RTLIB::MUL_I64, "_allmul");
148  }
149 
150  if (Subtarget.isTargetDarwin()) {
151  // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152  setUseUnderscoreSetJmp(false);
154  } else if (Subtarget.isTargetWindowsGNU()) {
155  // MS runtime is weird: it exports _setjmp, but longjmp!
158  } else {
161  }
162 
163  // Set up the register classes.
164  addRegisterClass(MVT::i8, &X86::GR8RegClass);
165  addRegisterClass(MVT::i16, &X86::GR16RegClass);
166  addRegisterClass(MVT::i32, &X86::GR32RegClass);
167  if (Subtarget.is64Bit())
168  addRegisterClass(MVT::i64, &X86::GR64RegClass);
169 
170  for (MVT VT : MVT::integer_valuetypes())
172 
173  // We don't accept any truncstore of integer registers.
180 
182 
183  // SETOEQ and SETUNE require checking two conditions.
190 
191  // Integer absolute.
192  if (Subtarget.hasCMov()) {
195  if (Subtarget.is64Bit())
197  }
198 
199  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
200  // operation.
204 
205  if (Subtarget.is64Bit()) {
206  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207  // f32/f64 are legal, f80 is custom.
209  else
212  } else if (!Subtarget.useSoftFloat()) {
213  // We have an algorithm for SSE2->double, and we turn this into a
214  // 64-bit FILD followed by conditional FADD for other targets.
216  // We have an algorithm for SSE2, and we turn this into a 64-bit
217  // FILD or VCVTUSI2SS/SD for other targets.
219  }
220 
221  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
222  // this operation.
225 
226  if (!Subtarget.useSoftFloat()) {
227  // SSE has no i16 to fp conversion, only i32.
228  if (X86ScalarSSEf32) {
230  // f32 and f64 cases are Legal, f80 case is not
232  } else {
235  }
236  } else {
239  }
240 
241  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
242  // this operation.
245 
246  if (!Subtarget.useSoftFloat()) {
247  // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
248  // are Legal, f80 is custom lowered.
251 
252  if (X86ScalarSSEf32) {
254  // f32 and f64 cases are Legal, f80 case is not
256  } else {
259  }
260  } else {
264  }
265 
266  // Handle FP_TO_UINT by promoting the destination to a larger signed
267  // conversion.
271 
272  if (Subtarget.is64Bit()) {
273  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
274  // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
277  } else {
280  }
281  } else if (!Subtarget.useSoftFloat()) {
282  // Since AVX is a superset of SSE3, only check for SSE here.
283  if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
284  // Expand FP_TO_UINT into a select.
285  // FIXME: We would like to use a Custom expander here eventually to do
286  // the optimal thing for SSE vs. the default expansion in the legalizer.
288  else
289  // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
290  // With SSE3 we can use fisttpll to convert to a signed i64; without
291  // SSE, we're stuck with a fistpll.
293 
295  }
296 
297  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
298  if (!X86ScalarSSEf64) {
301  if (Subtarget.is64Bit()) {
303  // Without SSE, i64->f64 goes through memory.
305  }
306  } else if (!Subtarget.is64Bit())
308 
309  // Scalar integer divide and remainder are lowered to use operations that
310  // produce two results, to match the available instructions. This exposes
311  // the two-result form to trivial CSE, which is able to combine x/y and x%y
312  // into a single instruction.
313  //
314  // Scalar integer multiply-high is also lowered to use two-result
315  // operations, to match the available instructions. However, plain multiply
316  // (low) operations are left as Legal, as there are single-result
317  // instructions for this in x86. Using the two-result multiply instructions
318  // when both high and low results are needed must be arranged by dagcombine.
319  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
326  }
327 
330  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
334  }
335  if (Subtarget.is64Bit())
341 
346 
347  // Promote the i8 variants and force them on up to i32 which has a shorter
348  // encoding.
351  if (!Subtarget.hasBMI()) {
356  if (Subtarget.is64Bit()) {
359  }
360  }
361 
362  if (Subtarget.hasLZCNT()) {
363  // When promoting the i8 variants, force them to i32 for a shorter
364  // encoding.
367  } else {
374  if (Subtarget.is64Bit()) {
377  }
378  }
379 
380  // Special handling for half-precision floating point conversions.
381  // If we don't have F16C support, then lower half float conversions
382  // into library calls.
383  if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
386  }
387 
388  // There's never any support for operations beyond MVT::f32.
393 
400 
401  if (Subtarget.hasPOPCNT()) {
403  } else {
407  if (Subtarget.is64Bit())
409  }
410 
412 
413  if (!Subtarget.hasMOVBE())
415 
416  // These should be promoted to a larger select which is supported.
418  // X86 wants to expand cmov itself.
419  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
422  }
423  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
424  if (VT == MVT::i64 && !Subtarget.is64Bit())
425  continue;
428  }
429 
430  // Custom action for SELECT MMX and expand action for SELECT_CC MMX
433 
435  // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
436  // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
441  setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
442 
443  // Darwin ABI issue.
444  for (auto VT : { MVT::i32, MVT::i64 }) {
445  if (VT == MVT::i64 && !Subtarget.is64Bit())
446  continue;
453  }
454 
455  // 64-bit shl, sra, srl (iff 32-bit x86)
456  for (auto VT : { MVT::i32, MVT::i64 }) {
457  if (VT == MVT::i64 && !Subtarget.is64Bit())
458  continue;
462  }
463 
464  if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
466 
468 
469  // Expand certain atomics
470  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
478  }
479 
480  if (Subtarget.hasCmpxchg16b()) {
482  }
483 
484  // FIXME - use subtarget debug flags
485  if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
486  !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
489  }
490 
493 
496 
499 
500  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
503  bool Is64Bit = Subtarget.is64Bit();
505  setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
506 
509 
511 
512  // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
515 
516  if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
517  // f32 and f64 use SSE.
518  // Set up the FP register classes.
519  addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
520  : &X86::FR32RegClass);
521  addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
522  : &X86::FR64RegClass);
523 
524  for (auto VT : { MVT::f32, MVT::f64 }) {
525  // Use ANDPD to simulate FABS.
527 
528  // Use XORP to simulate FNEG.
530 
531  // Use ANDPD and ORPD to simulate FCOPYSIGN.
533 
534  // We don't support sin/cos/fmod
535  setOperationAction(ISD::FSIN , VT, Expand);
536  setOperationAction(ISD::FCOS , VT, Expand);
537  setOperationAction(ISD::FSINCOS, VT, Expand);
538  }
539 
540  // Lower this to MOVMSK plus an AND.
543 
544  // Expand FP immediates into loads from the stack, except for the special
545  // cases we handle.
546  addLegalFPImmediate(APFloat(+0.0)); // xorpd
547  addLegalFPImmediate(APFloat(+0.0f)); // xorps
548  } else if (UseX87 && X86ScalarSSEf32) {
549  // Use SSE for f32, x87 for f64.
550  // Set up the FP register classes.
551  addRegisterClass(MVT::f32, &X86::FR32RegClass);
552  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
553 
554  // Use ANDPS to simulate FABS.
556 
557  // Use XORP to simulate FNEG.
559 
561 
562  // Use ANDPS and ORPS to simulate FCOPYSIGN.
565 
566  // We don't support sin/cos/fmod
570 
571  // Special cases we handle for FP constants.
572  addLegalFPImmediate(APFloat(+0.0f)); // xorps
573  addLegalFPImmediate(APFloat(+0.0)); // FLD0
574  addLegalFPImmediate(APFloat(+1.0)); // FLD1
575  addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
576  addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
577 
578  // Always expand sin/cos functions even though x87 has an instruction.
582  } else if (UseX87) {
583  // f32 and f64 in x87.
584  // Set up the FP register classes.
585  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
586  addRegisterClass(MVT::f32, &X86::RFP32RegClass);
587 
588  for (auto VT : { MVT::f32, MVT::f64 }) {
589  setOperationAction(ISD::UNDEF, VT, Expand);
590  setOperationAction(ISD::FCOPYSIGN, VT, Expand);
591 
592  // Always expand sin/cos functions even though x87 has an instruction.
593  setOperationAction(ISD::FSIN , VT, Expand);
594  setOperationAction(ISD::FCOS , VT, Expand);
595  setOperationAction(ISD::FSINCOS, VT, Expand);
596  }
597  addLegalFPImmediate(APFloat(+0.0)); // FLD0
598  addLegalFPImmediate(APFloat(+1.0)); // FLD1
599  addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
600  addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
601  addLegalFPImmediate(APFloat(+0.0f)); // FLD0
602  addLegalFPImmediate(APFloat(+1.0f)); // FLD1
603  addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
604  addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
605  }
606 
607  // We don't support FMA.
610 
611  // Long double always uses X87, except f128 in MMX.
612  if (UseX87) {
613  if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
614  addRegisterClass(MVT::f128, &X86::FR128RegClass);
619  }
620 
621  addRegisterClass(MVT::f80, &X86::RFP80RegClass);
624  {
626  addLegalFPImmediate(TmpFlt); // FLD0
627  TmpFlt.changeSign();
628  addLegalFPImmediate(TmpFlt); // FLD0/FCHS
629 
630  bool ignored;
631  APFloat TmpFlt2(+1.0);
633  &ignored);
634  addLegalFPImmediate(TmpFlt2); // FLD1
635  TmpFlt2.changeSign();
636  addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
637  }
638 
639  // Always expand sin/cos functions even though x87 has an instruction.
643 
650  }
651 
652  // Always use a library call for pow.
656 
664 
665  // Some FP actions are always expanded for vector types.
666  for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
668  setOperationAction(ISD::FSIN, VT, Expand);
669  setOperationAction(ISD::FSINCOS, VT, Expand);
670  setOperationAction(ISD::FCOS, VT, Expand);
671  setOperationAction(ISD::FREM, VT, Expand);
672  setOperationAction(ISD::FCOPYSIGN, VT, Expand);
673  setOperationAction(ISD::FPOW, VT, Expand);
674  setOperationAction(ISD::FLOG, VT, Expand);
675  setOperationAction(ISD::FLOG2, VT, Expand);
676  setOperationAction(ISD::FLOG10, VT, Expand);
677  setOperationAction(ISD::FEXP, VT, Expand);
678  setOperationAction(ISD::FEXP2, VT, Expand);
679  }
680 
681  // First set operation action for all vector types to either promote
682  // (for widening) or expand (for scalarization). Then we will selectively
683  // turn on ones that can be effectively codegen'd.
684  for (MVT VT : MVT::vector_valuetypes()) {
685  setOperationAction(ISD::SDIV, VT, Expand);
686  setOperationAction(ISD::UDIV, VT, Expand);
687  setOperationAction(ISD::SREM, VT, Expand);
688  setOperationAction(ISD::UREM, VT, Expand);
693  setOperationAction(ISD::FMA, VT, Expand);
694  setOperationAction(ISD::FFLOOR, VT, Expand);
695  setOperationAction(ISD::FCEIL, VT, Expand);
696  setOperationAction(ISD::FTRUNC, VT, Expand);
697  setOperationAction(ISD::FRINT, VT, Expand);
698  setOperationAction(ISD::FNEARBYINT, VT, Expand);
699  setOperationAction(ISD::SMUL_LOHI, VT, Expand);
700  setOperationAction(ISD::MULHS, VT, Expand);
701  setOperationAction(ISD::UMUL_LOHI, VT, Expand);
702  setOperationAction(ISD::MULHU, VT, Expand);
703  setOperationAction(ISD::SDIVREM, VT, Expand);
704  setOperationAction(ISD::UDIVREM, VT, Expand);
705  setOperationAction(ISD::CTPOP, VT, Expand);
706  setOperationAction(ISD::CTTZ, VT, Expand);
707  setOperationAction(ISD::CTLZ, VT, Expand);
708  setOperationAction(ISD::ROTL, VT, Expand);
709  setOperationAction(ISD::ROTR, VT, Expand);
710  setOperationAction(ISD::BSWAP, VT, Expand);
711  setOperationAction(ISD::SETCC, VT, Expand);
712  setOperationAction(ISD::FP_TO_UINT, VT, Expand);
713  setOperationAction(ISD::FP_TO_SINT, VT, Expand);
714  setOperationAction(ISD::UINT_TO_FP, VT, Expand);
715  setOperationAction(ISD::SINT_TO_FP, VT, Expand);
717  setOperationAction(ISD::TRUNCATE, VT, Expand);
720  setOperationAction(ISD::ANY_EXTEND, VT, Expand);
721  setOperationAction(ISD::SELECT_CC, VT, Expand);
722  for (MVT InnerVT : MVT::vector_valuetypes()) {
723  setTruncStoreAction(InnerVT, VT, Expand);
724 
725  setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
726  setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
727 
728  // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
729  // types, we have to deal with them whether we ask for Expansion or not.
730  // Setting Expand causes its own optimisation problems though, so leave
731  // them legal.
732  if (VT.getVectorElementType() == MVT::i1)
733  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
734 
735  // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
736  // split/scalarized right now.
737  if (VT.getVectorElementType() == MVT::f16)
738  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
739  }
740  }
741 
742  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
743  // with -msoft-float, disable use of MMX as well.
744  if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
745  addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
746  // No operations on x86mmx supported, everything uses intrinsics.
747  }
748 
749  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
750  addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
751  : &X86::VR128RegClass);
752 
762  }
763 
764  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
765  addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
766  : &X86::VR128RegClass);
767 
768  // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
769  // registers cannot be used even for integer operations.
770  addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
771  : &X86::VR128RegClass);
772  addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
773  : &X86::VR128RegClass);
774  addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
775  : &X86::VR128RegClass);
776  addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
777  : &X86::VR128RegClass);
778 
792 
797 
801 
802  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
806  }
807 
808  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
814  }
815 
816  // We support custom legalizing of sext and anyext loads for specific
817  // memory vector types which we can load as a scalar (or sequence of
818  // scalars) and extend in-register to a legal 128-bit vector type. For sext
819  // loads these must work with a single scalar load.
820  for (MVT VT : MVT::integer_vector_valuetypes()) {
830  }
831 
832  for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
836 
837  if (VT == MVT::v2i64 && !Subtarget.is64Bit())
838  continue;
839 
842  }
843 
844  // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
845  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
851  }
852 
853  // Custom lower v2i64 and v2f64 selects.
856 
859 
862 
864 
865  // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
867 
870 
871  for (MVT VT : MVT::fp_vector_valuetypes())
873 
877 
881 
882  // In the customized shift lowering, the legal v4i32/v2i64 cases
883  // in AVX2 will be recognized.
884  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
888  }
889  }
890 
891  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
900  }
901 
902  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
903  for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
904  setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
905  setOperationAction(ISD::FCEIL, RoundedTy, Legal);
906  setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
907  setOperationAction(ISD::FRINT, RoundedTy, Legal);
909  }
910 
919 
920  // FIXME: Do we need to handle scalar-to-vector here?
922 
923  // We directly match byte blends in the backend as they match the VSELECT
924  // condition form.
926 
927  // SSE41 brings specific instructions for doing vector sign extend even in
928  // cases where we don't have SRA.
929  for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
932  }
933 
934  for (MVT VT : MVT::integer_vector_valuetypes()) {
938  }
939 
940  // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
941  for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
949  }
950 
951  // i8 vectors are custom because the source register and source
952  // source memory operand types are not the same width.
954  }
955 
956  if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
957  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
960 
961  // XOP can efficiently perform BITREVERSE with VPPERM.
962  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
964 
965  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
968  }
969 
970  if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
971  bool HasInt256 = Subtarget.hasInt256();
972 
973  addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
974  : &X86::VR256RegClass);
975  addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
976  : &X86::VR256RegClass);
977  addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
978  : &X86::VR256RegClass);
979  addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
980  : &X86::VR256RegClass);
981  addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
982  : &X86::VR256RegClass);
983  addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
984  : &X86::VR256RegClass);
985 
986  for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
995  }
996 
997  // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
998  // even though v8i16 is a legal type.
1002 
1005 
1006  for (MVT VT : MVT::fp_vector_valuetypes())
1008 
1009  // In the customized shift lowering, the legal v8i32/v4i64 cases
1010  // in AVX2 will be recognized.
1011  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1015  }
1016 
1020 
1021  for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1025  }
1026 
1031 
1032  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1037  }
1038 
1039  if (Subtarget.hasAnyFMA()) {
1040  for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1043  }
1044 
1045  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1046  setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1047  setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1048  }
1049 
1052  setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1054 
1057 
1058  setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1059  setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1062 
1063  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1064  setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1065  setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1066  setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1067  setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1068  setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1069  }
1070 
1071  if (HasInt256) {
1075 
1076  // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1077  // when we have a 256bit-wide blend with immediate.
1079 
1080  // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1081  for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1088  }
1089  }
1090 
1091  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1095  }
1096 
1097  // Extract subvector is special because the value type
1098  // (result) is 128-bit but the source is 256-bit wide.
1099  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1100  MVT::v4f32, MVT::v2f64 }) {
1102  }
1103 
1104  // Custom lower several nodes for 256-bit types.
1106  MVT::v8f32, MVT::v4f64 }) {
1109  setOperationAction(ISD::VSELECT, VT, Custom);
1115  }
1116 
1117  if (HasInt256)
1119 
1120  // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1121  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1127  }
1128 
1129  if (HasInt256) {
1130  // Custom legalize 2x32 to get a little better code.
1133 
1134  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1136  setOperationAction(ISD::MGATHER, VT, Custom);
1137  }
1138  }
1139 
1140  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1141  addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1142  addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1143  addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1144  addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1145 
1146  addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1147  addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1148  addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1149  addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1150  addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1151 
1155 
1164 
1171  if (Subtarget.hasVLX()) {
1174  }
1175 
1176  // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1177  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1181  }
1182 
1183  for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1190 
1195  setOperationAction(ISD::VSELECT, VT, Expand);
1196  }
1197 
1204  for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1206 
1207  for (MVT VT : MVT::fp_vector_valuetypes())
1209 
1210  for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1216  }
1217 
1221  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1222  setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1223  setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1224  setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1225  setTruncStoreAction(VT, MaskVT, Custom);
1226  }
1227 
1228  for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1233  }
1234 
1243 
1249 
1250  if (!Subtarget.hasVLX()) {
1251  // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1252  // to 512-bit rather than use the AVX2 instructions so that we can use
1253  // k-masks.
1254  for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1258  }
1259  }
1260 
1269 
1270  for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1276  }
1277 
1280 
1281  // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1284 
1289 
1292 
1295 
1299 
1300  for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1313  }
1314 
1315  // Need to promote to 64-bit even though we have 32-bit masked instructions
1316  // because the IR optimizers rearrange bitcasts around logic ops leaving
1317  // too many variations to handle if we don't promote them.
1321 
1322  if (Subtarget.hasDQI()) {
1327  }
1328 
1329  if (Subtarget.hasCDI()) {
1330  // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1331  for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1334  }
1335  } // Subtarget.hasCDI()
1336 
1337  if (Subtarget.hasVPOPCNTDQ()) {
1338  for (auto VT : { MVT::v16i32, MVT::v8i64 })
1340  }
1341 
1342  // Extract subvector is special because the value type
1343  // (result) is 256-bit but the source is 512-bit wide.
1344  // 128-bit was made Legal under AVX1.
1345  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1348 
1349  for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1361  }
1362  for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1365  }
1366  }// has AVX-512
1367 
1368  if (!Subtarget.useSoftFloat() &&
1369  (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
1370  // These operations are handled on non-VLX by artificially widening in
1371  // isel patterns.
1372  // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1373 
1379 
1380  for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1386  }
1387 
1388  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1391  }
1392 
1393  // Custom legalize 2x32 to get a little better code.
1396 
1397  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1400 
1401  if (Subtarget.hasDQI()) {
1402  for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1407  }
1408  }
1409 
1410  if (Subtarget.hasCDI()) {
1411  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1414  }
1415  } // Subtarget.hasCDI()
1416 
1417  if (Subtarget.hasVPOPCNTDQ()) {
1418  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1420  }
1421  }
1422 
1423  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1424  addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1425  addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1426 
1427  addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1428  addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1429 
1430  for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1434  setOperationAction(ISD::VSELECT, VT, Expand);
1435 
1443  }
1444 
1449  for (auto VT : { MVT::v16i1, MVT::v32i1 })
1451 
1452  // Extends from v32i1 masks to 256-bit vectors.
1456  // Extends from v64i1 masks to 512-bit vectors.
1460 
1484 
1486 
1488 
1489  for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1505 
1509  }
1510 
1511  for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1513  }
1514 
1515  if (Subtarget.hasBITALG()) {
1516  for (auto VT : { MVT::v64i8, MVT::v32i16 })
1518  }
1519  }
1520 
1521  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
1522  (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
1523  for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1524  setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1525  setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1526  }
1527 
1528  // These operations are handled on non-VLX by artificially widening in
1529  // isel patterns.
1530  // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1531 
1532  if (Subtarget.hasBITALG()) {
1533  for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1535  }
1536  }
1537 
1538  if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1544 
1550 
1551  if (Subtarget.hasDQI()) {
1552  // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1553  // v2f32 UINT_TO_FP is already custom under SSE2.
1556  "Unexpected operation action!");
1557  // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1560  }
1561 
1562  if (Subtarget.hasBWI()) {
1565  }
1566  }
1567 
1568  // We want to custom lower some of our intrinsics.
1572  if (!Subtarget.is64Bit()) {
1575  }
1576 
1577  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1578  // handle type legalization for these operations here.
1579  //
1580  // FIXME: We really should do custom legalization for addition and
1581  // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1582  // than generic legalization for 64-bit multiplication-with-overflow, though.
1583  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1584  if (VT == MVT::i64 && !Subtarget.is64Bit())
1585  continue;
1586  // Add/Sub/Mul with overflow operations are custom lowered.
1593 
1594  // Support carry in as value rather than glue.
1598  }
1599 
1600  if (!Subtarget.is64Bit()) {
1601  // These libcalls are not available in 32-bit.
1602  setLibcallName(RTLIB::SHL_I128, nullptr);
1603  setLibcallName(RTLIB::SRL_I128, nullptr);
1604  setLibcallName(RTLIB::SRA_I128, nullptr);
1605  setLibcallName(RTLIB::MUL_I128, nullptr);
1606  }
1607 
1608  // Combine sin / cos into _sincos_stret if it is available.
1609  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1610  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1613  }
1614 
1615  if (Subtarget.isTargetWin64()) {
1622  }
1623 
1624  // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1625  // is. We should promote the value to 64-bits to solve this.
1626  // This is what the CRT headers do - `fmodf` is an inline header
1627  // function casting to f64 and calling `fmod`.
1628  if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1629  Subtarget.isTargetWindowsItanium()))
1630  for (ISD::NodeType Op :
1635 
1636  // We have target-specific dag combine patterns for the following nodes:
1675 
1677 
1678  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1680  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1682  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1684 
1685  // TODO: These control memcmp expansion in CGP and could be raised higher, but
1686  // that needs to benchmarked and balanced with the potential use of vector
1687  // load/store types (PR33329, PR33914).
1688  MaxLoadsPerMemcmp = 2;
1690 
1691  // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1693 
1694  // An out-of-order CPU can speculatively execute past a predictable branch,
1695  // but a conditional move could be stalled by an expensive earlier operation.
1696  PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1697  EnableExtLdPromotion = true;
1698  setPrefFunctionAlignment(4); // 2^4 bytes.
1699 
1701 }
1702 
1703 // This has so far only been implemented for 64-bit MachO.
1705  return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1706 }
1707 
1709  // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1710  return Subtarget.getTargetTriple().isOSMSVCRT();
1711 }
1712 
1714  const SDLoc &DL) const {
1715  EVT PtrTy = getPointerTy(DAG.getDataLayout());
1716  unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1717  MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1718  return SDValue(Node, 0);
1719 }
1720 
1724  VT.getVectorNumElements() != 1 &&
1726  return TypeWidenVector;
1727 
1729 }
1730 
1733  EVT VT) const {
1734  if (!VT.isVector())
1735  return MVT::i8;
1736 
1737  if (Subtarget.hasAVX512()) {
1738  const unsigned NumElts = VT.getVectorNumElements();
1739 
1740  // Figure out what this type will be legalized to.
1741  EVT LegalVT = VT;
1742  while (getTypeAction(Context, LegalVT) != TypeLegal)
1743  LegalVT = getTypeToTransformTo(Context, LegalVT);
1744 
1745  // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1746  if (LegalVT.getSimpleVT().is512BitVector())
1747  return EVT::getVectorVT(Context, MVT::i1, NumElts);
1748 
1749  if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1750  // If we legalized to less than a 512-bit vector, then we will use a vXi1
1751  // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1752  // vXi16/vXi8.
1753  MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1754  if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1755  return EVT::getVectorVT(Context, MVT::i1, NumElts);
1756  }
1757  }
1758 
1760 }
1761 
1762 /// Helper for getByValTypeAlignment to determine
1763 /// the desired ByVal argument alignment.
1764 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1765  if (MaxAlign == 16)
1766  return;
1767  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1768  if (VTy->getBitWidth() == 128)
1769  MaxAlign = 16;
1770  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1771  unsigned EltAlign = 0;
1772  getMaxByValAlign(ATy->getElementType(), EltAlign);
1773  if (EltAlign > MaxAlign)
1774  MaxAlign = EltAlign;
1775  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1776  for (auto *EltTy : STy->elements()) {
1777  unsigned EltAlign = 0;
1778  getMaxByValAlign(EltTy, EltAlign);
1779  if (EltAlign > MaxAlign)
1780  MaxAlign = EltAlign;
1781  if (MaxAlign == 16)
1782  break;
1783  }
1784  }
1785 }
1786 
1787 /// Return the desired alignment for ByVal aggregate
1788 /// function arguments in the caller parameter area. For X86, aggregates
1789 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1790 /// are at 4-byte boundaries.
1792  const DataLayout &DL) const {
1793  if (Subtarget.is64Bit()) {
1794  // Max of 8 and alignment of type.
1795  unsigned TyAlign = DL.getABITypeAlignment(Ty);
1796  if (TyAlign > 8)
1797  return TyAlign;
1798  return 8;
1799  }
1800 
1801  unsigned Align = 4;
1802  if (Subtarget.hasSSE1())
1803  getMaxByValAlign(Ty, Align);
1804  return Align;
1805 }
1806 
1807 /// Returns the target specific optimal type for load
1808 /// and store operations as a result of memset, memcpy, and memmove
1809 /// lowering. If DstAlign is zero that means it's safe to destination
1810 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1811 /// means there isn't a need to check it against alignment requirement,
1812 /// probably because the source does not need to be loaded. If 'IsMemset' is
1813 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1814 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1815 /// source is constant so it does not need to be loaded.
1816 /// It returns EVT::Other if the type should be determined using generic
1817 /// target-independent logic.
1818 EVT
1820  unsigned DstAlign, unsigned SrcAlign,
1821  bool IsMemset, bool ZeroMemset,
1822  bool MemcpyStrSrc,
1823  MachineFunction &MF) const {
1824  const Function &F = MF.getFunction();
1825  if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
1826  if (Size >= 16 &&
1827  (!Subtarget.isUnalignedMem16Slow() ||
1828  ((DstAlign == 0 || DstAlign >= 16) &&
1829  (SrcAlign == 0 || SrcAlign >= 16)))) {
1830  // FIXME: Check if unaligned 32-byte accesses are slow.
1831  if (Size >= 32 && Subtarget.hasAVX()) {
1832  // Although this isn't a well-supported type for AVX1, we'll let
1833  // legalization and shuffle lowering produce the optimal codegen. If we
1834  // choose an optimal type with a vector element larger than a byte,
1835  // getMemsetStores() may create an intermediate splat (using an integer
1836  // multiply) before we splat as a vector.
1837  return MVT::v32i8;
1838  }
1839  if (Subtarget.hasSSE2())
1840  return MVT::v16i8;
1841  // TODO: Can SSE1 handle a byte vector?
1842  if (Subtarget.hasSSE1())
1843  return MVT::v4f32;
1844  } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1845  !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1846  // Do not use f64 to lower memcpy if source is string constant. It's
1847  // better to use i32 to avoid the loads.
1848  // Also, do not use f64 to lower memset unless this is a memset of zeros.
1849  // The gymnastics of splatting a byte value into an XMM register and then
1850  // only using 8-byte stores (because this is a CPU with slow unaligned
1851  // 16-byte accesses) makes that a loser.
1852  return MVT::f64;
1853  }
1854  }
1855  // This is a compromise. If we reach here, unaligned accesses may be slow on
1856  // this target. However, creating smaller, aligned accesses could be even
1857  // slower and would certainly be a lot more code.
1858  if (Subtarget.is64Bit() && Size >= 8)
1859  return MVT::i64;
1860  return MVT::i32;
1861 }
1862 
1864  if (VT == MVT::f32)
1865  return X86ScalarSSEf32;
1866  else if (VT == MVT::f64)
1867  return X86ScalarSSEf64;
1868  return true;
1869 }
1870 
1871 bool
1873  unsigned,
1874  unsigned,
1875  bool *Fast) const {
1876  if (Fast) {
1877  switch (VT.getSizeInBits()) {
1878  default:
1879  // 8-byte and under are always assumed to be fast.
1880  *Fast = true;
1881  break;
1882  case 128:
1883  *Fast = !Subtarget.isUnalignedMem16Slow();
1884  break;
1885  case 256:
1886  *Fast = !Subtarget.isUnalignedMem32Slow();
1887  break;
1888  // TODO: What about AVX-512 (512-bit) accesses?
1889  }
1890  }
1891  // Misaligned accesses of any size are always allowed.
1892  return true;
1893 }
1894 
1895 /// Return the entry encoding for a jump table in the
1896 /// current function. The returned value is a member of the
1897 /// MachineJumpTableInfo::JTEntryKind enum.
1899  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1900  // symbol.
1901  if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1903 
1904  // Otherwise, use the normal jump table encoding heuristics.
1906 }
1907 
1909  return Subtarget.useSoftFloat();
1910 }
1911 
1913  ArgListTy &Args) const {
1914 
1915  // Only relabel X86-32 for C / Stdcall CCs.
1916  if (Subtarget.is64Bit())
1917  return;
1918  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1919  return;
1920  unsigned ParamRegs = 0;
1921  if (auto *M = MF->getFunction().getParent())
1922  ParamRegs = M->getNumberRegisterParameters();
1923 
1924  // Mark the first N int arguments as having reg
1925  for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1926  Type *T = Args[Idx].Ty;
1927  if (T->isPointerTy() || T->isIntegerTy())
1928  if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1929  unsigned numRegs = 1;
1930  if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1931  numRegs = 2;
1932  if (ParamRegs < numRegs)
1933  return;
1934  ParamRegs -= numRegs;
1935  Args[Idx].IsInReg = true;
1936  }
1937  }
1938 }
1939 
1940 const MCExpr *
1942  const MachineBasicBlock *MBB,
1943  unsigned uid,MCContext &Ctx) const{
1944  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1945  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1946  // entries.
1947  return MCSymbolRefExpr::create(MBB->getSymbol(),
1949 }
1950 
1951 /// Returns relocation base for the given PIC jumptable.
1953  SelectionDAG &DAG) const {
1954  if (!Subtarget.is64Bit())
1955  // This doesn't have SDLoc associated with it, but is not really the
1956  // same as a Register.
1957  return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1958  getPointerTy(DAG.getDataLayout()));
1959  return Table;
1960 }
1961 
1962 /// This returns the relocation base for the given PIC jumptable,
1963 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1966  MCContext &Ctx) const {
1967  // X86-64 uses RIP relative addressing based on the jump table label.
1968  if (Subtarget.isPICStyleRIPRel())
1969  return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1970 
1971  // Otherwise, the reference is relative to the PIC base.
1972  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1973 }
1974 
1975 std::pair<const TargetRegisterClass *, uint8_t>
1977  MVT VT) const {
1978  const TargetRegisterClass *RRC = nullptr;
1979  uint8_t Cost = 1;
1980  switch (VT.SimpleTy) {
1981  default:
1983  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1984  RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1985  break;
1986  case MVT::x86mmx:
1987  RRC = &X86::VR64RegClass;
1988  break;
1989  case MVT::f32: case MVT::f64:
1990  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1991  case MVT::v4f32: case MVT::v2f64:
1992  case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1993  case MVT::v8f32: case MVT::v4f64:
1994  case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1995  case MVT::v16f32: case MVT::v8f64:
1996  RRC = &X86::VR128XRegClass;
1997  break;
1998  }
1999  return std::make_pair(RRC, Cost);
2000 }
2001 
2002 unsigned X86TargetLowering::getAddressSpace() const {
2003  if (Subtarget.is64Bit())
2004  return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2005  return 256;
2006 }
2007 
2008 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2009  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2010  (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2011 }
2012 
2014  unsigned Offset, unsigned AddressSpace) {
2017  Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2018 }
2019 
2021  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2022  // tcbhead_t; use it instead of the usual global variable (see
2023  // sysdeps/{i386,x86_64}/nptl/tls.h)
2024  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2025  if (Subtarget.isTargetFuchsia()) {
2026  // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2027  return SegmentOffset(IRB, 0x10, getAddressSpace());
2028  } else {
2029  // %fs:0x28, unless we're using a Kernel code model, in which case
2030  // it's %gs:0x28. gs:0x14 on i386.
2031  unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2032  return SegmentOffset(IRB, Offset, getAddressSpace());
2033  }
2034  }
2035 
2036  return TargetLowering::getIRStackGuard(IRB);
2037 }
2038 
2040  // MSVC CRT provides functionalities for stack protection.
2041  if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2042  // MSVC CRT has a global variable holding security cookie.
2043  M.getOrInsertGlobal("__security_cookie",
2045 
2046  // MSVC CRT has a function to validate security cookie.
2047  auto *SecurityCheckCookie = cast<Function>(
2048  M.getOrInsertFunction("__security_check_cookie",
2051  SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2052  SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2053  return;
2054  }
2055  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2056  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2057  return;
2059 }
2060 
2062  // MSVC CRT has a global variable holding security cookie.
2063  if (Subtarget.getTargetTriple().isOSMSVCRT())
2064  return M.getGlobalVariable("__security_cookie");
2066 }
2067 
2069  // MSVC CRT has a function to validate security cookie.
2070  if (Subtarget.getTargetTriple().isOSMSVCRT())
2071  return M.getFunction("__security_check_cookie");
2073 }
2074 
2076  if (Subtarget.getTargetTriple().isOSContiki())
2077  return getDefaultSafeStackPointerLocation(IRB, false);
2078 
2079  // Android provides a fixed TLS slot for the SafeStack pointer. See the
2080  // definition of TLS_SLOT_SAFESTACK in
2081  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2082  if (Subtarget.isTargetAndroid()) {
2083  // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2084  // %gs:0x24 on i386
2085  unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2086  return SegmentOffset(IRB, Offset, getAddressSpace());
2087  }
2088 
2089  // Fuchsia is similar.
2090  if (Subtarget.isTargetFuchsia()) {
2091  // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2092  return SegmentOffset(IRB, 0x18, getAddressSpace());
2093  }
2094 
2096 }
2097 
2099  unsigned DestAS) const {
2100  assert(SrcAS != DestAS && "Expected different address spaces!");
2101 
2102  return SrcAS < 256 && DestAS < 256;
2103 }
2104 
2105 //===----------------------------------------------------------------------===//
2106 // Return Value Calling Convention Implementation
2107 //===----------------------------------------------------------------------===//
2108 
2109 #include "X86GenCallingConv.inc"
2110 
2111 bool X86TargetLowering::CanLowerReturn(
2112  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2113  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2115  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2116  return CCInfo.CheckReturn(Outs, RetCC_X86);
2117 }
2118 
2119 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2120  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2121  return ScratchRegs;
2122 }
2123 
2124 /// Lowers masks values (v*i1) to the local register values
2125 /// \returns DAG node after lowering to register type
2126 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2127  const SDLoc &Dl, SelectionDAG &DAG) {
2128  EVT ValVT = ValArg.getValueType();
2129 
2130  if (ValVT == MVT::v1i1)
2131  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2132  DAG.getIntPtrConstant(0, Dl));
2133 
2134  if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2135  (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2136  // Two stage lowering might be required
2137  // bitcast: v8i1 -> i8 / v16i1 -> i16
2138  // anyextend: i8 -> i32 / i16 -> i32
2139  EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2140  SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2141  if (ValLoc == MVT::i32)
2142  ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2143  return ValToCopy;
2144  }
2145 
2146  if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2147  (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2148  // One stage lowering is required
2149  // bitcast: v32i1 -> i32 / v64i1 -> i64
2150  return DAG.getBitcast(ValLoc, ValArg);
2151  }
2152 
2153  return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2154 }
2155 
2156 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2158  const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2159  SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2160  CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2161  assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2162  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2163  assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2164  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2165  "The value should reside in two registers");
2166 
2167  // Before splitting the value we cast it to i64
2168  Arg = DAG.getBitcast(MVT::i64, Arg);
2169 
2170  // Splitting the value into two i32 types
2171  SDValue Lo, Hi;
2172  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2173  DAG.getConstant(0, Dl, MVT::i32));
2174  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2175  DAG.getConstant(1, Dl, MVT::i32));
2176 
2177  // Attach the two i32 types into corresponding registers
2178  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2179  RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2180 }
2181 
2182 SDValue
2183 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2184  bool isVarArg,
2185  const SmallVectorImpl<ISD::OutputArg> &Outs,
2186  const SmallVectorImpl<SDValue> &OutVals,
2187  const SDLoc &dl, SelectionDAG &DAG) const {
2188  MachineFunction &MF = DAG.getMachineFunction();
2190 
2191  // In some cases we need to disable registers from the default CSR list.
2192  // For example, when they are used for argument passing.
2193  bool ShouldDisableCalleeSavedRegister =
2194  CallConv == CallingConv::X86_RegCall ||
2195  MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2196 
2197  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2198  report_fatal_error("X86 interrupts may not return any value");
2199 
2201  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2202  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2203 
2204  SDValue Flag;
2205  SmallVector<SDValue, 6> RetOps;
2206  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2207  // Operand #1 = Bytes To Pop
2208  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2209  MVT::i32));
2210 
2211  // Copy the result values into the output registers.
2212  for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2213  ++I, ++OutsIndex) {
2214  CCValAssign &VA = RVLocs[I];
2215  assert(VA.isRegLoc() && "Can only return in registers!");
2216 
2217  // Add the register to the CalleeSaveDisableRegs list.
2218  if (ShouldDisableCalleeSavedRegister)
2220 
2221  SDValue ValToCopy = OutVals[OutsIndex];
2222  EVT ValVT = ValToCopy.getValueType();
2223 
2224  // Promote values to the appropriate types.
2225  if (VA.getLocInfo() == CCValAssign::SExt)
2226  ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2227  else if (VA.getLocInfo() == CCValAssign::ZExt)
2228  ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2229  else if (VA.getLocInfo() == CCValAssign::AExt) {
2230  if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2231  ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2232  else
2233  ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2234  }
2235  else if (VA.getLocInfo() == CCValAssign::BCvt)
2236  ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2237 
2239  "Unexpected FP-extend for return value.");
2240 
2241  // If this is x86-64, and we disabled SSE, we can't return FP values,
2242  // or SSE or MMX vectors.
2243  if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2244  VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2245  (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2246  errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2247  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2248  } else if (ValVT == MVT::f64 &&
2249  (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2250  // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2251  // llvm-gcc has never done it right and no one has noticed, so this
2252  // should be OK for now.
2253  errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2254  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2255  }
2256 
2257  // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2258  // the RET instruction and handled by the FP Stackifier.
2259  if (VA.getLocReg() == X86::FP0 ||
2260  VA.getLocReg() == X86::FP1) {
2261  // If this is a copy from an xmm register to ST(0), use an FPExtend to
2262  // change the value to the FP stack register class.
2263  if (isScalarFPTypeInSSEReg(VA.getValVT()))
2264  ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2265  RetOps.push_back(ValToCopy);
2266  // Don't emit a copytoreg.
2267  continue;
2268  }
2269 
2270  // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2271  // which is returned in RAX / RDX.
2272  if (Subtarget.is64Bit()) {
2273  if (ValVT == MVT::x86mmx) {
2274  if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2275  ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2276  ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2277  ValToCopy);
2278  // If we don't have SSE2 available, convert to v4f32 so the generated
2279  // register is legal.
2280  if (!Subtarget.hasSSE2())
2281  ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2282  }
2283  }
2284  }
2285 
2287 
2288  if (VA.needsCustom()) {
2289  assert(VA.getValVT() == MVT::v64i1 &&
2290  "Currently the only custom case is when we split v64i1 to 2 regs");
2291 
2292  Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2293  Subtarget);
2294 
2295  assert(2 == RegsToPass.size() &&
2296  "Expecting two registers after Pass64BitArgInRegs");
2297 
2298  // Add the second register to the CalleeSaveDisableRegs list.
2299  if (ShouldDisableCalleeSavedRegister)
2300  MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2301  } else {
2302  RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2303  }
2304 
2305  // Add nodes to the DAG and add the values into the RetOps list
2306  for (auto &Reg : RegsToPass) {
2307  Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2308  Flag = Chain.getValue(1);
2309  RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2310  }
2311  }
2312 
2313  // Swift calling convention does not require we copy the sret argument
2314  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2315 
2316  // All x86 ABIs require that for returning structs by value we copy
2317  // the sret argument into %rax/%eax (depending on ABI) for the return.
2318  // We saved the argument into a virtual register in the entry block,
2319  // so now we copy the value out and into %rax/%eax.
2320  //
2321  // Checking Function.hasStructRetAttr() here is insufficient because the IR
2322  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2323  // false, then an sret argument may be implicitly inserted in the SelDAG. In
2324  // either case FuncInfo->setSRetReturnReg() will have been called.
2325  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2326  // When we have both sret and another return value, we should use the
2327  // original Chain stored in RetOps[0], instead of the current Chain updated
2328  // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2329 
2330  // For the case of sret and another return value, we have
2331  // Chain_0 at the function entry
2332  // Chain_1 = getCopyToReg(Chain_0) in the above loop
2333  // If we use Chain_1 in getCopyFromReg, we will have
2334  // Val = getCopyFromReg(Chain_1)
2335  // Chain_2 = getCopyToReg(Chain_1, Val) from below
2336 
2337  // getCopyToReg(Chain_0) will be glued together with
2338  // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2339  // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2340  // Data dependency from Unit B to Unit A due to usage of Val in
2341  // getCopyToReg(Chain_1, Val)
2342  // Chain dependency from Unit A to Unit B
2343 
2344  // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2345  SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2346  getPointerTy(MF.getDataLayout()));
2347 
2348  unsigned RetValReg
2349  = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2350  X86::RAX : X86::EAX;
2351  Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2352  Flag = Chain.getValue(1);
2353 
2354  // RAX/EAX now acts like a return value.
2355  RetOps.push_back(
2356  DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2357 
2358  // Add the returned register to the CalleeSaveDisableRegs list.
2359  if (ShouldDisableCalleeSavedRegister)
2360  MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2361  }
2362 
2363  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2364  const MCPhysReg *I =
2366  if (I) {
2367  for (; *I; ++I) {
2368  if (X86::GR64RegClass.contains(*I))
2369  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2370  else
2371  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2372  }
2373  }
2374 
2375  RetOps[0] = Chain; // Update chain.
2376 
2377  // Add the flag if we have it.
2378  if (Flag.getNode())
2379  RetOps.push_back(Flag);
2380 
2382  if (CallConv == CallingConv::X86_INTR)
2383  opcode = X86ISD::IRET;
2384  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2385 }
2386 
2387 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2388  if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2389  return false;
2390 
2391  SDValue TCChain = Chain;
2392  SDNode *Copy = *N->use_begin();
2393  if (Copy->getOpcode() == ISD::CopyToReg) {
2394  // If the copy has a glue operand, we conservatively assume it isn't safe to
2395  // perform a tail call.
2396  if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2397  return false;
2398  TCChain = Copy->getOperand(0);
2399  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2400  return false;
2401 
2402  bool HasRet = false;
2403  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2404  UI != UE; ++UI) {
2405  if (UI->getOpcode() != X86ISD::RET_FLAG)
2406  return false;
2407  // If we are returning more than one value, we can definitely
2408  // not make a tail call see PR19530
2409  if (UI->getNumOperands() > 4)
2410  return false;
2411  if (UI->getNumOperands() == 4 &&
2412  UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2413  return false;
2414  HasRet = true;
2415  }
2416 
2417  if (!HasRet)
2418  return false;
2419 
2420  Chain = TCChain;
2421  return true;
2422 }
2423 
2424 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2425  ISD::NodeType ExtendKind) const {
2426  MVT ReturnMVT = MVT::i32;
2427 
2428  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2429  if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2430  // The ABI does not require i1, i8 or i16 to be extended.
2431  //
2432  // On Darwin, there is code in the wild relying on Clang's old behaviour of
2433  // always extending i8/i16 return values, so keep doing that for now.
2434  // (PR26665).
2435  ReturnMVT = MVT::i8;
2436  }
2437 
2438  EVT MinVT = getRegisterType(Context, ReturnMVT);
2439  return VT.bitsLT(MinVT) ? MinVT : VT;
2440 }
2441 
2442 /// Reads two 32 bit registers and creates a 64 bit mask value.
2443 /// \param VA The current 32 bit value that need to be assigned.
2444 /// \param NextVA The next 32 bit value that need to be assigned.
2445 /// \param Root The parent DAG node.
2446 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2447 /// glue purposes. In the case the DAG is already using
2448 /// physical register instead of virtual, we should glue
2449 /// our new SDValue to InFlag SDvalue.
2450 /// \return a new SDvalue of size 64bit.
2452  SDValue &Root, SelectionDAG &DAG,
2453  const SDLoc &Dl, const X86Subtarget &Subtarget,
2454  SDValue *InFlag = nullptr) {
2455  assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2456  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2457  assert(VA.getValVT() == MVT::v64i1 &&
2458  "Expecting first location of 64 bit width type");
2459  assert(NextVA.getValVT() == VA.getValVT() &&
2460  "The locations should have the same type");
2461  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2462  "The values should reside in two registers");
2463 
2464  SDValue Lo, Hi;
2465  unsigned Reg;
2466  SDValue ArgValueLo, ArgValueHi;
2467 
2468  MachineFunction &MF = DAG.getMachineFunction();
2469  const TargetRegisterClass *RC = &X86::GR32RegClass;
2470 
2471  // Read a 32 bit value from the registers
2472  if (nullptr == InFlag) {
2473  // When no physical register is present,
2474  // create an intermediate virtual register
2475  Reg = MF.addLiveIn(VA.getLocReg(), RC);
2476  ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2477  Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2478  ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2479  } else {
2480  // When a physical register is available read the value from it and glue
2481  // the reads together.
2482  ArgValueLo =
2483  DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2484  *InFlag = ArgValueLo.getValue(2);
2485  ArgValueHi =
2486  DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2487  *InFlag = ArgValueHi.getValue(2);
2488  }
2489 
2490  // Convert the i32 type into v32i1 type
2491  Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2492 
2493  // Convert the i32 type into v32i1 type
2494  Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2495 
2496  // Concatenate the two values together
2497  return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2498 }
2499 
2500 /// The function will lower a register of various sizes (8/16/32/64)
2501 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2502 /// \returns a DAG node contains the operand after lowering to mask type.
2503 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2504  const EVT &ValLoc, const SDLoc &Dl,
2505  SelectionDAG &DAG) {
2506  SDValue ValReturned = ValArg;
2507 
2508  if (ValVT == MVT::v1i1)
2509  return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2510 
2511  if (ValVT == MVT::v64i1) {
2512  // In 32 bit machine, this case is handled by getv64i1Argument
2513  assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2514  // In 64 bit machine, There is no need to truncate the value only bitcast
2515  } else {
2516  MVT maskLen;
2517  switch (ValVT.getSimpleVT().SimpleTy) {
2518  case MVT::v8i1:
2519  maskLen = MVT::i8;
2520  break;
2521  case MVT::v16i1:
2522  maskLen = MVT::i16;
2523  break;
2524  case MVT::v32i1:
2525  maskLen = MVT::i32;
2526  break;
2527  default:
2528  llvm_unreachable("Expecting a vector of i1 types");
2529  }
2530 
2531  ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2532  }
2533  return DAG.getBitcast(ValVT, ValReturned);
2534 }
2535 
2536 /// Lower the result values of a call into the
2537 /// appropriate copies out of appropriate physical registers.
2538 ///
2539 SDValue X86TargetLowering::LowerCallResult(
2540  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2541  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2542  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2543  uint32_t *RegMask) const {
2544 
2545  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2546  // Assign locations to each value returned by this call.
2548  bool Is64Bit = Subtarget.is64Bit();
2549  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2550  *DAG.getContext());
2551  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2552 
2553  // Copy all of the result registers out of their specified physreg.
2554  for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2555  ++I, ++InsIndex) {
2556  CCValAssign &VA = RVLocs[I];
2557  EVT CopyVT = VA.getLocVT();
2558 
2559  // In some calling conventions we need to remove the used registers
2560  // from the register mask.
2561  if (RegMask) {
2562  for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2563  SubRegs.isValid(); ++SubRegs)
2564  RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2565  }
2566 
2567  // If this is x86-64, and we disabled SSE, we can't return FP values
2568  if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2569  ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2570  errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2571  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2572  }
2573 
2574  // If we prefer to use the value in xmm registers, copy it out as f80 and
2575  // use a truncate to move it from fp stack reg to xmm reg.
2576  bool RoundAfterCopy = false;
2577  if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2579  if (!Subtarget.hasX87())
2580  report_fatal_error("X87 register return with X87 disabled");
2581  CopyVT = MVT::f80;
2582  RoundAfterCopy = (CopyVT != VA.getLocVT());
2583  }
2584 
2585  SDValue Val;
2586  if (VA.needsCustom()) {
2587  assert(VA.getValVT() == MVT::v64i1 &&
2588  "Currently the only custom case is when we split v64i1 to 2 regs");
2589  Val =
2590  getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2591  } else {
2592  Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2593  .getValue(1);
2594  Val = Chain.getValue(0);
2595  InFlag = Chain.getValue(2);
2596  }
2597 
2598  if (RoundAfterCopy)
2599  Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2600  // This truncation won't change the value.
2601  DAG.getIntPtrConstant(1, dl));
2602 
2603  if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2604  if (VA.getValVT().isVector() &&
2605  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2606  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2607  // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2608  Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2609  } else
2610  Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2611  }
2612 
2613  InVals.push_back(Val);
2614  }
2615 
2616  return Chain;
2617 }
2618 
2619 //===----------------------------------------------------------------------===//
2620 // C & StdCall & Fast Calling Convention implementation
2621 //===----------------------------------------------------------------------===//
2622 // StdCall calling convention seems to be standard for many Windows' API
2623 // routines and around. It differs from C calling convention just a little:
2624 // callee should clean up the stack, not caller. Symbols should be also
2625 // decorated in some fancy way :) It doesn't support any vector arguments.
2626 // For info on fast calling convention see Fast Calling Convention (tail call)
2627 // implementation LowerX86_32FastCCCallTo.
2628 
2629 /// CallIsStructReturn - Determines whether a call uses struct return
2630 /// semantics.
2635 };
2636 static StructReturnType
2638  if (Outs.empty())
2639  return NotStructReturn;
2640 
2641  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2642  if (!Flags.isSRet())
2643  return NotStructReturn;
2644  if (Flags.isInReg() || IsMCU)
2645  return RegStructReturn;
2646  return StackStructReturn;
2647 }
2648 
2649 /// Determines whether a function uses struct return semantics.
2650 static StructReturnType
2652  if (Ins.empty())
2653  return NotStructReturn;
2654 
2655  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2656  if (!Flags.isSRet())
2657  return NotStructReturn;
2658  if (Flags.isInReg() || IsMCU)
2659  return RegStructReturn;
2660  return StackStructReturn;
2661 }
2662 
2663 /// Make a copy of an aggregate at address specified by "Src" to address
2664 /// "Dst" with size and alignment information specified by the specific
2665 /// parameter attribute. The copy will be passed as a byval function parameter.
2667  SDValue Chain, ISD::ArgFlagsTy Flags,
2668  SelectionDAG &DAG, const SDLoc &dl) {
2669  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2670 
2671  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2672  /*isVolatile*/false, /*AlwaysInline=*/true,
2673  /*isTailCall*/false,
2675 }
2676 
2677 /// Return true if the calling convention is one that we can guarantee TCO for.
2679  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2681  CC == CallingConv::HHVM);
2682 }
2683 
2684 /// Return true if we might ever do TCO for calls with this calling convention.
2686  switch (CC) {
2687  // C calling conventions:
2688  case CallingConv::C:
2689  case CallingConv::Win64:
2691  // Callee pop conventions:
2696  return true;
2697  default:
2698  return canGuaranteeTCO(CC);
2699  }
2700 }
2701 
2702 /// Return true if the function is being made into a tailcall target by
2703 /// changing its ABI.
2704 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2705  return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2706 }
2707 
2708 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2709  auto Attr =
2710  CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2711  if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2712  return false;
2713 
2714  ImmutableCallSite CS(CI);
2715  CallingConv::ID CalleeCC = CS.getCallingConv();
2716  if (!mayTailCallThisCC(CalleeCC))
2717  return false;
2718 
2719  return true;
2720 }
2721 
2722 SDValue
2723 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2724  const SmallVectorImpl<ISD::InputArg> &Ins,
2725  const SDLoc &dl, SelectionDAG &DAG,
2726  const CCValAssign &VA,
2727  MachineFrameInfo &MFI, unsigned i) const {
2728  // Create the nodes corresponding to a load from this parameter slot.
2729  ISD::ArgFlagsTy Flags = Ins[i].Flags;
2730  bool AlwaysUseMutable = shouldGuaranteeTCO(
2731  CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2732  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2733  EVT ValVT;
2734  MVT PtrVT = getPointerTy(DAG.getDataLayout());
2735 
2736  // If value is passed by pointer we have address passed instead of the value
2737  // itself. No need to extend if the mask value and location share the same
2738  // absolute size.
2739  bool ExtendedInMem =
2740  VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2741  VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2742 
2743  if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2744  ValVT = VA.getLocVT();
2745  else
2746  ValVT = VA.getValVT();
2747 
2748  // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2749  // taken by a return address.
2750  int Offset = 0;
2751  if (CallConv == CallingConv::X86_INTR) {
2752  // X86 interrupts may take one or two arguments.
2753  // On the stack there will be no return address as in regular call.
2754  // Offset of last argument need to be set to -4/-8 bytes.
2755  // Where offset of the first argument out of two, should be set to 0 bytes.
2756  Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2757  if (Subtarget.is64Bit() && Ins.size() == 2) {
2758  // The stack pointer needs to be realigned for 64 bit handlers with error
2759  // code, so the argument offset changes by 8 bytes.
2760  Offset += 8;
2761  }
2762  }
2763 
2764  // FIXME: For now, all byval parameter objects are marked mutable. This can be
2765  // changed with more analysis.
2766  // In case of tail call optimization mark all arguments mutable. Since they
2767  // could be overwritten by lowering of arguments in case of a tail call.
2768  if (Flags.isByVal()) {
2769  unsigned Bytes = Flags.getByValSize();
2770  if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2771  int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2772  // Adjust SP offset of interrupt parameter.
2773  if (CallConv == CallingConv::X86_INTR) {
2774  MFI.setObjectOffset(FI, Offset);
2775  }
2776  return DAG.getFrameIndex(FI, PtrVT);
2777  }
2778 
2779  // This is an argument in memory. We might be able to perform copy elision.
2780  if (Flags.isCopyElisionCandidate()) {
2781  EVT ArgVT = Ins[i].ArgVT;
2782  SDValue PartAddr;
2783  if (Ins[i].PartOffset == 0) {
2784  // If this is a one-part value or the first part of a multi-part value,
2785  // create a stack object for the entire argument value type and return a
2786  // load from our portion of it. This assumes that if the first part of an
2787  // argument is in memory, the rest will also be in memory.
2788  int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2789  /*Immutable=*/false);
2790  PartAddr = DAG.getFrameIndex(FI, PtrVT);
2791  return DAG.getLoad(
2792  ValVT, dl, Chain, PartAddr,
2794  } else {
2795  // This is not the first piece of an argument in memory. See if there is
2796  // already a fixed stack object including this offset. If so, assume it
2797  // was created by the PartOffset == 0 branch above and create a load from
2798  // the appropriate offset into it.
2799  int64_t PartBegin = VA.getLocMemOffset();
2800  int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2801  int FI = MFI.getObjectIndexBegin();
2802  for (; MFI.isFixedObjectIndex(FI); ++FI) {
2803  int64_t ObjBegin = MFI.getObjectOffset(FI);
2804  int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2805  if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2806  break;
2807  }
2808  if (MFI.isFixedObjectIndex(FI)) {
2809  SDValue Addr =
2810  DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2811  DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2812  return DAG.getLoad(
2813  ValVT, dl, Chain, Addr,
2815  Ins[i].PartOffset));
2816  }
2817  }
2818  }
2819 
2820  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2821  VA.getLocMemOffset(), isImmutable);
2822 
2823  // Set SExt or ZExt flag.
2824  if (VA.getLocInfo() == CCValAssign::ZExt) {
2825  MFI.setObjectZExt(FI, true);
2826  } else if (VA.getLocInfo() == CCValAssign::SExt) {
2827  MFI.setObjectSExt(FI, true);
2828  }
2829 
2830  // Adjust SP offset of interrupt parameter.
2831  if (CallConv == CallingConv::X86_INTR) {
2832  MFI.setObjectOffset(FI, Offset);
2833  }
2834 
2835  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2836  SDValue Val = DAG.getLoad(
2837  ValVT, dl, Chain, FIN,
2839  return ExtendedInMem
2840  ? (VA.getValVT().isVector()
2841  ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2842  : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2843  : Val;
2844 }
2845 
2846 // FIXME: Get this from tablegen.
2848  const X86Subtarget &Subtarget) {
2849  assert(Subtarget.is64Bit());
2850 
2851  if (Subtarget.isCallingConvWin64(CallConv)) {
2852  static const MCPhysReg GPR64ArgRegsWin64[] = {
2853  X86::RCX, X86::RDX, X86::R8, X86::R9
2854  };
2855  return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2856  }
2857 
2858  static const MCPhysReg GPR64ArgRegs64Bit[] = {
2859  X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2860  };
2861  return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2862 }
2863 
2864 // FIXME: Get this from tablegen.
2866  CallingConv::ID CallConv,
2867  const X86Subtarget &Subtarget) {
2868  assert(Subtarget.is64Bit());
2869  if (Subtarget.isCallingConvWin64(CallConv)) {
2870  // The XMM registers which might contain var arg parameters are shadowed
2871  // in their paired GPR. So we only need to save the GPR to their home
2872  // slots.
2873  // TODO: __vectorcall will change this.
2874  return None;
2875  }
2876 
2877  const Function &F = MF.getFunction();
2878  bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
2879  bool isSoftFloat = Subtarget.useSoftFloat();
2880  assert(!(isSoftFloat && NoImplicitFloatOps) &&
2881  "SSE register cannot be used when SSE is disabled!");
2882  if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2883  // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2884  // registers.
2885  return None;
2886 
2887  static const MCPhysReg XMMArgRegs64Bit[] = {
2888  X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2889  X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2890  };
2891  return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2892 }
2893 
2894 #ifndef NDEBUG
2896  return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2897  [](const CCValAssign &A, const CCValAssign &B) -> bool {
2898  return A.getValNo() < B.getValNo();
2899  });
2900 }
2901 #endif
2902 
2903 SDValue X86TargetLowering::LowerFormalArguments(
2904  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2905  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2906  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2907  MachineFunction &MF = DAG.getMachineFunction();
2909  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2910 
2911  const Function &F = MF.getFunction();
2912  if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
2913  F.getName() == "main")
2914  FuncInfo->setForceFramePointer(true);
2915 
2916  MachineFrameInfo &MFI = MF.getFrameInfo();
2917  bool Is64Bit = Subtarget.is64Bit();
2918  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2919 
2920  assert(
2921  !(isVarArg && canGuaranteeTCO(CallConv)) &&
2922  "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2923 
2924  if (CallConv == CallingConv::X86_INTR) {
2925  bool isLegal = Ins.size() == 1 ||
2926  (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2927  (!Is64Bit && Ins[1].VT == MVT::i32)));
2928  if (!isLegal)
2929  report_fatal_error("X86 interrupts may take one or two arguments");
2930  }
2931 
2932  // Assign locations to all of the incoming arguments.
2934  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2935 
2936  // Allocate shadow area for Win64.
2937  if (IsWin64)
2938  CCInfo.AllocateStack(32, 8);
2939 
2940  CCInfo.AnalyzeArguments(Ins, CC_X86);
2941 
2942  // In vectorcall calling convention a second pass is required for the HVA
2943  // types.
2944  if (CallingConv::X86_VectorCall == CallConv) {
2945  CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2946  }
2947 
2948  // The next loop assumes that the locations are in the same order of the
2949  // input arguments.
2950  assert(isSortedByValueNo(ArgLocs) &&
2951  "Argument Location list must be sorted before lowering");
2952 
2953  SDValue ArgValue;
2954  for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2955  ++I, ++InsIndex) {
2956  assert(InsIndex < Ins.size() && "Invalid Ins index");
2957  CCValAssign &VA = ArgLocs[I];
2958 
2959  if (VA.isRegLoc()) {
2960  EVT RegVT = VA.getLocVT();
2961  if (VA.needsCustom()) {
2962  assert(
2963  VA.getValVT() == MVT::v64i1 &&
2964  "Currently the only custom case is when we split v64i1 to 2 regs");
2965 
2966  // v64i1 values, in regcall calling convention, that are
2967  // compiled to 32 bit arch, are split up into two registers.
2968  ArgValue =
2969  getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2970  } else {
2971  const TargetRegisterClass *RC;
2972  if (RegVT == MVT::i32)
2973  RC = &X86::GR32RegClass;
2974  else if (Is64Bit && RegVT == MVT::i64)
2975  RC = &X86::GR64RegClass;
2976  else if (RegVT == MVT::f32)
2977  RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2978  else if (RegVT == MVT::f64)
2979  RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2980  else if (RegVT == MVT::f80)
2981  RC = &X86::RFP80RegClass;
2982  else if (RegVT == MVT::f128)
2983  RC = &X86::FR128RegClass;
2984  else if (RegVT.is512BitVector())
2985  RC = &X86::VR512RegClass;
2986  else if (RegVT.is256BitVector())
2987  RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2988  else if (RegVT.is128BitVector())
2989  RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2990  else if (RegVT == MVT::x86mmx)
2991  RC = &X86::VR64RegClass;
2992  else if (RegVT == MVT::v1i1)
2993  RC = &X86::VK1RegClass;
2994  else if (RegVT == MVT::v8i1)
2995  RC = &X86::VK8RegClass;
2996  else if (RegVT == MVT::v16i1)
2997  RC = &X86::VK16RegClass;
2998  else if (RegVT == MVT::v32i1)
2999  RC = &X86::VK32RegClass;
3000  else if (RegVT == MVT::v64i1)
3001  RC = &X86::VK64RegClass;
3002  else
3003  llvm_unreachable("Unknown argument type!");
3004 
3005  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3006  ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3007  }
3008 
3009  // If this is an 8 or 16-bit value, it is really passed promoted to 32
3010  // bits. Insert an assert[sz]ext to capture this, then truncate to the
3011  // right size.
3012  if (VA.getLocInfo() == CCValAssign::SExt)
3013  ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3014  DAG.getValueType(VA.getValVT()));
3015  else if (VA.getLocInfo() == CCValAssign::ZExt)
3016  ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3017  DAG.getValueType(VA.getValVT()));
3018  else if (VA.getLocInfo() == CCValAssign::BCvt)
3019  ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3020 
3021  if (VA.isExtInLoc()) {
3022  // Handle MMX values passed in XMM regs.
3023  if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3024  ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3025  else if (VA.getValVT().isVector() &&
3026  VA.getValVT().getScalarType() == MVT::i1 &&
3027  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3028  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3029  // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3030  ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3031  } else
3032  ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3033  }
3034  } else {
3035  assert(VA.isMemLoc());
3036  ArgValue =
3037  LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3038  }
3039 
3040  // If value is passed via pointer - do a load.
3041  if (VA.getLocInfo() == CCValAssign::Indirect)
3042  ArgValue =
3043  DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3044 
3045  InVals.push_back(ArgValue);
3046  }
3047 
3048  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3049  // Swift calling convention does not require we copy the sret argument
3050  // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3051  if (CallConv == CallingConv::Swift)
3052  continue;
3053 
3054  // All x86 ABIs require that for returning structs by value we copy the
3055  // sret argument into %rax/%eax (depending on ABI) for the return. Save
3056  // the argument into a virtual register so that we can access it from the
3057  // return points.
3058  if (Ins[I].Flags.isSRet()) {
3059  unsigned Reg = FuncInfo->getSRetReturnReg();
3060  if (!Reg) {
3061  MVT PtrTy = getPointerTy(DAG.getDataLayout());
3062  Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3063  FuncInfo->setSRetReturnReg(Reg);
3064  }
3065  SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3066  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3067  break;
3068  }
3069  }
3070 
3071  unsigned StackSize = CCInfo.getNextStackOffset();
3072  // Align stack specially for tail calls.
3073  if (shouldGuaranteeTCO(CallConv,
3075  StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3076 
3077  // If the function takes variable number of arguments, make a frame index for
3078  // the start of the first vararg value... for expansion of llvm.va_start. We
3079  // can skip this if there are no va_start calls.
3080  if (MFI.hasVAStart() &&
3081  (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3082  CallConv != CallingConv::X86_ThisCall))) {
3083  FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3084  }
3085 
3086  // Figure out if XMM registers are in use.
3087  assert(!(Subtarget.useSoftFloat() &&
3088  F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3089  "SSE register cannot be used when SSE is disabled!");
3090 
3091  // 64-bit calling conventions support varargs and register parameters, so we
3092  // have to do extra work to spill them in the prologue.
3093  if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3094  // Find the first unallocated argument registers.
3095  ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3096  ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3097  unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3098  unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3099  assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3100  "SSE register cannot be used when SSE is disabled!");
3101 
3102  // Gather all the live in physical registers.
3103  SmallVector<SDValue, 6> LiveGPRs;
3104  SmallVector<SDValue, 8> LiveXMMRegs;
3105  SDValue ALVal;
3106  for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3107  unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3108  LiveGPRs.push_back(
3109  DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3110  }
3111  if (!ArgXMMs.empty()) {
3112  unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3113  ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3114  for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3115  unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3116  LiveXMMRegs.push_back(
3117  DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3118  }
3119  }
3120 
3121  if (IsWin64) {
3122  // Get to the caller-allocated home save location. Add 8 to account
3123  // for the return address.
3124  int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3125  FuncInfo->setRegSaveFrameIndex(
3126  MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3127  // Fixup to set vararg frame on shadow area (4 x i64).
3128  if (NumIntRegs < 4)
3129  FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3130  } else {
3131  // For X86-64, if there are vararg parameters that are passed via
3132  // registers, then we must store them to their spots on the stack so
3133  // they may be loaded by dereferencing the result of va_next.
3134  FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3135  FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3137  ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3138  }
3139 
3140  // Store the integer parameter registers.
3141  SmallVector<SDValue, 8> MemOps;
3142  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3143  getPointerTy(DAG.getDataLayout()));
3144  unsigned Offset = FuncInfo->getVarArgsGPOffset();
3145  for (SDValue Val : LiveGPRs) {
3146  SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3147  RSFIN, DAG.getIntPtrConstant(Offset, dl));
3148  SDValue Store =
3149  DAG.getStore(Val.getValue(1), dl, Val, FIN,
3151  DAG.getMachineFunction(),
3152  FuncInfo->getRegSaveFrameIndex(), Offset));
3153  MemOps.push_back(Store);
3154  Offset += 8;
3155  }
3156 
3157  if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3158  // Now store the XMM (fp + vector) parameter registers.
3159  SmallVector<SDValue, 12> SaveXMMOps;
3160  SaveXMMOps.push_back(Chain);
3161  SaveXMMOps.push_back(ALVal);
3162  SaveXMMOps.push_back(DAG.getIntPtrConstant(
3163  FuncInfo->getRegSaveFrameIndex(), dl));
3164  SaveXMMOps.push_back(DAG.getIntPtrConstant(
3165  FuncInfo->getVarArgsFPOffset(), dl));
3166  SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3167  LiveXMMRegs.end());
3169  MVT::Other, SaveXMMOps));
3170  }
3171 
3172  if (!MemOps.empty())
3173  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3174  }
3175 
3176  if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3177  // Find the largest legal vector type.
3178  MVT VecVT = MVT::Other;
3179  // FIXME: Only some x86_32 calling conventions support AVX512.
3180  if (Subtarget.hasAVX512() &&
3181  (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3182  CallConv == CallingConv::Intel_OCL_BI)))
3183  VecVT = MVT::v16f32;
3184  else if (Subtarget.hasAVX())
3185  VecVT = MVT::v8f32;
3186  else if (Subtarget.hasSSE2())
3187  VecVT = MVT::v4f32;
3188 
3189  // We forward some GPRs and some vector types.
3190  SmallVector<MVT, 2> RegParmTypes;
3191  MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3192  RegParmTypes.push_back(IntVT);
3193  if (VecVT != MVT::Other)
3194  RegParmTypes.push_back(VecVT);
3195 
3196  // Compute the set of forwarded registers. The rest are scratch.
3198  FuncInfo->getForwardedMustTailRegParms();
3199  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3200 
3201  // Conservatively forward AL on x86_64, since it might be used for varargs.
3202  if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3203  unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3204  Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3205  }
3206 
3207  // Copy all forwards from physical to virtual registers.
3208  for (ForwardedRegister &F : Forwards) {
3209  // FIXME: Can we use a less constrained schedule?
3210  SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3211  F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3212  Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3213  }
3214  }
3215 
3216  // Some CCs need callee pop.
3217  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3219  FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3220  } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3221  // X86 interrupts must pop the error code (and the alignment padding) if
3222  // present.
3223  FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3224  } else {
3225  FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3226  // If this is an sret function, the return should pop the hidden pointer.
3227  if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3228  !Subtarget.getTargetTriple().isOSMSVCRT() &&
3229  argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3230  FuncInfo->setBytesToPopOnReturn(4);
3231  }
3232 
3233  if (!Is64Bit) {
3234  // RegSaveFrameIndex is X86-64 only.
3235  FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3236  if (CallConv == CallingConv::X86_FastCall ||
3237  CallConv == CallingConv::X86_ThisCall)
3238  // fastcc functions can't have varargs.
3239  FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3240  }
3241 
3242  FuncInfo->setArgumentStackSize(StackSize);
3243 
3244  if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3246  if (Personality == EHPersonality::CoreCLR) {
3247  assert(Is64Bit);
3248  // TODO: Add a mechanism to frame lowering that will allow us to indicate
3249  // that we'd prefer this slot be allocated towards the bottom of the frame
3250  // (i.e. near the stack pointer after allocating the frame). Every
3251  // funclet needs a copy of this slot in its (mostly empty) frame, and the
3252  // offset from the bottom of this and each funclet's frame must be the
3253  // same, so the size of funclets' (mostly empty) frames is dictated by
3254  // how far this slot is from the bottom (since they allocate just enough
3255  // space to accommodate holding this slot at the correct offset).
3256  int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3257  EHInfo->PSPSymFrameIdx = PSPSymFI;
3258  }
3259  }
3260 
3261  if (CallConv == CallingConv::X86_RegCall ||
3262  F.hasFnAttribute("no_caller_saved_registers")) {
3264  for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3265  MRI.disableCalleeSavedRegister(Pair.first);
3266  }
3267 
3268  return Chain;
3269 }
3270 
3271 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3272  SDValue Arg, const SDLoc &dl,
3273  SelectionDAG &DAG,
3274  const CCValAssign &VA,
3275  ISD::ArgFlagsTy Flags) const {
3276  unsigned LocMemOffset = VA.getLocMemOffset();
3277  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3278  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3279  StackPtr, PtrOff);
3280  if (Flags.isByVal())
3281  return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3282 
3283  return DAG.getStore(
3284  Chain, dl, Arg, PtrOff,
3285  MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3286 }
3287 
3288 /// Emit a load of return address if tail call
3289 /// optimization is performed and it is required.
3290 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3291  SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3292  bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3293  // Adjust the Return address stack slot.
3294  EVT VT = getPointerTy(DAG.getDataLayout());
3295  OutRetAddr = getReturnAddressFrameIndex(DAG);
3296 
3297  // Load the "old" Return address.
3298  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3299  return SDValue(OutRetAddr.getNode(), 1);
3300 }
3301 
3302 /// Emit a store of the return address if tail call
3303 /// optimization is performed and it is required (FPDiff!=0).
3305  SDValue Chain, SDValue RetAddrFrIdx,
3306  EVT PtrVT, unsigned SlotSize,
3307  int FPDiff, const SDLoc &dl) {
3308  // Store the return address to the appropriate stack slot.
3309  if (!FPDiff) return Chain;
3310  // Calculate the new stack slot for the return address.
3311  int NewReturnAddrFI =
3312  MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3313  false);
3314  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3315  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3317  DAG.getMachineFunction(), NewReturnAddrFI));
3318  return Chain;
3319 }
3320 
3321 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3322 /// operation of specified width.
3323 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3324  SDValue V2) {
3325  unsigned NumElems = VT.getVectorNumElements();
3327  Mask.push_back(NumElems);
3328  for (unsigned i = 1; i != NumElems; ++i)
3329  Mask.push_back(i);
3330  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3331 }
3332 
3333 SDValue
3334 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3335  SmallVectorImpl<SDValue> &InVals) const {
3336  SelectionDAG &DAG = CLI.DAG;
3337  SDLoc &dl = CLI.DL;
3339  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3341  SDValue Chain = CLI.Chain;
3342  SDValue Callee = CLI.Callee;
3343  CallingConv::ID CallConv = CLI.CallConv;
3344  bool &isTailCall = CLI.IsTailCall;
3345  bool isVarArg = CLI.IsVarArg;
3346 
3347  MachineFunction &MF = DAG.getMachineFunction();
3348  bool Is64Bit = Subtarget.is64Bit();
3349  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3350  StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3351  bool IsSibcall = false;
3353  auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3354  const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3355  const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3356  bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3357  (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3358 
3359  if (CallConv == CallingConv::X86_INTR)
3360  report_fatal_error("X86 interrupts may not be called directly");
3361 
3362  if (Attr.getValueAsString() == "true")
3363  isTailCall = false;
3364 
3365  if (Subtarget.isPICStyleGOT() &&
3367  // If we are using a GOT, disable tail calls to external symbols with
3368  // default visibility. Tail calling such a symbol requires using a GOT
3369  // relocation, which forces early binding of the symbol. This breaks code
3370  // that require lazy function symbol resolution. Using musttail or
3371  // GuaranteedTailCallOpt will override this.
3373  if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3375  isTailCall = false;
3376  }
3377 
3378  bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3379  if (IsMustTail) {
3380  // Force this to be a tail call. The verifier rules are enough to ensure
3381  // that we can lower this successfully without moving the return address
3382  // around.
3383  isTailCall = true;
3384  } else if (isTailCall) {
3385  // Check if it's really possible to do a tail call.
3386  isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3387  isVarArg, SR != NotStructReturn,
3388  MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3389  Outs, OutVals, Ins, DAG);
3390 
3391  // Sibcalls are automatically detected tailcalls which do not require
3392  // ABI changes.
3393  if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3394  IsSibcall = true;
3395 
3396  if (isTailCall)
3397  ++NumTailCalls;
3398  }
3399 
3400  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3401  "Var args not supported with calling convention fastcc, ghc or hipe");
3402 
3403  // Analyze operands of the call, assigning locations to each operand.
3405  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3406 
3407  // Allocate shadow area for Win64.
3408  if (IsWin64)
3409  CCInfo.AllocateStack(32, 8);
3410 
3411  CCInfo.AnalyzeArguments(Outs, CC_X86);
3412 
3413  // In vectorcall calling convention a second pass is required for the HVA
3414  // types.
3415  if (CallingConv::X86_VectorCall == CallConv) {
3416  CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3417  }
3418 
3419  // Get a count of how many bytes are to be pushed on the stack.
3420  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3421  if (IsSibcall)
3422  // This is a sibcall. The memory operands are available in caller's
3423  // own caller's stack.
3424  NumBytes = 0;
3425  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3426  canGuaranteeTCO(CallConv))
3427  NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3428 
3429  int FPDiff = 0;
3430  if (isTailCall && !IsSibcall && !IsMustTail) {
3431  // Lower arguments at fp - stackoffset + fpdiff.
3432  unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3433 
3434  FPDiff = NumBytesCallerPushed - NumBytes;
3435 
3436  // Set the delta of movement of the returnaddr stackslot.
3437  // But only set if delta is greater than previous delta.
3438  if (FPDiff < X86Info->getTCReturnAddrDelta())
3439  X86Info->setTCReturnAddrDelta(FPDiff);
3440  }
3441 
3442  unsigned NumBytesToPush = NumBytes;
3443  unsigned NumBytesToPop = NumBytes;
3444 
3445  // If we have an inalloca argument, all stack space has already been allocated
3446  // for us and be right at the top of the stack. We don't support multiple
3447  // arguments passed in memory when using inalloca.
3448  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3449  NumBytesToPush = 0;
3450  if (!ArgLocs.back().isMemLoc())
3451  report_fatal_error("cannot use inalloca attribute on a register "
3452  "parameter");
3453  if (ArgLocs.back().getLocMemOffset() != 0)
3454  report_fatal_error("any parameter with the inalloca attribute must be "
3455  "the only memory argument");
3456  }
3457 
3458  if (!IsSibcall)
3459  Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3460  NumBytes - NumBytesToPush, dl);
3461 
3462  SDValue RetAddrFrIdx;
3463  // Load return address for tail calls.
3464  if (isTailCall && FPDiff)
3465  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3466  Is64Bit, FPDiff, dl);
3467 
3469  SmallVector<SDValue, 8> MemOpChains;
3470  SDValue StackPtr;
3471 
3472  // The next loop assumes that the locations are in the same order of the
3473  // input arguments.
3474  assert(isSortedByValueNo(ArgLocs) &&
3475  "Argument Location list must be sorted before lowering");
3476 
3477  // Walk the register/memloc assignments, inserting copies/loads. In the case
3478  // of tail call optimization arguments are handle later.
3479  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3480  for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3481  ++I, ++OutIndex) {
3482  assert(OutIndex < Outs.size() && "Invalid Out index");
3483  // Skip inalloca arguments, they have already been written.
3484  ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3485  if (Flags.isInAlloca())
3486  continue;
3487 
3488  CCValAssign &VA = ArgLocs[I];
3489  EVT RegVT = VA.getLocVT();
3490  SDValue Arg = OutVals[OutIndex];
3491  bool isByVal = Flags.isByVal();
3492 
3493  // Promote the value if needed.
3494  switch (VA.getLocInfo()) {
3495  default: llvm_unreachable("Unknown loc info!");
3496  case CCValAssign::Full: break;
3497  case CCValAssign::SExt:
3498  Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3499  break;
3500  case CCValAssign::ZExt:
3501  Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3502  break;
3503  case CCValAssign::AExt:
3504  if (Arg.getValueType().isVector() &&
3506  Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3507  else if (RegVT.is128BitVector()) {
3508  // Special case: passing MMX values in XMM registers.
3509  Arg = DAG.getBitcast(MVT::i64, Arg);
3510  Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3511  Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3512  } else
3513  Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3514  break;
3515  case CCValAssign::BCvt:
3516  Arg = DAG.getBitcast(RegVT, Arg);
3517  break;
3518  case CCValAssign::Indirect: {
3519  // Store the argument.
3520  SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3521  int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3522  Chain = DAG.getStore(
3523  Chain, dl, Arg, SpillSlot,
3525  Arg = SpillSlot;
3526  break;
3527  }
3528  }
3529 
3530  if (VA.needsCustom()) {
3531  assert(VA.getValVT() == MVT::v64i1 &&
3532  "Currently the only custom case is when we split v64i1 to 2 regs");
3533  // Split v64i1 value into two registers
3534  Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3535  Subtarget);
3536  } else if (VA.isRegLoc()) {
3537  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3538  if (isVarArg && IsWin64) {
3539  // Win64 ABI requires argument XMM reg to be copied to the corresponding
3540  // shadow reg if callee is a varargs function.
3541  unsigned ShadowReg = 0;
3542  switch (VA.getLocReg()) {
3543  case X86::XMM0: ShadowReg = X86::RCX; break;
3544  case X86::XMM1: ShadowReg = X86::RDX; break;
3545  case X86::XMM2: ShadowReg = X86::R8; break;
3546  case X86::XMM3: ShadowReg = X86::R9; break;
3547  }
3548  if (ShadowReg)
3549  RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3550  }
3551  } else if (!IsSibcall && (!isTailCall || isByVal)) {
3552  assert(VA.isMemLoc());
3553  if (!StackPtr.getNode())
3554  StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3555  getPointerTy(DAG.getDataLayout()));
3556  MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3557  dl, DAG, VA, Flags));
3558  }
3559  }
3560 
3561  if (!MemOpChains.empty())
3562  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3563 
3564  if (Subtarget.isPICStyleGOT()) {
3565  // ELF / PIC requires GOT in the EBX register before function calls via PLT
3566  // GOT pointer.
3567  if (!isTailCall) {
3568  RegsToPass.push_back(std::make_pair(
3569  unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3570  getPointerTy(DAG.getDataLayout()))));
3571  } else {
3572  // If we are tail calling and generating PIC/GOT style code load the
3573  // address of the callee into ECX. The value in ecx is used as target of
3574  // the tail jump. This is done to circumvent the ebx/callee-saved problem
3575  // for tail calls on PIC/GOT architectures. Normally we would just put the
3576  // address of GOT into ebx and then call target@PLT. But for tail calls
3577  // ebx would be restored (since ebx is callee saved) before jumping to the
3578  // target@PLT.
3579 
3580  // Note: The actual moving to ECX is done further down.
3582  if (G && !G->getGlobal()->hasLocalLinkage() &&
3584  Callee = LowerGlobalAddress(Callee, DAG);
3585  else if (isa<ExternalSymbolSDNode>(Callee))
3586  Callee = LowerExternalSymbol(Callee, DAG);
3587  }
3588  }
3589 
3590  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3591  // From AMD64 ABI document:
3592  // For calls that may call functions that use varargs or stdargs
3593  // (prototype-less calls or calls to functions containing ellipsis (...) in
3594  // the declaration) %al is used as hidden argument to specify the number
3595  // of SSE registers used. The contents of %al do not need to match exactly
3596  // the number of registers, but must be an ubound on the number of SSE
3597  // registers used and is in the range 0 - 8 inclusive.
3598 
3599  // Count the number of XMM registers allocated.
3600  static const MCPhysReg XMMArgRegs[] = {
3601  X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3602  X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3603  };
3604  unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3605  assert((Subtarget.hasSSE1() || !NumXMMRegs)
3606  && "SSE registers cannot be used when SSE is disabled");
3607 
3608  RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3609  DAG.getConstant(NumXMMRegs, dl,
3610  MVT::i8)));
3611  }
3612 
3613  if (isVarArg && IsMustTail) {
3614  const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3615  for (const auto &F : Forwards) {
3616  SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3617  RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3618  }
3619  }
3620 
3621  // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3622  // don't need this because the eligibility check rejects calls that require
3623  // shuffling arguments passed in memory.
3624  if (!IsSibcall && isTailCall) {
3625  // Force all the incoming stack arguments to be loaded from the stack
3626  // before any new outgoing arguments are stored to the stack, because the
3627  // outgoing stack slots may alias the incoming argument stack slots, and
3628  // the alias isn't otherwise explicit. This is slightly more conservative
3629  // than necessary, because it means that each store effectively depends
3630  // on every argument instead of just those arguments it would clobber.
3631  SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3632 
3633  SmallVector<SDValue, 8> MemOpChains2;
3634  SDValue FIN;
3635  int FI = 0;
3636  for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3637  ++I, ++OutsIndex) {
3638  CCValAssign &VA = ArgLocs[I];
3639 
3640  if (VA.isRegLoc()) {
3641  if (VA.needsCustom()) {
3642  assert((CallConv == CallingConv::X86_RegCall) &&
3643  "Expecting custom case only in regcall calling convention");
3644  // This means that we are in special case where one argument was
3645  // passed through two register locations - Skip the next location
3646  ++I;
3647  }
3648 
3649  continue;
3650  }
3651 
3652  assert(VA.isMemLoc());
3653  SDValue Arg = OutVals[OutsIndex];
3654  ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3655  // Skip inalloca arguments. They don't require any work.
3656  if (Flags.isInAlloca())
3657  continue;
3658  // Create frame index.
3659  int32_t Offset = VA.getLocMemOffset()+FPDiff;
3660  uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3661  FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3662  FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3663 
3664  if (Flags.isByVal()) {
3665  // Copy relative to framepointer.
3667  if (!StackPtr.getNode())
3668  StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3669  getPointerTy(DAG.getDataLayout()));
3670  Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3671  StackPtr, Source);
3672 
3673  MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3674  ArgChain,
3675  Flags, DAG, dl));
3676  } else {
3677  // Store relative to framepointer.
3678  MemOpChains2.push_back(DAG.getStore(
3679  ArgChain, dl, Arg, FIN,
3681  }
3682  }
3683 
3684  if (!MemOpChains2.empty())
3685  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3686 
3687  // Store the return address to the appropriate stack slot.
3688  Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3689  getPointerTy(DAG.getDataLayout()),
3690  RegInfo->getSlotSize(), FPDiff, dl);
3691  }
3692 
3693  // Build a sequence of copy-to-reg nodes chained together with token chain
3694  // and flag operands which copy the outgoing args into registers.
3695  SDValue InFlag;
3696  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3697  Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3698  RegsToPass[i].second, InFlag);
3699  InFlag = Chain.getValue(1);
3700  }
3701 
3702  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3703  assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3704  // In the 64-bit large code model, we have to make all calls
3705  // through a register, since the call instruction's 32-bit
3706  // pc-relative offset may not be large enough to hold the whole
3707  // address.
3708  } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3709  // If the callee is a GlobalAddress node (quite common, every direct call
3710  // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3711  // it.
3712  GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3713 
3714  // We should use extra load for direct calls to dllimported functions in
3715  // non-JIT mode.
3716  const GlobalValue *GV = G->getGlobal();
3717  if (!GV->hasDLLImportStorageClass()) {
3718  unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3719 
3720  Callee = DAG.getTargetGlobalAddress(
3721  GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3722 
3723  if (OpFlags == X86II::MO_GOTPCREL) {
3724  // Add a wrapper.
3725  Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3726  getPointerTy(DAG.getDataLayout()), Callee);
3727  // Add extra indirection
3728  Callee = DAG.getLoad(
3729  getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3731  }
3732  }
3733  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3734  const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
3735  unsigned char OpFlags =
3736  Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3737 
3738  Callee = DAG.getTargetExternalSymbol(
3739  S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);