LLVM  7.0.0svn
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the AArch64TargetLowering class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64ISelLowering.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
47 #include "llvm/IR/Attributes.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DataLayout.h"
50 #include "llvm/IR/DebugLoc.h"
51 #include "llvm/IR/DerivedTypes.h"
52 #include "llvm/IR/Function.h"
54 #include "llvm/IR/GlobalValue.h"
55 #include "llvm/IR/IRBuilder.h"
56 #include "llvm/IR/Instruction.h"
57 #include "llvm/IR/Instructions.h"
58 #include "llvm/IR/Intrinsics.h"
59 #include "llvm/IR/Module.h"
60 #include "llvm/IR/OperandTraits.h"
61 #include "llvm/IR/Type.h"
62 #include "llvm/IR/Use.h"
63 #include "llvm/IR/Value.h"
64 #include "llvm/MC/MCRegisterInfo.h"
65 #include "llvm/Support/Casting.h"
66 #include "llvm/Support/CodeGen.h"
68 #include "llvm/Support/Compiler.h"
69 #include "llvm/Support/Debug.h"
71 #include "llvm/Support/KnownBits.h"
77 #include <algorithm>
78 #include <bitset>
79 #include <cassert>
80 #include <cctype>
81 #include <cstdint>
82 #include <cstdlib>
83 #include <iterator>
84 #include <limits>
85 #include <tuple>
86 #include <utility>
87 #include <vector>
88 
89 using namespace llvm;
90 
91 #define DEBUG_TYPE "aarch64-lower"
92 
93 STATISTIC(NumTailCalls, "Number of tail calls");
94 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
95 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
96 
97 static cl::opt<bool>
98 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
99  cl::desc("Allow AArch64 SLI/SRI formation"),
100  cl::init(false));
101 
102 // FIXME: The necessary dtprel relocations don't seem to be supported
103 // well in the GNU bfd and gold linkers at the moment. Therefore, by
104 // default, for now, fall back to GeneralDynamic code generation.
106  "aarch64-elf-ldtls-generation", cl::Hidden,
107  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
108  cl::init(false));
109 
110 static cl::opt<bool>
111 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
112  cl::desc("Enable AArch64 logical imm instruction "
113  "optimization"),
114  cl::init(true));
115 
116 /// Value type used for condition codes.
117 static const MVT MVT_CC = MVT::i32;
118 
120  const AArch64Subtarget &STI)
121  : TargetLowering(TM), Subtarget(&STI) {
122  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
123  // we have to make something up. Arbitrarily, choose ZeroOrOne.
125  // When comparing vectors the result sets the different elements in the
126  // vector to all-one or all-zero.
128 
129  // Set up the register classes.
130  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
131  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
132 
133  if (Subtarget->hasFPARMv8()) {
134  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
135  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
136  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
137  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
138  }
139 
140  if (Subtarget->hasNEON()) {
141  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
142  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
143  // Someone set us up the NEON.
144  addDRTypeForNEON(MVT::v2f32);
145  addDRTypeForNEON(MVT::v8i8);
146  addDRTypeForNEON(MVT::v4i16);
147  addDRTypeForNEON(MVT::v2i32);
148  addDRTypeForNEON(MVT::v1i64);
149  addDRTypeForNEON(MVT::v1f64);
150  addDRTypeForNEON(MVT::v4f16);
151 
152  addQRTypeForNEON(MVT::v4f32);
153  addQRTypeForNEON(MVT::v2f64);
154  addQRTypeForNEON(MVT::v16i8);
155  addQRTypeForNEON(MVT::v8i16);
156  addQRTypeForNEON(MVT::v4i32);
157  addQRTypeForNEON(MVT::v2i64);
158  addQRTypeForNEON(MVT::v8f16);
159  }
160 
161  // Compute derived properties from the register classes
163 
164  // Provide all sorts of operation actions
192 
196 
200 
202 
203  // Custom lowering hooks are needed for XOR
204  // to fold it into CSINC/CSINV.
207 
208  // Virtually no operation on f128 is legal, but LLVM can't expand them when
209  // there's a valid register class, so we need custom operations in most cases.
231 
232  // Lowering for many of the conversions is actually specified by the non-f128
233  // type. The LowerXXX function will be trivial when f128 isn't involved.
248 
249  // Variable arguments.
254 
255  // Variable-sized objects.
258 
259  if (Subtarget->isTargetWindows())
261  else
263 
264  // Constant pool entries
266 
267  // BlockAddress
269 
270  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
279 
280  // AArch64 lacks both left-rotate and popcount instructions.
283  for (MVT VT : MVT::vector_valuetypes()) {
286  }
287 
288  // AArch64 doesn't have {U|S}MUL_LOHI.
291 
294 
297  for (MVT VT : MVT::vector_valuetypes()) {
300  }
307 
308  // Custom lower Add/Sub/Mul with overflow.
321 
330  if (Subtarget->hasFullFP16())
332  else
334 
366 
367  if (!Subtarget->hasFullFP16()) {
390 
391  // promote v4f16 to v4f32 when that is known to be safe.
404 
420 
441  }
442 
443  // AArch64 has implementations of a lot of rounding-like FP operations.
444  for (MVT Ty : {MVT::f32, MVT::f64}) {
455  }
456 
457  if (Subtarget->hasFullFP16()) {
468  }
469 
471 
477 
478  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
479  // This requires the Performance Monitors extension.
480  if (Subtarget->hasPerfMon())
482 
483  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
484  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
485  // Issue __sincos_stret if available.
488  } else {
491  }
492 
493  // Make floating-point constants legal for the large code model, so they don't
494  // become loads from the constant pool.
495  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
498  }
499 
500  // AArch64 does not have floating-point extending loads, i1 sign-extending
501  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
502  for (MVT VT : MVT::fp_valuetypes()) {
507  }
508  for (MVT VT : MVT::integer_valuetypes())
510 
518 
521 
522  // Indexed loads and stores are supported.
523  for (unsigned im = (unsigned)ISD::PRE_INC;
539  }
540 
541  // Trap.
543 
544  // We combine OR nodes for bitfield operations.
546 
547  // Vector add and sub nodes may conceal a high-half opportunity.
548  // Also, try to fold ADD into CSINC/CSINV..
555 
559 
561 
568  if (Subtarget->supportsAddressTopByteIgnored())
570 
572 
575 
579 
581 
582  // In case of strict alignment, avoid an excessive number of byte wide stores.
586 
591 
593 
595 
597 
598  EnableExtLdPromotion = true;
599 
600  // Set required alignment.
602  // Set preferred alignments.
605 
606  // Only change the limit for entries in a jump table if specified by
607  // the subtarget, but not at the command line.
608  unsigned MaxJT = STI.getMaximumJumpTableSize();
609  if (MaxJT && getMaximumJumpTableSize() == 0)
611 
612  setHasExtractBitsInsn(true);
613 
615 
616  if (Subtarget->hasNEON()) {
617  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
618  // silliness like this:
644 
650 
652 
653  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
654  // elements smaller than i32, so promote the input to i32 first.
659  // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
660  // -> v8f16 conversions.
665  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
670  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
671  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
674 
677 
686 
687  // AArch64 doesn't have MUL.2d:
689  // Custom handling for some quad-vector types to detect MULL.
693 
694  // Vector reductions
695  for (MVT VT : MVT::integer_valuetypes()) {
701  }
702  for (MVT VT : MVT::fp_valuetypes()) {
705  }
706 
709  // Likewise, narrowing and extending vector loads/stores aren't handled
710  // directly.
711  for (MVT VT : MVT::vector_valuetypes()) {
713 
714  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
717  } else {
720  }
723 
725 
726  for (MVT InnerVT : MVT::vector_valuetypes()) {
727  setTruncStoreAction(VT, InnerVT, Expand);
728  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
729  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
730  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
731  }
732  }
733 
734  // AArch64 has implementations of a lot of rounding-like FP operations.
735  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
742  }
743  }
744 
746 }
747 
748 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
749  assert(VT.isVector() && "VT should be a vector type");
750 
751  if (VT.isFloatingPoint()) {
753  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
754  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
755  }
756 
757  // Mark vector float intrinsics as expand.
758  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
767 
768  // But we do support custom-lowering for FCOPYSIGN.
770  }
771 
784 
788  for (MVT InnerVT : MVT::all_valuetypes())
789  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
790 
791  // CNT supports only B element sizes.
792  if (VT != MVT::v8i8 && VT != MVT::v16i8)
794 
800 
803 
804  if (!VT.isFloatingPoint())
806 
807  // [SU][MIN|MAX] are available for all NEON types apart from i64.
808  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
809  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
810  setOperationAction(Opcode, VT, Legal);
811 
812  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
813  if (VT.isFloatingPoint() &&
814  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
815  for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
817  setOperationAction(Opcode, VT, Legal);
818 
819  if (Subtarget->isLittleEndian()) {
820  for (unsigned im = (unsigned)ISD::PRE_INC;
824  }
825  }
826 }
827 
828 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
829  addRegisterClass(VT, &AArch64::FPR64RegClass);
830  addTypeForNEON(VT, MVT::v2i32);
831 }
832 
833 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
834  addRegisterClass(VT, &AArch64::FPR128RegClass);
835  addTypeForNEON(VT, MVT::v4i32);
836 }
837 
839  EVT VT) const {
840  if (!VT.isVector())
841  return MVT::i32;
843 }
844 
845 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
846  const APInt &Demanded,
848  unsigned NewOpc) {
849  uint64_t OldImm = Imm, NewImm, Enc;
850  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
851 
852  // Return if the immediate is already all zeros, all ones, a bimm32 or a
853  // bimm64.
854  if (Imm == 0 || Imm == Mask ||
855  AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
856  return false;
857 
858  unsigned EltSize = Size;
859  uint64_t DemandedBits = Demanded.getZExtValue();
860 
861  // Clear bits that are not demanded.
862  Imm &= DemandedBits;
863 
864  while (true) {
865  // The goal here is to set the non-demanded bits in a way that minimizes
866  // the number of switching between 0 and 1. In order to achieve this goal,
867  // we set the non-demanded bits to the value of the preceding demanded bits.
868  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
869  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
870  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
871  // The final result is 0b11000011.
872  uint64_t NonDemandedBits = ~DemandedBits;
873  uint64_t InvertedImm = ~Imm & DemandedBits;
874  uint64_t RotatedImm =
875  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
876  NonDemandedBits;
877  uint64_t Sum = RotatedImm + NonDemandedBits;
878  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
879  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
880  NewImm = (Imm | Ones) & Mask;
881 
882  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
883  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
884  // we halve the element size and continue the search.
885  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
886  break;
887 
888  // We cannot shrink the element size any further if it is 2-bits.
889  if (EltSize == 2)
890  return false;
891 
892  EltSize /= 2;
893  Mask >>= EltSize;
894  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
895 
896  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
897  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
898  return false;
899 
900  // Merge the upper and lower halves of Imm and DemandedBits.
901  Imm |= Hi;
902  DemandedBits |= DemandedBitsHi;
903  }
904 
905  ++NumOptimizedImms;
906 
907  // Replicate the element across the register width.
908  while (EltSize < Size) {
909  NewImm |= NewImm << EltSize;
910  EltSize *= 2;
911  }
912 
913  (void)OldImm;
914  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
915  "demanded bits should never be altered");
916  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
917 
918  // Create the new constant immediate node.
919  EVT VT = Op.getValueType();
920  SDLoc DL(Op);
921  SDValue New;
922 
923  // If the new constant immediate is all-zeros or all-ones, let the target
924  // independent DAG combine optimize this node.
925  if (NewImm == 0 || NewImm == OrigMask) {
926  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
927  TLO.DAG.getConstant(NewImm, DL, VT));
928  // Otherwise, create a machine node so that target independent DAG combine
929  // doesn't undo this optimization.
930  } else {
931  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
932  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
933  New = SDValue(
934  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
935  }
936 
937  return TLO.CombineTo(Op, New);
938 }
939 
941  SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
942  // Delay this optimization to as late as possible.
943  if (!TLO.LegalOps)
944  return false;
945 
947  return false;
948 
949  EVT VT = Op.getValueType();
950  if (VT.isVector())
951  return false;
952 
953  unsigned Size = VT.getSizeInBits();
954  assert((Size == 32 || Size == 64) &&
955  "i32 or i64 is expected after legalization.");
956 
957  // Exit early if we demand all bits.
958  if (Demanded.countPopulation() == Size)
959  return false;
960 
961  unsigned NewOpc;
962  switch (Op.getOpcode()) {
963  default:
964  return false;
965  case ISD::AND:
966  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
967  break;
968  case ISD::OR:
969  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
970  break;
971  case ISD::XOR:
972  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
973  break;
974  }
976  if (!C)
977  return false;
978  uint64_t Imm = C->getZExtValue();
979  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
980 }
981 
982 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
983 /// Mask are known to be either zero or one and return them Known.
985  const SDValue Op, KnownBits &Known,
986  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
987  switch (Op.getOpcode()) {
988  default:
989  break;
990  case AArch64ISD::CSEL: {
991  KnownBits Known2;
992  DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
993  DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
994  Known.Zero &= Known2.Zero;
995  Known.One &= Known2.One;
996  break;
997  }
998  case ISD::INTRINSIC_W_CHAIN: {
999  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1000  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1001  switch (IntID) {
1002  default: return;
1003  case Intrinsic::aarch64_ldaxr:
1004  case Intrinsic::aarch64_ldxr: {
1005  unsigned BitWidth = Known.getBitWidth();
1006  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1007  unsigned MemBits = VT.getScalarSizeInBits();
1008  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1009  return;
1010  }
1011  }
1012  break;
1013  }
1015  case ISD::INTRINSIC_VOID: {
1016  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1017  switch (IntNo) {
1018  default:
1019  break;
1020  case Intrinsic::aarch64_neon_umaxv:
1021  case Intrinsic::aarch64_neon_uminv: {
1022  // Figure out the datatype of the vector operand. The UMINV instruction
1023  // will zero extend the result, so we can mark as known zero all the
1024  // bits larger than the element datatype. 32-bit or larget doesn't need
1025  // this as those are legal types and will be handled by isel directly.
1026  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1027  unsigned BitWidth = Known.getBitWidth();
1028  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1029  assert(BitWidth >= 8 && "Unexpected width!");
1030  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1031  Known.Zero |= Mask;
1032  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1033  assert(BitWidth >= 16 && "Unexpected width!");
1034  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1035  Known.Zero |= Mask;
1036  }
1037  break;
1038  } break;
1039  }
1040  }
1041  }
1042 }
1043 
1045  EVT) const {
1046  return MVT::i64;
1047 }
1048 
1050  unsigned AddrSpace,
1051  unsigned Align,
1052  bool *Fast) const {
1053  if (Subtarget->requiresStrictAlign())
1054  return false;
1055 
1056  if (Fast) {
1057  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1058  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1059  // See comments in performSTORECombine() for more details about
1060  // these conditions.
1061 
1062  // Code that uses clang vector extensions can mark that it
1063  // wants unaligned accesses to be treated as fast by
1064  // underspecifying alignment to be 1 or 2.
1065  Align <= 2 ||
1066 
1067  // Disregard v2i64. Memcpy lowering produces those and splitting
1068  // them regresses performance on micro-benchmarks and olden/bh.
1069  VT == MVT::v2i64;
1070  }
1071  return true;
1072 }
1073 
1074 FastISel *
1076  const TargetLibraryInfo *libInfo) const {
1077  return AArch64::createFastISel(funcInfo, libInfo);
1078 }
1079 
1080 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1081  switch ((AArch64ISD::NodeType)Opcode) {
1082  case AArch64ISD::FIRST_NUMBER: break;
1083  case AArch64ISD::CALL: return "AArch64ISD::CALL";
1084  case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
1085  case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
1086  case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
1087  case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
1088  case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
1089  case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
1090  case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
1091  case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
1092  case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
1093  case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
1094  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1095  case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
1096  case AArch64ISD::ADC: return "AArch64ISD::ADC";
1097  case AArch64ISD::SBC: return "AArch64ISD::SBC";
1098  case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
1099  case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
1100  case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
1101  case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
1102  case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
1103  case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
1104  case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
1105  case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
1106  case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
1107  case AArch64ISD::DUP: return "AArch64ISD::DUP";
1108  case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
1109  case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
1110  case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
1111  case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
1112  case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
1113  case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
1114  case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
1115  case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
1116  case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
1117  case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
1118  case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
1119  case AArch64ISD::BICi: return "AArch64ISD::BICi";
1120  case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
1121  case AArch64ISD::BSL: return "AArch64ISD::BSL";
1122  case AArch64ISD::NEG: return "AArch64ISD::NEG";
1123  case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
1124  case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
1125  case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
1126  case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
1127  case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
1128  case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
1129  case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
1130  case AArch64ISD::REV16: return "AArch64ISD::REV16";
1131  case AArch64ISD::REV32: return "AArch64ISD::REV32";
1132  case AArch64ISD::REV64: return "AArch64ISD::REV64";
1133  case AArch64ISD::EXT: return "AArch64ISD::EXT";
1134  case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
1135  case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
1136  case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
1137  case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
1138  case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
1139  case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
1140  case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
1141  case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
1142  case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
1143  case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
1144  case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
1145  case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
1146  case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
1147  case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
1148  case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
1149  case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
1150  case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
1151  case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
1152  case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
1153  case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
1154  case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
1155  case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
1156  case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
1157  case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
1158  case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
1159  case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
1160  case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
1161  case AArch64ISD::NOT: return "AArch64ISD::NOT";
1162  case AArch64ISD::BIT: return "AArch64ISD::BIT";
1163  case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
1164  case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
1165  case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
1166  case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
1167  case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
1168  case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
1169  case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
1170  case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
1171  case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
1172  case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
1173  case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
1174  case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
1175  case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
1176  case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
1177  case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
1178  case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
1179  case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
1180  case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
1181  case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
1182  case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
1183  case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
1184  case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
1185  case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
1186  case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
1187  case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
1188  case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
1189  case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
1190  case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
1191  case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
1192  case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
1193  case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
1194  case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
1195  case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
1196  case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
1197  case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
1198  case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
1199  case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
1200  case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
1201  case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
1202  case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
1203  case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
1204  case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
1205  case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
1206  case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
1207  }
1208  return nullptr;
1209 }
1210 
1213  MachineBasicBlock *MBB) const {
1214  // We materialise the F128CSEL pseudo-instruction as some control flow and a
1215  // phi node:
1216 
1217  // OrigBB:
1218  // [... previous instrs leading to comparison ...]
1219  // b.ne TrueBB
1220  // b EndBB
1221  // TrueBB:
1222  // ; Fallthrough
1223  // EndBB:
1224  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1225 
1226  MachineFunction *MF = MBB->getParent();
1227  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1228  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1229  DebugLoc DL = MI.getDebugLoc();
1230  MachineFunction::iterator It = ++MBB->getIterator();
1231 
1232  unsigned DestReg = MI.getOperand(0).getReg();
1233  unsigned IfTrueReg = MI.getOperand(1).getReg();
1234  unsigned IfFalseReg = MI.getOperand(2).getReg();
1235  unsigned CondCode = MI.getOperand(3).getImm();
1236  bool NZCVKilled = MI.getOperand(4).isKill();
1237 
1238  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1239  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1240  MF->insert(It, TrueBB);
1241  MF->insert(It, EndBB);
1242 
1243  // Transfer rest of current basic-block to EndBB
1244  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1245  MBB->end());
1246  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1247 
1248  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1249  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1250  MBB->addSuccessor(TrueBB);
1251  MBB->addSuccessor(EndBB);
1252 
1253  // TrueBB falls through to the end.
1254  TrueBB->addSuccessor(EndBB);
1255 
1256  if (!NZCVKilled) {
1257  TrueBB->addLiveIn(AArch64::NZCV);
1258  EndBB->addLiveIn(AArch64::NZCV);
1259  }
1260 
1261  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1262  .addReg(IfTrueReg)
1263  .addMBB(TrueBB)
1264  .addReg(IfFalseReg)
1265  .addMBB(MBB);
1266 
1267  MI.eraseFromParent();
1268  return EndBB;
1269 }
1270 
1272  MachineInstr &MI, MachineBasicBlock *BB) const {
1273  switch (MI.getOpcode()) {
1274  default:
1275 #ifndef NDEBUG
1276  MI.dump();
1277 #endif
1278  llvm_unreachable("Unexpected instruction for custom inserter!");
1279 
1280  case AArch64::F128CSEL:
1281  return EmitF128CSEL(MI, BB);
1282 
1283  case TargetOpcode::STACKMAP:
1284  case TargetOpcode::PATCHPOINT:
1285  return emitPatchPoint(MI, BB);
1286  }
1287 }
1288 
1289 //===----------------------------------------------------------------------===//
1290 // AArch64 Lowering private implementation.
1291 //===----------------------------------------------------------------------===//
1292 
1293 //===----------------------------------------------------------------------===//
1294 // Lowering Code
1295 //===----------------------------------------------------------------------===//
1296 
1297 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1298 /// CC
1300  switch (CC) {
1301  default:
1302  llvm_unreachable("Unknown condition code!");
1303  case ISD::SETNE:
1304  return AArch64CC::NE;
1305  case ISD::SETEQ:
1306  return AArch64CC::EQ;
1307  case ISD::SETGT:
1308  return AArch64CC::GT;
1309  case ISD::SETGE:
1310  return AArch64CC::GE;
1311  case ISD::SETLT:
1312  return AArch64CC::LT;
1313  case ISD::SETLE:
1314  return AArch64CC::LE;
1315  case ISD::SETUGT:
1316  return AArch64CC::HI;
1317  case ISD::SETUGE:
1318  return AArch64CC::HS;
1319  case ISD::SETULT:
1320  return AArch64CC::LO;
1321  case ISD::SETULE:
1322  return AArch64CC::LS;
1323  }
1324 }
1325 
1326 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1329  AArch64CC::CondCode &CondCode2) {
1330  CondCode2 = AArch64CC::AL;
1331  switch (CC) {
1332  default:
1333  llvm_unreachable("Unknown FP condition!");
1334  case ISD::SETEQ:
1335  case ISD::SETOEQ:
1336  CondCode = AArch64CC::EQ;
1337  break;
1338  case ISD::SETGT:
1339  case ISD::SETOGT:
1340  CondCode = AArch64CC::GT;
1341  break;
1342  case ISD::SETGE:
1343  case ISD::SETOGE:
1344  CondCode = AArch64CC::GE;
1345  break;
1346  case ISD::SETOLT:
1347  CondCode = AArch64CC::MI;
1348  break;
1349  case ISD::SETOLE:
1350  CondCode = AArch64CC::LS;
1351  break;
1352  case ISD::SETONE:
1353  CondCode = AArch64CC::MI;
1354  CondCode2 = AArch64CC::GT;
1355  break;
1356  case ISD::SETO:
1357  CondCode = AArch64CC::VC;
1358  break;
1359  case ISD::SETUO:
1360  CondCode = AArch64CC::VS;
1361  break;
1362  case ISD::SETUEQ:
1363  CondCode = AArch64CC::EQ;
1364  CondCode2 = AArch64CC::VS;
1365  break;
1366  case ISD::SETUGT:
1367  CondCode = AArch64CC::HI;
1368  break;
1369  case ISD::SETUGE:
1370  CondCode = AArch64CC::PL;
1371  break;
1372  case ISD::SETLT:
1373  case ISD::SETULT:
1374  CondCode = AArch64CC::LT;
1375  break;
1376  case ISD::SETLE:
1377  case ISD::SETULE:
1378  CondCode = AArch64CC::LE;
1379  break;
1380  case ISD::SETNE:
1381  case ISD::SETUNE:
1382  CondCode = AArch64CC::NE;
1383  break;
1384  }
1385 }
1386 
1387 /// Convert a DAG fp condition code to an AArch64 CC.
1388 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1389 /// should be AND'ed instead of OR'ed.
1392  AArch64CC::CondCode &CondCode2) {
1393  CondCode2 = AArch64CC::AL;
1394  switch (CC) {
1395  default:
1396  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1397  assert(CondCode2 == AArch64CC::AL);
1398  break;
1399  case ISD::SETONE:
1400  // (a one b)
1401  // == ((a olt b) || (a ogt b))
1402  // == ((a ord b) && (a une b))
1403  CondCode = AArch64CC::VC;
1404  CondCode2 = AArch64CC::NE;
1405  break;
1406  case ISD::SETUEQ:
1407  // (a ueq b)
1408  // == ((a uno b) || (a oeq b))
1409  // == ((a ule b) && (a uge b))
1410  CondCode = AArch64CC::PL;
1411  CondCode2 = AArch64CC::LE;
1412  break;
1413  }
1414 }
1415 
1416 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1417 /// CC usable with the vector instructions. Fewer operations are available
1418 /// without a real NZCV register, so we have to use less efficient combinations
1419 /// to get the same effect.
1422  AArch64CC::CondCode &CondCode2,
1423  bool &Invert) {
1424  Invert = false;
1425  switch (CC) {
1426  default:
1427  // Mostly the scalar mappings work fine.
1428  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1429  break;
1430  case ISD::SETUO:
1431  Invert = true;
1433  case ISD::SETO:
1434  CondCode = AArch64CC::MI;
1435  CondCode2 = AArch64CC::GE;
1436  break;
1437  case ISD::SETUEQ:
1438  case ISD::SETULT:
1439  case ISD::SETULE:
1440  case ISD::SETUGT:
1441  case ISD::SETUGE:
1442  // All of the compare-mask comparisons are ordered, but we can switch
1443  // between the two by a double inversion. E.g. ULE == !OGT.
1444  Invert = true;
1445  changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1446  break;
1447  }
1448 }
1449 
1450 static bool isLegalArithImmed(uint64_t C) {
1451  // Matches AArch64DAGToDAGISel::SelectArithImmed().
1452  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1453  LLVM_DEBUG(dbgs() << "Is imm " << C
1454  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1455  return IsLegal;
1456 }
1457 
1459  const SDLoc &dl, SelectionDAG &DAG) {
1460  EVT VT = LHS.getValueType();
1461  const bool FullFP16 =
1462  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1463 
1464  if (VT.isFloatingPoint()) {
1465  assert(VT != MVT::f128);
1466  if (VT == MVT::f16 && !FullFP16) {
1467  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1468  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1469  VT = MVT::f32;
1470  }
1471  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1472  }
1473 
1474  // The CMP instruction is just an alias for SUBS, and representing it as
1475  // SUBS means that it's possible to get CSE with subtract operations.
1476  // A later phase can perform the optimization of setting the destination
1477  // register to WZR/XZR if it ends up being unused.
1478  unsigned Opcode = AArch64ISD::SUBS;
1479 
1480  if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
1481  (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1482  // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
1483  // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
1484  // can be set differently by this operation. It comes down to whether
1485  // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1486  // everything is fine. If not then the optimization is wrong. Thus general
1487  // comparisons are only valid if op2 != 0.
1488 
1489  // So, finally, the only LLVM-native comparisons that don't mention C and V
1490  // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1491  // the absence of information about op2.
1492  Opcode = AArch64ISD::ADDS;
1493  RHS = RHS.getOperand(1);
1494  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1495  !isUnsignedIntSetCC(CC)) {
1496  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1497  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1498  // of the signed comparisons.
1499  Opcode = AArch64ISD::ANDS;
1500  RHS = LHS.getOperand(1);
1501  LHS = LHS.getOperand(0);
1502  }
1503 
1504  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1505  .getValue(1);
1506 }
1507 
1508 /// \defgroup AArch64CCMP CMP;CCMP matching
1509 ///
1510 /// These functions deal with the formation of CMP;CCMP;... sequences.
1511 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1512 /// a comparison. They set the NZCV flags to a predefined value if their
1513 /// predicate is false. This allows to express arbitrary conjunctions, for
1514 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
1515 /// expressed as:
1516 /// cmp A
1517 /// ccmp B, inv(CB), CA
1518 /// check for CB flags
1519 ///
1520 /// In general we can create code for arbitrary "... (and (and A B) C)"
1521 /// sequences. We can also implement some "or" expressions, because "(or A B)"
1522 /// is equivalent to "not (and (not A) (not B))" and we can implement some
1523 /// negation operations:
1524 /// We can negate the results of a single comparison by inverting the flags
1525 /// used when the predicate fails and inverting the flags tested in the next
1526 /// instruction; We can also negate the results of the whole previous
1527 /// conditional compare sequence by inverting the flags tested in the next
1528 /// instruction. However there is no way to negate the result of a partial
1529 /// sequence.
1530 ///
1531 /// Therefore on encountering an "or" expression we can negate the subtree on
1532 /// one side and have to be able to push the negate to the leafs of the subtree
1533 /// on the other side (see also the comments in code). As complete example:
1534 /// "or (or (setCA (cmp A)) (setCB (cmp B)))
1535 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1536 /// is transformed to
1537 /// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
1538 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1539 /// and implemented as:
1540 /// cmp C
1541 /// ccmp D, inv(CD), CC
1542 /// ccmp A, CA, inv(CD)
1543 /// ccmp B, CB, inv(CA)
1544 /// check for CB flags
1545 /// A counterexample is "or (and A B) (and C D)" which cannot be implemented
1546 /// by conditional compare sequences.
1547 /// @{
1548 
1549 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1551  ISD::CondCode CC, SDValue CCOp,
1553  AArch64CC::CondCode OutCC,
1554  const SDLoc &DL, SelectionDAG &DAG) {
1555  unsigned Opcode = 0;
1556  const bool FullFP16 =
1557  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1558 
1559  if (LHS.getValueType().isFloatingPoint()) {
1560  assert(LHS.getValueType() != MVT::f128);
1561  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1562  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1563  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1564  }
1565  Opcode = AArch64ISD::FCCMP;
1566  } else if (RHS.getOpcode() == ISD::SUB) {
1567  SDValue SubOp0 = RHS.getOperand(0);
1568  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1569  // See emitComparison() on why we can only do this for SETEQ and SETNE.
1570  Opcode = AArch64ISD::CCMN;
1571  RHS = RHS.getOperand(1);
1572  }
1573  }
1574  if (Opcode == 0)
1575  Opcode = AArch64ISD::CCMP;
1576 
1577  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1579  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1580  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1581  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1582 }
1583 
1584 /// Returns true if @p Val is a tree of AND/OR/SETCC operations.
1585 /// CanPushNegate is set to true if we can push a negate operation through
1586 /// the tree in a was that we are left with AND operations and negate operations
1587 /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
1588 /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
1589 /// brought into such a form.
1590 static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
1591  unsigned Depth = 0) {
1592  if (!Val.hasOneUse())
1593  return false;
1594  unsigned Opcode = Val->getOpcode();
1595  if (Opcode == ISD::SETCC) {
1596  if (Val->getOperand(0).getValueType() == MVT::f128)
1597  return false;
1598  CanNegate = true;
1599  return true;
1600  }
1601  // Protect against exponential runtime and stack overflow.
1602  if (Depth > 6)
1603  return false;
1604  if (Opcode == ISD::AND || Opcode == ISD::OR) {
1605  SDValue O0 = Val->getOperand(0);
1606  SDValue O1 = Val->getOperand(1);
1607  bool CanNegateL;
1608  if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
1609  return false;
1610  bool CanNegateR;
1611  if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
1612  return false;
1613 
1614  if (Opcode == ISD::OR) {
1615  // For an OR expression we need to be able to negate at least one side or
1616  // we cannot do the transformation at all.
1617  if (!CanNegateL && !CanNegateR)
1618  return false;
1619  // We can however change a (not (or x y)) to (and (not x) (not y)) if we
1620  // can negate the x and y subtrees.
1621  CanNegate = CanNegateL && CanNegateR;
1622  } else {
1623  // If the operands are OR expressions then we finally need to negate their
1624  // outputs, we can only do that for the operand with emitted last by
1625  // negating OutCC, not for both operands.
1626  bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
1627  bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
1628  if (NeedsNegOutL && NeedsNegOutR)
1629  return false;
1630  // We cannot negate an AND operation (it would become an OR),
1631  CanNegate = false;
1632  }
1633  return true;
1634  }
1635  return false;
1636 }
1637 
1638 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1639 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1640 /// Tries to transform the given i1 producing node @p Val to a series compare
1641 /// and conditional compare operations. @returns an NZCV flags producing node
1642 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1643 /// transformation was not possible.
1644 /// On recursive invocations @p PushNegate may be set to true to have negation
1645 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
1646 /// for the comparisons in the current subtree; @p Depth limits the search
1647 /// depth to avoid stack overflow.
1649  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1651  // We're at a tree leaf, produce a conditional comparison operation.
1652  unsigned Opcode = Val->getOpcode();
1653  if (Opcode == ISD::SETCC) {
1654  SDValue LHS = Val->getOperand(0);
1655  SDValue RHS = Val->getOperand(1);
1656  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1657  bool isInteger = LHS.getValueType().isInteger();
1658  if (Negate)
1659  CC = getSetCCInverse(CC, isInteger);
1660  SDLoc DL(Val);
1661  // Determine OutCC and handle FP special case.
1662  if (isInteger) {
1663  OutCC = changeIntCCToAArch64CC(CC);
1664  } else {
1666  AArch64CC::CondCode ExtraCC;
1667  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1668  // Some floating point conditions can't be tested with a single condition
1669  // code. Construct an additional comparison in this case.
1670  if (ExtraCC != AArch64CC::AL) {
1671  SDValue ExtraCmp;
1672  if (!CCOp.getNode())
1673  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1674  else
1675  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1676  ExtraCC, DL, DAG);
1677  CCOp = ExtraCmp;
1678  Predicate = ExtraCC;
1679  }
1680  }
1681 
1682  // Produce a normal comparison if we are first in the chain
1683  if (!CCOp)
1684  return emitComparison(LHS, RHS, CC, DL, DAG);
1685  // Otherwise produce a ccmp.
1686  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1687  DAG);
1688  }
1689  assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
1690  "Valid conjunction/disjunction tree");
1691 
1692  // Check if both sides can be transformed.
1693  SDValue LHS = Val->getOperand(0);
1694  SDValue RHS = Val->getOperand(1);
1695 
1696  // In case of an OR we need to negate our operands and the result.
1697  // (A v B) <=> not(not(A) ^ not(B))
1698  bool NegateOpsAndResult = Opcode == ISD::OR;
1699  // We can negate the results of all previous operations by inverting the
1700  // predicate flags giving us a free negation for one side. The other side
1701  // must be negatable by itself.
1702  if (NegateOpsAndResult) {
1703  // See which side we can negate.
1704  bool CanNegateL;
1705  bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
1706  assert(isValidL && "Valid conjunction/disjunction tree");
1707  (void)isValidL;
1708 
1709 #ifndef NDEBUG
1710  bool CanNegateR;
1711  bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
1712  assert(isValidR && "Valid conjunction/disjunction tree");
1713  assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
1714 #endif
1715 
1716  // Order the side which we cannot negate to RHS so we can emit it first.
1717  if (!CanNegateL)
1718  std::swap(LHS, RHS);
1719  } else {
1720  bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
1721  assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
1722  "Valid conjunction/disjunction tree");
1723  // Order the side where we need to negate the output flags to RHS so it
1724  // gets emitted first.
1725  if (NeedsNegOutL)
1726  std::swap(LHS, RHS);
1727  }
1728 
1729  // Emit RHS. If we want to negate the tree we only need to push a negate
1730  // through if we are already in a PushNegate case, otherwise we can negate
1731  // the "flags to test" afterwards.
1732  AArch64CC::CondCode RHSCC;
1733  SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
1734  CCOp, Predicate);
1735  if (NegateOpsAndResult && !Negate)
1736  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1737  // Emit LHS. We may need to negate it.
1738  SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
1739  NegateOpsAndResult, CmpR,
1740  RHSCC);
1741  // If we transformed an OR to and AND then we have to negate the result
1742  // (or absorb the Negate parameter).
1743  if (NegateOpsAndResult && !Negate)
1744  OutCC = AArch64CC::getInvertedCondCode(OutCC);
1745  return CmpL;
1746 }
1747 
1748 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1749 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1750 /// \see emitConjunctionDisjunctionTreeRec().
1752  AArch64CC::CondCode &OutCC) {
1753  bool CanNegate;
1754  if (!isConjunctionDisjunctionTree(Val, CanNegate))
1755  return SDValue();
1756 
1757  return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
1758  AArch64CC::AL);
1759 }
1760 
1761 /// @}
1762 
1764  SDValue &AArch64cc, SelectionDAG &DAG,
1765  const SDLoc &dl) {
1766  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1767  EVT VT = RHS.getValueType();
1768  uint64_t C = RHSC->getZExtValue();
1769  if (!isLegalArithImmed(C)) {
1770  // Constant does not fit, try adjusting it by one?
1771  switch (CC) {
1772  default:
1773  break;
1774  case ISD::SETLT:
1775  case ISD::SETGE:
1776  if ((VT == MVT::i32 && C != 0x80000000 &&
1777  isLegalArithImmed((uint32_t)(C - 1))) ||
1778  (VT == MVT::i64 && C != 0x80000000ULL &&
1779  isLegalArithImmed(C - 1ULL))) {
1780  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1781  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1782  RHS = DAG.getConstant(C, dl, VT);
1783  }
1784  break;
1785  case ISD::SETULT:
1786  case ISD::SETUGE:
1787  if ((VT == MVT::i32 && C != 0 &&
1788  isLegalArithImmed((uint32_t)(C - 1))) ||
1789  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1790  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1791  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1792  RHS = DAG.getConstant(C, dl, VT);
1793  }
1794  break;
1795  case ISD::SETLE:
1796  case ISD::SETGT:
1797  if ((VT == MVT::i32 && C != INT32_MAX &&
1798  isLegalArithImmed((uint32_t)(C + 1))) ||
1799  (VT == MVT::i64 && C != INT64_MAX &&
1800  isLegalArithImmed(C + 1ULL))) {
1801  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1802  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1803  RHS = DAG.getConstant(C, dl, VT);
1804  }
1805  break;
1806  case ISD::SETULE:
1807  case ISD::SETUGT:
1808  if ((VT == MVT::i32 && C != UINT32_MAX &&
1809  isLegalArithImmed((uint32_t)(C + 1))) ||
1810  (VT == MVT::i64 && C != UINT64_MAX &&
1811  isLegalArithImmed(C + 1ULL))) {
1812  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1813  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1814  RHS = DAG.getConstant(C, dl, VT);
1815  }
1816  break;
1817  }
1818  }
1819  }
1820  SDValue Cmp;
1821  AArch64CC::CondCode AArch64CC;
1822  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
1823  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
1824 
1825  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
1826  // For the i8 operand, the largest immediate is 255, so this can be easily
1827  // encoded in the compare instruction. For the i16 operand, however, the
1828  // largest immediate cannot be encoded in the compare.
1829  // Therefore, use a sign extending load and cmn to avoid materializing the
1830  // -1 constant. For example,
1831  // movz w1, #65535
1832  // ldrh w0, [x0, #0]
1833  // cmp w0, w1
1834  // >
1835  // ldrsh w0, [x0, #0]
1836  // cmn w0, #1
1837  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
1838  // if and only if (sext LHS) == (sext RHS). The checks are in place to
1839  // ensure both the LHS and RHS are truly zero extended and to make sure the
1840  // transformation is profitable.
1841  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
1842  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
1843  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
1844  LHS.getNode()->hasNUsesOfValue(1, 0)) {
1845  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
1846  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
1847  SDValue SExt =
1848  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
1849  DAG.getValueType(MVT::i16));
1850  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
1851  RHS.getValueType()),
1852  CC, dl, DAG);
1853  AArch64CC = changeIntCCToAArch64CC(CC);
1854  }
1855  }
1856 
1857  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
1858  if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
1859  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
1860  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
1861  }
1862  }
1863  }
1864 
1865  if (!Cmp) {
1866  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
1867  AArch64CC = changeIntCCToAArch64CC(CC);
1868  }
1869  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
1870  return Cmp;
1871 }
1872 
1873 static std::pair<SDValue, SDValue>
1875  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
1876  "Unsupported value type");
1877  SDValue Value, Overflow;
1878  SDLoc DL(Op);
1879  SDValue LHS = Op.getOperand(0);
1880  SDValue RHS = Op.getOperand(1);
1881  unsigned Opc = 0;
1882  switch (Op.getOpcode()) {
1883  default:
1884  llvm_unreachable("Unknown overflow instruction!");
1885  case ISD::SADDO:
1886  Opc = AArch64ISD::ADDS;
1887  CC = AArch64CC::VS;
1888  break;
1889  case ISD::UADDO:
1890  Opc = AArch64ISD::ADDS;
1891  CC = AArch64CC::HS;
1892  break;
1893  case ISD::SSUBO:
1894  Opc = AArch64ISD::SUBS;
1895  CC = AArch64CC::VS;
1896  break;
1897  case ISD::USUBO:
1898  Opc = AArch64ISD::SUBS;
1899  CC = AArch64CC::LO;
1900  break;
1901  // Multiply needs a little bit extra work.
1902  case ISD::SMULO:
1903  case ISD::UMULO: {
1904  CC = AArch64CC::NE;
1905  bool IsSigned = Op.getOpcode() == ISD::SMULO;
1906  if (Op.getValueType() == MVT::i32) {
1907  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1908  // For a 32 bit multiply with overflow check we want the instruction
1909  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
1910  // need to generate the following pattern:
1911  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
1912  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
1913  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
1914  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1915  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
1916  DAG.getConstant(0, DL, MVT::i64));
1917  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
1918  // operation. We need to clear out the upper 32 bits, because we used a
1919  // widening multiply that wrote all 64 bits. In the end this should be a
1920  // noop.
1921  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
1922  if (IsSigned) {
1923  // The signed overflow check requires more than just a simple check for
1924  // any bit set in the upper 32 bits of the result. These bits could be
1925  // just the sign bits of a negative number. To perform the overflow
1926  // check we have to arithmetic shift right the 32nd bit of the result by
1927  // 31 bits. Then we compare the result to the upper 32 bits.
1928  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
1929  DAG.getConstant(32, DL, MVT::i64));
1930  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
1931  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
1932  DAG.getConstant(31, DL, MVT::i64));
1933  // It is important that LowerBits is last, otherwise the arithmetic
1934  // shift will not be folded into the compare (SUBS).
1935  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
1936  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1937  .getValue(1);
1938  } else {
1939  // The overflow check for unsigned multiply is easy. We only need to
1940  // check if any of the upper 32 bits are set. This can be done with a
1941  // CMP (shifted register). For that we need to generate the following
1942  // pattern:
1943  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
1944  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
1945  DAG.getConstant(32, DL, MVT::i64));
1946  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1947  Overflow =
1948  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
1949  DAG.getConstant(0, DL, MVT::i64),
1950  UpperBits).getValue(1);
1951  }
1952  break;
1953  }
1954  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
1955  // For the 64 bit multiply
1956  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1957  if (IsSigned) {
1958  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
1959  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
1960  DAG.getConstant(63, DL, MVT::i64));
1961  // It is important that LowerBits is last, otherwise the arithmetic
1962  // shift will not be folded into the compare (SUBS).
1963  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1964  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1965  .getValue(1);
1966  } else {
1967  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
1968  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1969  Overflow =
1970  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
1971  DAG.getConstant(0, DL, MVT::i64),
1972  UpperBits).getValue(1);
1973  }
1974  break;
1975  }
1976  } // switch (...)
1977 
1978  if (Opc) {
1979  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
1980 
1981  // Emit the AArch64 operation with overflow check.
1982  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
1983  Overflow = Value.getValue(1);
1984  }
1985  return std::make_pair(Value, Overflow);
1986 }
1987 
1988 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
1989  RTLIB::Libcall Call) const {
1990  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
1991  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
1992 }
1993 
1994 // Returns true if the given Op is the overflow flag result of an overflow
1995 // intrinsic operation.
1996 static bool isOverflowIntrOpRes(SDValue Op) {
1997  unsigned Opc = Op.getOpcode();
1998  return (Op.getResNo() == 1 &&
1999  (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2000  Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
2001 }
2002 
2004  SDValue Sel = Op.getOperand(0);
2005  SDValue Other = Op.getOperand(1);
2006  SDLoc dl(Sel);
2007 
2008  // If the operand is an overflow checking operation, invert the condition
2009  // code and kill the Not operation. I.e., transform:
2010  // (xor (overflow_op_bool, 1))
2011  // -->
2012  // (csel 1, 0, invert(cc), overflow_op_bool)
2013  // ... which later gets transformed to just a cset instruction with an
2014  // inverted condition code, rather than a cset + eor sequence.
2015  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
2016  // Only lower legal XALUO ops.
2017  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2018  return SDValue();
2019 
2020  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2021  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2023  SDValue Value, Overflow;
2024  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2025  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2026  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2027  CCVal, Overflow);
2028  }
2029  // If neither operand is a SELECT_CC, give up.
2030  if (Sel.getOpcode() != ISD::SELECT_CC)
2031  std::swap(Sel, Other);
2032  if (Sel.getOpcode() != ISD::SELECT_CC)
2033  return Op;
2034 
2035  // The folding we want to perform is:
2036  // (xor x, (select_cc a, b, cc, 0, -1) )
2037  // -->
2038  // (csel x, (xor x, -1), cc ...)
2039  //
2040  // The latter will get matched to a CSINV instruction.
2041 
2042  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2043  SDValue LHS = Sel.getOperand(0);
2044  SDValue RHS = Sel.getOperand(1);
2045  SDValue TVal = Sel.getOperand(2);
2046  SDValue FVal = Sel.getOperand(3);
2047 
2048  // FIXME: This could be generalized to non-integer comparisons.
2049  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2050  return Op;
2051 
2052  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2053  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2054 
2055  // The values aren't constants, this isn't the pattern we're looking for.
2056  if (!CFVal || !CTVal)
2057  return Op;
2058 
2059  // We can commute the SELECT_CC by inverting the condition. This
2060  // might be needed to make this fit into a CSINV pattern.
2061  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2062  std::swap(TVal, FVal);
2063  std::swap(CTVal, CFVal);
2064  CC = ISD::getSetCCInverse(CC, true);
2065  }
2066 
2067  // If the constants line up, perform the transform!
2068  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2069  SDValue CCVal;
2070  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2071 
2072  FVal = Other;
2073  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2074  DAG.getConstant(-1ULL, dl, Other.getValueType()));
2075 
2076  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2077  CCVal, Cmp);
2078  }
2079 
2080  return Op;
2081 }
2082 
2084  EVT VT = Op.getValueType();
2085 
2086  // Let legalize expand this if it isn't a legal type yet.
2087  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2088  return SDValue();
2089 
2090  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2091 
2092  unsigned Opc;
2093  bool ExtraOp = false;
2094  switch (Op.getOpcode()) {
2095  default:
2096  llvm_unreachable("Invalid code");
2097  case ISD::ADDC:
2098  Opc = AArch64ISD::ADDS;
2099  break;
2100  case ISD::SUBC:
2101  Opc = AArch64ISD::SUBS;
2102  break;
2103  case ISD::ADDE:
2104  Opc = AArch64ISD::ADCS;
2105  ExtraOp = true;
2106  break;
2107  case ISD::SUBE:
2108  Opc = AArch64ISD::SBCS;
2109  ExtraOp = true;
2110  break;
2111  }
2112 
2113  if (!ExtraOp)
2114  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2115  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2116  Op.getOperand(2));
2117 }
2118 
2120  // Let legalize expand this if it isn't a legal type yet.
2122  return SDValue();
2123 
2124  SDLoc dl(Op);
2126  // The actual operation that sets the overflow or carry flag.
2127  SDValue Value, Overflow;
2128  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2129 
2130  // We use 0 and 1 as false and true values.
2131  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2132  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2133 
2134  // We use an inverted condition, because the conditional select is inverted
2135  // too. This will allow it to be selected to a single instruction:
2136  // CSINC Wd, WZR, WZR, invert(cond).
2137  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2138  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2139  CCVal, Overflow);
2140 
2141  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2142  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2143 }
2144 
2145 // Prefetch operands are:
2146 // 1: Address to prefetch
2147 // 2: bool isWrite
2148 // 3: int locality (0 = no locality ... 3 = extreme locality)
2149 // 4: bool isDataCache
2151  SDLoc DL(Op);
2152  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2153  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2154  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2155 
2156  bool IsStream = !Locality;
2157  // When the locality number is set
2158  if (Locality) {
2159  // The front-end should have filtered out the out-of-range values
2160  assert(Locality <= 3 && "Prefetch locality out-of-range");
2161  // The locality degree is the opposite of the cache speed.
2162  // Put the number the other way around.
2163  // The encoding starts at 0 for level 1
2164  Locality = 3 - Locality;
2165  }
2166 
2167  // built the mask value encoding the expected behavior.
2168  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2169  (!IsData << 3) | // IsDataCache bit
2170  (Locality << 1) | // Cache level bits
2171  (unsigned)IsStream; // Stream bit
2172  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2173  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2174 }
2175 
2176 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2177  SelectionDAG &DAG) const {
2178  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2179 
2180  RTLIB::Libcall LC;
2182 
2183  return LowerF128Call(Op, DAG, LC);
2184 }
2185 
2186 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2187  SelectionDAG &DAG) const {
2188  if (Op.getOperand(0).getValueType() != MVT::f128) {
2189  // It's legal except when f128 is involved
2190  return Op;
2191  }
2192 
2193  RTLIB::Libcall LC;
2195 
2196  // FP_ROUND node has a second operand indicating whether it is known to be
2197  // precise. That doesn't take part in the LibCall so we can't directly use
2198  // LowerF128Call.
2199  SDValue SrcVal = Op.getOperand(0);
2200  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
2201  SDLoc(Op)).first;
2202 }
2203 
2205  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2206  // Any additional optimization in this function should be recorded
2207  // in the cost tables.
2208  EVT InVT = Op.getOperand(0).getValueType();
2209  EVT VT = Op.getValueType();
2210  unsigned NumElts = InVT.getVectorNumElements();
2211 
2212  // f16 vectors are promoted to f32 before a conversion.
2213  if (InVT.getVectorElementType() == MVT::f16) {
2214  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2215  SDLoc dl(Op);
2216  return DAG.getNode(
2217  Op.getOpcode(), dl, Op.getValueType(),
2218  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2219  }
2220 
2221  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2222  SDLoc dl(Op);
2223  SDValue Cv =
2224  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2225  Op.getOperand(0));
2226  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2227  }
2228 
2229  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2230  SDLoc dl(Op);
2231  MVT ExtVT =
2233  VT.getVectorNumElements());
2234  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2235  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2236  }
2237 
2238  // Type changing conversions are illegal.
2239  return Op;
2240 }
2241 
2242 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2243  SelectionDAG &DAG) const {
2244  if (Op.getOperand(0).getValueType().isVector())
2245  return LowerVectorFP_TO_INT(Op, DAG);
2246 
2247  // f16 conversions are promoted to f32 when full fp16 is not supported.
2248  if (Op.getOperand(0).getValueType() == MVT::f16 &&
2249  !Subtarget->hasFullFP16()) {
2250  SDLoc dl(Op);
2251  return DAG.getNode(
2252  Op.getOpcode(), dl, Op.getValueType(),
2253  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2254  }
2255 
2256  if (Op.getOperand(0).getValueType() != MVT::f128) {
2257  // It's legal except when f128 is involved
2258  return Op;
2259  }
2260 
2261  RTLIB::Libcall LC;
2262  if (Op.getOpcode() == ISD::FP_TO_SINT)
2264  else
2266 
2267  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2268  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
2269 }
2270 
2272  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2273  // Any additional optimization in this function should be recorded
2274  // in the cost tables.
2275  EVT VT = Op.getValueType();
2276  SDLoc dl(Op);
2277  SDValue In = Op.getOperand(0);
2278  EVT InVT = In.getValueType();
2279 
2280  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2281  MVT CastVT =
2283  InVT.getVectorNumElements());
2284  In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2285  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2286  }
2287 
2288  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2289  unsigned CastOpc =
2291  EVT CastVT = VT.changeVectorElementTypeToInteger();
2292  In = DAG.getNode(CastOpc, dl, CastVT, In);
2293  return DAG.getNode(Op.getOpcode(), dl, VT, In);
2294  }
2295 
2296  return Op;
2297 }
2298 
2299 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2300  SelectionDAG &DAG) const {
2301  if (Op.getValueType().isVector())
2302  return LowerVectorINT_TO_FP(Op, DAG);
2303 
2304  // f16 conversions are promoted to f32 when full fp16 is not supported.
2305  if (Op.getValueType() == MVT::f16 &&
2306  !Subtarget->hasFullFP16()) {
2307  SDLoc dl(Op);
2308  return DAG.getNode(
2309  ISD::FP_ROUND, dl, MVT::f16,
2310  DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2311  DAG.getIntPtrConstant(0, dl));
2312  }
2313 
2314  // i128 conversions are libcalls.
2315  if (Op.getOperand(0).getValueType() == MVT::i128)
2316  return SDValue();
2317 
2318  // Other conversions are legal, unless it's to the completely software-based
2319  // fp128.
2320  if (Op.getValueType() != MVT::f128)
2321  return Op;
2322 
2323  RTLIB::Libcall LC;
2324  if (Op.getOpcode() == ISD::SINT_TO_FP)
2326  else
2328 
2329  return LowerF128Call(Op, DAG, LC);
2330 }
2331 
2332 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2333  SelectionDAG &DAG) const {
2334  // For iOS, we want to call an alternative entry point: __sincos_stret,
2335  // which returns the values in two S / D registers.
2336  SDLoc dl(Op);
2337  SDValue Arg = Op.getOperand(0);
2338  EVT ArgVT = Arg.getValueType();
2339  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2340 
2341  ArgListTy Args;
2342  ArgListEntry Entry;
2343 
2344  Entry.Node = Arg;
2345  Entry.Ty = ArgTy;
2346  Entry.IsSExt = false;
2347  Entry.IsZExt = false;
2348  Args.push_back(Entry);
2349 
2350  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2351  : RTLIB::SINCOS_STRET_F32;
2352  const char *LibcallName = getLibcallName(LC);
2353  SDValue Callee =
2354  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2355 
2356  StructType *RetTy = StructType::get(ArgTy, ArgTy);
2358  CLI.setDebugLoc(dl)
2359  .setChain(DAG.getEntryNode())
2360  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2361 
2362  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2363  return CallResult.first;
2364 }
2365 
2367  if (Op.getValueType() != MVT::f16)
2368  return SDValue();
2369 
2370  assert(Op.getOperand(0).getValueType() == MVT::i16);
2371  SDLoc DL(Op);
2372 
2373  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2374  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2375  return SDValue(
2376  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2377  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2378  0);
2379 }
2380 
2381 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2382  if (OrigVT.getSizeInBits() >= 64)
2383  return OrigVT;
2384 
2385  assert(OrigVT.isSimple() && "Expecting a simple value type");
2386 
2387  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2388  switch (OrigSimpleTy) {
2389  default: llvm_unreachable("Unexpected Vector Type");
2390  case MVT::v2i8:
2391  case MVT::v2i16:
2392  return MVT::v2i32;
2393  case MVT::v4i8:
2394  return MVT::v4i16;
2395  }
2396 }
2397 
2399  const EVT &OrigTy,
2400  const EVT &ExtTy,
2401  unsigned ExtOpcode) {
2402  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2403  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2404  // 64-bits we need to insert a new extension so that it will be 64-bits.
2405  assert(ExtTy.is128BitVector() && "Unexpected extension size");
2406  if (OrigTy.getSizeInBits() >= 64)
2407  return N;
2408 
2409  // Must extend size to at least 64 bits to be used as an operand for VMULL.
2410  EVT NewVT = getExtensionTo64Bits(OrigTy);
2411 
2412  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2413 }
2414 
2416  bool isSigned) {
2417  EVT VT = N->getValueType(0);
2418 
2419  if (N->getOpcode() != ISD::BUILD_VECTOR)
2420  return false;
2421 
2422  for (const SDValue &Elt : N->op_values()) {
2423  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2424  unsigned EltSize = VT.getScalarSizeInBits();
2425  unsigned HalfSize = EltSize / 2;
2426  if (isSigned) {
2427  if (!isIntN(HalfSize, C->getSExtValue()))
2428  return false;
2429  } else {
2430  if (!isUIntN(HalfSize, C->getZExtValue()))
2431  return false;
2432  }
2433  continue;
2434  }
2435  return false;
2436  }
2437 
2438  return true;
2439 }
2440 
2442  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2444  N->getOperand(0)->getValueType(0),
2445  N->getValueType(0),
2446  N->getOpcode());
2447 
2448  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2449  EVT VT = N->getValueType(0);
2450  SDLoc dl(N);
2451  unsigned EltSize = VT.getScalarSizeInBits() / 2;
2452  unsigned NumElts = VT.getVectorNumElements();
2453  MVT TruncVT = MVT::getIntegerVT(EltSize);
2455  for (unsigned i = 0; i != NumElts; ++i) {
2456  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2457  const APInt &CInt = C->getAPIntValue();
2458  // Element types smaller than 32 bits are not legal, so use i32 elements.
2459  // The values are implicitly truncated so sext vs. zext doesn't matter.
2460  Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2461  }
2462  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2463 }
2464 
2465 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2466  return N->getOpcode() == ISD::SIGN_EXTEND ||
2467  isExtendedBUILD_VECTOR(N, DAG, true);
2468 }
2469 
2470 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2471  return N->getOpcode() == ISD::ZERO_EXTEND ||
2472  isExtendedBUILD_VECTOR(N, DAG, false);
2473 }
2474 
2475 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2476  unsigned Opcode = N->getOpcode();
2477  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2478  SDNode *N0 = N->getOperand(0).getNode();
2479  SDNode *N1 = N->getOperand(1).getNode();
2480  return N0->hasOneUse() && N1->hasOneUse() &&
2481  isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2482  }
2483  return false;
2484 }
2485 
2486 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2487  unsigned Opcode = N->getOpcode();
2488  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2489  SDNode *N0 = N->getOperand(0).getNode();
2490  SDNode *N1 = N->getOperand(1).getNode();
2491  return N0->hasOneUse() && N1->hasOneUse() &&
2492  isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2493  }
2494  return false;
2495 }
2496 
2498  // Multiplications are only custom-lowered for 128-bit vectors so that
2499  // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2500  EVT VT = Op.getValueType();
2501  assert(VT.is128BitVector() && VT.isInteger() &&
2502  "unexpected type for custom-lowering ISD::MUL");
2503  SDNode *N0 = Op.getOperand(0).getNode();
2504  SDNode *N1 = Op.getOperand(1).getNode();
2505  unsigned NewOpc = 0;
2506  bool isMLA = false;
2507  bool isN0SExt = isSignExtended(N0, DAG);
2508  bool isN1SExt = isSignExtended(N1, DAG);
2509  if (isN0SExt && isN1SExt)
2510  NewOpc = AArch64ISD::SMULL;
2511  else {
2512  bool isN0ZExt = isZeroExtended(N0, DAG);
2513  bool isN1ZExt = isZeroExtended(N1, DAG);
2514  if (isN0ZExt && isN1ZExt)
2515  NewOpc = AArch64ISD::UMULL;
2516  else if (isN1SExt || isN1ZExt) {
2517  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2518  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2519  if (isN1SExt && isAddSubSExt(N0, DAG)) {
2520  NewOpc = AArch64ISD::SMULL;
2521  isMLA = true;
2522  } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2523  NewOpc = AArch64ISD::UMULL;
2524  isMLA = true;
2525  } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2526  std::swap(N0, N1);
2527  NewOpc = AArch64ISD::UMULL;
2528  isMLA = true;
2529  }
2530  }
2531 
2532  if (!NewOpc) {
2533  if (VT == MVT::v2i64)
2534  // Fall through to expand this. It is not legal.
2535  return SDValue();
2536  else
2537  // Other vector multiplications are legal.
2538  return Op;
2539  }
2540  }
2541 
2542  // Legalize to a S/UMULL instruction
2543  SDLoc DL(Op);
2544  SDValue Op0;
2545  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2546  if (!isMLA) {
2547  Op0 = skipExtensionForVectorMULL(N0, DAG);
2548  assert(Op0.getValueType().is64BitVector() &&
2549  Op1.getValueType().is64BitVector() &&
2550  "unexpected types for extended operands to VMULL");
2551  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2552  }
2553  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2554  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2555  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2556  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2557  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2558  EVT Op1VT = Op1.getValueType();
2559  return DAG.getNode(N0->getOpcode(), DL, VT,
2560  DAG.getNode(NewOpc, DL, VT,
2561  DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2562  DAG.getNode(NewOpc, DL, VT,
2563  DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2564 }
2565 
2566 // Lower vector multiply high (ISD::MULHS and ISD::MULHU).
2568  // Multiplications are only custom-lowered for 128-bit vectors so that
2569  // {S,U}MULL{2} can be detected. Otherwise v2i64 multiplications are not
2570  // legal.
2571  EVT VT = Op.getValueType();
2572  assert(VT.is128BitVector() && VT.isInteger() &&
2573  "unexpected type for custom-lowering ISD::MULH{U,S}");
2574 
2575  SDValue V0 = Op.getOperand(0);
2576  SDValue V1 = Op.getOperand(1);
2577 
2578  SDLoc DL(Op);
2579 
2580  EVT ExtractVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
2581 
2582  // We turn (V0 mulhs/mulhu V1) to:
2583  //
2584  // (uzp2 (smull (extract_subvector (ExtractVT V128:V0, (i64 0)),
2585  // (extract_subvector (ExtractVT V128:V1, (i64 0))))),
2586  // (smull (extract_subvector (ExtractVT V128:V0, (i64 VMull2Idx)),
2587  // (extract_subvector (ExtractVT V128:V2, (i64 VMull2Idx))))))
2588  //
2589  // Where ExtractVT is a subvector with half number of elements, and
2590  // VMullIdx2 is the index of the middle element (the high part).
2591  //
2592  // The vector hight part extract and multiply will be matched against
2593  // {S,U}MULL{v16i8_v8i16,v8i16_v4i32,v4i32_v2i64} which in turn will
2594  // issue a {s}mull2 instruction.
2595  //
2596  // This basically multiply the lower subvector with '{s,u}mull', the high
2597  // subvector with '{s,u}mull2', and shuffle both results high part in
2598  // resulting vector.
2599  unsigned Mull2VectorIdx = VT.getVectorNumElements () / 2;
2600  SDValue VMullIdx = DAG.getConstant(0, DL, MVT::i64);
2601  SDValue VMull2Idx = DAG.getConstant(Mull2VectorIdx, DL, MVT::i64);
2602 
2603  SDValue VMullV0 =
2604  DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMullIdx);
2605  SDValue VMullV1 =
2606  DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMullIdx);
2607 
2608  SDValue VMull2V0 =
2609  DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMull2Idx);
2610  SDValue VMull2V1 =
2611  DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMull2Idx);
2612 
2613  unsigned MullOpc = Op.getOpcode() == ISD::MULHS ? AArch64ISD::SMULL
2615 
2616  EVT MullVT = ExtractVT.widenIntegerVectorElementType(*DAG.getContext());
2617  SDValue Mull = DAG.getNode(MullOpc, DL, MullVT, VMullV0, VMullV1);
2618  SDValue Mull2 = DAG.getNode(MullOpc, DL, MullVT, VMull2V0, VMull2V1);
2619 
2620  Mull = DAG.getNode(ISD::BITCAST, DL, VT, Mull);
2621  Mull2 = DAG.getNode(ISD::BITCAST, DL, VT, Mull2);
2622 
2623  return DAG.getNode(AArch64ISD::UZP2, DL, VT, Mull, Mull2);
2624 }
2625 
2626 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2627  SelectionDAG &DAG) const {
2628  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2629  SDLoc dl(Op);
2630  switch (IntNo) {
2631  default: return SDValue(); // Don't custom lower most intrinsics.
2632  case Intrinsic::thread_pointer: {
2633  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2634  return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2635  }
2636  case Intrinsic::aarch64_neon_abs:
2637  return DAG.getNode(ISD::ABS, dl, Op.getValueType(),
2638  Op.getOperand(1));
2639  case Intrinsic::aarch64_neon_smax:
2640  return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2641  Op.getOperand(1), Op.getOperand(2));
2642  case Intrinsic::aarch64_neon_umax:
2643  return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2644  Op.getOperand(1), Op.getOperand(2));
2645  case Intrinsic::aarch64_neon_smin:
2646  return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2647  Op.getOperand(1), Op.getOperand(2));
2648  case Intrinsic::aarch64_neon_umin:
2649  return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2650  Op.getOperand(1), Op.getOperand(2));
2651  }
2652 }
2653 
2655  SelectionDAG &DAG) const {
2656  LLVM_DEBUG(dbgs() << "Custom lowering: ");
2657  LLVM_DEBUG(Op.dump());
2658 
2659  switch (Op.getOpcode()) {
2660  default:
2661  llvm_unreachable("unimplemented operand");
2662  return SDValue();
2663  case ISD::BITCAST:
2664  return LowerBITCAST(Op, DAG);
2665  case ISD::GlobalAddress:
2666  return LowerGlobalAddress(Op, DAG);
2667  case ISD::GlobalTLSAddress:
2668  return LowerGlobalTLSAddress(Op, DAG);
2669  case ISD::SETCC:
2670  return LowerSETCC(Op, DAG);
2671  case ISD::BR_CC:
2672  return LowerBR_CC(Op, DAG);
2673  case ISD::SELECT:
2674  return LowerSELECT(Op, DAG);
2675  case ISD::SELECT_CC:
2676  return LowerSELECT_CC(Op, DAG);
2677  case ISD::JumpTable:
2678  return LowerJumpTable(Op, DAG);
2679  case ISD::ConstantPool:
2680  return LowerConstantPool(Op, DAG);
2681  case ISD::BlockAddress:
2682  return LowerBlockAddress(Op, DAG);
2683  case ISD::VASTART:
2684  return LowerVASTART(Op, DAG);
2685  case ISD::VACOPY:
2686  return LowerVACOPY(Op, DAG);
2687  case ISD::VAARG:
2688  return LowerVAARG(Op, DAG);
2689  case ISD::ADDC:
2690  case ISD::ADDE:
2691  case ISD::SUBC:
2692  case ISD::SUBE:
2693  return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2694  case ISD::SADDO:
2695  case ISD::UADDO:
2696  case ISD::SSUBO:
2697  case ISD::USUBO:
2698  case ISD::SMULO:
2699  case ISD::UMULO:
2700  return LowerXALUO(Op, DAG);
2701  case ISD::FADD:
2702  return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
2703  case ISD::FSUB:
2704  return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
2705  case ISD::FMUL:
2706  return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
2707  case ISD::FDIV:
2708  return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
2709  case ISD::FP_ROUND:
2710  return LowerFP_ROUND(Op, DAG);
2711  case ISD::FP_EXTEND:
2712  return LowerFP_EXTEND(Op, DAG);
2713  case ISD::FRAMEADDR:
2714  return LowerFRAMEADDR(Op, DAG);
2715  case ISD::RETURNADDR:
2716  return LowerRETURNADDR(Op, DAG);
2718  return LowerINSERT_VECTOR_ELT(Op, DAG);
2720  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2721  case ISD::BUILD_VECTOR:
2722  return LowerBUILD_VECTOR(Op, DAG);
2723  case ISD::VECTOR_SHUFFLE:
2724  return LowerVECTOR_SHUFFLE(Op, DAG);
2726  return LowerEXTRACT_SUBVECTOR(Op, DAG);
2727  case ISD::SRA:
2728  case ISD::SRL:
2729  case ISD::SHL:
2730  return LowerVectorSRA_SRL_SHL(Op, DAG);
2731  case ISD::SHL_PARTS:
2732  return LowerShiftLeftParts(Op, DAG);
2733  case ISD::SRL_PARTS:
2734  case ISD::SRA_PARTS:
2735  return LowerShiftRightParts(Op, DAG);
2736  case ISD::CTPOP:
2737  return LowerCTPOP(Op, DAG);
2738  case ISD::FCOPYSIGN:
2739  return LowerFCOPYSIGN(Op, DAG);
2740  case ISD::AND:
2741  return LowerVectorAND(Op, DAG);
2742  case ISD::OR:
2743  return LowerVectorOR(Op, DAG);
2744  case ISD::XOR:
2745  return LowerXOR(Op, DAG);
2746  case ISD::PREFETCH:
2747  return LowerPREFETCH(Op, DAG);
2748  case ISD::SINT_TO_FP:
2749  case ISD::UINT_TO_FP:
2750  return LowerINT_TO_FP(Op, DAG);
2751  case ISD::FP_TO_SINT:
2752  case ISD::FP_TO_UINT:
2753  return LowerFP_TO_INT(Op, DAG);
2754  case ISD::FSINCOS:
2755  return LowerFSINCOS(Op, DAG);
2756  case ISD::MUL:
2757  return LowerMUL(Op, DAG);
2758  case ISD::MULHS:
2759  case ISD::MULHU:
2760  return LowerMULH(Op, DAG);
2762  return LowerINTRINSIC_WO_CHAIN(Op, DAG);
2763  case ISD::VECREDUCE_ADD:
2764  case ISD::VECREDUCE_SMAX:
2765  case ISD::VECREDUCE_SMIN:
2766  case ISD::VECREDUCE_UMAX:
2767  case ISD::VECREDUCE_UMIN:
2768  case ISD::VECREDUCE_FMAX:
2769  case ISD::VECREDUCE_FMIN:
2770  return LowerVECREDUCE(Op, DAG);
2771  case ISD::ATOMIC_LOAD_SUB:
2772  return LowerATOMIC_LOAD_SUB(Op, DAG);
2773  case ISD::ATOMIC_LOAD_AND:
2774  return LowerATOMIC_LOAD_AND(Op, DAG);
2776  return LowerDYNAMIC_STACKALLOC(Op, DAG);
2777  }
2778 }
2779 
2780 //===----------------------------------------------------------------------===//
2781 // Calling Convention Implementation
2782 //===----------------------------------------------------------------------===//
2783 
2784 #include "AArch64GenCallingConv.inc"
2785 
2786 /// Selects the correct CCAssignFn for a given CallingConvention value.
2788  bool IsVarArg) const {
2789  switch (CC) {
2790  default:
2791  report_fatal_error("Unsupported calling convention.");
2793  return CC_AArch64_WebKit_JS;
2794  case CallingConv::GHC:
2795  return CC_AArch64_GHC;
2796  case CallingConv::C:
2797  case CallingConv::Fast:
2800  case CallingConv::Swift:
2801  if (Subtarget->isTargetWindows() && IsVarArg)
2802  return CC_AArch64_Win64_VarArg;
2803  if (!Subtarget->isTargetDarwin())
2804  return CC_AArch64_AAPCS;
2805  return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
2806  case CallingConv::Win64:
2807  return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
2808  }
2809 }
2810 
2811 CCAssignFn *
2813  return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
2814  : RetCC_AArch64_AAPCS;
2815 }
2816 
2817 SDValue AArch64TargetLowering::LowerFormalArguments(
2818  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2819  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2820  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2821  MachineFunction &MF = DAG.getMachineFunction();
2822  MachineFrameInfo &MFI = MF.getFrameInfo();
2823  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
2824 
2825  // Assign locations to all of the incoming arguments.
2827  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2828  *DAG.getContext());
2829 
2830  // At this point, Ins[].VT may already be promoted to i32. To correctly
2831  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
2832  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
2833  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
2834  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
2835  // LocVT.
2836  unsigned NumArgs = Ins.size();
2838  unsigned CurArgIdx = 0;
2839  for (unsigned i = 0; i != NumArgs; ++i) {
2840  MVT ValVT = Ins[i].VT;
2841  if (Ins[i].isOrigArg()) {
2842  std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
2843  CurArgIdx = Ins[i].getOrigArgIndex();
2844 
2845  // Get type of the original argument.
2846  EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
2847  /*AllowUnknown*/ true);
2848  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
2849  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
2850  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
2851  ValVT = MVT::i8;
2852  else if (ActualMVT == MVT::i16)
2853  ValVT = MVT::i16;
2854  }
2855  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
2856  bool Res =
2857  AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
2858  assert(!Res && "Call operand has unhandled type");
2859  (void)Res;
2860  }
2861  assert(ArgLocs.size() == Ins.size());
2862  SmallVector<SDValue, 16> ArgValues;
2863  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2864  CCValAssign &VA = ArgLocs[i];
2865 
2866  if (Ins[i].Flags.isByVal()) {
2867  // Byval is used for HFAs in the PCS, but the system should work in a
2868  // non-compliant manner for larger structs.
2869  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2870  int Size = Ins[i].Flags.getByValSize();
2871  unsigned NumRegs = (Size + 7) / 8;
2872 
2873  // FIXME: This works on big-endian for composite byvals, which are the common
2874  // case. It should also work for fundamental types too.
2875  unsigned FrameIdx =
2876  MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
2877  SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
2878  InVals.push_back(FrameIdxN);
2879 
2880  continue;
2881  }
2882 
2883  if (VA.isRegLoc()) {
2884  // Arguments stored in registers.
2885  EVT RegVT = VA.getLocVT();
2886 
2887  SDValue ArgValue;
2888  const TargetRegisterClass *RC;
2889 
2890  if (RegVT == MVT::i32)
2891  RC = &AArch64::GPR32RegClass;
2892  else if (RegVT == MVT::i64)
2893  RC = &AArch64::GPR64RegClass;
2894  else if (RegVT == MVT::f16)
2895  RC = &AArch64::FPR16RegClass;
2896  else if (RegVT == MVT::f32)
2897  RC = &AArch64::FPR32RegClass;
2898  else if (RegVT == MVT::f64 || RegVT.is64BitVector())
2899  RC = &AArch64::FPR64RegClass;
2900  else if (RegVT == MVT::f128 || RegVT.is128BitVector())
2901  RC = &AArch64::FPR128RegClass;
2902  else
2903  llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
2904 
2905  // Transform the arguments in physical registers into virtual ones.
2906  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2907  ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
2908 
2909  // If this is an 8, 16 or 32-bit value, it is really passed promoted
2910  // to 64 bits. Insert an assert[sz]ext to capture this, then
2911  // truncate to the right size.
2912  switch (VA.getLocInfo()) {
2913  default:
2914  llvm_unreachable("Unknown loc info!");
2915  case CCValAssign::Full:
2916  break;
2917  case CCValAssign::BCvt:
2918  ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
2919  break;
2920  case CCValAssign::AExt:
2921  case CCValAssign::SExt:
2922  case CCValAssign::ZExt:
2923  // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
2924  // nodes after our lowering.
2925  assert(RegVT == Ins[i].VT && "incorrect register location selected");
2926  break;
2927  }
2928 
2929  InVals.push_back(ArgValue);
2930 
2931  } else { // VA.isRegLoc()
2932  assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
2933  unsigned ArgOffset = VA.getLocMemOffset();
2934  unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
2935 
2936  uint32_t BEAlign = 0;
2937  if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
2938  !Ins[i].Flags.isInConsecutiveRegs())
2939  BEAlign = 8 - ArgSize;
2940 
2941  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
2942 
2943  // Create load nodes to retrieve arguments from the stack.
2944  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2945  SDValue ArgValue;
2946 
2947  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2949  MVT MemVT = VA.getValVT();
2950 
2951  switch (VA.getLocInfo()) {
2952  default:
2953  break;
2954  case CCValAssign::BCvt:
2955  MemVT = VA.getLocVT();
2956  break;
2957  case CCValAssign::SExt:
2958  ExtType = ISD::SEXTLOAD;
2959  break;
2960  case CCValAssign::ZExt:
2961  ExtType = ISD::ZEXTLOAD;
2962  break;
2963  case CCValAssign::AExt:
2964  ExtType = ISD::EXTLOAD;
2965  break;
2966  }
2967 
2968  ArgValue = DAG.getExtLoad(
2969  ExtType, DL, VA.getLocVT(), Chain, FIN,
2971  MemVT);
2972 
2973  InVals.push_back(ArgValue);
2974  }
2975  }
2976 
2977  // varargs
2979  if (isVarArg) {
2980  if (!Subtarget->isTargetDarwin() || IsWin64) {
2981  // The AAPCS variadic function ABI is identical to the non-variadic
2982  // one. As a result there may be more arguments in registers and we should
2983  // save them for future reference.
2984  // Win64 variadic functions also pass arguments in registers, but all float
2985  // arguments are passed in integer registers.
2986  saveVarArgRegisters(CCInfo, DAG, DL, Chain);
2987  }
2988 
2989  // This will point to the next argument passed via stack.
2990  unsigned StackOffset = CCInfo.getNextStackOffset();
2991  // We currently pass all varargs at 8-byte alignment.
2992  StackOffset = ((StackOffset + 7) & ~7);
2993  FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
2994  }
2995 
2996  unsigned StackArgSize = CCInfo.getNextStackOffset();
2997  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2998  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
2999  // This is a non-standard ABI so by fiat I say we're allowed to make full
3000  // use of the stack area to be popped, which must be aligned to 16 bytes in
3001  // any case:
3002  StackArgSize = alignTo(StackArgSize, 16);
3003 
3004  // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
3005  // a multiple of 16.
3006  FuncInfo->setArgumentStackToRestore(StackArgSize);
3007 
3008  // This realignment carries over to the available bytes below. Our own
3009  // callers will guarantee the space is free by giving an aligned value to
3010  // CALLSEQ_START.
3011  }
3012  // Even if we're not expected to free up the space, it's useful to know how
3013  // much is there while considering tail calls (because we can reuse it).
3014  FuncInfo->setBytesInStackArgArea(StackArgSize);
3015 
3016  return Chain;
3017 }
3018 
3019 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
3020  SelectionDAG &DAG,
3021  const SDLoc &DL,
3022  SDValue &Chain) const {
3023  MachineFunction &MF = DAG.getMachineFunction();
3024  MachineFrameInfo &MFI = MF.getFrameInfo();
3026  auto PtrVT = getPointerTy(DAG.getDataLayout());
3027  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3028 
3029  SmallVector<SDValue, 8> MemOps;
3030 
3031  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
3032  AArch64::X3, AArch64::X4, AArch64::X5,
3033  AArch64::X6, AArch64::X7 };
3034  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
3035  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
3036 
3037  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
3038  int GPRIdx = 0;
3039  if (GPRSaveSize != 0) {
3040  if (IsWin64) {
3041  GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
3042  if (GPRSaveSize & 15)
3043  // The extra size here, if triggered, will always be 8.
3044  MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
3045  } else
3046  GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
3047 
3048  SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
3049 
3050  for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
3051  unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
3052  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
3053  SDValue Store = DAG.getStore(
3054  Val.getValue(1), DL, Val, FIN,
3055  IsWin64
3057  GPRIdx,
3058  (i - FirstVariadicGPR) * 8)
3060  MemOps.push_back(Store);
3061  FIN =
3062  DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
3063  }
3064  }
3065  FuncInfo->setVarArgsGPRIndex(GPRIdx);
3066  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
3067 
3068  if (Subtarget->hasFPARMv8() && !IsWin64) {
3069  static const MCPhysReg FPRArgRegs[] = {
3070  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
3071  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
3072  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
3073  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
3074 
3075  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
3076  int FPRIdx = 0;
3077  if (FPRSaveSize != 0) {
3078  FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
3079 
3080  SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3081 
3082  for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3083  unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3084  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3085 
3086  SDValue Store = DAG.getStore(
3087  Val.getValue(1), DL, Val, FIN,
3089  MemOps.push_back(Store);
3090  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3091  DAG.getConstant(16, DL, PtrVT));
3092  }
3093  }
3094  FuncInfo->setVarArgsFPRIndex(FPRIdx);
3095  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3096  }
3097 
3098  if (!MemOps.empty()) {
3099  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3100  }
3101 }
3102 
3103 /// LowerCallResult - Lower the result values of a call into the
3104 /// appropriate copies out of appropriate physical registers.
3105 SDValue AArch64TargetLowering::LowerCallResult(
3106  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3107  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3108  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3109  SDValue ThisVal) const {
3110  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3111  ? RetCC_AArch64_WebKit_JS
3112  : RetCC_AArch64_AAPCS;
3113  // Assign locations to each value returned by this call.
3115  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3116  *DAG.getContext());
3117  CCInfo.AnalyzeCallResult(Ins, RetCC);
3118 
3119  // Copy all of the result registers out of their specified physreg.
3120  for (unsigned i = 0; i != RVLocs.size(); ++i) {
3121  CCValAssign VA = RVLocs[i];
3122 
3123  // Pass 'this' value directly from the argument to return value, to avoid
3124  // reg unit interference
3125  if (i == 0 && isThisReturn) {
3126  assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3127  "unexpected return calling convention register assignment");
3128  InVals.push_back(ThisVal);
3129  continue;
3130  }
3131 
3132  SDValue Val =
3133  DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3134  Chain = Val.getValue(1);
3135  InFlag = Val.getValue(2);
3136 
3137  switch (VA.getLocInfo()) {
3138  default:
3139  llvm_unreachable("Unknown loc info!");
3140  case CCValAssign::Full:
3141  break;
3142  case CCValAssign::BCvt:
3143  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3144  break;
3145  }
3146 
3147  InVals.push_back(Val);
3148  }
3149 
3150  return Chain;
3151 }
3152 
3153 /// Return true if the calling convention is one that we can guarantee TCO for.
3155  return CC == CallingConv::Fast;
3156 }
3157 
3158 /// Return true if we might ever do TCO for calls with this calling convention.
3160  switch (CC) {
3161  case CallingConv::C:
3163  case CallingConv::Swift:
3164  return true;
3165  default:
3166  return canGuaranteeTCO(CC);
3167  }
3168 }
3169 
3170 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3171  SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3172  const SmallVectorImpl<ISD::OutputArg> &Outs,
3173  const SmallVectorImpl<SDValue> &OutVals,
3174  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3175  if (!mayTailCallThisCC(CalleeCC))
3176  return false;
3177 
3178  MachineFunction &MF = DAG.getMachineFunction();
3179  const Function &CallerF = MF.getFunction();
3180  CallingConv::ID CallerCC = CallerF.getCallingConv();
3181  bool CCMatch = CallerCC == CalleeCC;
3182 
3183  // Byval parameters hand the function a pointer directly into the stack area
3184  // we want to reuse during a tail call. Working around this *is* possible (see
3185  // X86) but less efficient and uglier in LowerCall.
3186  for (Function::const_arg_iterator i = CallerF.arg_begin(),
3187  e = CallerF.arg_end();
3188  i != e; ++i)
3189  if (i->hasByValAttr())
3190  return false;
3191 
3193  return canGuaranteeTCO(CalleeCC) && CCMatch;
3194 
3195  // Externally-defined functions with weak linkage should not be
3196  // tail-called on AArch64 when the OS does not support dynamic
3197  // pre-emption of symbols, as the AAELF spec requires normal calls
3198  // to undefined weak functions to be replaced with a NOP or jump to the
3199  // next instruction. The behaviour of branch instructions in this
3200  // situation (as used for tail calls) is implementation-defined, so we
3201  // cannot rely on the linker replacing the tail call with a return.
3202  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3203  const GlobalValue *GV = G->getGlobal();
3204  const Triple &TT = getTargetMachine().getTargetTriple();
3205  if (GV->hasExternalWeakLinkage() &&
3206  (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3207  return false;
3208  }
3209 
3210  // Now we search for cases where we can use a tail call without changing the
3211  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3212  // concept.
3213 
3214  // I want anyone implementing a new calling convention to think long and hard
3215  // about this assert.
3216  assert((!isVarArg || CalleeCC == CallingConv::C) &&
3217  "Unexpected variadic calling convention");
3218 
3219  LLVMContext &C = *DAG.getContext();
3220  if (isVarArg && !Outs.empty()) {
3221  // At least two cases here: if caller is fastcc then we can't have any
3222  // memory arguments (we'd be expected to clean up the stack afterwards). If
3223  // caller is C then we could potentially use its argument area.
3224 
3225  // FIXME: for now we take the most conservative of these in both cases:
3226  // disallow all variadic memory operands.
3228  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3229 
3230  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3231  for (const CCValAssign &ArgLoc : ArgLocs)
3232  if (!ArgLoc.isRegLoc())
3233  return false;
3234  }
3235 
3236  // Check that the call results are passed in the same way.
3237  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3238  CCAssignFnForCall(CalleeCC, isVarArg),
3239  CCAssignFnForCall(CallerCC, isVarArg)))
3240  return false;
3241  // The callee has to preserve all registers the caller needs to preserve.
3242  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3243  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3244  if (!CCMatch) {
3245  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3246  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3247  return false;
3248  }
3249 
3250  // Nothing more to check if the callee is taking no arguments
3251  if (Outs.empty())
3252  return true;
3253 
3255  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3256 
3257  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3258 
3259  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3260 
3261  // If the stack arguments for this call do not fit into our own save area then
3262  // the call cannot be made tail.
3263  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3264  return false;
3265 
3266  const MachineRegisterInfo &MRI = MF.getRegInfo();
3267  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3268  return false;
3269 
3270  return true;
3271 }
3272 
3273 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3274  SelectionDAG &DAG,
3275  MachineFrameInfo &MFI,
3276  int ClobberedFI) const {
3277  SmallVector<SDValue, 8> ArgChains;
3278  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3279  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3280 
3281  // Include the original chain at the beginning of the list. When this is
3282  // used by target LowerCall hooks, this helps legalize find the
3283  // CALLSEQ_BEGIN node.
3284  ArgChains.push_back(Chain);
3285 
3286  // Add a chain value for each stack argument corresponding
3287  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3288  UE = DAG.getEntryNode().getNode()->use_end();
3289  U != UE; ++U)
3290  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3291  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3292  if (FI->getIndex() < 0) {
3293  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3294  int64_t InLastByte = InFirstByte;
3295  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3296 
3297  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3298  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3299  ArgChains.push_back(SDValue(L, 1));
3300  }
3301 
3302  // Build a tokenfactor for all the chains.
3303  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3304 }
3305 
3306 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3307  bool TailCallOpt) const {
3308  return CallCC == CallingConv::Fast && TailCallOpt;
3309 }
3310 
3311 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3312 /// and add input and output parameter nodes.
3313 SDValue
3314 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3315  SmallVectorImpl<SDValue> &InVals) const {
3316  SelectionDAG &DAG = CLI.DAG;
3317  SDLoc &DL = CLI.DL;
3318  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3319  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3320  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3321  SDValue Chain = CLI.Chain;
3322  SDValue Callee = CLI.Callee;
3323  bool &IsTailCall = CLI.IsTailCall;
3324  CallingConv::ID CallConv = CLI.CallConv;
3325  bool IsVarArg = CLI.IsVarArg;
3326 
3327  MachineFunction &MF = DAG.getMachineFunction();
3328  bool IsThisReturn = false;
3329 
3331  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3332  bool IsSibCall = false;
3333 
3334  if (IsTailCall) {
3335  // Check if it's really possible to do a tail call.
3336  IsTailCall = isEligibleForTailCallOptimization(
3337  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3338  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3339  report_fatal_error("failed to perform tail call elimination on a call "
3340  "site marked musttail");
3341 
3342  // A sibling call is one where we're under the usual C ABI and not planning
3343  // to change that but can still do a tail call:
3344  if (!TailCallOpt && IsTailCall)
3345  IsSibCall = true;
3346 
3347  if (IsTailCall)
3348  ++NumTailCalls;
3349  }
3350 
3351  // Analyze operands of the call, assigning locations to each operand.
3353  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3354  *DAG.getContext());
3355 
3356  if (IsVarArg) {
3357  // Handle fixed and variable vector arguments differently.
3358  // Variable vector arguments always go into memory.
3359  unsigned NumArgs = Outs.size();
3360 
3361  for (unsigned i = 0; i != NumArgs; ++i) {
3362  MVT ArgVT = Outs[i].VT;
3363  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3364  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3365  /*IsVarArg=*/ !Outs[i].IsFixed);
3366  bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3367  assert(!Res && "Call operand has unhandled type");
3368  (void)Res;
3369  }
3370  } else {
3371  // At this point, Outs[].VT may already be promoted to i32. To correctly
3372  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3373  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3374  // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3375  // we use a special version of AnalyzeCallOperands to pass in ValVT and
3376  // LocVT.
3377  unsigned NumArgs = Outs.size();
3378  for (unsigned i = 0; i != NumArgs; ++i) {
3379  MVT ValVT = Outs[i].VT;
3380  // Get type of the original argument.
3381  EVT ActualVT = getValueType(DAG.getDataLayout(),
3382  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3383  /*AllowUnknown*/ true);
3384  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3385  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3386  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3387  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3388  ValVT = MVT::i8;
3389  else if (ActualMVT == MVT::i16)
3390  ValVT = MVT::i16;
3391 
3392  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3393  bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3394  assert(!Res && "Call operand has unhandled type");
3395  (void)Res;
3396  }
3397  }
3398 
3399  // Get a count of how many bytes are to be pushed on the stack.
3400  unsigned NumBytes = CCInfo.getNextStackOffset();
3401 
3402  if (IsSibCall) {
3403  // Since we're not changing the ABI to make this a tail call, the memory
3404  // operands are already available in the caller's incoming argument space.
3405  NumBytes = 0;
3406  }
3407 
3408  // FPDiff is the byte offset of the call's argument area from the callee's.
3409  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3410  // by this amount for a tail call. In a sibling call it must be 0 because the
3411  // caller will deallocate the entire stack and the callee still expects its
3412  // arguments to begin at SP+0. Completely unused for non-tail calls.
3413  int FPDiff = 0;
3414 
3415  if (IsTailCall && !IsSibCall) {
3416  unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3417 
3418  // Since callee will pop argument stack as a tail call, we must keep the
3419  // popped size 16-byte aligned.
3420  NumBytes = alignTo(NumBytes, 16);
3421 
3422  // FPDiff will be negative if this tail call requires more space than we
3423  // would automatically have in our incoming argument space. Positive if we
3424  // can actually shrink the stack.
3425  FPDiff = NumReusableBytes - NumBytes;
3426 
3427  // The stack pointer must be 16-byte aligned at all times it's used for a
3428  // memory operation, which in practice means at *all* times and in
3429  // particular across call boundaries. Therefore our own arguments started at
3430  // a 16-byte aligned SP and the delta applied for the tail call should
3431  // satisfy the same constraint.
3432  assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3433  }
3434 
3435  // Adjust the stack pointer for the new arguments...
3436  // These operations are automatically eliminated by the prolog/epilog pass
3437  if (!IsSibCall)
3438  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3439 
3440  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3441  getPointerTy(DAG.getDataLayout()));
3442 
3444  SmallVector<SDValue, 8> MemOpChains;
3445  auto PtrVT = getPointerTy(DAG.getDataLayout());
3446 
3447  // Walk the register/memloc assignments, inserting copies/loads.
3448  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3449  ++i, ++realArgIdx) {
3450  CCValAssign &VA = ArgLocs[i];
3451  SDValue Arg = OutVals[realArgIdx];
3452  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3453 
3454  // Promote the value if needed.
3455  switch (VA.getLocInfo()) {
3456  default:
3457  llvm_unreachable("Unknown loc info!");
3458  case CCValAssign::Full:
3459  break;
3460  case CCValAssign::SExt:
3461  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3462  break;
3463  case CCValAssign::ZExt:
3464  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3465  break;
3466  case CCValAssign::AExt:
3467  if (Outs[realArgIdx].ArgVT == MVT::i1) {
3468  // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3469  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3470  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3471  }
3472  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3473  break;
3474  case CCValAssign::BCvt:
3475  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3476  break;
3477  case CCValAssign::FPExt:
3478  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3479  break;
3480  }
3481 
3482  if (VA.isRegLoc()) {
3483  if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3484  Outs[0].VT == MVT::i64) {
3485  assert(VA.getLocVT() == MVT::i64 &&
3486  "unexpected calling convention register assignment");
3487  assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3488  "unexpected use of 'returned'");
3489  IsThisReturn = true;
3490  }
3491  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3492  } else {
3493  assert(VA.isMemLoc());
3494 
3495  SDValue DstAddr;
3496  MachinePointerInfo DstInfo;
3497 
3498  // FIXME: This works on big-endian for composite byvals, which are the
3499  // common case. It should also work for fundamental types too.
3500  uint32_t BEAlign = 0;
3501  unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3502  : VA.getValVT().getSizeInBits();
3503  OpSize = (OpSize + 7) / 8;
3504  if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3505  !Flags.isInConsecutiveRegs()) {
3506  if (OpSize < 8)
3507  BEAlign = 8 - OpSize;
3508  }
3509  unsigned LocMemOffset = VA.getLocMemOffset();
3510  int32_t Offset = LocMemOffset + BEAlign;
3511  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3512  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3513 
3514  if (IsTailCall) {
3515  Offset = Offset + FPDiff;
3516  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3517 
3518  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3519  DstInfo =
3521 
3522  // Make sure any stack arguments overlapping with where we're storing
3523  // are loaded before this eventual operation. Otherwise they'll be
3524  // clobbered.
3525  Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3526  } else {
3527  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3528 
3529  DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3531  LocMemOffset);
3532  }
3533 
3534  if (Outs[i].Flags.isByVal()) {
3535  SDValue SizeNode =
3536  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3537  SDValue Cpy = DAG.getMemcpy(
3538  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3539  /*isVol = */ false, /*AlwaysInline = */ false,
3540  /*isTailCall = */ false,
3541  DstInfo, MachinePointerInfo());
3542 
3543  MemOpChains.push_back(Cpy);
3544  } else {
3545  // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3546  // promoted to a legal register type i32, we should truncate Arg back to
3547  // i1/i8/i16.
3548  if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3549  VA.getValVT() == MVT::i16)
3550  Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3551 
3552  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3553  MemOpChains.push_back(Store);
3554  }
3555  }
3556  }
3557 
3558  if (!MemOpChains.empty())
3559  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3560 
3561  // Build a sequence of copy-to-reg nodes chained together with token chain
3562  // and flag operands which copy the outgoing args into the appropriate regs.
3563  SDValue InFlag;
3564  for (auto &RegToPass : RegsToPass) {
3565  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3566  RegToPass.second, InFlag);
3567  InFlag = Chain.getValue(1);
3568  }
3569 
3570  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3571  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3572  // node so that legalize doesn't hack it.
3573  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3574  auto GV = G->getGlobal();
3575  if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
3577  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
3578  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3579  } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
3580  assert(Subtarget->isTargetWindows() &&
3581  "Windows is the only supported COFF target");
3582  Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
3583  } else {
3584  const GlobalValue *GV = G->getGlobal();
3585  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
3586  }
3587  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3588  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3589  Subtarget->isTargetMachO()) {
3590  const char *Sym = S->getSymbol();
3591  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
3592  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3593  } else {
3594  const char *Sym = S->getSymbol();
3595  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
3596  }
3597  }
3598 
3599  // We don't usually want to end the call-sequence here because we would tidy
3600  // the frame up *after* the call, however in the ABI-changing tail-call case
3601  // we've carefully laid out the parameters so that when sp is reset they'll be
3602  // in the correct location.
3603  if (IsTailCall && !IsSibCall) {
3604  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3605  DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
3606  InFlag = Chain.getValue(1);
3607  }
3608 
3609  std::vector<SDValue> Ops;
3610  Ops.push_back(Chain);
3611  Ops.push_back(Callee);
3612 
3613  if (IsTailCall) {
3614  // Each tail call may have to adjust the stack by a different amount, so
3615  // this information must travel along with the operation for eventual
3616  // consumption by emitEpilogue.
3617  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3618  }
3619 
3620  // Add argument registers to the end of the list so that they are known live
3621  // into the call.
3622  for (auto &RegToPass : RegsToPass)
3623  Ops.push_back(DAG.getRegister(RegToPass.first,
3624  RegToPass.second.getValueType()));
3625 
3626  // Add a register mask operand representing the call-preserved registers.
3627  const uint32_t *Mask;
3628  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3629  if (IsThisReturn) {
3630  // For 'this' returns, use the X0-preserving mask if applicable
3631  Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
3632  if (!Mask) {
3633  IsThisReturn = false;
3634  Mask = TRI->getCallPreservedMask(MF, CallConv);
3635  }
3636  } else
3637  Mask = TRI->getCallPreservedMask(MF, CallConv);
3638 
3639  assert(Mask && "Missing call preserved mask for calling convention");
3640  Ops.push_back(DAG.getRegisterMask(Mask));
3641 
3642  if (InFlag.getNode())
3643  Ops.push_back(InFlag);
3644 
3645  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3646 
3647  // If we're doing a tall call, use a TC_RETURN here rather than an
3648  // actual call instruction.
3649  if (IsTailCall) {
3651  return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
3652  }
3653 
3654  // Returns a chain and a flag for retval copy to use.
3655  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
3656  InFlag = Chain.getValue(1);
3657 
3658  uint64_t CalleePopBytes =
3659  DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
3660 
3661  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3662  DAG.getIntPtrConstant(CalleePopBytes, DL, true),
3663  InFlag, DL);
3664  if (!Ins.empty())
3665  InFlag = Chain.getValue(1);
3666 
3667  // Handle result values, copying them out of physregs into vregs that we
3668  // return.
3669  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3670  InVals, IsThisReturn,
3671  IsThisReturn ? OutVals[0] : SDValue());
3672 }
3673 
3674 bool AArch64TargetLowering::CanLowerReturn(
3675  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3676  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3677  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3678  ? RetCC_AArch64_WebKit_JS
3679  : RetCC_AArch64_AAPCS;
3681  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3682  return CCInfo.CheckReturn(Outs, RetCC);
3683 }
3684 
3685 SDValue
3686 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3687  bool isVarArg,
3688  const SmallVectorImpl<ISD::OutputArg> &Outs,
3689  const SmallVectorImpl<SDValue> &OutVals,
3690  const SDLoc &DL, SelectionDAG &DAG) const {
3691  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3692  ? RetCC_AArch64_WebKit_JS
3693  : RetCC_AArch64_AAPCS;
3695  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3696  *DAG.getContext());
3697  CCInfo.AnalyzeReturn(Outs, RetCC);
3698 
3699  // Copy the result values into the output registers.
3700  SDValue Flag;
3701  SmallVector<SDValue, 4> RetOps(1, Chain);
3702  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
3703  ++i, ++realRVLocIdx) {
3704  CCValAssign &VA = RVLocs[i];
3705  assert(VA.isRegLoc() && "Can only return in registers!");
3706  SDValue Arg = OutVals[realRVLocIdx];
3707 
3708  switch (VA.getLocInfo()) {
3709  default:
3710  llvm_unreachable("Unknown loc info!");
3711  case CCValAssign::Full:
3712  if (Outs[i].ArgVT == MVT::i1) {
3713  // AAPCS requires i1 to be zero-extended to i8 by the producer of the
3714  // value. This is strictly redundant on Darwin (which uses "zeroext
3715  // i1"), but will be optimised out before ISel.
3716  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3717  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3718  }
3719  break;
3720  case CCValAssign::BCvt:
3721  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3722  break;
3723  }
3724 
3725  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
3726  Flag = Chain.getValue(1);
3727  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3728  }
3729  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3730  const MCPhysReg *I =
3732  if (I) {
3733  for (; *I; ++I) {
3734  if (AArch64::GPR64RegClass.contains(*I))
3735  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3736  else if (AArch64::FPR64RegClass.contains(*I))
3737  RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
3738  else
3739  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3740  }
3741  }
3742 
3743  RetOps[0] = Chain; // Update chain.
3744 
3745  // Add the flag if we have it.
3746  if (Flag.getNode())
3747  RetOps.push_back(Flag);
3748 
3749  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
3750 }
3751 
3752 //===----------------------------------------------------------------------===//
3753 // Other Lowering Code
3754 //===----------------------------------------------------------------------===//
3755 
3756 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
3757  SelectionDAG &DAG,
3758  unsigned Flag) const {
3759  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
3760  N->getOffset(), Flag);
3761 }
3762 
3763 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
3764  SelectionDAG &DAG,
3765  unsigned Flag) const {
3766  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
3767 }
3768 
3769 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
3770  SelectionDAG &DAG,
3771  unsigned Flag) const {
3772  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
3773  N->getOffset(), Flag);
3774 }
3775 
3776 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
3777  SelectionDAG &DAG,
3778  unsigned Flag) const {
3779  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
3780 }
3781 
3782 // (loadGOT sym)
3783 template <class NodeTy>
3784 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
3785  unsigned Flags) const {
3786  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
3787  SDLoc DL(N);
3788  EVT Ty = getPointerTy(DAG.getDataLayout());
3789  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
3790  // FIXME: Once remat is capable of dealing with instructions with register
3791  // operands, expand this into two nodes instead of using a wrapper node.
3792  return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
3793 }
3794 
3795 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
3796 template <class NodeTy>
3797 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
3798  unsigned Flags) const {
3799  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
3800  SDLoc DL(N);
3801  EVT Ty = getPointerTy(DAG.getDataLayout());
3802  const unsigned char MO_NC = AArch64II::MO_NC;
3803  return DAG.getNode(
3804  AArch64ISD::WrapperLarge, DL, Ty,
3805  getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
3806  getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
3807  getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
3808  getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
3809 }
3810 
3811 // (addlow (adrp %hi(sym)) %lo(sym))
3812 template <class NodeTy>
3813 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
3814  unsigned Flags) const {
3815  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
3816  SDLoc DL(N);
3817  EVT Ty = getPointerTy(DAG.getDataLayout());
3818  SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
3819  SDValue Lo = getTargetNode(N, Ty, DAG,
3821  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
3822  return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
3823 }
3824 
3825 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
3826  SelectionDAG &DAG) const {
3827  GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
3828  const GlobalValue *GV = GN->getGlobal();
3829  const AArch64II::TOF TargetFlags =
3830  (GV->hasDLLImportStorageClass() ? AArch64II::MO_DLLIMPORT
3832  unsigned char OpFlags =
3833  Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
3834 
3835  if (OpFlags != AArch64II::MO_NO_FLAG)
3836  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
3837  "unexpected offset in global node");
3838 
3839  // This also catches the large code model case for Darwin.
3840  if ((OpFlags & AArch64II::MO_GOT) != 0) {
3841  return getGOT(GN, DAG, TargetFlags);
3842  }
3843 
3844  SDValue Result;
3845  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
3846  Result = getAddrLarge(GN, DAG, TargetFlags);
3847  } else {
3848  Result = getAddr(GN, DAG, TargetFlags);
3849  }
3850  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3851  SDLoc DL(GN);
3852  if (GV->hasDLLImportStorageClass())
3853  Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3855  return Result;
3856 }
3857 
3858 /// Convert a TLS address reference into the correct sequence of loads
3859 /// and calls to compute the variable's address (for Darwin, currently) and
3860 /// return an SDValue containing the final node.
3861 
3862 /// Darwin only has one TLS scheme which must be capable of dealing with the
3863 /// fully general situation, in the worst case. This means:
3864 /// + "extern __thread" declaration.
3865 /// + Defined in a possibly unknown dynamic library.
3866 ///
3867 /// The general system is that each __thread variable has a [3 x i64] descriptor
3868 /// which contains information used by the runtime to calculate the address. The
3869 /// only part of this the compiler needs to know about is the first xword, which
3870 /// contains a function pointer that must be called with the address of the
3871 /// entire descriptor in "x0".
3872 ///
3873 /// Since this descriptor may be in a different unit, in general even the
3874 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
3875 /// is:
3876 /// adrp x0, _var@TLVPPAGE
3877 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
3878 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
3879 /// ; the function pointer
3880 /// blr x1 ; Uses descriptor address in x0
3881 /// ; Address of _var is now in x0.
3882 ///
3883 /// If the address of _var's descriptor *is* known to the linker, then it can
3884 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
3885 /// a slight efficiency gain.
3886 SDValue
3887 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
3888  SelectionDAG &DAG) const {
3889  assert(Subtarget->isTargetDarwin() &&
3890  "This function expects a Darwin target");
3891 
3892  SDLoc DL(Op);
3893  MVT PtrVT = getPointerTy(DAG.getDataLayout());
3894  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3895 
3896  SDValue TLVPAddr =
3897  DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
3898  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
3899 
3900  // The first entry in the descriptor is a function pointer that we must call
3901  // to obtain the address of the variable.
3902  SDValue Chain = DAG.getEntryNode();
3903  SDValue FuncTLVGet = DAG.getLoad(
3904  MVT::i64, DL, Chain, DescAddr,
3906  /* Alignment = */ 8,
3909  Chain = FuncTLVGet.getValue(1);
3910 
3912  MFI.setAdjustsStack(true);
3913 
3914  // TLS calls preserve all registers except those that absolutely must be
3915  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3916  // silly).
3917  const uint32_t *Mask =
3918  Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
3919 
3920  // Finally, we can make the call. This is just a degenerate version of a
3921  // normal AArch64 call node: x0 takes the address of the descriptor, and
3922  // returns the address of the variable in this thread.
3923  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
3924  Chain =
3926  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
3927  DAG.getRegisterMask(Mask), Chain.getValue(1));
3928  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
3929 }
3930 
3931 /// When accessing thread-local variables under either the general-dynamic or
3932 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
3933 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
3934 /// is a function pointer to carry out the resolution.
3935 ///
3936 /// The sequence is:
3937 /// adrp x0, :tlsdesc:var
3938 /// ldr x1, [x0, #:tlsdesc_lo12:var]
3939 /// add x0, x0, #:tlsdesc_lo12:var
3940 /// .tlsdesccall var
3941 /// blr x1
3942 /// (TPIDR_EL0 offset now in x0)
3943 ///
3944 /// The above sequence must be produced unscheduled, to enable the linker to
3945 /// optimize/relax this sequence.
3946 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
3947 /// above sequence, and expanded really late in the compilation flow, to ensure
3948 /// the sequence is produced as per above.
3949 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
3950  const SDLoc &DL,
3951  SelectionDAG &DAG) const {
3952  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3953 
3954  SDValue Chain = DAG.getEntryNode();
3955  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3956 
3957  Chain =
3958  DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
3959  SDValue Glue = Chain.getValue(1);
3960 
3961  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
3962 }
3963 
3964 SDValue
3965 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
3966  SelectionDAG &DAG) const {
3967  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
3968  assert(Subtarget->useSmallAddressing() &&
3969  "ELF TLS only supported in small memory model");
3970  // Different choices can be made for the maximum size of the TLS area for a
3971  // module. For the small address model, the default TLS size is 16MiB and the
3972  // maximum TLS size is 4GiB.
3973  // FIXME: add -mtls-size command line option and make it control the 16MiB
3974  // vs. 4GiB code sequence generation.
3975  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3976 
3977  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
3978 
3980  if (Model == TLSModel::LocalDynamic)
3981  Model = TLSModel::GeneralDynamic;
3982  }
3983 
3984  SDValue TPOff;
3985  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3986  SDLoc DL(Op);
3987  const GlobalValue *GV = GA->getGlobal();
3988 
3989  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
3990 
3991  if (Model == TLSModel::LocalExec) {
3992  SDValue HiVar = DAG.getTargetGlobalAddress(
3993  GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
3994  SDValue LoVar = DAG.getTargetGlobalAddress(
3995  GV, DL, PtrVT, 0,
3997 
3998  SDValue TPWithOff_lo =
3999  SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
4000  HiVar,
4001  DAG.getTargetConstant(0, DL, MVT::i32)),
4002  0);
4003  SDValue TPWithOff =
4004  SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
4005  LoVar,
4006  DAG.getTargetConstant(0, DL, MVT::i32)),
4007  0);
4008  return TPWithOff;
4009  } else if (Model == TLSModel::InitialExec) {
4010  TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
4011  TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
4012  } else if (Model == TLSModel::LocalDynamic) {
4013  // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
4014  // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
4015  // the beginning of the module's TLS region, followed by a DTPREL offset
4016  // calculation.
4017 
4018  // These accesses will need deduplicating if there's more than one.
4019  AArch64FunctionInfo *MFI =
4022 
4023  // The call needs a relocation too for linker relaxation. It doesn't make
4024  // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
4025  // the address.
4026  SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
4028 
4029  // Now we can calculate the offset from TPIDR_EL0 to this module's
4030  // thread-local area.
4031  TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
4032 
4033  // Now use :dtprel_whatever: operations to calculate this variable's offset
4034  // in its thread-storage area.
4035  SDValue HiVar = DAG.getTargetGlobalAddress(
4037  SDValue LoVar = DAG.getTargetGlobalAddress(
4038  GV, DL, MVT::i64, 0,
4040 
4041  TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
4042  DAG.getTargetConstant(0, DL, MVT::i32)),
4043  0);
4044  TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
4045  DAG.getTargetConstant(0, DL, MVT::i32)),
4046  0);
4047  } else if (Model == TLSModel::GeneralDynamic) {
4048  // The call needs a relocation too for linker relaxation. It doesn't make
4049  // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
4050  // the address.
4051  SDValue SymAddr =
4052  DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
4053 
4054  // Finally we can make a call to calculate the offset from tpidr_el0.
4055  TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
4056  } else
4057  llvm_unreachable("Unsupported ELF TLS access model");
4058 
4059  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
4060 }
4061 
4062 SDValue
4063 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
4064  SelectionDAG &DAG) const {
4065  assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
4066 
4067  SDValue Chain = DAG.getEntryNode();
4068  EVT PtrVT = getPointerTy(DAG.getDataLayout());
4069  SDLoc DL(Op);
4070 
4071  SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
4072 
4073  // Load the ThreadLocalStoragePointer from the TEB
4074  // A pointer to the TLS array is located at offset 0x58 from the TEB.
4075  SDValue TLSArray =
4076  DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
4077  TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
4078  Chain = TLSArray.getValue(1);
4079 
4080  // Load the TLS index from the C runtime;
4081  // This does the same as getAddr(), but without having a GlobalAddressSDNode.
4082  // This also does the same as LOADgot, but using a generic i32 load,
4083  // while LOADgot only loads i64.
4084  SDValue TLSIndexHi =
4085  DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
4086  SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
4087  "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4088  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
4089  SDValue TLSIndex =
4090  DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
4091  TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
4092  Chain = TLSIndex.getValue(1);
4093 
4094  // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
4095  // offset into the TLSArray.
4096  TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
4097  SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
4098  DAG.getConstant(3, DL, PtrVT));
4099  SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
4100  DAG.