LLVM  8.0.0svn
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the AArch64TargetLowering class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64ISelLowering.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
47 #include "llvm/IR/Attributes.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DataLayout.h"
50 #include "llvm/IR/DebugLoc.h"
51 #include "llvm/IR/DerivedTypes.h"
52 #include "llvm/IR/Function.h"
54 #include "llvm/IR/GlobalValue.h"
55 #include "llvm/IR/IRBuilder.h"
56 #include "llvm/IR/Instruction.h"
57 #include "llvm/IR/Instructions.h"
58 #include "llvm/IR/Intrinsics.h"
59 #include "llvm/IR/Module.h"
60 #include "llvm/IR/OperandTraits.h"
61 #include "llvm/IR/Type.h"
62 #include "llvm/IR/Use.h"
63 #include "llvm/IR/Value.h"
64 #include "llvm/MC/MCRegisterInfo.h"
65 #include "llvm/Support/Casting.h"
66 #include "llvm/Support/CodeGen.h"
68 #include "llvm/Support/Compiler.h"
69 #include "llvm/Support/Debug.h"
71 #include "llvm/Support/KnownBits.h"
77 #include <algorithm>
78 #include <bitset>
79 #include <cassert>
80 #include <cctype>
81 #include <cstdint>
82 #include <cstdlib>
83 #include <iterator>
84 #include <limits>
85 #include <tuple>
86 #include <utility>
87 #include <vector>
88 
89 using namespace llvm;
90 
91 #define DEBUG_TYPE "aarch64-lower"
92 
93 STATISTIC(NumTailCalls, "Number of tail calls");
94 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
95 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
96 
97 static cl::opt<bool>
98 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
99  cl::desc("Allow AArch64 SLI/SRI formation"),
100  cl::init(false));
101 
102 // FIXME: The necessary dtprel relocations don't seem to be supported
103 // well in the GNU bfd and gold linkers at the moment. Therefore, by
104 // default, for now, fall back to GeneralDynamic code generation.
106  "aarch64-elf-ldtls-generation", cl::Hidden,
107  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
108  cl::init(false));
109 
110 static cl::opt<bool>
111 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
112  cl::desc("Enable AArch64 logical imm instruction "
113  "optimization"),
114  cl::init(true));
115 
116 /// Value type used for condition codes.
117 static const MVT MVT_CC = MVT::i32;
118 
120  const AArch64Subtarget &STI)
121  : TargetLowering(TM), Subtarget(&STI) {
122  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
123  // we have to make something up. Arbitrarily, choose ZeroOrOne.
125  // When comparing vectors the result sets the different elements in the
126  // vector to all-one or all-zero.
128 
129  // Set up the register classes.
130  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
131  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
132 
133  if (Subtarget->hasFPARMv8()) {
134  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
135  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
136  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
137  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
138  }
139 
140  if (Subtarget->hasNEON()) {
141  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
142  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
143  // Someone set us up the NEON.
144  addDRTypeForNEON(MVT::v2f32);
145  addDRTypeForNEON(MVT::v8i8);
146  addDRTypeForNEON(MVT::v4i16);
147  addDRTypeForNEON(MVT::v2i32);
148  addDRTypeForNEON(MVT::v1i64);
149  addDRTypeForNEON(MVT::v1f64);
150  addDRTypeForNEON(MVT::v4f16);
151 
152  addQRTypeForNEON(MVT::v4f32);
153  addQRTypeForNEON(MVT::v2f64);
154  addQRTypeForNEON(MVT::v16i8);
155  addQRTypeForNEON(MVT::v8i16);
156  addQRTypeForNEON(MVT::v4i32);
157  addQRTypeForNEON(MVT::v2i64);
158  addQRTypeForNEON(MVT::v8f16);
159  }
160 
161  // Compute derived properties from the register classes
163 
164  // Provide all sorts of operation actions
192 
196 
200 
202 
203  // Custom lowering hooks are needed for XOR
204  // to fold it into CSINC/CSINV.
207 
208  // Virtually no operation on f128 is legal, but LLVM can't expand them when
209  // there's a valid register class, so we need custom operations in most cases.
231 
232  // Lowering for many of the conversions is actually specified by the non-f128
233  // type. The LowerXXX function will be trivial when f128 isn't involved.
248 
249  // Variable arguments.
254 
255  // Variable-sized objects.
258 
259  if (Subtarget->isTargetWindows())
261  else
263 
264  // Constant pool entries
266 
267  // BlockAddress
269 
270  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
279 
280  // AArch64 lacks both left-rotate and popcount instructions.
283  for (MVT VT : MVT::vector_valuetypes()) {
286  }
287 
288  // AArch64 doesn't have {U|S}MUL_LOHI.
291 
294 
297  for (MVT VT : MVT::vector_valuetypes()) {
300  }
307 
308  // Custom lower Add/Sub/Mul with overflow.
321 
330  if (Subtarget->hasFullFP16())
332  else
334 
366 
367  if (!Subtarget->hasFullFP16()) {
390 
391  // promote v4f16 to v4f32 when that is known to be safe.
404 
420 
441  }
442 
443  // AArch64 has implementations of a lot of rounding-like FP operations.
444  for (MVT Ty : {MVT::f32, MVT::f64}) {
455  }
456 
457  if (Subtarget->hasFullFP16()) {
468  }
469 
471 
473 
479 
480  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
481  // This requires the Performance Monitors extension.
482  if (Subtarget->hasPerfMon())
484 
485  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
486  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
487  // Issue __sincos_stret if available.
490  } else {
493  }
494 
495  // Make floating-point constants legal for the large code model, so they don't
496  // become loads from the constant pool.
497  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
500  }
501 
502  // AArch64 does not have floating-point extending loads, i1 sign-extending
503  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
504  for (MVT VT : MVT::fp_valuetypes()) {
509  }
510  for (MVT VT : MVT::integer_valuetypes())
512 
520 
523 
524  // Indexed loads and stores are supported.
525  for (unsigned im = (unsigned)ISD::PRE_INC;
541  }
542 
543  // Trap.
545 
546  // We combine OR nodes for bitfield operations.
548 
549  // Vector add and sub nodes may conceal a high-half opportunity.
550  // Also, try to fold ADD into CSINC/CSINV..
557 
561 
563 
570  if (Subtarget->supportsAddressTopByteIgnored())
572 
574 
577 
581 
583 
584  // In case of strict alignment, avoid an excessive number of byte wide stores.
588 
593 
595 
597 
599 
600  EnableExtLdPromotion = true;
601 
602  // Set required alignment.
604  // Set preferred alignments.
607 
608  // Only change the limit for entries in a jump table if specified by
609  // the subtarget, but not at the command line.
610  unsigned MaxJT = STI.getMaximumJumpTableSize();
611  if (MaxJT && getMaximumJumpTableSize() == 0)
613 
614  setHasExtractBitsInsn(true);
615 
617 
618  if (Subtarget->hasNEON()) {
619  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
620  // silliness like this:
646 
652 
654 
655  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
656  // elements smaller than i32, so promote the input to i32 first.
661  // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
662  // -> v8f16 conversions.
667  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
672  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
673  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
676 
679 
688 
689  // AArch64 doesn't have MUL.2d:
691  // Custom handling for some quad-vector types to detect MULL.
695 
696  // Vector reductions
697  for (MVT VT : MVT::integer_valuetypes()) {
703  }
704  for (MVT VT : MVT::fp_valuetypes()) {
707  }
708 
711  // Likewise, narrowing and extending vector loads/stores aren't handled
712  // directly.
713  for (MVT VT : MVT::vector_valuetypes()) {
715 
716  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
719  } else {
722  }
725 
727 
728  for (MVT InnerVT : MVT::vector_valuetypes()) {
729  setTruncStoreAction(VT, InnerVT, Expand);
730  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
731  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
732  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
733  }
734  }
735 
736  // AArch64 has implementations of a lot of rounding-like FP operations.
737  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
744  }
745 
747  }
748 
750 }
751 
752 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
753  assert(VT.isVector() && "VT should be a vector type");
754 
755  if (VT.isFloatingPoint()) {
757  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
758  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
759  }
760 
761  // Mark vector float intrinsics as expand.
762  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
771 
772  // But we do support custom-lowering for FCOPYSIGN.
774  }
775 
788 
792  for (MVT InnerVT : MVT::all_valuetypes())
793  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
794 
795  // CNT supports only B element sizes.
796  if (VT != MVT::v8i8 && VT != MVT::v16i8)
798 
804 
807 
808  if (!VT.isFloatingPoint())
810 
811  // [SU][MIN|MAX] are available for all NEON types apart from i64.
812  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
813  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
814  setOperationAction(Opcode, VT, Legal);
815 
816  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
817  if (VT.isFloatingPoint() &&
818  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
819  for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
821  setOperationAction(Opcode, VT, Legal);
822 
823  if (Subtarget->isLittleEndian()) {
824  for (unsigned im = (unsigned)ISD::PRE_INC;
828  }
829  }
830 }
831 
832 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
833  addRegisterClass(VT, &AArch64::FPR64RegClass);
834  addTypeForNEON(VT, MVT::v2i32);
835 }
836 
837 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
838  addRegisterClass(VT, &AArch64::FPR128RegClass);
839  addTypeForNEON(VT, MVT::v4i32);
840 }
841 
843  EVT VT) const {
844  if (!VT.isVector())
845  return MVT::i32;
847 }
848 
849 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
850  const APInt &Demanded,
852  unsigned NewOpc) {
853  uint64_t OldImm = Imm, NewImm, Enc;
854  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
855 
856  // Return if the immediate is already all zeros, all ones, a bimm32 or a
857  // bimm64.
858  if (Imm == 0 || Imm == Mask ||
859  AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
860  return false;
861 
862  unsigned EltSize = Size;
863  uint64_t DemandedBits = Demanded.getZExtValue();
864 
865  // Clear bits that are not demanded.
866  Imm &= DemandedBits;
867 
868  while (true) {
869  // The goal here is to set the non-demanded bits in a way that minimizes
870  // the number of switching between 0 and 1. In order to achieve this goal,
871  // we set the non-demanded bits to the value of the preceding demanded bits.
872  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
873  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
874  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
875  // The final result is 0b11000011.
876  uint64_t NonDemandedBits = ~DemandedBits;
877  uint64_t InvertedImm = ~Imm & DemandedBits;
878  uint64_t RotatedImm =
879  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
880  NonDemandedBits;
881  uint64_t Sum = RotatedImm + NonDemandedBits;
882  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
883  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
884  NewImm = (Imm | Ones) & Mask;
885 
886  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
887  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
888  // we halve the element size and continue the search.
889  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
890  break;
891 
892  // We cannot shrink the element size any further if it is 2-bits.
893  if (EltSize == 2)
894  return false;
895 
896  EltSize /= 2;
897  Mask >>= EltSize;
898  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
899 
900  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
901  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
902  return false;
903 
904  // Merge the upper and lower halves of Imm and DemandedBits.
905  Imm |= Hi;
906  DemandedBits |= DemandedBitsHi;
907  }
908 
909  ++NumOptimizedImms;
910 
911  // Replicate the element across the register width.
912  while (EltSize < Size) {
913  NewImm |= NewImm << EltSize;
914  EltSize *= 2;
915  }
916 
917  (void)OldImm;
918  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
919  "demanded bits should never be altered");
920  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
921 
922  // Create the new constant immediate node.
923  EVT VT = Op.getValueType();
924  SDLoc DL(Op);
925  SDValue New;
926 
927  // If the new constant immediate is all-zeros or all-ones, let the target
928  // independent DAG combine optimize this node.
929  if (NewImm == 0 || NewImm == OrigMask) {
930  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
931  TLO.DAG.getConstant(NewImm, DL, VT));
932  // Otherwise, create a machine node so that target independent DAG combine
933  // doesn't undo this optimization.
934  } else {
935  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
936  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
937  New = SDValue(
938  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
939  }
940 
941  return TLO.CombineTo(Op, New);
942 }
943 
945  SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
946  // Delay this optimization to as late as possible.
947  if (!TLO.LegalOps)
948  return false;
949 
951  return false;
952 
953  EVT VT = Op.getValueType();
954  if (VT.isVector())
955  return false;
956 
957  unsigned Size = VT.getSizeInBits();
958  assert((Size == 32 || Size == 64) &&
959  "i32 or i64 is expected after legalization.");
960 
961  // Exit early if we demand all bits.
962  if (Demanded.countPopulation() == Size)
963  return false;
964 
965  unsigned NewOpc;
966  switch (Op.getOpcode()) {
967  default:
968  return false;
969  case ISD::AND:
970  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
971  break;
972  case ISD::OR:
973  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
974  break;
975  case ISD::XOR:
976  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
977  break;
978  }
980  if (!C)
981  return false;
982  uint64_t Imm = C->getZExtValue();
983  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
984 }
985 
986 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
987 /// Mask are known to be either zero or one and return them Known.
989  const SDValue Op, KnownBits &Known,
990  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
991  switch (Op.getOpcode()) {
992  default:
993  break;
994  case AArch64ISD::CSEL: {
995  KnownBits Known2;
996  DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
997  DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
998  Known.Zero &= Known2.Zero;
999  Known.One &= Known2.One;
1000  break;
1001  }
1002  case ISD::INTRINSIC_W_CHAIN: {
1003  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1004  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1005  switch (IntID) {
1006  default: return;
1007  case Intrinsic::aarch64_ldaxr:
1008  case Intrinsic::aarch64_ldxr: {
1009  unsigned BitWidth = Known.getBitWidth();
1010  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1011  unsigned MemBits = VT.getScalarSizeInBits();
1012  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1013  return;
1014  }
1015  }
1016  break;
1017  }
1019  case ISD::INTRINSIC_VOID: {
1020  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1021  switch (IntNo) {
1022  default:
1023  break;
1024  case Intrinsic::aarch64_neon_umaxv:
1025  case Intrinsic::aarch64_neon_uminv: {
1026  // Figure out the datatype of the vector operand. The UMINV instruction
1027  // will zero extend the result, so we can mark as known zero all the
1028  // bits larger than the element datatype. 32-bit or larget doesn't need
1029  // this as those are legal types and will be handled by isel directly.
1030  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1031  unsigned BitWidth = Known.getBitWidth();
1032  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1033  assert(BitWidth >= 8 && "Unexpected width!");
1034  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1035  Known.Zero |= Mask;
1036  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1037  assert(BitWidth >= 16 && "Unexpected width!");
1038  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1039  Known.Zero |= Mask;
1040  }
1041  break;
1042  } break;
1043  }
1044  }
1045  }
1046 }
1047 
1049  EVT) const {
1050  return MVT::i64;
1051 }
1052 
1054  unsigned AddrSpace,
1055  unsigned Align,
1056  bool *Fast) const {
1057  if (Subtarget->requiresStrictAlign())
1058  return false;
1059 
1060  if (Fast) {
1061  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1062  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1063  // See comments in performSTORECombine() for more details about
1064  // these conditions.
1065 
1066  // Code that uses clang vector extensions can mark that it
1067  // wants unaligned accesses to be treated as fast by
1068  // underspecifying alignment to be 1 or 2.
1069  Align <= 2 ||
1070 
1071  // Disregard v2i64. Memcpy lowering produces those and splitting
1072  // them regresses performance on micro-benchmarks and olden/bh.
1073  VT == MVT::v2i64;
1074  }
1075  return true;
1076 }
1077 
1078 FastISel *
1080  const TargetLibraryInfo *libInfo) const {
1081  return AArch64::createFastISel(funcInfo, libInfo);
1082 }
1083 
1084 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1085  switch ((AArch64ISD::NodeType)Opcode) {
1086  case AArch64ISD::FIRST_NUMBER: break;
1087  case AArch64ISD::CALL: return "AArch64ISD::CALL";
1088  case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
1089  case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
1090  case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
1091  case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
1092  case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
1093  case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
1094  case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
1095  case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
1096  case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
1097  case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
1098  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1099  case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
1100  case AArch64ISD::ADC: return "AArch64ISD::ADC";
1101  case AArch64ISD::SBC: return "AArch64ISD::SBC";
1102  case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
1103  case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
1104  case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
1105  case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
1106  case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
1107  case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
1108  case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
1109  case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
1110  case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
1111  case AArch64ISD::DUP: return "AArch64ISD::DUP";
1112  case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
1113  case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
1114  case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
1115  case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
1116  case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
1117  case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
1118  case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
1119  case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
1120  case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
1121  case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
1122  case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
1123  case AArch64ISD::BICi: return "AArch64ISD::BICi";
1124  case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
1125  case AArch64ISD::BSL: return "AArch64ISD::BSL";
1126  case AArch64ISD::NEG: return "AArch64ISD::NEG";
1127  case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
1128  case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
1129  case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
1130  case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
1131  case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
1132  case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
1133  case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
1134  case AArch64ISD::REV16: return "AArch64ISD::REV16";
1135  case AArch64ISD::REV32: return "AArch64ISD::REV32";
1136  case AArch64ISD::REV64: return "AArch64ISD::REV64";
1137  case AArch64ISD::EXT: return "AArch64ISD::EXT";
1138  case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
1139  case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
1140  case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
1141  case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
1142  case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
1143  case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
1144  case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
1145  case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
1146  case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
1147  case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
1148  case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
1149  case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
1150  case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
1151  case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
1152  case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
1153  case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
1154  case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
1155  case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
1156  case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
1157  case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
1158  case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
1159  case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
1160  case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
1161  case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
1162  case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
1163  case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
1164  case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
1165  case AArch64ISD::NOT: return "AArch64ISD::NOT";
1166  case AArch64ISD::BIT: return "AArch64ISD::BIT";
1167  case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
1168  case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
1169  case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
1170  case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
1171  case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
1172  case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
1173  case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
1174  case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
1175  case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
1176  case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
1177  case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
1178  case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
1179  case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
1180  case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
1181  case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
1182  case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
1183  case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
1184  case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
1185  case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
1186  case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
1187  case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
1188  case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
1189  case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
1190  case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
1191  case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
1192  case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
1193  case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
1194  case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
1195  case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
1196  case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
1197  case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
1198  case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
1199  case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
1200  case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
1201  case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
1202  case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
1203  case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
1204  case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
1205  case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
1206  case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
1207  case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
1208  case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
1209  case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
1210  case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
1211  }
1212  return nullptr;
1213 }
1214 
1217  MachineBasicBlock *MBB) const {
1218  // We materialise the F128CSEL pseudo-instruction as some control flow and a
1219  // phi node:
1220 
1221  // OrigBB:
1222  // [... previous instrs leading to comparison ...]
1223  // b.ne TrueBB
1224  // b EndBB
1225  // TrueBB:
1226  // ; Fallthrough
1227  // EndBB:
1228  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1229 
1230  MachineFunction *MF = MBB->getParent();
1231  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1232  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1233  DebugLoc DL = MI.getDebugLoc();
1234  MachineFunction::iterator It = ++MBB->getIterator();
1235 
1236  unsigned DestReg = MI.getOperand(0).getReg();
1237  unsigned IfTrueReg = MI.getOperand(1).getReg();
1238  unsigned IfFalseReg = MI.getOperand(2).getReg();
1239  unsigned CondCode = MI.getOperand(3).getImm();
1240  bool NZCVKilled = MI.getOperand(4).isKill();
1241 
1242  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1243  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1244  MF->insert(It, TrueBB);
1245  MF->insert(It, EndBB);
1246 
1247  // Transfer rest of current basic-block to EndBB
1248  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1249  MBB->end());
1250  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1251 
1252  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1253  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1254  MBB->addSuccessor(TrueBB);
1255  MBB->addSuccessor(EndBB);
1256 
1257  // TrueBB falls through to the end.
1258  TrueBB->addSuccessor(EndBB);
1259 
1260  if (!NZCVKilled) {
1261  TrueBB->addLiveIn(AArch64::NZCV);
1262  EndBB->addLiveIn(AArch64::NZCV);
1263  }
1264 
1265  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1266  .addReg(IfTrueReg)
1267  .addMBB(TrueBB)
1268  .addReg(IfFalseReg)
1269  .addMBB(MBB);
1270 
1271  MI.eraseFromParent();
1272  return EndBB;
1273 }
1274 
1276  MachineInstr &MI, MachineBasicBlock *BB) const {
1277  switch (MI.getOpcode()) {
1278  default:
1279 #ifndef NDEBUG
1280  MI.dump();
1281 #endif
1282  llvm_unreachable("Unexpected instruction for custom inserter!");
1283 
1284  case AArch64::F128CSEL:
1285  return EmitF128CSEL(MI, BB);
1286 
1287  case TargetOpcode::STACKMAP:
1288  case TargetOpcode::PATCHPOINT:
1289  return emitPatchPoint(MI, BB);
1290  }
1291 }
1292 
1293 //===----------------------------------------------------------------------===//
1294 // AArch64 Lowering private implementation.
1295 //===----------------------------------------------------------------------===//
1296 
1297 //===----------------------------------------------------------------------===//
1298 // Lowering Code
1299 //===----------------------------------------------------------------------===//
1300 
1301 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1302 /// CC
1304  switch (CC) {
1305  default:
1306  llvm_unreachable("Unknown condition code!");
1307  case ISD::SETNE:
1308  return AArch64CC::NE;
1309  case ISD::SETEQ:
1310  return AArch64CC::EQ;
1311  case ISD::SETGT:
1312  return AArch64CC::GT;
1313  case ISD::SETGE:
1314  return AArch64CC::GE;
1315  case ISD::SETLT:
1316  return AArch64CC::LT;
1317  case ISD::SETLE:
1318  return AArch64CC::LE;
1319  case ISD::SETUGT:
1320  return AArch64CC::HI;
1321  case ISD::SETUGE:
1322  return AArch64CC::HS;
1323  case ISD::SETULT:
1324  return AArch64CC::LO;
1325  case ISD::SETULE:
1326  return AArch64CC::LS;
1327  }
1328 }
1329 
1330 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1333  AArch64CC::CondCode &CondCode2) {
1334  CondCode2 = AArch64CC::AL;
1335  switch (CC) {
1336  default:
1337  llvm_unreachable("Unknown FP condition!");
1338  case ISD::SETEQ:
1339  case ISD::SETOEQ:
1340  CondCode = AArch64CC::EQ;
1341  break;
1342  case ISD::SETGT:
1343  case ISD::SETOGT:
1344  CondCode = AArch64CC::GT;
1345  break;
1346  case ISD::SETGE:
1347  case ISD::SETOGE:
1348  CondCode = AArch64CC::GE;
1349  break;
1350  case ISD::SETOLT:
1351  CondCode = AArch64CC::MI;
1352  break;
1353  case ISD::SETOLE:
1354  CondCode = AArch64CC::LS;
1355  break;
1356  case ISD::SETONE:
1357  CondCode = AArch64CC::MI;
1358  CondCode2 = AArch64CC::GT;
1359  break;
1360  case ISD::SETO:
1361  CondCode = AArch64CC::VC;
1362  break;
1363  case ISD::SETUO:
1364  CondCode = AArch64CC::VS;
1365  break;
1366  case ISD::SETUEQ:
1367  CondCode = AArch64CC::EQ;
1368  CondCode2 = AArch64CC::VS;
1369  break;
1370  case ISD::SETUGT:
1371  CondCode = AArch64CC::HI;
1372  break;
1373  case ISD::SETUGE:
1374  CondCode = AArch64CC::PL;
1375  break;
1376  case ISD::SETLT:
1377  case ISD::SETULT:
1378  CondCode = AArch64CC::LT;
1379  break;
1380  case ISD::SETLE:
1381  case ISD::SETULE:
1382  CondCode = AArch64CC::LE;
1383  break;
1384  case ISD::SETNE:
1385  case ISD::SETUNE:
1386  CondCode = AArch64CC::NE;
1387  break;
1388  }
1389 }
1390 
1391 /// Convert a DAG fp condition code to an AArch64 CC.
1392 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1393 /// should be AND'ed instead of OR'ed.
1396  AArch64CC::CondCode &CondCode2) {
1397  CondCode2 = AArch64CC::AL;
1398  switch (CC) {
1399  default:
1400  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1401  assert(CondCode2 == AArch64CC::AL);
1402  break;
1403  case ISD::SETONE:
1404  // (a one b)
1405  // == ((a olt b) || (a ogt b))
1406  // == ((a ord b) && (a une b))
1407  CondCode = AArch64CC::VC;
1408  CondCode2 = AArch64CC::NE;
1409  break;
1410  case ISD::SETUEQ:
1411  // (a ueq b)
1412  // == ((a uno b) || (a oeq b))
1413  // == ((a ule b) && (a uge b))
1414  CondCode = AArch64CC::PL;
1415  CondCode2 = AArch64CC::LE;
1416  break;
1417  }
1418 }
1419 
1420 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1421 /// CC usable with the vector instructions. Fewer operations are available
1422 /// without a real NZCV register, so we have to use less efficient combinations
1423 /// to get the same effect.
1426  AArch64CC::CondCode &CondCode2,
1427  bool &Invert) {
1428  Invert = false;
1429  switch (CC) {
1430  default:
1431  // Mostly the scalar mappings work fine.
1432  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1433  break;
1434  case ISD::SETUO:
1435  Invert = true;
1437  case ISD::SETO:
1438  CondCode = AArch64CC::MI;
1439  CondCode2 = AArch64CC::GE;
1440  break;
1441  case ISD::SETUEQ:
1442  case ISD::SETULT:
1443  case ISD::SETULE:
1444  case ISD::SETUGT:
1445  case ISD::SETUGE:
1446  // All of the compare-mask comparisons are ordered, but we can switch
1447  // between the two by a double inversion. E.g. ULE == !OGT.
1448  Invert = true;
1449  changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1450  break;
1451  }
1452 }
1453 
1454 static bool isLegalArithImmed(uint64_t C) {
1455  // Matches AArch64DAGToDAGISel::SelectArithImmed().
1456  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1457  LLVM_DEBUG(dbgs() << "Is imm " << C
1458  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1459  return IsLegal;
1460 }
1461 
1463  const SDLoc &dl, SelectionDAG &DAG) {
1464  EVT VT = LHS.getValueType();
1465  const bool FullFP16 =
1466  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1467 
1468  if (VT.isFloatingPoint()) {
1469  assert(VT != MVT::f128);
1470  if (VT == MVT::f16 && !FullFP16) {
1471  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1472  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1473  VT = MVT::f32;
1474  }
1475  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1476  }
1477 
1478  // The CMP instruction is just an alias for SUBS, and representing it as
1479  // SUBS means that it's possible to get CSE with subtract operations.
1480  // A later phase can perform the optimization of setting the destination
1481  // register to WZR/XZR if it ends up being unused.
1482  unsigned Opcode = AArch64ISD::SUBS;
1483 
1484  if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
1485  (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1486  // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
1487  // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
1488  // can be set differently by this operation. It comes down to whether
1489  // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1490  // everything is fine. If not then the optimization is wrong. Thus general
1491  // comparisons are only valid if op2 != 0.
1492 
1493  // So, finally, the only LLVM-native comparisons that don't mention C and V
1494  // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1495  // the absence of information about op2.
1496  Opcode = AArch64ISD::ADDS;
1497  RHS = RHS.getOperand(1);
1498  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1499  !isUnsignedIntSetCC(CC)) {
1500  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1501  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1502  // of the signed comparisons.
1503  Opcode = AArch64ISD::ANDS;
1504  RHS = LHS.getOperand(1);
1505  LHS = LHS.getOperand(0);
1506  }
1507 
1508  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1509  .getValue(1);
1510 }
1511 
1512 /// \defgroup AArch64CCMP CMP;CCMP matching
1513 ///
1514 /// These functions deal with the formation of CMP;CCMP;... sequences.
1515 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1516 /// a comparison. They set the NZCV flags to a predefined value if their
1517 /// predicate is false. This allows to express arbitrary conjunctions, for
1518 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
1519 /// expressed as:
1520 /// cmp A
1521 /// ccmp B, inv(CB), CA
1522 /// check for CB flags
1523 ///
1524 /// In general we can create code for arbitrary "... (and (and A B) C)"
1525 /// sequences. We can also implement some "or" expressions, because "(or A B)"
1526 /// is equivalent to "not (and (not A) (not B))" and we can implement some
1527 /// negation operations:
1528 /// We can negate the results of a single comparison by inverting the flags
1529 /// used when the predicate fails and inverting the flags tested in the next
1530 /// instruction; We can also negate the results of the whole previous
1531 /// conditional compare sequence by inverting the flags tested in the next
1532 /// instruction. However there is no way to negate the result of a partial
1533 /// sequence.
1534 ///
1535 /// Therefore on encountering an "or" expression we can negate the subtree on
1536 /// one side and have to be able to push the negate to the leafs of the subtree
1537 /// on the other side (see also the comments in code). As complete example:
1538 /// "or (or (setCA (cmp A)) (setCB (cmp B)))
1539 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1540 /// is transformed to
1541 /// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
1542 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1543 /// and implemented as:
1544 /// cmp C
1545 /// ccmp D, inv(CD), CC
1546 /// ccmp A, CA, inv(CD)
1547 /// ccmp B, CB, inv(CA)
1548 /// check for CB flags
1549 /// A counterexample is "or (and A B) (and C D)" which cannot be implemented
1550 /// by conditional compare sequences.
1551 /// @{
1552 
1553 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1555  ISD::CondCode CC, SDValue CCOp,
1557  AArch64CC::CondCode OutCC,
1558  const SDLoc &DL, SelectionDAG &DAG) {
1559  unsigned Opcode = 0;
1560  const bool FullFP16 =
1561  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1562 
1563  if (LHS.getValueType().isFloatingPoint()) {
1564  assert(LHS.getValueType() != MVT::f128);
1565  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1566  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1567  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1568  }
1569  Opcode = AArch64ISD::FCCMP;
1570  } else if (RHS.getOpcode() == ISD::SUB) {
1571  SDValue SubOp0 = RHS.getOperand(0);
1572  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1573  // See emitComparison() on why we can only do this for SETEQ and SETNE.
1574  Opcode = AArch64ISD::CCMN;
1575  RHS = RHS.getOperand(1);
1576  }
1577  }
1578  if (Opcode == 0)
1579  Opcode = AArch64ISD::CCMP;
1580 
1581  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1583  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1584  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1585  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1586 }
1587 
1588 /// Returns true if @p Val is a tree of AND/OR/SETCC operations.
1589 /// CanPushNegate is set to true if we can push a negate operation through
1590 /// the tree in a was that we are left with AND operations and negate operations
1591 /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
1592 /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
1593 /// brought into such a form.
1594 static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
1595  unsigned Depth = 0) {
1596  if (!Val.hasOneUse())
1597  return false;
1598  unsigned Opcode = Val->getOpcode();
1599  if (Opcode == ISD::SETCC) {
1600  if (Val->getOperand(0).getValueType() == MVT::f128)
1601  return false;
1602  CanNegate = true;
1603  return true;
1604  }
1605  // Protect against exponential runtime and stack overflow.
1606  if (Depth > 6)
1607  return false;
1608  if (Opcode == ISD::AND || Opcode == ISD::OR) {
1609  SDValue O0 = Val->getOperand(0);
1610  SDValue O1 = Val->getOperand(1);
1611  bool CanNegateL;
1612  if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
1613  return false;
1614  bool CanNegateR;
1615  if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
1616  return false;
1617 
1618  if (Opcode == ISD::OR) {
1619  // For an OR expression we need to be able to negate at least one side or
1620  // we cannot do the transformation at all.
1621  if (!CanNegateL && !CanNegateR)
1622  return false;
1623  // We can however change a (not (or x y)) to (and (not x) (not y)) if we
1624  // can negate the x and y subtrees.
1625  CanNegate = CanNegateL && CanNegateR;
1626  } else {
1627  // If the operands are OR expressions then we finally need to negate their
1628  // outputs, we can only do that for the operand with emitted last by
1629  // negating OutCC, not for both operands.
1630  bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
1631  bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
1632  if (NeedsNegOutL && NeedsNegOutR)
1633  return false;
1634  // We cannot negate an AND operation (it would become an OR),
1635  CanNegate = false;
1636  }
1637  return true;
1638  }
1639  return false;
1640 }
1641 
1642 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1643 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1644 /// Tries to transform the given i1 producing node @p Val to a series compare
1645 /// and conditional compare operations. @returns an NZCV flags producing node
1646 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1647 /// transformation was not possible.
1648 /// On recursive invocations @p PushNegate may be set to true to have negation
1649 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
1650 /// for the comparisons in the current subtree; @p Depth limits the search
1651 /// depth to avoid stack overflow.
1653  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1655  // We're at a tree leaf, produce a conditional comparison operation.
1656  unsigned Opcode = Val->getOpcode();
1657  if (Opcode == ISD::SETCC) {
1658  SDValue LHS = Val->getOperand(0);
1659  SDValue RHS = Val->getOperand(1);
1660  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1661  bool isInteger = LHS.getValueType().isInteger();
1662  if (Negate)
1663  CC = getSetCCInverse(CC, isInteger);
1664  SDLoc DL(Val);
1665  // Determine OutCC and handle FP special case.
1666  if (isInteger) {
1667  OutCC = changeIntCCToAArch64CC(CC);
1668  } else {
1670  AArch64CC::CondCode ExtraCC;
1671  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1672  // Some floating point conditions can't be tested with a single condition
1673  // code. Construct an additional comparison in this case.
1674  if (ExtraCC != AArch64CC::AL) {
1675  SDValue ExtraCmp;
1676  if (!CCOp.getNode())
1677  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1678  else
1679  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1680  ExtraCC, DL, DAG);
1681  CCOp = ExtraCmp;
1682  Predicate = ExtraCC;
1683  }
1684  }
1685 
1686  // Produce a normal comparison if we are first in the chain
1687  if (!CCOp)
1688  return emitComparison(LHS, RHS, CC, DL, DAG);
1689  // Otherwise produce a ccmp.
1690  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1691  DAG);
1692  }
1693  assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
1694  "Valid conjunction/disjunction tree");
1695 
1696  // Check if both sides can be transformed.
1697  SDValue LHS = Val->getOperand(0);
1698  SDValue RHS = Val->getOperand(1);
1699 
1700  // In case of an OR we need to negate our operands and the result.
1701  // (A v B) <=> not(not(A) ^ not(B))
1702  bool NegateOpsAndResult = Opcode == ISD::OR;
1703  // We can negate the results of all previous operations by inverting the
1704  // predicate flags giving us a free negation for one side. The other side
1705  // must be negatable by itself.
1706  if (NegateOpsAndResult) {
1707  // See which side we can negate.
1708  bool CanNegateL;
1709  bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
1710  assert(isValidL && "Valid conjunction/disjunction tree");
1711  (void)isValidL;
1712 
1713 #ifndef NDEBUG
1714  bool CanNegateR;
1715  bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
1716  assert(isValidR && "Valid conjunction/disjunction tree");
1717  assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
1718 #endif
1719 
1720  // Order the side which we cannot negate to RHS so we can emit it first.
1721  if (!CanNegateL)
1722  std::swap(LHS, RHS);
1723  } else {
1724  bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
1725  assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
1726  "Valid conjunction/disjunction tree");
1727  // Order the side where we need to negate the output flags to RHS so it
1728  // gets emitted first.
1729  if (NeedsNegOutL)
1730  std::swap(LHS, RHS);
1731  }
1732 
1733  // Emit RHS. If we want to negate the tree we only need to push a negate
1734  // through if we are already in a PushNegate case, otherwise we can negate
1735  // the "flags to test" afterwards.
1736  AArch64CC::CondCode RHSCC;
1737  SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
1738  CCOp, Predicate);
1739  if (NegateOpsAndResult && !Negate)
1740  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1741  // Emit LHS. We may need to negate it.
1742  SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
1743  NegateOpsAndResult, CmpR,
1744  RHSCC);
1745  // If we transformed an OR to and AND then we have to negate the result
1746  // (or absorb the Negate parameter).
1747  if (NegateOpsAndResult && !Negate)
1748  OutCC = AArch64CC::getInvertedCondCode(OutCC);
1749  return CmpL;
1750 }
1751 
1752 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1753 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1754 /// \see emitConjunctionDisjunctionTreeRec().
1756  AArch64CC::CondCode &OutCC) {
1757  bool CanNegate;
1758  if (!isConjunctionDisjunctionTree(Val, CanNegate))
1759  return SDValue();
1760 
1761  return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
1762  AArch64CC::AL);
1763 }
1764 
1765 /// @}
1766 
1768  SDValue &AArch64cc, SelectionDAG &DAG,
1769  const SDLoc &dl) {
1770  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1771  EVT VT = RHS.getValueType();
1772  uint64_t C = RHSC->getZExtValue();
1773  if (!isLegalArithImmed(C)) {
1774  // Constant does not fit, try adjusting it by one?
1775  switch (CC) {
1776  default:
1777  break;
1778  case ISD::SETLT:
1779  case ISD::SETGE:
1780  if ((VT == MVT::i32 && C != 0x80000000 &&
1781  isLegalArithImmed((uint32_t)(C - 1))) ||
1782  (VT == MVT::i64 && C != 0x80000000ULL &&
1783  isLegalArithImmed(C - 1ULL))) {
1784  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1785  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1786  RHS = DAG.getConstant(C, dl, VT);
1787  }
1788  break;
1789  case ISD::SETULT:
1790  case ISD::SETUGE:
1791  if ((VT == MVT::i32 && C != 0 &&
1792  isLegalArithImmed((uint32_t)(C - 1))) ||
1793  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1794  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1795  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1796  RHS = DAG.getConstant(C, dl, VT);
1797  }
1798  break;
1799  case ISD::SETLE:
1800  case ISD::SETGT:
1801  if ((VT == MVT::i32 && C != INT32_MAX &&
1802  isLegalArithImmed((uint32_t)(C + 1))) ||
1803  (VT == MVT::i64 && C != INT64_MAX &&
1804  isLegalArithImmed(C + 1ULL))) {
1805  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1806  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1807  RHS = DAG.getConstant(C, dl, VT);
1808  }
1809  break;
1810  case ISD::SETULE:
1811  case ISD::SETUGT:
1812  if ((VT == MVT::i32 && C != UINT32_MAX &&
1813  isLegalArithImmed((uint32_t)(C + 1))) ||
1814  (VT == MVT::i64 && C != UINT64_MAX &&
1815  isLegalArithImmed(C + 1ULL))) {
1816  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1817  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1818  RHS = DAG.getConstant(C, dl, VT);
1819  }
1820  break;
1821  }
1822  }
1823  }
1824  SDValue Cmp;
1825  AArch64CC::CondCode AArch64CC;
1826  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
1827  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
1828 
1829  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
1830  // For the i8 operand, the largest immediate is 255, so this can be easily
1831  // encoded in the compare instruction. For the i16 operand, however, the
1832  // largest immediate cannot be encoded in the compare.
1833  // Therefore, use a sign extending load and cmn to avoid materializing the
1834  // -1 constant. For example,
1835  // movz w1, #65535
1836  // ldrh w0, [x0, #0]
1837  // cmp w0, w1
1838  // >
1839  // ldrsh w0, [x0, #0]
1840  // cmn w0, #1
1841  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
1842  // if and only if (sext LHS) == (sext RHS). The checks are in place to
1843  // ensure both the LHS and RHS are truly zero extended and to make sure the
1844  // transformation is profitable.
1845  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
1846  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
1847  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
1848  LHS.getNode()->hasNUsesOfValue(1, 0)) {
1849  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
1850  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
1851  SDValue SExt =
1852  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
1853  DAG.getValueType(MVT::i16));
1854  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
1855  RHS.getValueType()),
1856  CC, dl, DAG);
1857  AArch64CC = changeIntCCToAArch64CC(CC);
1858  }
1859  }
1860 
1861  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
1862  if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
1863  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
1864  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
1865  }
1866  }
1867  }
1868 
1869  if (!Cmp) {
1870  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
1871  AArch64CC = changeIntCCToAArch64CC(CC);
1872  }
1873  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
1874  return Cmp;
1875 }
1876 
1877 static std::pair<SDValue, SDValue>
1879  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
1880  "Unsupported value type");
1881  SDValue Value, Overflow;
1882  SDLoc DL(Op);
1883  SDValue LHS = Op.getOperand(0);
1884  SDValue RHS = Op.getOperand(1);
1885  unsigned Opc = 0;
1886  switch (Op.getOpcode()) {
1887  default:
1888  llvm_unreachable("Unknown overflow instruction!");
1889  case ISD::SADDO:
1890  Opc = AArch64ISD::ADDS;
1891  CC = AArch64CC::VS;
1892  break;
1893  case ISD::UADDO:
1894  Opc = AArch64ISD::ADDS;
1895  CC = AArch64CC::HS;
1896  break;
1897  case ISD::SSUBO:
1898  Opc = AArch64ISD::SUBS;
1899  CC = AArch64CC::VS;
1900  break;
1901  case ISD::USUBO:
1902  Opc = AArch64ISD::SUBS;
1903  CC = AArch64CC::LO;
1904  break;
1905  // Multiply needs a little bit extra work.
1906  case ISD::SMULO:
1907  case ISD::UMULO: {
1908  CC = AArch64CC::NE;
1909  bool IsSigned = Op.getOpcode() == ISD::SMULO;
1910  if (Op.getValueType() == MVT::i32) {
1911  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1912  // For a 32 bit multiply with overflow check we want the instruction
1913  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
1914  // need to generate the following pattern:
1915  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
1916  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
1917  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
1918  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1919  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
1920  DAG.getConstant(0, DL, MVT::i64));
1921  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
1922  // operation. We need to clear out the upper 32 bits, because we used a
1923  // widening multiply that wrote all 64 bits. In the end this should be a
1924  // noop.
1925  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
1926  if (IsSigned) {
1927  // The signed overflow check requires more than just a simple check for
1928  // any bit set in the upper 32 bits of the result. These bits could be
1929  // just the sign bits of a negative number. To perform the overflow
1930  // check we have to arithmetic shift right the 32nd bit of the result by
1931  // 31 bits. Then we compare the result to the upper 32 bits.
1932  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
1933  DAG.getConstant(32, DL, MVT::i64));
1934  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
1935  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
1936  DAG.getConstant(31, DL, MVT::i64));
1937  // It is important that LowerBits is last, otherwise the arithmetic
1938  // shift will not be folded into the compare (SUBS).
1939  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
1940  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1941  .getValue(1);
1942  } else {
1943  // The overflow check for unsigned multiply is easy. We only need to
1944  // check if any of the upper 32 bits are set. This can be done with a
1945  // CMP (shifted register). For that we need to generate the following
1946  // pattern:
1947  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
1948  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
1949  DAG.getConstant(32, DL, MVT::i64));
1950  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1951  Overflow =
1952  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
1953  DAG.getConstant(0, DL, MVT::i64),
1954  UpperBits).getValue(1);
1955  }
1956  break;
1957  }
1958  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
1959  // For the 64 bit multiply
1960  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1961  if (IsSigned) {
1962  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
1963  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
1964  DAG.getConstant(63, DL, MVT::i64));
1965  // It is important that LowerBits is last, otherwise the arithmetic
1966  // shift will not be folded into the compare (SUBS).
1967  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1968  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1969  .getValue(1);
1970  } else {
1971  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
1972  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1973  Overflow =
1974  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
1975  DAG.getConstant(0, DL, MVT::i64),
1976  UpperBits).getValue(1);
1977  }
1978  break;
1979  }
1980  } // switch (...)
1981 
1982  if (Opc) {
1983  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
1984 
1985  // Emit the AArch64 operation with overflow check.
1986  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
1987  Overflow = Value.getValue(1);
1988  }
1989  return std::make_pair(Value, Overflow);
1990 }
1991 
1992 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
1993  RTLIB::Libcall Call) const {
1994  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
1995  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
1996 }
1997 
1998 // Returns true if the given Op is the overflow flag result of an overflow
1999 // intrinsic operation.
2000 static bool isOverflowIntrOpRes(SDValue Op) {
2001  unsigned Opc = Op.getOpcode();
2002  return (Op.getResNo() == 1 &&
2003  (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2004  Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
2005 }
2006 
2008  SDValue Sel = Op.getOperand(0);
2009  SDValue Other = Op.getOperand(1);
2010  SDLoc dl(Sel);
2011 
2012  // If the operand is an overflow checking operation, invert the condition
2013  // code and kill the Not operation. I.e., transform:
2014  // (xor (overflow_op_bool, 1))
2015  // -->
2016  // (csel 1, 0, invert(cc), overflow_op_bool)
2017  // ... which later gets transformed to just a cset instruction with an
2018  // inverted condition code, rather than a cset + eor sequence.
2019  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
2020  // Only lower legal XALUO ops.
2021  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2022  return SDValue();
2023 
2024  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2025  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2027  SDValue Value, Overflow;
2028  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2029  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2030  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2031  CCVal, Overflow);
2032  }
2033  // If neither operand is a SELECT_CC, give up.
2034  if (Sel.getOpcode() != ISD::SELECT_CC)
2035  std::swap(Sel, Other);
2036  if (Sel.getOpcode() != ISD::SELECT_CC)
2037  return Op;
2038 
2039  // The folding we want to perform is:
2040  // (xor x, (select_cc a, b, cc, 0, -1) )
2041  // -->
2042  // (csel x, (xor x, -1), cc ...)
2043  //
2044  // The latter will get matched to a CSINV instruction.
2045 
2046  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2047  SDValue LHS = Sel.getOperand(0);
2048  SDValue RHS = Sel.getOperand(1);
2049  SDValue TVal = Sel.getOperand(2);
2050  SDValue FVal = Sel.getOperand(3);
2051 
2052  // FIXME: This could be generalized to non-integer comparisons.
2053  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2054  return Op;
2055 
2056  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2057  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2058 
2059  // The values aren't constants, this isn't the pattern we're looking for.
2060  if (!CFVal || !CTVal)
2061  return Op;
2062 
2063  // We can commute the SELECT_CC by inverting the condition. This
2064  // might be needed to make this fit into a CSINV pattern.
2065  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2066  std::swap(TVal, FVal);
2067  std::swap(CTVal, CFVal);
2068  CC = ISD::getSetCCInverse(CC, true);
2069  }
2070 
2071  // If the constants line up, perform the transform!
2072  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2073  SDValue CCVal;
2074  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2075 
2076  FVal = Other;
2077  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2078  DAG.getConstant(-1ULL, dl, Other.getValueType()));
2079 
2080  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2081  CCVal, Cmp);
2082  }
2083 
2084  return Op;
2085 }
2086 
2088  EVT VT = Op.getValueType();
2089 
2090  // Let legalize expand this if it isn't a legal type yet.
2091  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2092  return SDValue();
2093 
2094  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2095 
2096  unsigned Opc;
2097  bool ExtraOp = false;
2098  switch (Op.getOpcode()) {
2099  default:
2100  llvm_unreachable("Invalid code");
2101  case ISD::ADDC:
2102  Opc = AArch64ISD::ADDS;
2103  break;
2104  case ISD::SUBC:
2105  Opc = AArch64ISD::SUBS;
2106  break;
2107  case ISD::ADDE:
2108  Opc = AArch64ISD::ADCS;
2109  ExtraOp = true;
2110  break;
2111  case ISD::SUBE:
2112  Opc = AArch64ISD::SBCS;
2113  ExtraOp = true;
2114  break;
2115  }
2116 
2117  if (!ExtraOp)
2118  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2119  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2120  Op.getOperand(2));
2121 }
2122 
2124  // Let legalize expand this if it isn't a legal type yet.
2126  return SDValue();
2127 
2128  SDLoc dl(Op);
2130  // The actual operation that sets the overflow or carry flag.
2131  SDValue Value, Overflow;
2132  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2133 
2134  // We use 0 and 1 as false and true values.
2135  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2136  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2137 
2138  // We use an inverted condition, because the conditional select is inverted
2139  // too. This will allow it to be selected to a single instruction:
2140  // CSINC Wd, WZR, WZR, invert(cond).
2141  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2142  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2143  CCVal, Overflow);
2144 
2145  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2146  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2147 }
2148 
2149 // Prefetch operands are:
2150 // 1: Address to prefetch
2151 // 2: bool isWrite
2152 // 3: int locality (0 = no locality ... 3 = extreme locality)
2153 // 4: bool isDataCache
2155  SDLoc DL(Op);
2156  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2157  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2158  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2159 
2160  bool IsStream = !Locality;
2161  // When the locality number is set
2162  if (Locality) {
2163  // The front-end should have filtered out the out-of-range values
2164  assert(Locality <= 3 && "Prefetch locality out-of-range");
2165  // The locality degree is the opposite of the cache speed.
2166  // Put the number the other way around.
2167  // The encoding starts at 0 for level 1
2168  Locality = 3 - Locality;
2169  }
2170 
2171  // built the mask value encoding the expected behavior.
2172  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2173  (!IsData << 3) | // IsDataCache bit
2174  (Locality << 1) | // Cache level bits
2175  (unsigned)IsStream; // Stream bit
2176  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2177  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2178 }
2179 
2180 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2181  SelectionDAG &DAG) const {
2182  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2183 
2184  RTLIB::Libcall LC;
2186 
2187  return LowerF128Call(Op, DAG, LC);
2188 }
2189 
2190 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2191  SelectionDAG &DAG) const {
2192  if (Op.getOperand(0).getValueType() != MVT::f128) {
2193  // It's legal except when f128 is involved
2194  return Op;
2195  }
2196 
2197  RTLIB::Libcall LC;
2199 
2200  // FP_ROUND node has a second operand indicating whether it is known to be
2201  // precise. That doesn't take part in the LibCall so we can't directly use
2202  // LowerF128Call.
2203  SDValue SrcVal = Op.getOperand(0);
2204  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
2205  SDLoc(Op)).first;
2206 }
2207 
2209  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2210  // Any additional optimization in this function should be recorded
2211  // in the cost tables.
2212  EVT InVT = Op.getOperand(0).getValueType();
2213  EVT VT = Op.getValueType();
2214  unsigned NumElts = InVT.getVectorNumElements();
2215 
2216  // f16 vectors are promoted to f32 before a conversion.
2217  if (InVT.getVectorElementType() == MVT::f16) {
2218  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2219  SDLoc dl(Op);
2220  return DAG.getNode(
2221  Op.getOpcode(), dl, Op.getValueType(),
2222  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2223  }
2224 
2225  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2226  SDLoc dl(Op);
2227  SDValue Cv =
2228  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2229  Op.getOperand(0));
2230  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2231  }
2232 
2233  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2234  SDLoc dl(Op);
2235  MVT ExtVT =
2237  VT.getVectorNumElements());
2238  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2239  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2240  }
2241 
2242  // Type changing conversions are illegal.
2243  return Op;
2244 }
2245 
2246 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2247  SelectionDAG &DAG) const {
2248  if (Op.getOperand(0).getValueType().isVector())
2249  return LowerVectorFP_TO_INT(Op, DAG);
2250 
2251  // f16 conversions are promoted to f32 when full fp16 is not supported.
2252  if (Op.getOperand(0).getValueType() == MVT::f16 &&
2253  !Subtarget->hasFullFP16()) {
2254  SDLoc dl(Op);
2255  return DAG.getNode(
2256  Op.getOpcode(), dl, Op.getValueType(),
2257  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2258  }
2259 
2260  if (Op.getOperand(0).getValueType() != MVT::f128) {
2261  // It's legal except when f128 is involved
2262  return Op;
2263  }
2264 
2265  RTLIB::Libcall LC;
2266  if (Op.getOpcode() == ISD::FP_TO_SINT)
2268  else
2270 
2271  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2272  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
2273 }
2274 
2276  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2277  // Any additional optimization in this function should be recorded
2278  // in the cost tables.
2279  EVT VT = Op.getValueType();
2280  SDLoc dl(Op);
2281  SDValue In = Op.getOperand(0);
2282  EVT InVT = In.getValueType();
2283 
2284  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2285  MVT CastVT =
2287  InVT.getVectorNumElements());
2288  In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2289  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2290  }
2291 
2292  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2293  unsigned CastOpc =
2295  EVT CastVT = VT.changeVectorElementTypeToInteger();
2296  In = DAG.getNode(CastOpc, dl, CastVT, In);
2297  return DAG.getNode(Op.getOpcode(), dl, VT, In);
2298  }
2299 
2300  return Op;
2301 }
2302 
2303 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2304  SelectionDAG &DAG) const {
2305  if (Op.getValueType().isVector())
2306  return LowerVectorINT_TO_FP(Op, DAG);
2307 
2308  // f16 conversions are promoted to f32 when full fp16 is not supported.
2309  if (Op.getValueType() == MVT::f16 &&
2310  !Subtarget->hasFullFP16()) {
2311  SDLoc dl(Op);
2312  return DAG.getNode(
2313  ISD::FP_ROUND, dl, MVT::f16,
2314  DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2315  DAG.getIntPtrConstant(0, dl));
2316  }
2317 
2318  // i128 conversions are libcalls.
2319  if (Op.getOperand(0).getValueType() == MVT::i128)
2320  return SDValue();
2321 
2322  // Other conversions are legal, unless it's to the completely software-based
2323  // fp128.
2324  if (Op.getValueType() != MVT::f128)
2325  return Op;
2326 
2327  RTLIB::Libcall LC;
2328  if (Op.getOpcode() == ISD::SINT_TO_FP)
2330  else
2332 
2333  return LowerF128Call(Op, DAG, LC);
2334 }
2335 
2336 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2337  SelectionDAG &DAG) const {
2338  // For iOS, we want to call an alternative entry point: __sincos_stret,
2339  // which returns the values in two S / D registers.
2340  SDLoc dl(Op);
2341  SDValue Arg = Op.getOperand(0);
2342  EVT ArgVT = Arg.getValueType();
2343  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2344 
2345  ArgListTy Args;
2346  ArgListEntry Entry;
2347 
2348  Entry.Node = Arg;
2349  Entry.Ty = ArgTy;
2350  Entry.IsSExt = false;
2351  Entry.IsZExt = false;
2352  Args.push_back(Entry);
2353 
2354  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2355  : RTLIB::SINCOS_STRET_F32;
2356  const char *LibcallName = getLibcallName(LC);
2357  SDValue Callee =
2358  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2359 
2360  StructType *RetTy = StructType::get(ArgTy, ArgTy);
2362  CLI.setDebugLoc(dl)
2363  .setChain(DAG.getEntryNode())
2364  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2365 
2366  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2367  return CallResult.first;
2368 }
2369 
2371  if (Op.getValueType() != MVT::f16)
2372  return SDValue();
2373 
2374  assert(Op.getOperand(0).getValueType() == MVT::i16);
2375  SDLoc DL(Op);
2376 
2377  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2378  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2379  return SDValue(
2380  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2381  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2382  0);
2383 }
2384 
2385 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2386  if (OrigVT.getSizeInBits() >= 64)
2387  return OrigVT;
2388 
2389  assert(OrigVT.isSimple() && "Expecting a simple value type");
2390 
2391  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2392  switch (OrigSimpleTy) {
2393  default: llvm_unreachable("Unexpected Vector Type");
2394  case MVT::v2i8:
2395  case MVT::v2i16:
2396  return MVT::v2i32;
2397  case MVT::v4i8:
2398  return MVT::v4i16;
2399  }
2400 }
2401 
2403  const EVT &OrigTy,
2404  const EVT &ExtTy,
2405  unsigned ExtOpcode) {
2406  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2407  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2408  // 64-bits we need to insert a new extension so that it will be 64-bits.
2409  assert(ExtTy.is128BitVector() && "Unexpected extension size");
2410  if (OrigTy.getSizeInBits() >= 64)
2411  return N;
2412 
2413  // Must extend size to at least 64 bits to be used as an operand for VMULL.
2414  EVT NewVT = getExtensionTo64Bits(OrigTy);
2415 
2416  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2417 }
2418 
2420  bool isSigned) {
2421  EVT VT = N->getValueType(0);
2422 
2423  if (N->getOpcode() != ISD::BUILD_VECTOR)
2424  return false;
2425 
2426  for (const SDValue &Elt : N->op_values()) {
2427  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2428  unsigned EltSize = VT.getScalarSizeInBits();
2429  unsigned HalfSize = EltSize / 2;
2430  if (isSigned) {
2431  if (!isIntN(HalfSize, C->getSExtValue()))
2432  return false;
2433  } else {
2434  if (!isUIntN(HalfSize, C->getZExtValue()))
2435  return false;
2436  }
2437  continue;
2438  }
2439  return false;
2440  }
2441 
2442  return true;
2443 }
2444 
2446  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2448  N->getOperand(0)->getValueType(0),
2449  N->getValueType(0),
2450  N->getOpcode());
2451 
2452  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2453  EVT VT = N->getValueType(0);
2454  SDLoc dl(N);
2455  unsigned EltSize = VT.getScalarSizeInBits() / 2;
2456  unsigned NumElts = VT.getVectorNumElements();
2457  MVT TruncVT = MVT::getIntegerVT(EltSize);
2459  for (unsigned i = 0; i != NumElts; ++i) {
2460  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2461  const APInt &CInt = C->getAPIntValue();
2462  // Element types smaller than 32 bits are not legal, so use i32 elements.
2463  // The values are implicitly truncated so sext vs. zext doesn't matter.
2464  Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2465  }
2466  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2467 }
2468 
2469 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2470  return N->getOpcode() == ISD::SIGN_EXTEND ||
2471  isExtendedBUILD_VECTOR(N, DAG, true);
2472 }
2473 
2474 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2475  return N->getOpcode() == ISD::ZERO_EXTEND ||
2476  isExtendedBUILD_VECTOR(N, DAG, false);
2477 }
2478 
2479 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2480  unsigned Opcode = N->getOpcode();
2481  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2482  SDNode *N0 = N->getOperand(0).getNode();
2483  SDNode *N1 = N->getOperand(1).getNode();
2484  return N0->hasOneUse() && N1->hasOneUse() &&
2485  isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2486  }
2487  return false;
2488 }
2489 
2490 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2491  unsigned Opcode = N->getOpcode();
2492  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2493  SDNode *N0 = N->getOperand(0).getNode();
2494  SDNode *N1 = N->getOperand(1).getNode();
2495  return N0->hasOneUse() && N1->hasOneUse() &&
2496  isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2497  }
2498  return false;
2499 }
2500 
2501 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
2502  SelectionDAG &DAG) const {
2503  // The rounding mode is in bits 23:22 of the FPSCR.
2504  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
2505  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
2506  // so that the shift + and get folded into a bitfield extract.
2507  SDLoc dl(Op);
2508 
2509  SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
2510  DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
2511  MVT::i64));
2512  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
2513  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
2514  DAG.getConstant(1U << 22, dl, MVT::i32));
2515  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
2516  DAG.getConstant(22, dl, MVT::i32));
2517  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
2518  DAG.getConstant(3, dl, MVT::i32));
2519 }
2520 
2522  // Multiplications are only custom-lowered for 128-bit vectors so that
2523  // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2524  EVT VT = Op.getValueType();
2525  assert(VT.is128BitVector() && VT.isInteger() &&
2526  "unexpected type for custom-lowering ISD::MUL");
2527  SDNode *N0 = Op.getOperand(0).getNode();
2528  SDNode *N1 = Op.getOperand(1).getNode();
2529  unsigned NewOpc = 0;
2530  bool isMLA = false;
2531  bool isN0SExt = isSignExtended(N0, DAG);
2532  bool isN1SExt = isSignExtended(N1, DAG);
2533  if (isN0SExt && isN1SExt)
2534  NewOpc = AArch64ISD::SMULL;
2535  else {
2536  bool isN0ZExt = isZeroExtended(N0, DAG);
2537  bool isN1ZExt = isZeroExtended(N1, DAG);
2538  if (isN0ZExt && isN1ZExt)
2539  NewOpc = AArch64ISD::UMULL;
2540  else if (isN1SExt || isN1ZExt) {
2541  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2542  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2543  if (isN1SExt && isAddSubSExt(N0, DAG)) {
2544  NewOpc = AArch64ISD::SMULL;
2545  isMLA = true;
2546  } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2547  NewOpc = AArch64ISD::UMULL;
2548  isMLA = true;
2549  } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2550  std::swap(N0, N1);
2551  NewOpc = AArch64ISD::UMULL;
2552  isMLA = true;
2553  }
2554  }
2555 
2556  if (!NewOpc) {
2557  if (VT == MVT::v2i64)
2558  // Fall through to expand this. It is not legal.
2559  return SDValue();
2560  else
2561  // Other vector multiplications are legal.
2562  return Op;
2563  }
2564  }
2565 
2566  // Legalize to a S/UMULL instruction
2567  SDLoc DL(Op);
2568  SDValue Op0;
2569  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2570  if (!isMLA) {
2571  Op0 = skipExtensionForVectorMULL(N0, DAG);
2572  assert(Op0.getValueType().is64BitVector() &&
2573  Op1.getValueType().is64BitVector() &&
2574  "unexpected types for extended operands to VMULL");
2575  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2576  }
2577  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2578  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2579  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2580  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2581  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2582  EVT Op1VT = Op1.getValueType();
2583  return DAG.getNode(N0->getOpcode(), DL, VT,
2584  DAG.getNode(NewOpc, DL, VT,
2585  DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2586  DAG.getNode(NewOpc, DL, VT,
2587  DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2588 }
2589 
2590 // Lower vector multiply high (ISD::MULHS and ISD::MULHU).
2592  // Multiplications are only custom-lowered for 128-bit vectors so that
2593  // {S,U}MULL{2} can be detected. Otherwise v2i64 multiplications are not
2594  // legal.
2595  EVT VT = Op.getValueType();
2596  assert(VT.is128BitVector() && VT.isInteger() &&
2597  "unexpected type for custom-lowering ISD::MULH{U,S}");
2598 
2599  SDValue V0 = Op.getOperand(0);
2600  SDValue V1 = Op.getOperand(1);
2601 
2602  SDLoc DL(Op);
2603 
2604  EVT ExtractVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
2605 
2606  // We turn (V0 mulhs/mulhu V1) to:
2607  //
2608  // (uzp2 (smull (extract_subvector (ExtractVT V128:V0, (i64 0)),
2609  // (extract_subvector (ExtractVT V128:V1, (i64 0))))),
2610  // (smull (extract_subvector (ExtractVT V128:V0, (i64 VMull2Idx)),
2611  // (extract_subvector (ExtractVT V128:V2, (i64 VMull2Idx))))))
2612  //
2613  // Where ExtractVT is a subvector with half number of elements, and
2614  // VMullIdx2 is the index of the middle element (the high part).
2615  //
2616  // The vector hight part extract and multiply will be matched against
2617  // {S,U}MULL{v16i8_v8i16,v8i16_v4i32,v4i32_v2i64} which in turn will
2618  // issue a {s}mull2 instruction.
2619  //
2620  // This basically multiply the lower subvector with '{s,u}mull', the high
2621  // subvector with '{s,u}mull2', and shuffle both results high part in
2622  // resulting vector.
2623  unsigned Mull2VectorIdx = VT.getVectorNumElements () / 2;
2624  SDValue VMullIdx = DAG.getConstant(0, DL, MVT::i64);
2625  SDValue VMull2Idx = DAG.getConstant(Mull2VectorIdx, DL, MVT::i64);
2626 
2627  SDValue VMullV0 =
2628  DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMullIdx);
2629  SDValue VMullV1 =
2630  DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMullIdx);
2631 
2632  SDValue VMull2V0 =
2633  DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMull2Idx);
2634  SDValue VMull2V1 =
2635  DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMull2Idx);
2636 
2637  unsigned MullOpc = Op.getOpcode() == ISD::MULHS ? AArch64ISD::SMULL
2639 
2640  EVT MullVT = ExtractVT.widenIntegerVectorElementType(*DAG.getContext());
2641  SDValue Mull = DAG.getNode(MullOpc, DL, MullVT, VMullV0, VMullV1);
2642  SDValue Mull2 = DAG.getNode(MullOpc, DL, MullVT, VMull2V0, VMull2V1);
2643 
2644  Mull = DAG.getNode(ISD::BITCAST, DL, VT, Mull);
2645  Mull2 = DAG.getNode(ISD::BITCAST, DL, VT, Mull2);
2646 
2647  return DAG.getNode(AArch64ISD::UZP2, DL, VT, Mull, Mull2);
2648 }
2649 
2650 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2651  SelectionDAG &DAG) const {
2652  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2653  SDLoc dl(Op);
2654  switch (IntNo) {
2655  default: return SDValue(); // Don't custom lower most intrinsics.
2656  case Intrinsic::thread_pointer: {
2657  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2658  return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2659  }
2660  case Intrinsic::aarch64_neon_abs:
2661  return DAG.getNode(ISD::ABS, dl, Op.getValueType(),
2662  Op.getOperand(1));
2663  case Intrinsic::aarch64_neon_smax:
2664  return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2665  Op.getOperand(1), Op.getOperand(2));
2666  case Intrinsic::aarch64_neon_umax:
2667  return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2668  Op.getOperand(1), Op.getOperand(2));
2669  case Intrinsic::aarch64_neon_smin:
2670  return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2671  Op.getOperand(1), Op.getOperand(2));
2672  case Intrinsic::aarch64_neon_umin:
2673  return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2674  Op.getOperand(1), Op.getOperand(2));
2675  }
2676 }
2677 
2678 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
2680  EVT VT, EVT MemVT,
2681  SelectionDAG &DAG) {
2682  assert(VT.isVector() && "VT should be a vector type");
2683  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
2684 
2685  SDValue Value = ST->getValue();
2686 
2687  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
2688  // the word lane which represent the v4i8 subvector. It optimizes the store
2689  // to:
2690  //
2691  // xtn v0.8b, v0.8h
2692  // str s0, [x0]
2693 
2694  SDValue Undef = DAG.getUNDEF(MVT::i16);
2695  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
2696  {Undef, Undef, Undef, Undef});
2697 
2698  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
2699  Value, UndefVec);
2700  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
2701 
2702  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
2703  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
2704  Trunc, DAG.getConstant(0, DL, MVT::i64));
2705 
2706  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
2707  ST->getBasePtr(), ST->getMemOperand());
2708 }
2709 
2710 // Custom lowering for any store, vector or scalar and/or default or with
2711 // a truncate operations. Currently only custom lower truncate operation
2712 // from vector v4i16 to v4i8.
2713 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
2714  SelectionDAG &DAG) const {
2715  SDLoc Dl(Op);
2716  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
2717  assert (StoreNode && "Can only custom lower store nodes");
2718 
2719  SDValue Value = StoreNode->getValue();
2720 
2721  EVT VT = Value.getValueType();
2722  EVT MemVT = StoreNode->getMemoryVT();
2723 
2724  assert (VT.isVector() && "Can only custom lower vector store types");
2725 
2726  unsigned AS = StoreNode->getAddressSpace();
2727  unsigned Align = StoreNode->getAlignment();
2728  if (Align < MemVT.getStoreSize() &&
2729  !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
2730  return scalarizeVectorStore(StoreNode, DAG);
2731  }
2732 
2733  if (StoreNode->isTruncatingStore()) {
2734  return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
2735  }
2736 
2737  return SDValue();
2738 }
2739 
2741  SelectionDAG &DAG) const {
2742  LLVM_DEBUG(dbgs() << "Custom lowering: ");
2743  LLVM_DEBUG(Op.dump());
2744 
2745  switch (Op.getOpcode()) {
2746  default:
2747  llvm_unreachable("unimplemented operand");
2748  return SDValue();
2749  case ISD::BITCAST:
2750  return LowerBITCAST(Op, DAG);
2751  case ISD::GlobalAddress:
2752  return LowerGlobalAddress(Op, DAG);
2753  case ISD::GlobalTLSAddress:
2754  return LowerGlobalTLSAddress(Op, DAG);
2755  case ISD::SETCC:
2756  return LowerSETCC(Op, DAG);
2757  case ISD::BR_CC:
2758  return LowerBR_CC(Op, DAG);
2759  case ISD::SELECT:
2760  return LowerSELECT(Op, DAG);
2761  case ISD::SELECT_CC:
2762  return LowerSELECT_CC(Op, DAG);
2763  case ISD::JumpTable:
2764  return LowerJumpTable(Op, DAG);
2765  case ISD::ConstantPool:
2766  return LowerConstantPool(Op, DAG);
2767  case ISD::BlockAddress:
2768  return LowerBlockAddress(Op, DAG);
2769  case ISD::VASTART:
2770  return LowerVASTART(Op, DAG);
2771  case ISD::VACOPY:
2772  return LowerVACOPY(Op, DAG);
2773  case ISD::VAARG:
2774  return LowerVAARG(Op, DAG);
2775  case ISD::ADDC:
2776  case ISD::ADDE:
2777  case ISD::SUBC:
2778  case ISD::SUBE:
2779  return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2780  case ISD::SADDO:
2781  case ISD::UADDO:
2782  case ISD::SSUBO:
2783  case ISD::USUBO:
2784  case ISD::SMULO:
2785  case ISD::UMULO:
2786  return LowerXALUO(Op, DAG);
2787  case ISD::FADD:
2788  return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
2789  case ISD::FSUB:
2790  return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
2791  case ISD::FMUL:
2792  return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
2793  case ISD::FDIV:
2794  return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
2795  case ISD::FP_ROUND:
2796  return LowerFP_ROUND(Op, DAG);
2797  case ISD::FP_EXTEND:
2798  return LowerFP_EXTEND(Op, DAG);
2799  case ISD::FRAMEADDR:
2800  return LowerFRAMEADDR(Op, DAG);
2801  case ISD::RETURNADDR:
2802  return LowerRETURNADDR(Op, DAG);
2804  return LowerINSERT_VECTOR_ELT(Op, DAG);
2806  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2807  case ISD::BUILD_VECTOR:
2808  return LowerBUILD_VECTOR(Op, DAG);
2809  case ISD::VECTOR_SHUFFLE:
2810  return LowerVECTOR_SHUFFLE(Op, DAG);
2812  return LowerEXTRACT_SUBVECTOR(Op, DAG);
2813  case ISD::SRA:
2814  case ISD::SRL:
2815  case ISD::SHL:
2816  return LowerVectorSRA_SRL_SHL(Op, DAG);
2817  case ISD::SHL_PARTS:
2818  return LowerShiftLeftParts(Op, DAG);
2819  case ISD::SRL_PARTS:
2820  case ISD::SRA_PARTS:
2821  return LowerShiftRightParts(Op, DAG);
2822  case ISD::CTPOP:
2823  return LowerCTPOP(Op, DAG);
2824  case ISD::FCOPYSIGN:
2825  return LowerFCOPYSIGN(Op, DAG);
2826  case ISD::AND:
2827  return LowerVectorAND(Op, DAG);
2828  case ISD::OR:
2829  return LowerVectorOR(Op, DAG);
2830  case ISD::XOR:
2831  return LowerXOR(Op, DAG);
2832  case ISD::PREFETCH:
2833  return LowerPREFETCH(Op, DAG);
2834  case ISD::SINT_TO_FP:
2835  case ISD::UINT_TO_FP:
2836  return LowerINT_TO_FP(Op, DAG);
2837  case ISD::FP_TO_SINT:
2838  case ISD::FP_TO_UINT:
2839  return LowerFP_TO_INT(Op, DAG);
2840  case ISD::FSINCOS:
2841  return LowerFSINCOS(Op, DAG);
2842  case ISD::FLT_ROUNDS_:
2843  return LowerFLT_ROUNDS_(Op, DAG);
2844  case ISD::MUL:
2845  return LowerMUL(Op, DAG);
2846  case ISD::MULHS:
2847  case ISD::MULHU:
2848  return LowerMULH(Op, DAG);
2850  return LowerINTRINSIC_WO_CHAIN(Op, DAG);
2851  case ISD::STORE:
2852  return LowerSTORE(Op, DAG);
2853  case ISD::VECREDUCE_ADD:
2854  case ISD::VECREDUCE_SMAX:
2855  case ISD::VECREDUCE_SMIN:
2856  case ISD::VECREDUCE_UMAX:
2857  case ISD::VECREDUCE_UMIN:
2858  case ISD::VECREDUCE_FMAX:
2859  case ISD::VECREDUCE_FMIN:
2860  return LowerVECREDUCE(Op, DAG);
2861  case ISD::ATOMIC_LOAD_SUB:
2862  return LowerATOMIC_LOAD_SUB(Op, DAG);
2863  case ISD::ATOMIC_LOAD_AND:
2864  return LowerATOMIC_LOAD_AND(Op, DAG);
2866  return LowerDYNAMIC_STACKALLOC(Op, DAG);
2867  }
2868 }
2869 
2870 //===----------------------------------------------------------------------===//
2871 // Calling Convention Implementation
2872 //===----------------------------------------------------------------------===//
2873 
2874 #include "AArch64GenCallingConv.inc"
2875 
2876 /// Selects the correct CCAssignFn for a given CallingConvention value.
2878  bool IsVarArg) const {
2879  switch (CC) {
2880  default:
2881  report_fatal_error("Unsupported calling convention.");
2883  return CC_AArch64_WebKit_JS;
2884  case CallingConv::GHC:
2885  return CC_AArch64_GHC;
2886  case CallingConv::C:
2887  case CallingConv::Fast:
2890  case CallingConv::Swift:
2891  if (Subtarget->isTargetWindows() && IsVarArg)
2892  return CC_AArch64_Win64_VarArg;
2893  if (!Subtarget->isTargetDarwin())
2894  return CC_AArch64_AAPCS;
2895  return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
2896  case CallingConv::Win64:
2897  return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
2898  }
2899 }
2900 
2901 CCAssignFn *
2903  return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
2904  : RetCC_AArch64_AAPCS;
2905 }
2906 
2907 SDValue AArch64TargetLowering::LowerFormalArguments(
2908  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2909  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2910  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2911  MachineFunction &MF = DAG.getMachineFunction();
2912  MachineFrameInfo &MFI = MF.getFrameInfo();
2913  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
2914 
2915  // Assign locations to all of the incoming arguments.
2917  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2918  *DAG.getContext());
2919 
2920  // At this point, Ins[].VT may already be promoted to i32. To correctly
2921  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
2922  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
2923  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
2924  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
2925  // LocVT.
2926  unsigned NumArgs = Ins.size();
2928  unsigned CurArgIdx = 0;
2929  for (unsigned i = 0; i != NumArgs; ++i) {
2930  MVT ValVT = Ins[i].VT;
2931  if (Ins[i].isOrigArg()) {
2932  std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
2933  CurArgIdx = Ins[i].getOrigArgIndex();
2934 
2935  // Get type of the original argument.
2936  EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
2937  /*AllowUnknown*/ true);
2938  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
2939  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
2940  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
2941  ValVT = MVT::i8;
2942  else if (ActualMVT == MVT::i16)
2943  ValVT = MVT::i16;
2944  }
2945  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
2946  bool Res =
2947  AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
2948  assert(!Res && "Call operand has unhandled type");
2949  (void)Res;
2950  }
2951  assert(ArgLocs.size() == Ins.size());
2952  SmallVector<SDValue, 16> ArgValues;
2953  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2954  CCValAssign &VA = ArgLocs[i];
2955 
2956  if (Ins[i].Flags.isByVal()) {
2957  // Byval is used for HFAs in the PCS, but the system should work in a
2958  // non-compliant manner for larger structs.
2959  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2960  int Size = Ins[i].Flags.getByValSize();
2961  unsigned NumRegs = (Size + 7) / 8;
2962 
2963  // FIXME: This works on big-endian for composite byvals, which are the common
2964  // case. It should also work for fundamental types too.
2965  unsigned FrameIdx =
2966  MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
2967  SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
2968  InVals.push_back(FrameIdxN);
2969 
2970  continue;
2971  }
2972 
2973  if (VA.isRegLoc()) {
2974  // Arguments stored in registers.
2975  EVT RegVT = VA.getLocVT();
2976 
2977  SDValue ArgValue;
2978  const TargetRegisterClass *RC;
2979 
2980  if (RegVT == MVT::i32)
2981  RC = &AArch64::GPR32RegClass;
2982  else if (RegVT == MVT::i64)
2983  RC = &AArch64::GPR64RegClass;
2984  else if (RegVT == MVT::f16)
2985  RC = &AArch64::FPR16RegClass;
2986  else if (RegVT == MVT::f32)
2987  RC = &AArch64::FPR32RegClass;
2988  else if (RegVT == MVT::f64 || RegVT.is64BitVector())
2989  RC = &AArch64::FPR64RegClass;
2990  else if (RegVT == MVT::f128 || RegVT.is128BitVector())
2991  RC = &AArch64::FPR128RegClass;
2992  else
2993  llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
2994 
2995  // Transform the arguments in physical registers into virtual ones.
2996  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2997  ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
2998 
2999  // If this is an 8, 16 or 32-bit value, it is really passed promoted
3000  // to 64 bits. Insert an assert[sz]ext to capture this, then
3001  // truncate to the right size.
3002  switch (VA.getLocInfo()) {
3003  default:
3004  llvm_unreachable("Unknown loc info!");
3005  case CCValAssign::Full:
3006  break;
3007  case CCValAssign::BCvt:
3008  ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
3009  break;
3010  case CCValAssign::AExt:
3011  case CCValAssign::SExt:
3012  case CCValAssign::ZExt:
3013  // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
3014  // nodes after our lowering.
3015  assert(RegVT == Ins[i].VT && "incorrect register location selected");
3016  break;
3017  }
3018 
3019  InVals.push_back(ArgValue);
3020 
3021  } else { // VA.isRegLoc()
3022  assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
3023  unsigned ArgOffset = VA.getLocMemOffset();
3024  unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
3025 
3026  uint32_t BEAlign = 0;
3027  if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
3028  !Ins[i].Flags.isInConsecutiveRegs())
3029  BEAlign = 8 - ArgSize;
3030 
3031  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
3032 
3033  // Create load nodes to retrieve arguments from the stack.
3034  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3035  SDValue ArgValue;
3036 
3037  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
3039  MVT MemVT = VA.getValVT();
3040 
3041  switch (VA.getLocInfo()) {
3042  default:
3043  break;
3044  case CCValAssign::BCvt:
3045  MemVT = VA.getLocVT();
3046  break;
3047  case CCValAssign::SExt:
3048  ExtType = ISD::SEXTLOAD;
3049  break;
3050  case CCValAssign::ZExt:
3051  ExtType = ISD::ZEXTLOAD;
3052  break;
3053  case CCValAssign::AExt:
3054  ExtType = ISD::EXTLOAD;
3055  break;
3056  }
3057 
3058  ArgValue = DAG.getExtLoad(
3059  ExtType, DL, VA.getLocVT(), Chain, FIN,
3061  MemVT);
3062 
3063  InVals.push_back(ArgValue);
3064  }
3065  }
3066 
3067  // varargs
3069  if (isVarArg) {
3070  if (!Subtarget->isTargetDarwin() || IsWin64) {
3071  // The AAPCS variadic function ABI is identical to the non-variadic
3072  // one. As a result there may be more arguments in registers and we should
3073  // save them for future reference.
3074  // Win64 variadic functions also pass arguments in registers, but all float
3075  // arguments are passed in integer registers.
3076  saveVarArgRegisters(CCInfo, DAG, DL, Chain);
3077  }
3078 
3079  // This will point to the next argument passed via stack.
3080  unsigned StackOffset = CCInfo.getNextStackOffset();
3081  // We currently pass all varargs at 8-byte alignment.
3082  StackOffset = ((StackOffset + 7) & ~7);
3083  FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
3084  }
3085 
3086  unsigned StackArgSize = CCInfo.getNextStackOffset();
3087  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3088  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
3089  // This is a non-standard ABI so by fiat I say we're allowed to make full
3090  // use of the stack area to be popped, which must be aligned to 16 bytes in
3091  // any case:
3092  StackArgSize = alignTo(StackArgSize, 16);
3093 
3094  // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
3095  // a multiple of 16.
3096  FuncInfo->setArgumentStackToRestore(StackArgSize);
3097 
3098  // This realignment carries over to the available bytes below. Our own
3099  // callers will guarantee the space is free by giving an aligned value to
3100  // CALLSEQ_START.
3101  }
3102  // Even if we're not expected to free up the space, it's useful to know how
3103  // much is there while considering tail calls (because we can reuse it).
3104  FuncInfo->setBytesInStackArgArea(StackArgSize);
3105 
3106  return Chain;
3107 }
3108 
3109 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
3110  SelectionDAG &DAG,
3111  const SDLoc &DL,
3112  SDValue &Chain) const {
3113  MachineFunction &MF = DAG.getMachineFunction();
3114  MachineFrameInfo &MFI = MF.getFrameInfo();
3116  auto PtrVT = getPointerTy(DAG.getDataLayout());
3117  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3118 
3119  SmallVector<SDValue, 8> MemOps;
3120 
3121  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
3122  AArch64::X3, AArch64::X4, AArch64::X5,
3123  AArch64::X6, AArch64::X7 };
3124  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
3125  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
3126 
3127  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
3128  int GPRIdx = 0;
3129  if (GPRSaveSize != 0) {
3130  if (IsWin64) {
3131  GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
3132  if (GPRSaveSize & 15)
3133  // The extra size here, if triggered, will always be 8.
3134  MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
3135  } else
3136  GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
3137 
3138  SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
3139 
3140  for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
3141  unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
3142  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
3143  SDValue Store = DAG.getStore(
3144  Val.getValue(1), DL, Val, FIN,
3145  IsWin64
3147  GPRIdx,
3148  (i - FirstVariadicGPR) * 8)
3150  MemOps.push_back(Store);
3151  FIN =
3152  DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
3153  }
3154  }
3155  FuncInfo->setVarArgsGPRIndex(GPRIdx);
3156  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
3157 
3158  if (Subtarget->hasFPARMv8() && !IsWin64) {
3159  static const MCPhysReg FPRArgRegs[] = {
3160  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
3161  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
3162  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
3163  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
3164 
3165  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
3166  int FPRIdx = 0;
3167  if (FPRSaveSize != 0) {
3168  FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
3169 
3170  SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3171 
3172  for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3173  unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3174  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3175 
3176  SDValue Store = DAG.getStore(
3177  Val.getValue(1), DL, Val, FIN,
3179  MemOps.push_back(Store);
3180  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3181  DAG.getConstant(16, DL, PtrVT));
3182  }
3183  }
3184  FuncInfo->setVarArgsFPRIndex(FPRIdx);
3185  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3186  }
3187 
3188  if (!MemOps.empty()) {
3189  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3190  }
3191 }
3192 
3193 /// LowerCallResult - Lower the result values of a call into the
3194 /// appropriate copies out of appropriate physical registers.
3195 SDValue AArch64TargetLowering::LowerCallResult(
3196  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3197  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3198  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3199  SDValue ThisVal) const {
3200  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3201  ? RetCC_AArch64_WebKit_JS
3202  : RetCC_AArch64_AAPCS;
3203  // Assign locations to each value returned by this call.
3205  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3206  *DAG.getContext());
3207  CCInfo.AnalyzeCallResult(Ins, RetCC);
3208 
3209  // Copy all of the result registers out of their specified physreg.
3210  for (unsigned i = 0; i != RVLocs.size(); ++i) {
3211  CCValAssign VA = RVLocs[i];
3212 
3213  // Pass 'this' value directly from the argument to return value, to avoid
3214  // reg unit interference
3215  if (i == 0 && isThisReturn) {
3216  assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3217  "unexpected return calling convention register assignment");
3218  InVals.push_back(ThisVal);
3219  continue;
3220  }
3221 
3222  SDValue Val =
3223  DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3224  Chain = Val.getValue(1);
3225  InFlag = Val.getValue(2);
3226 
3227  switch (VA.getLocInfo()) {
3228  default:
3229  llvm_unreachable("Unknown loc info!");
3230  case CCValAssign::Full:
3231  break;
3232  case CCValAssign::BCvt:
3233  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3234  break;
3235  }
3236 
3237  InVals.push_back(Val);
3238  }
3239 
3240  return Chain;
3241 }
3242 
3243 /// Return true if the calling convention is one that we can guarantee TCO for.
3245  return CC == CallingConv::Fast;
3246 }
3247 
3248 /// Return true if we might ever do TCO for calls with this calling convention.
3250  switch (CC) {
3251  case CallingConv::C:
3253  case CallingConv::Swift:
3254  return true;
3255  default:
3256  return canGuaranteeTCO(CC);
3257  }
3258 }
3259 
3260 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3261  SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3262  const SmallVectorImpl<ISD::OutputArg> &Outs,
3263  const SmallVectorImpl<SDValue> &OutVals,
3264  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3265  if (!mayTailCallThisCC(CalleeCC))
3266  return false;
3267 
3268  MachineFunction &MF = DAG.getMachineFunction();
3269  const Function &CallerF = MF.getFunction();
3270  CallingConv::ID CallerCC = CallerF.getCallingConv();
3271  bool CCMatch = CallerCC == CalleeCC;
3272 
3273  // Byval parameters hand the function a pointer directly into the stack area
3274  // we want to reuse during a tail call. Working around this *is* possible (see
3275  // X86) but less efficient and uglier in LowerCall.
3276  for (Function::const_arg_iterator i = CallerF.arg_begin(),
3277  e = CallerF.arg_end();
3278  i != e; ++i)
3279  if (i->hasByValAttr())
3280  return false;
3281 
3283  return canGuaranteeTCO(CalleeCC) && CCMatch;
3284 
3285  // Externally-defined functions with weak linkage should not be
3286  // tail-called on AArch64 when the OS does not support dynamic
3287  // pre-emption of symbols, as the AAELF spec requires normal calls
3288  // to undefined weak functions to be replaced with a NOP or jump to the
3289  // next instruction. The behaviour of branch instructions in this
3290  // situation (as used for tail calls) is implementation-defined, so we
3291  // cannot rely on the linker replacing the tail call with a return.
3292  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3293  const GlobalValue *GV = G->getGlobal();
3294  const Triple &TT = getTargetMachine().getTargetTriple();
3295  if (GV->hasExternalWeakLinkage() &&
3296  (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3297  return false;
3298  }
3299 
3300  // Now we search for cases where we can use a tail call without changing the
3301  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3302  // concept.
3303 
3304  // I want anyone implementing a new calling convention to think long and hard
3305  // about this assert.
3306  assert((!isVarArg || CalleeCC == CallingConv::C) &&
3307  "Unexpected variadic calling convention");
3308 
3309  LLVMContext &C = *DAG.getContext();
3310  if (isVarArg && !Outs.empty()) {
3311  // At least two cases here: if caller is fastcc then we can't have any
3312  // memory arguments (we'd be expected to clean up the stack afterwards). If
3313  // caller is C then we could potentially use its argument area.
3314 
3315  // FIXME: for now we take the most conservative of these in both cases:
3316  // disallow all variadic memory operands.
3318  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3319 
3320  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3321  for (const CCValAssign &ArgLoc : ArgLocs)
3322  if (!ArgLoc.isRegLoc())
3323  return false;
3324  }
3325 
3326  // Check that the call results are passed in the same way.
3327  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3328  CCAssignFnForCall(CalleeCC, isVarArg),
3329  CCAssignFnForCall(CallerCC, isVarArg)))
3330  return false;
3331  // The callee has to preserve all registers the caller needs to preserve.
3332  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3333  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3334  if (!CCMatch) {
3335  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3336  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3337  return false;
3338  }
3339 
3340  // Nothing more to check if the callee is taking no arguments
3341  if (Outs.empty())
3342  return true;
3343 
3345  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3346 
3347  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3348 
3349  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3350 
3351  // If the stack arguments for this call do not fit into our own save area then
3352  // the call cannot be made tail.
3353  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3354  return false;
3355 
3356  const MachineRegisterInfo &MRI = MF.getRegInfo();
3357  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3358  return false;
3359 
3360  return true;
3361 }
3362 
3363 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3364  SelectionDAG &DAG,
3365  MachineFrameInfo &MFI,
3366  int ClobberedFI) const {
3367  SmallVector<SDValue, 8> ArgChains;
3368  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3369  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3370 
3371  // Include the original chain at the beginning of the list. When this is
3372  // used by target LowerCall hooks, this helps legalize find the
3373  // CALLSEQ_BEGIN node.
3374  ArgChains.push_back(Chain);
3375 
3376  // Add a chain value for each stack argument corresponding
3377  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3378  UE = DAG.getEntryNode().getNode()->use_end();
3379  U != UE; ++U)
3380  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3381  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3382  if (FI->getIndex() < 0) {
3383  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3384  int64_t InLastByte = InFirstByte;
3385  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3386 
3387  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3388  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3389  ArgChains.push_back(SDValue(L, 1));
3390  }
3391 
3392  // Build a tokenfactor for all the chains.
3393  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3394 }
3395 
3396 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3397  bool TailCallOpt) const {
3398  return CallCC == CallingConv::Fast && TailCallOpt;
3399 }
3400 
3401 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3402 /// and add input and output parameter nodes.
3403 SDValue
3404 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3405  SmallVectorImpl<SDValue> &InVals) const {
3406  SelectionDAG &DAG = CLI.DAG;
3407  SDLoc &DL = CLI.DL;
3408  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3409  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3410  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3411  SDValue Chain = CLI.Chain;
3412  SDValue Callee = CLI.Callee;
3413  bool &IsTailCall = CLI.IsTailCall;
3414  CallingConv::ID CallConv = CLI.CallConv;
3415  bool IsVarArg = CLI.IsVarArg;
3416 
3417  MachineFunction &MF = DAG.getMachineFunction();
3418  bool IsThisReturn = false;
3419 
3421  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3422  bool IsSibCall = false;
3423 
3424  if (IsTailCall) {
3425  // Check if it's really possible to do a tail call.
3426  IsTailCall = isEligibleForTailCallOptimization(
3427  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3428  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3429  report_fatal_error("failed to perform tail call elimination on a call "
3430  "site marked musttail");
3431 
3432  // A sibling call is one where we're under the usual C ABI and not planning
3433  // to change that but can still do a tail call:
3434  if (!TailCallOpt && IsTailCall)
3435  IsSibCall = true;
3436 
3437  if (IsTailCall)
3438  ++NumTailCalls;
3439  }
3440 
3441  // Analyze operands of the call, assigning locations to each operand.
3443  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3444  *DAG.getContext());
3445 
3446  if (IsVarArg) {
3447  // Handle fixed and variable vector arguments differently.
3448  // Variable vector arguments always go into memory.
3449  unsigned NumArgs = Outs.size();
3450 
3451  for (unsigned i = 0; i != NumArgs; ++i) {
3452  MVT ArgVT = Outs[i].VT;
3453  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3454  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3455  /*IsVarArg=*/ !Outs[i].IsFixed);
3456  bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3457  assert(!Res && "Call operand has unhandled type");
3458  (void)Res;
3459  }
3460  } else {
3461  // At this point, Outs[].VT may already be promoted to i32. To correctly
3462  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3463  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3464  // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3465  // we use a special version of AnalyzeCallOperands to pass in ValVT and
3466  // LocVT.
3467  unsigned NumArgs = Outs.size();
3468  for (unsigned i = 0; i != NumArgs; ++i) {
3469  MVT ValVT = Outs[i].VT;
3470  // Get type of the original argument.
3471  EVT ActualVT = getValueType(DAG.getDataLayout(),
3472  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3473  /*AllowUnknown*/ true);
3474  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3475  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3476  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3477  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3478  ValVT = MVT::i8;
3479  else if (ActualMVT == MVT::i16)
3480  ValVT = MVT::i16;
3481 
3482  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3483  bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3484  assert(!Res && "Call operand has unhandled type");
3485  (void)Res;
3486  }
3487  }
3488 
3489  // Get a count of how many bytes are to be pushed on the stack.
3490  unsigned NumBytes = CCInfo.getNextStackOffset();
3491 
3492  if (IsSibCall) {
3493  // Since we're not changing the ABI to make this a tail call, the memory
3494  // operands are already available in the caller's incoming argument space.
3495  NumBytes = 0;
3496  }
3497 
3498  // FPDiff is the byte offset of the call's argument area from the callee's.
3499  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3500  // by this amount for a tail call. In a sibling call it must be 0 because the
3501  // caller will deallocate the entire stack and the callee still expects its
3502  // arguments to begin at SP+0. Completely unused for non-tail calls.
3503  int FPDiff = 0;
3504 
3505  if (IsTailCall && !IsSibCall) {
3506  unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3507 
3508  // Since callee will pop argument stack as a tail call, we must keep the
3509  // popped size 16-byte aligned.
3510  NumBytes = alignTo(NumBytes, 16);
3511 
3512  // FPDiff will be negative if this tail call requires more space than we
3513  // would automatically have in our incoming argument space. Positive if we
3514  // can actually shrink the stack.
3515  FPDiff = NumReusableBytes - NumBytes;
3516 
3517  // The stack pointer must be 16-byte aligned at all times it's used for a
3518  // memory operation, which in practice means at *all* times and in
3519  // particular across call boundaries. Therefore our own arguments started at
3520  // a 16-byte aligned SP and the delta applied for the tail call should
3521  // satisfy the same constraint.
3522  assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3523  }
3524 
3525  // Adjust the stack pointer for the new arguments...
3526  // These operations are automatically eliminated by the prolog/epilog pass
3527  if (!IsSibCall)
3528  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3529 
3530  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3531  getPointerTy(DAG.getDataLayout()));
3532 
3534  SmallVector<SDValue, 8> MemOpChains;
3535  auto PtrVT = getPointerTy(DAG.getDataLayout());
3536 
3537  // Walk the register/memloc assignments, inserting copies/loads.
3538  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3539  ++i, ++realArgIdx) {
3540  CCValAssign &VA = ArgLocs[i];
3541  SDValue Arg = OutVals[realArgIdx];
3542  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3543 
3544  // Promote the value if needed.
3545  switch (VA.getLocInfo()) {
3546  default:
3547  llvm_unreachable("Unknown loc info!");
3548  case CCValAssign::Full:
3549  break;
3550  case CCValAssign::SExt:
3551  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3552  break;
3553  case CCValAssign::ZExt:
3554  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3555  break;
3556  case CCValAssign::AExt:
3557  if (Outs[realArgIdx].ArgVT == MVT::i1) {
3558  // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3559  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3560  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3561  }
3562  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3563  break;
3564  case CCValAssign::BCvt:
3565  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3566  break;
3567  case CCValAssign::FPExt:
3568  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3569  break;
3570  }
3571 
3572  if (VA.isRegLoc()) {
3573  if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3574  Outs[0].VT == MVT::i64) {
3575  assert(VA.getLocVT() == MVT::i64 &&
3576  "unexpected calling convention register assignment");
3577  assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3578  "unexpected use of 'returned'");
3579  IsThisReturn = true;
3580  }
3581  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3582  } else {
3583  assert(VA.isMemLoc());
3584 
3585  SDValue DstAddr;
3586  MachinePointerInfo DstInfo;
3587 
3588  // FIXME: This works on big-endian for composite byvals, which are the
3589  // common case. It should also work for fundamental types too.
3590  uint32_t BEAlign = 0;
3591  unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3592  : VA.getValVT().getSizeInBits();
3593  OpSize = (OpSize + 7) / 8;
3594  if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3595  !Flags.isInConsecutiveRegs()) {
3596  if (OpSize < 8)
3597  BEAlign = 8 - OpSize;
3598  }
3599  unsigned LocMemOffset = VA.getLocMemOffset();
3600  int32_t Offset = LocMemOffset + BEAlign;
3601  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3602  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3603 
3604  if (IsTailCall) {
3605  Offset = Offset + FPDiff;
3606  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3607 
3608  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3609  DstInfo =
3611 
3612  // Make sure any stack arguments overlapping with where we're storing
3613  // are loaded before this eventual operation. Otherwise they'll be
3614  // clobbered.
3615  Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3616  } else {
3617  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3618 
3619  DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3621  LocMemOffset);
3622  }
3623 
3624  if (Outs[i].Flags.isByVal()) {
3625  SDValue SizeNode =
3626  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3627  SDValue Cpy = DAG.getMemcpy(
3628  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3629  /*isVol = */ false, /*AlwaysInline = */ false,
3630  /*isTailCall = */ false,
3631  DstInfo, MachinePointerInfo());
3632 
3633  MemOpChains.push_back(Cpy);
3634  } else {
3635  // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3636  // promoted to a legal register type i32, we should truncate Arg back to
3637  // i1/i8/i16.
3638  if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3639  VA.getValVT() == MVT::i16)
3640  Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3641 
3642  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3643  MemOpChains.push_back(Store);
3644  }
3645  }
3646  }
3647 
3648  if (!MemOpChains.empty())
3649  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3650 
3651  // Build a sequence of copy-to-reg nodes chained together with token chain
3652  // and flag operands which copy the outgoing args into the appropriate regs.
3653  SDValue InFlag;
3654  for (auto &RegToPass : RegsToPass) {
3655  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3656  RegToPass.second, InFlag);
3657  InFlag = Chain.getValue(1);
3658  }
3659 
3660  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3661  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3662  // node so that legalize doesn't hack it.
3663  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3664  auto GV = G->getGlobal();
3665  if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
3667  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
3668  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3669  } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
3670  assert(Subtarget->isTargetWindows() &&
3671  "Windows is the only supported COFF target");
3672  Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
3673  } else {
3674  const GlobalValue *GV = G->getGlobal();
3675  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
3676  }
3677  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3678  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3679  Subtarget->isTargetMachO()) {
3680  const char *Sym = S->getSymbol();
3681  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
3682  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3683  } else {
3684  const char *Sym = S->getSymbol();
3685  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
3686  }
3687  }
3688 
3689  // We don't usually want to end the call-sequence here because we would tidy
3690  // the frame up *after* the call, however in the ABI-changing tail-call case
3691  // we've carefully laid out the parameters so that when sp is reset they'll be
3692  // in the correct location.
3693  if (IsTailCall && !IsSibCall) {
3694  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3695  DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
3696  InFlag = Chain.getValue(1);
3697  }
3698 
3699  std::vector<SDValue> Ops;
3700  Ops.push_back(Chain);
3701  Ops.push_back(Callee);
3702 
3703  if (IsTailCall) {
3704  // Each tail call may have to adjust the stack by a different amount, so
3705  // this information must travel along with the operation for eventual
3706  // consumption by emitEpilogue.
3707  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3708  }
3709 
3710  // Add argument registers to the end of the list so that they are known live
3711  // into the call.
3712  for (auto &RegToPass : RegsToPass)
3713  Ops.push_back(DAG.getRegister(RegToPass.first,
3714  RegToPass.second.getValueType()));
3715 
3716  // Add a register mask operand representing the call-preserved registers.
3717  const uint32_t *Mask;
3718  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3719  if (IsThisReturn) {
3720  // For 'this' returns, use the X0-preserving mask if applicable
3721  Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
3722  if (!Mask) {
3723  IsThisReturn = false;
3724  Mask = TRI->getCallPreservedMask(MF, CallConv);
3725  }
3726  } else
3727  Mask = TRI->getCallPreservedMask(MF, CallConv);
3728 
3729  assert(Mask && "Missing call preserved mask for calling convention");
3730  Ops.push_back(DAG.getRegisterMask(Mask));
3731 
3732  if (InFlag.getNode())
3733  Ops.push_back(InFlag);
3734 
3735  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3736 
3737  // If we're doing a tall call, use a TC_RETURN here rather than an
3738  // actual call instruction.
3739  if (IsTailCall) {
3741  return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
3742  }
3743 
3744  // Returns a chain and a flag for retval copy to use.
3745  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
3746  InFlag = Chain.getValue(1);
3747 
3748  uint64_t CalleePopBytes =
3749  DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
3750 
3751  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3752  DAG.getIntPtrConstant(CalleePopBytes, DL, true),
3753  InFlag, DL);
3754  if (!Ins.empty())
3755  InFlag = Chain.getValue(1);
3756 
3757  // Handle result values, copying them out of physregs into vregs that we
3758  // return.
3759  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3760  InVals, IsThisReturn,
3761  IsThisReturn ? OutVals[0] : SDValue());
3762 }
3763 
3764 bool AArch64TargetLowering::CanLowerReturn(
3765  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3766  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3767  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3768  ? RetCC_AArch64_WebKit_JS
3769  : RetCC_AArch64_AAPCS;
3771  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3772  return CCInfo.CheckReturn(Outs, RetCC);
3773 }
3774 
3775 SDValue
3776 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3777  bool isVarArg,
3778  const SmallVectorImpl<ISD::OutputArg> &Outs,
3779  const SmallVectorImpl<SDValue> &OutVals,
3780  const SDLoc &DL, SelectionDAG &DAG) const {
3781  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3782  ? RetCC_AArch64_WebKit_JS
3783  : RetCC_AArch64_AAPCS;
3785  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3786  *DAG.getContext());
3787  CCInfo.AnalyzeReturn(Outs, RetCC);
3788 
3789  // Copy the result values into the output registers.
3790  SDValue Flag;
3791  SmallVector<SDValue, 4> RetOps(1, Chain);
3792  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
3793  ++i, ++realRVLocIdx) {
3794  CCValAssign &VA = RVLocs[i];
3795  assert(VA.isRegLoc() && "Can only return in registers!");
3796  SDValue Arg = OutVals[realRVLocIdx];
3797 
3798  switch (VA.getLocInfo()) {
3799  default:
3800  llvm_unreachable("Unknown loc info!");
3801  case CCValAssign::Full:
3802  if (Outs[i].ArgVT == MVT::i1) {
3803  // AAPCS requires i1 to be zero-extended to i8 by the producer of the
3804  // value. This is strictly redundant on Darwin (which uses "zeroext
3805  // i1"), but will be optimised out before ISel.
3806  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3807  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3808  }
3809  break;
3810  case CCValAssign::BCvt:
3811  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3812  break;
3813  }
3814 
3815  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
3816  Flag = Chain.getValue(1);
3817  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3818  }
3819  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3820  const MCPhysReg *I =
3822  if (I) {
3823  for (; *I; ++I) {
3824  if (AArch64::GPR64RegClass.contains(*I))
3825  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3826  else if (AArch64::FPR64RegClass.contains(*I))
3827  RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
3828  else
3829  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3830  }
3831  }
3832 
3833  RetOps[0] = Chain; // Update chain.
3834 
3835  // Add the flag if we have it.
3836  if (Flag.getNode())
3837  RetOps.push_back(Flag);
3838 
3839  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
3840 }
3841 
3842 //===----------------------------------------------------------------------===//
3843 // Other Lowering Code
3844 //===----------------------------------------------------------------------===//
3845 
3846 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
3847  SelectionDAG &DAG,
3848  unsigned Flag) const {
3849  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
3850  N->getOffset(), Flag);
3851 }
3852 
3853 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
3854  SelectionDAG &DAG,
3855  unsigned Flag) const {
3856  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
3857 }
3858 
3859 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
3860  SelectionDAG &DAG,
3861  unsigned Flag) const {
3862  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
3863  N->getOffset(), Flag);
3864 }
3865 
3866 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
3867  SelectionDAG &DAG,
3868  unsigned Flag) const {
3869  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
3870 }
3871 
3872 // (loadGOT sym)
3873 template <class NodeTy>
3874 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
3875  unsigned Flags) const {
3876  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
3877  SDLoc DL(N);
3878  EVT Ty = getPointerTy(DAG.getDataLayout());
3879  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
3880  // FIXME: Once remat is capable of dealing with instructions with register
3881  // operands, expand this into two nodes instead of using a wrapper node.
3882  return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
3883 }
3884 
3885 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
3886 template <class NodeTy>
3887 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
3888  unsigned Flags) const {
3889  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
3890  SDLoc DL(N);
3891  EVT Ty = getPointerTy(DAG.getDataLayout());
3892  const unsigned char MO_NC = AArch64II::MO_NC;
3893  return DAG.getNode(
3894  AArch64ISD::WrapperLarge, DL, Ty,
3895  getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
3896  getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
3897  getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
3898  getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
3899 }
3900 
3901 // (addlow (adrp %hi(sym)) %lo(sym))
3902 template <class NodeTy>
3903 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
3904  unsigned Flags) const {
3905  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
3906  SDLoc DL(N);
3907  EVT Ty = getPointerTy(DAG.getDataLayout());
3908  SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
3909  SDValue Lo = getTargetNode(N, Ty, DAG,
3911  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
3912  return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
3913 }
3914 
3915 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
3916  SelectionDAG &DAG) const {
3917  GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
3918  const GlobalValue *GV = GN->getGlobal();
3919  const AArch64II::TOF TargetFlags =
3920  (GV->hasDLLImportStorageClass() ? AArch64II::MO_DLLIMPORT
3922  unsigned char OpFlags =
3923  Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
3924 
3925  if (OpFlags != AArch64II::MO_NO_FLAG)
3926  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
3927  "unexpected offset in global node");
3928 
3929  // This also catches the large code model case for Darwin.
3930  if ((OpFlags & AArch64II::MO_GOT) != 0) {
3931  return getGOT(GN, DAG, TargetFlags);
3932  }
3933 
3934  SDValue Result;
3935  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
3936  Result = getAddrLarge(GN, DAG, TargetFlags);
3937  } else {
3938  Result = getAddr(GN, DAG, TargetFlags);
3939  }
3940  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3941  SDLoc DL(GN);
3942  if (GV->hasDLLImportStorageClass())
3943  Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3945  return Result;
3946 }
3947 
3948 /// Convert a TLS address reference into the correct sequence of loads
3949 /// and calls to compute the variable's address (for Darwin, currently) and
3950 /// return an SDValue containing the final node.
3951 
3952 /// Darwin only has one TLS scheme which must be capable of dealing with the
3953 /// fully general situation, in the worst case. This means:
3954 /// + "extern __thread" declaration.
3955 /// + Defined in a possibly unknown dynamic library.
3956 ///
3957 /// The general system is that each __thread variable has a [3 x i64] descriptor
3958 /// which contains information used by the runtime to calculate the address. The
3959 /// only part of this the compiler needs to know about is the first xword, which
3960 /// contains a function pointer that must be called with the address of the
3961 /// entire descriptor in "x0".
3962 ///
3963 /// Since this descriptor may be in a different unit, in general even the
3964 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
3965 /// is:
3966 /// adrp x0, _var@TLVPPAGE
3967 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
3968 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
3969 /// ; the function pointer
3970 /// blr x1 ; Uses descriptor address in x0
3971 /// ; Address of _var is now in x0.
3972 ///
3973 /// If the address of _var's descriptor *is* known to the linker, then it can
3974 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
3975 /// a slight efficiency gain.
3976 SDValue
3977 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
3978  SelectionDAG &DAG) const {
3979  assert(Subtarget->isTargetDarwin() &&
3980  "This function expects a Darwin target");
3981 
3982  SDLoc DL(Op);
3983  MVT PtrVT = getPointerTy(DAG.getDataLayout());
3984  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3985 
3986  SDValue TLVPAddr =
3987  DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
3988  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
3989 
3990  // The first entry in the descriptor is a function pointer that we must call
3991  // to obtain the address of the variable.
3992  SDValue Chain = DAG.getEntryNode();
3993  SDValue FuncTLVGet = DAG.getLoad(
3994  MVT::i64, DL, Chain, DescAddr,
3996  /* Alignment = */ 8,
3999  Chain = FuncTLVGet.getValue(1);
4000 
4002  MFI.setAdjustsStack(true);
4003 
4004  // TLS calls preserve all registers except those that absolutely must be
4005  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
4006  // silly).
4007  const uint32_t *Mask =
4008  Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
4009 
4010  // Finally, we can make the call. This is just a degenerate version of a
4011  // normal AArch64 call node: x0 takes the address of the descriptor, and
4012  // returns the address of the variable in this thread.
4013  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
4014  Chain =
4016  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
4017  DAG.getRegisterMask(Mask), Chain.getValue(1));
4018  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
4019 }
4020 
4021 /// When accessing thread-local variables under either the general-dynamic or
4022 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
4023 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
4024 /// is a function pointer to carry out the resolution.
4025 ///
4026 /// The sequence is:
4027 /// adrp x0, :tlsdesc:var
4028 /// ldr x1, [x0, #:tlsdesc_lo12:var]
4029 /// add x0, x0, #:tlsdesc_lo12:var
4030 /// .tlsdesccall var
4031 /// blr x1
4032 /// (TPIDR_EL0 offset now in x0)
4033 ///
4034 /// The above sequence must be produced unscheduled, to enable the linker to
4035 /// optimize/relax this sequence.
4036 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
4037 /// above sequence, and expanded really late in the compilation flow, to ensure
4038 /// the sequence is produced as per above.
4039 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
4040  const SDLoc &DL,
4041  SelectionDAG &DAG) const {
4042  EVT PtrVT = getPointerTy(DAG.getDataLayout());
4043 
4044  SDValue Chain = DAG.getEntryNode();
4045  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4046 
4047  Chain =
4048  DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
4049  SDValue Glue = Chain.getValue(1);
4050 
4051  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
4052 }
4053 
4054 SDValue
4055 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
4056  SelectionDAG &DAG) const {
4057  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
4058  assert(Subtarget->useSmallAddressing() &&
4059  "ELF TLS only supported in small memory model");
4060  // Different choices can be made for the maximum size of the TLS area for a
4061  // module. For the small address model, the default TLS size is 16MiB and the
4062  // maximum TLS size is 4GiB.
4063  // FIXME: add -mtls-size command line option and make it control the 16MiB
4064  // vs. 4GiB code sequence generation.
4065  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
4066 
4067  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
4068 
4070  if (Model == TLSModel::LocalDynamic)
4071  Model = TLSModel::GeneralDynamic;
4072  }
4073 
4074  SDValue TPOff;
4075  EVT PtrVT = getPointerTy(DAG.getDataLayout());
4076  SDLoc DL(Op);
4077  const GlobalValue *GV = GA->getGlobal();
4078 
4079  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
4080 
4081  if (Model == TLSModel::LocalExec) {
4082  SDValue HiVar = DAG.getTargetGlobalAddress(
4083  GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
4084  SDValue LoVar = DAG.getTargetGlobalAddress(
4085  GV, DL, PtrVT, 0,
4087 
4088  SDValue TPWithOff_lo =
4089  SDValue(DAG.getMachineNode(