LLVM  9.0.0svn
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ExpandImm.h"
14 #include "AArch64ISelLowering.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
47 #include "llvm/IR/Attributes.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DataLayout.h"
50 #include "llvm/IR/DebugLoc.h"
51 #include "llvm/IR/DerivedTypes.h"
52 #include "llvm/IR/Function.h"
54 #include "llvm/IR/GlobalValue.h"
55 #include "llvm/IR/IRBuilder.h"
56 #include "llvm/IR/Instruction.h"
57 #include "llvm/IR/Instructions.h"
58 #include "llvm/IR/IntrinsicInst.h"
59 #include "llvm/IR/Intrinsics.h"
60 #include "llvm/IR/Module.h"
61 #include "llvm/IR/OperandTraits.h"
62 #include "llvm/IR/PatternMatch.h"
63 #include "llvm/IR/Type.h"
64 #include "llvm/IR/Use.h"
65 #include "llvm/IR/Value.h"
66 #include "llvm/MC/MCRegisterInfo.h"
67 #include "llvm/Support/Casting.h"
68 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/Compiler.h"
71 #include "llvm/Support/Debug.h"
73 #include "llvm/Support/KnownBits.h"
79 #include <algorithm>
80 #include <bitset>
81 #include <cassert>
82 #include <cctype>
83 #include <cstdint>
84 #include <cstdlib>
85 #include <iterator>
86 #include <limits>
87 #include <tuple>
88 #include <utility>
89 #include <vector>
90 
91 using namespace llvm;
92 using namespace llvm::PatternMatch;
93 
94 #define DEBUG_TYPE "aarch64-lower"
95 
96 STATISTIC(NumTailCalls, "Number of tail calls");
97 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
98 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
99 
100 static cl::opt<bool>
101 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
102  cl::desc("Allow AArch64 SLI/SRI formation"),
103  cl::init(false));
104 
105 // FIXME: The necessary dtprel relocations don't seem to be supported
106 // well in the GNU bfd and gold linkers at the moment. Therefore, by
107 // default, for now, fall back to GeneralDynamic code generation.
109  "aarch64-elf-ldtls-generation", cl::Hidden,
110  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
111  cl::init(false));
112 
113 static cl::opt<bool>
114 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
115  cl::desc("Enable AArch64 logical imm instruction "
116  "optimization"),
117  cl::init(true));
118 
119 /// Value type used for condition codes.
120 static const MVT MVT_CC = MVT::i32;
121 
123  const AArch64Subtarget &STI)
124  : TargetLowering(TM), Subtarget(&STI) {
125  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
126  // we have to make something up. Arbitrarily, choose ZeroOrOne.
128  // When comparing vectors the result sets the different elements in the
129  // vector to all-one or all-zero.
131 
132  // Set up the register classes.
133  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
134  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
135 
136  if (Subtarget->hasFPARMv8()) {
137  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
138  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
139  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
140  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
141  }
142 
143  if (Subtarget->hasNEON()) {
144  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
145  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
146  // Someone set us up the NEON.
147  addDRTypeForNEON(MVT::v2f32);
148  addDRTypeForNEON(MVT::v8i8);
149  addDRTypeForNEON(MVT::v4i16);
150  addDRTypeForNEON(MVT::v2i32);
151  addDRTypeForNEON(MVT::v1i64);
152  addDRTypeForNEON(MVT::v1f64);
153  addDRTypeForNEON(MVT::v4f16);
154 
155  addQRTypeForNEON(MVT::v4f32);
156  addQRTypeForNEON(MVT::v2f64);
157  addQRTypeForNEON(MVT::v16i8);
158  addQRTypeForNEON(MVT::v8i16);
159  addQRTypeForNEON(MVT::v4i32);
160  addQRTypeForNEON(MVT::v2i64);
161  addQRTypeForNEON(MVT::v8f16);
162  }
163 
164  // Compute derived properties from the register classes
166 
167  // Provide all sorts of operation actions
195 
199 
203 
205 
206  // Custom lowering hooks are needed for XOR
207  // to fold it into CSINC/CSINV.
210 
211  // Virtually no operation on f128 is legal, but LLVM can't expand them when
212  // there's a valid register class, so we need custom operations in most cases.
234 
235  // Lowering for many of the conversions is actually specified by the non-f128
236  // type. The LowerXXX function will be trivial when f128 isn't involved.
251 
252  // Variable arguments.
257 
258  // Variable-sized objects.
261 
262  if (Subtarget->isTargetWindows())
264  else
266 
267  // Constant pool entries
269 
270  // BlockAddress
272 
273  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
282 
283  // AArch64 lacks both left-rotate and popcount instructions.
286  for (MVT VT : MVT::vector_valuetypes()) {
289  }
290 
291  // AArch64 doesn't have {U|S}MUL_LOHI.
294 
297 
300  for (MVT VT : MVT::vector_valuetypes()) {
303  }
310 
311  // Custom lower Add/Sub/Mul with overflow.
324 
333  if (Subtarget->hasFullFP16())
335  else
337 
371 
372  if (!Subtarget->hasFullFP16()) {
395 
396  // promote v4f16 to v4f32 when that is known to be safe.
409 
425 
446  }
447 
448  // AArch64 has implementations of a lot of rounding-like FP operations.
449  for (MVT Ty : {MVT::f32, MVT::f64}) {
460  }
461 
462  if (Subtarget->hasFullFP16()) {
473  }
474 
476 
478 
484 
485  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
486  // This requires the Performance Monitors extension.
487  if (Subtarget->hasPerfMon())
489 
490  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
491  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
492  // Issue __sincos_stret if available.
495  } else {
498  }
499 
500  // Make floating-point constants legal for the large code model, so they don't
501  // become loads from the constant pool.
502  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
505  }
506 
507  // AArch64 does not have floating-point extending loads, i1 sign-extending
508  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
509  for (MVT VT : MVT::fp_valuetypes()) {
514  }
515  for (MVT VT : MVT::integer_valuetypes())
517 
525 
528 
529  // Indexed loads and stores are supported.
530  for (unsigned im = (unsigned)ISD::PRE_INC;
546  }
547 
548  // Trap.
550 
551  // We combine OR nodes for bitfield operations.
553  // Try to create BICs for vector ANDs.
555 
556  // Vector add and sub nodes may conceal a high-half opportunity.
557  // Also, try to fold ADD into CSINC/CSINV..
564 
568 
570 
577  if (Subtarget->supportsAddressTopByteIgnored())
579 
581 
584 
588 
590 
591  // In case of strict alignment, avoid an excessive number of byte wide stores.
595 
600 
602 
604 
606 
607  EnableExtLdPromotion = true;
608 
609  // Set required alignment.
611  // Set preferred alignments.
614 
615  // Only change the limit for entries in a jump table if specified by
616  // the sub target, but not at the command line.
617  unsigned MaxJT = STI.getMaximumJumpTableSize();
618  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
620 
621  setHasExtractBitsInsn(true);
622 
624 
625  if (Subtarget->hasNEON()) {
626  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
627  // silliness like this:
653 
659 
661 
662  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
663  // elements smaller than i32, so promote the input to i32 first.
666  // i8 vector elements also need promotion to i32 for v8i8
669  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
674  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
675  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
678 
679  if (Subtarget->hasFullFP16()) {
684  } else {
685  // when AArch64 doesn't have fullfp16 support, promote the input
686  // to i32 first.
691  }
692 
695 
696  // AArch64 doesn't have MUL.2d:
698  // Custom handling for some quad-vector types to detect MULL.
702 
703  // Vector reductions
704  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
711  }
712  for (MVT VT : { MVT::v4f16, MVT::v2f32,
716  }
717 
720  // Likewise, narrowing and extending vector loads/stores aren't handled
721  // directly.
722  for (MVT VT : MVT::vector_valuetypes()) {
724 
725  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
728  } else {
731  }
734 
737 
738  for (MVT InnerVT : MVT::vector_valuetypes()) {
739  setTruncStoreAction(VT, InnerVT, Expand);
740  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
741  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
742  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
743  }
744  }
745 
746  // AArch64 has implementations of a lot of rounding-like FP operations.
747  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
754  }
755 
756  if (Subtarget->hasFullFP16()) {
757  for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
764  }
765  }
766 
768  }
769 
771 }
772 
773 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
774  assert(VT.isVector() && "VT should be a vector type");
775 
776  if (VT.isFloatingPoint()) {
778  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
779  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
780  }
781 
782  // Mark vector float intrinsics as expand.
783  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
792 
793  // But we do support custom-lowering for FCOPYSIGN.
795  }
796 
808 
812  for (MVT InnerVT : MVT::all_valuetypes())
813  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
814 
815  // CNT supports only B element sizes, then use UADDLP to widen.
816  if (VT != MVT::v8i8 && VT != MVT::v16i8)
818 
824 
827 
828  if (!VT.isFloatingPoint())
830 
831  // [SU][MIN|MAX] are available for all NEON types apart from i64.
832  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
833  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
834  setOperationAction(Opcode, VT, Legal);
835 
836  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
837  if (VT.isFloatingPoint() &&
838  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
839  for (unsigned Opcode :
841  setOperationAction(Opcode, VT, Legal);
842 
843  if (Subtarget->isLittleEndian()) {
844  for (unsigned im = (unsigned)ISD::PRE_INC;
848  }
849  }
850 }
851 
852 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
853  addRegisterClass(VT, &AArch64::FPR64RegClass);
854  addTypeForNEON(VT, MVT::v2i32);
855 }
856 
857 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
858  addRegisterClass(VT, &AArch64::FPR128RegClass);
859  addTypeForNEON(VT, MVT::v4i32);
860 }
861 
863  EVT VT) const {
864  if (!VT.isVector())
865  return MVT::i32;
867 }
868 
869 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
870  const APInt &Demanded,
872  unsigned NewOpc) {
873  uint64_t OldImm = Imm, NewImm, Enc;
874  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
875 
876  // Return if the immediate is already all zeros, all ones, a bimm32 or a
877  // bimm64.
878  if (Imm == 0 || Imm == Mask ||
879  AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
880  return false;
881 
882  unsigned EltSize = Size;
883  uint64_t DemandedBits = Demanded.getZExtValue();
884 
885  // Clear bits that are not demanded.
886  Imm &= DemandedBits;
887 
888  while (true) {
889  // The goal here is to set the non-demanded bits in a way that minimizes
890  // the number of switching between 0 and 1. In order to achieve this goal,
891  // we set the non-demanded bits to the value of the preceding demanded bits.
892  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
893  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
894  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
895  // The final result is 0b11000011.
896  uint64_t NonDemandedBits = ~DemandedBits;
897  uint64_t InvertedImm = ~Imm & DemandedBits;
898  uint64_t RotatedImm =
899  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
900  NonDemandedBits;
901  uint64_t Sum = RotatedImm + NonDemandedBits;
902  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
903  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
904  NewImm = (Imm | Ones) & Mask;
905 
906  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
907  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
908  // we halve the element size and continue the search.
909  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
910  break;
911 
912  // We cannot shrink the element size any further if it is 2-bits.
913  if (EltSize == 2)
914  return false;
915 
916  EltSize /= 2;
917  Mask >>= EltSize;
918  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
919 
920  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
921  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
922  return false;
923 
924  // Merge the upper and lower halves of Imm and DemandedBits.
925  Imm |= Hi;
926  DemandedBits |= DemandedBitsHi;
927  }
928 
929  ++NumOptimizedImms;
930 
931  // Replicate the element across the register width.
932  while (EltSize < Size) {
933  NewImm |= NewImm << EltSize;
934  EltSize *= 2;
935  }
936 
937  (void)OldImm;
938  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
939  "demanded bits should never be altered");
940  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
941 
942  // Create the new constant immediate node.
943  EVT VT = Op.getValueType();
944  SDLoc DL(Op);
945  SDValue New;
946 
947  // If the new constant immediate is all-zeros or all-ones, let the target
948  // independent DAG combine optimize this node.
949  if (NewImm == 0 || NewImm == OrigMask) {
950  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
951  TLO.DAG.getConstant(NewImm, DL, VT));
952  // Otherwise, create a machine node so that target independent DAG combine
953  // doesn't undo this optimization.
954  } else {
955  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
956  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
957  New = SDValue(
958  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
959  }
960 
961  return TLO.CombineTo(Op, New);
962 }
963 
965  SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
966  // Delay this optimization to as late as possible.
967  if (!TLO.LegalOps)
968  return false;
969 
971  return false;
972 
973  EVT VT = Op.getValueType();
974  if (VT.isVector())
975  return false;
976 
977  unsigned Size = VT.getSizeInBits();
978  assert((Size == 32 || Size == 64) &&
979  "i32 or i64 is expected after legalization.");
980 
981  // Exit early if we demand all bits.
982  if (Demanded.countPopulation() == Size)
983  return false;
984 
985  unsigned NewOpc;
986  switch (Op.getOpcode()) {
987  default:
988  return false;
989  case ISD::AND:
990  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
991  break;
992  case ISD::OR:
993  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
994  break;
995  case ISD::XOR:
996  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
997  break;
998  }
1000  if (!C)
1001  return false;
1002  uint64_t Imm = C->getZExtValue();
1003  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
1004 }
1005 
1006 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1007 /// Mask are known to be either zero or one and return them Known.
1009  const SDValue Op, KnownBits &Known,
1010  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1011  switch (Op.getOpcode()) {
1012  default:
1013  break;
1014  case AArch64ISD::CSEL: {
1015  KnownBits Known2;
1016  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1017  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1018  Known.Zero &= Known2.Zero;
1019  Known.One &= Known2.One;
1020  break;
1021  }
1022  case ISD::INTRINSIC_W_CHAIN: {
1023  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1024  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1025  switch (IntID) {
1026  default: return;
1027  case Intrinsic::aarch64_ldaxr:
1028  case Intrinsic::aarch64_ldxr: {
1029  unsigned BitWidth = Known.getBitWidth();
1030  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1031  unsigned MemBits = VT.getScalarSizeInBits();
1032  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1033  return;
1034  }
1035  }
1036  break;
1037  }
1039  case ISD::INTRINSIC_VOID: {
1040  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1041  switch (IntNo) {
1042  default:
1043  break;
1044  case Intrinsic::aarch64_neon_umaxv:
1045  case Intrinsic::aarch64_neon_uminv: {
1046  // Figure out the datatype of the vector operand. The UMINV instruction
1047  // will zero extend the result, so we can mark as known zero all the
1048  // bits larger than the element datatype. 32-bit or larget doesn't need
1049  // this as those are legal types and will be handled by isel directly.
1050  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1051  unsigned BitWidth = Known.getBitWidth();
1052  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1053  assert(BitWidth >= 8 && "Unexpected width!");
1054  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1055  Known.Zero |= Mask;
1056  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1057  assert(BitWidth >= 16 && "Unexpected width!");
1058  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1059  Known.Zero |= Mask;
1060  }
1061  break;
1062  } break;
1063  }
1064  }
1065  }
1066 }
1067 
1069  EVT) const {
1070  return MVT::i64;
1071 }
1072 
1074  unsigned AddrSpace,
1075  unsigned Align,
1076  bool *Fast) const {
1077  if (Subtarget->requiresStrictAlign())
1078  return false;
1079 
1080  if (Fast) {
1081  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1082  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1083  // See comments in performSTORECombine() for more details about
1084  // these conditions.
1085 
1086  // Code that uses clang vector extensions can mark that it
1087  // wants unaligned accesses to be treated as fast by
1088  // underspecifying alignment to be 1 or 2.
1089  Align <= 2 ||
1090 
1091  // Disregard v2i64. Memcpy lowering produces those and splitting
1092  // them regresses performance on micro-benchmarks and olden/bh.
1093  VT == MVT::v2i64;
1094  }
1095  return true;
1096 }
1097 
1098 FastISel *
1100  const TargetLibraryInfo *libInfo) const {
1101  return AArch64::createFastISel(funcInfo, libInfo);
1102 }
1103 
1104 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1105  switch ((AArch64ISD::NodeType)Opcode) {
1106  case AArch64ISD::FIRST_NUMBER: break;
1107  case AArch64ISD::CALL: return "AArch64ISD::CALL";
1108  case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
1109  case AArch64ISD::ADR: return "AArch64ISD::ADR";
1110  case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
1111  case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
1112  case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
1113  case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
1114  case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
1115  case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
1116  case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
1117  case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
1118  case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
1119  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1120  case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
1121  case AArch64ISD::ADC: return "AArch64ISD::ADC";
1122  case AArch64ISD::SBC: return "AArch64ISD::SBC";
1123  case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
1124  case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
1125  case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
1126  case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
1127  case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
1128  case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
1129  case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
1130  case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
1131  case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
1132  case AArch64ISD::DUP: return "AArch64ISD::DUP";
1133  case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
1134  case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
1135  case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
1136  case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
1137  case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
1138  case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
1139  case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
1140  case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
1141  case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
1142  case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
1143  case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
1144  case AArch64ISD::BICi: return "AArch64ISD::BICi";
1145  case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
1146  case AArch64ISD::BSL: return "AArch64ISD::BSL";
1147  case AArch64ISD::NEG: return "AArch64ISD::NEG";
1148  case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
1149  case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
1150  case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
1151  case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
1152  case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
1153  case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
1154  case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
1155  case AArch64ISD::REV16: return "AArch64ISD::REV16";
1156  case AArch64ISD::REV32: return "AArch64ISD::REV32";
1157  case AArch64ISD::REV64: return "AArch64ISD::REV64";
1158  case AArch64ISD::EXT: return "AArch64ISD::EXT";
1159  case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
1160  case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
1161  case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
1162  case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
1163  case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
1164  case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
1165  case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
1166  case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
1167  case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
1168  case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
1169  case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
1170  case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
1171  case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
1172  case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
1173  case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
1174  case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
1175  case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
1176  case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
1177  case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
1178  case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
1179  case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
1180  case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
1181  case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
1182  case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
1183  case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
1184  case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
1185  case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
1186  case AArch64ISD::NOT: return "AArch64ISD::NOT";
1187  case AArch64ISD::BIT: return "AArch64ISD::BIT";
1188  case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
1189  case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
1190  case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
1191  case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
1192  case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
1193  case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
1194  case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
1195  case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
1196  case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
1197  case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
1198  case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
1199  case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
1200  case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
1201  case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
1202  case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
1203  case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
1204  case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
1205  case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
1206  case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
1207  case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
1208  case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
1209  case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
1210  case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
1211  case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
1212  case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
1213  case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
1214  case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
1215  case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
1216  case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
1217  case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
1218  case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
1219  case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
1220  case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
1221  case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
1222  case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
1223  case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
1224  case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
1225  case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
1226  case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
1227  case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
1228  case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
1229  case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
1230  case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
1231  case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
1232  }
1233  return nullptr;
1234 }
1235 
1238  MachineBasicBlock *MBB) const {
1239  // We materialise the F128CSEL pseudo-instruction as some control flow and a
1240  // phi node:
1241 
1242  // OrigBB:
1243  // [... previous instrs leading to comparison ...]
1244  // b.ne TrueBB
1245  // b EndBB
1246  // TrueBB:
1247  // ; Fallthrough
1248  // EndBB:
1249  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1250 
1251  MachineFunction *MF = MBB->getParent();
1252  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1253  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1254  DebugLoc DL = MI.getDebugLoc();
1255  MachineFunction::iterator It = ++MBB->getIterator();
1256 
1257  unsigned DestReg = MI.getOperand(0).getReg();
1258  unsigned IfTrueReg = MI.getOperand(1).getReg();
1259  unsigned IfFalseReg = MI.getOperand(2).getReg();
1260  unsigned CondCode = MI.getOperand(3).getImm();
1261  bool NZCVKilled = MI.getOperand(4).isKill();
1262 
1263  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1264  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1265  MF->insert(It, TrueBB);
1266  MF->insert(It, EndBB);
1267 
1268  // Transfer rest of current basic-block to EndBB
1269  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1270  MBB->end());
1271  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1272 
1273  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1274  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1275  MBB->addSuccessor(TrueBB);
1276  MBB->addSuccessor(EndBB);
1277 
1278  // TrueBB falls through to the end.
1279  TrueBB->addSuccessor(EndBB);
1280 
1281  if (!NZCVKilled) {
1282  TrueBB->addLiveIn(AArch64::NZCV);
1283  EndBB->addLiveIn(AArch64::NZCV);
1284  }
1285 
1286  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1287  .addReg(IfTrueReg)
1288  .addMBB(TrueBB)
1289  .addReg(IfFalseReg)
1290  .addMBB(MBB);
1291 
1292  MI.eraseFromParent();
1293  return EndBB;
1294 }
1295 
1297  MachineInstr &MI, MachineBasicBlock *BB) const {
1299  BB->getParent()->getFunction().getPersonalityFn())) &&
1300  "SEH does not use catchret!");
1301  return BB;
1302 }
1303 
1305  MachineInstr &MI, MachineBasicBlock *BB) const {
1306  MI.eraseFromParent();
1307  return BB;
1308 }
1309 
1311  MachineInstr &MI, MachineBasicBlock *BB) const {
1312  switch (MI.getOpcode()) {
1313  default:
1314 #ifndef NDEBUG
1315  MI.dump();
1316 #endif
1317  llvm_unreachable("Unexpected instruction for custom inserter!");
1318 
1319  case AArch64::F128CSEL:
1320  return EmitF128CSEL(MI, BB);
1321 
1322  case TargetOpcode::STACKMAP:
1323  case TargetOpcode::PATCHPOINT:
1324  return emitPatchPoint(MI, BB);
1325 
1326  case AArch64::CATCHRET:
1327  return EmitLoweredCatchRet(MI, BB);
1328  case AArch64::CATCHPAD:
1329  return EmitLoweredCatchPad(MI, BB);
1330  }
1331 }
1332 
1333 //===----------------------------------------------------------------------===//
1334 // AArch64 Lowering private implementation.
1335 //===----------------------------------------------------------------------===//
1336 
1337 //===----------------------------------------------------------------------===//
1338 // Lowering Code
1339 //===----------------------------------------------------------------------===//
1340 
1341 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1342 /// CC
1344  switch (CC) {
1345  default:
1346  llvm_unreachable("Unknown condition code!");
1347  case ISD::SETNE:
1348  return AArch64CC::NE;
1349  case ISD::SETEQ:
1350  return AArch64CC::EQ;
1351  case ISD::SETGT:
1352  return AArch64CC::GT;
1353  case ISD::SETGE:
1354  return AArch64CC::GE;
1355  case ISD::SETLT:
1356  return AArch64CC::LT;
1357  case ISD::SETLE:
1358  return AArch64CC::LE;
1359  case ISD::SETUGT:
1360  return AArch64CC::HI;
1361  case ISD::SETUGE:
1362  return AArch64CC::HS;
1363  case ISD::SETULT:
1364  return AArch64CC::LO;
1365  case ISD::SETULE:
1366  return AArch64CC::LS;
1367  }
1368 }
1369 
1370 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1373  AArch64CC::CondCode &CondCode2) {
1374  CondCode2 = AArch64CC::AL;
1375  switch (CC) {
1376  default:
1377  llvm_unreachable("Unknown FP condition!");
1378  case ISD::SETEQ:
1379  case ISD::SETOEQ:
1380  CondCode = AArch64CC::EQ;
1381  break;
1382  case ISD::SETGT:
1383  case ISD::SETOGT:
1384  CondCode = AArch64CC::GT;
1385  break;
1386  case ISD::SETGE:
1387  case ISD::SETOGE:
1388  CondCode = AArch64CC::GE;
1389  break;
1390  case ISD::SETOLT:
1391  CondCode = AArch64CC::MI;
1392  break;
1393  case ISD::SETOLE:
1394  CondCode = AArch64CC::LS;
1395  break;
1396  case ISD::SETONE:
1397  CondCode = AArch64CC::MI;
1398  CondCode2 = AArch64CC::GT;
1399  break;
1400  case ISD::SETO:
1401  CondCode = AArch64CC::VC;
1402  break;
1403  case ISD::SETUO:
1404  CondCode = AArch64CC::VS;
1405  break;
1406  case ISD::SETUEQ:
1407  CondCode = AArch64CC::EQ;
1408  CondCode2 = AArch64CC::VS;
1409  break;
1410  case ISD::SETUGT:
1411  CondCode = AArch64CC::HI;
1412  break;
1413  case ISD::SETUGE:
1414  CondCode = AArch64CC::PL;
1415  break;
1416  case ISD::SETLT:
1417  case ISD::SETULT:
1418  CondCode = AArch64CC::LT;
1419  break;
1420  case ISD::SETLE:
1421  case ISD::SETULE:
1422  CondCode = AArch64CC::LE;
1423  break;
1424  case ISD::SETNE:
1425  case ISD::SETUNE:
1426  CondCode = AArch64CC::NE;
1427  break;
1428  }
1429 }
1430 
1431 /// Convert a DAG fp condition code to an AArch64 CC.
1432 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1433 /// should be AND'ed instead of OR'ed.
1436  AArch64CC::CondCode &CondCode2) {
1437  CondCode2 = AArch64CC::AL;
1438  switch (CC) {
1439  default:
1440  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1441  assert(CondCode2 == AArch64CC::AL);
1442  break;
1443  case ISD::SETONE:
1444  // (a one b)
1445  // == ((a olt b) || (a ogt b))
1446  // == ((a ord b) && (a une b))
1447  CondCode = AArch64CC::VC;
1448  CondCode2 = AArch64CC::NE;
1449  break;
1450  case ISD::SETUEQ:
1451  // (a ueq b)
1452  // == ((a uno b) || (a oeq b))
1453  // == ((a ule b) && (a uge b))
1454  CondCode = AArch64CC::PL;
1455  CondCode2 = AArch64CC::LE;
1456  break;
1457  }
1458 }
1459 
1460 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1461 /// CC usable with the vector instructions. Fewer operations are available
1462 /// without a real NZCV register, so we have to use less efficient combinations
1463 /// to get the same effect.
1466  AArch64CC::CondCode &CondCode2,
1467  bool &Invert) {
1468  Invert = false;
1469  switch (CC) {
1470  default:
1471  // Mostly the scalar mappings work fine.
1472  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1473  break;
1474  case ISD::SETUO:
1475  Invert = true;
1477  case ISD::SETO:
1478  CondCode = AArch64CC::MI;
1479  CondCode2 = AArch64CC::GE;
1480  break;
1481  case ISD::SETUEQ:
1482  case ISD::SETULT:
1483  case ISD::SETULE:
1484  case ISD::SETUGT:
1485  case ISD::SETUGE:
1486  // All of the compare-mask comparisons are ordered, but we can switch
1487  // between the two by a double inversion. E.g. ULE == !OGT.
1488  Invert = true;
1489  changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1490  break;
1491  }
1492 }
1493 
1494 static bool isLegalArithImmed(uint64_t C) {
1495  // Matches AArch64DAGToDAGISel::SelectArithImmed().
1496  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1497  LLVM_DEBUG(dbgs() << "Is imm " << C
1498  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1499  return IsLegal;
1500 }
1501 
1502 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
1503 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
1504 // can be set differently by this operation. It comes down to whether
1505 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1506 // everything is fine. If not then the optimization is wrong. Thus general
1507 // comparisons are only valid if op2 != 0.
1508 //
1509 // So, finally, the only LLVM-native comparisons that don't mention C and V
1510 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1511 // the absence of information about op2.
1512 static bool isCMN(SDValue Op, ISD::CondCode CC) {
1513  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
1514  (CC == ISD::SETEQ || CC == ISD::SETNE);
1515 }
1516 
1518  const SDLoc &dl, SelectionDAG &DAG) {
1519  EVT VT = LHS.getValueType();
1520  const bool FullFP16 =
1521  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1522 
1523  if (VT.isFloatingPoint()) {
1524  assert(VT != MVT::f128);
1525  if (VT == MVT::f16 && !FullFP16) {
1526  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1527  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1528  VT = MVT::f32;
1529  }
1530  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1531  }
1532 
1533  // The CMP instruction is just an alias for SUBS, and representing it as
1534  // SUBS means that it's possible to get CSE with subtract operations.
1535  // A later phase can perform the optimization of setting the destination
1536  // register to WZR/XZR if it ends up being unused.
1537  unsigned Opcode = AArch64ISD::SUBS;
1538 
1539  if (isCMN(RHS, CC)) {
1540  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
1541  Opcode = AArch64ISD::ADDS;
1542  RHS = RHS.getOperand(1);
1543  } else if (isCMN(LHS, CC)) {
1544  // As we are looking for EQ/NE compares, the operands can be commuted ; can
1545  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
1546  Opcode = AArch64ISD::ADDS;
1547  LHS = LHS.getOperand(1);
1548  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1549  !isUnsignedIntSetCC(CC)) {
1550  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1551  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1552  // of the signed comparisons.
1553  Opcode = AArch64ISD::ANDS;
1554  RHS = LHS.getOperand(1);
1555  LHS = LHS.getOperand(0);
1556  }
1557 
1558  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1559  .getValue(1);
1560 }
1561 
1562 /// \defgroup AArch64CCMP CMP;CCMP matching
1563 ///
1564 /// These functions deal with the formation of CMP;CCMP;... sequences.
1565 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1566 /// a comparison. They set the NZCV flags to a predefined value if their
1567 /// predicate is false. This allows to express arbitrary conjunctions, for
1568 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
1569 /// expressed as:
1570 /// cmp A
1571 /// ccmp B, inv(CB), CA
1572 /// check for CB flags
1573 ///
1574 /// This naturally lets us implement chains of AND operations with SETCC
1575 /// operands. And we can even implement some other situations by transforming
1576 /// them:
1577 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
1578 /// negating the flags used in a CCMP/FCCMP operations.
1579 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
1580 /// by negating the flags we test for afterwards. i.e.
1581 /// NEG (CMP CCMP CCCMP ...) can be implemented.
1582 /// - Note that we can only ever negate all previously processed results.
1583 /// What we can not implement by flipping the flags to test is a negation
1584 /// of two sub-trees (because the negation affects all sub-trees emitted so
1585 /// far, so the 2nd sub-tree we emit would also affect the first).
1586 /// With those tools we can implement some OR operations:
1587 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
1588 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
1589 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
1590 /// elimination rules from earlier to implement the whole thing as a
1591 /// CCMP/FCCMP chain.
1592 ///
1593 /// As complete example:
1594 /// or (or (setCA (cmp A)) (setCB (cmp B)))
1595 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1596 /// can be reassociated to:
1597 /// or (and (setCC (cmp C)) setCD (cmp D))
1598 // (or (setCA (cmp A)) (setCB (cmp B)))
1599 /// can be transformed to:
1600 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
1601 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1602 /// which can be implemented as:
1603 /// cmp C
1604 /// ccmp D, inv(CD), CC
1605 /// ccmp A, CA, inv(CD)
1606 /// ccmp B, CB, inv(CA)
1607 /// check for CB flags
1608 ///
1609 /// A counterexample is "or (and A B) (and C D)" which translates to
1610 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
1611 /// can only implement 1 of the inner (not) operations, but not both!
1612 /// @{
1613 
1614 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1616  ISD::CondCode CC, SDValue CCOp,
1618  AArch64CC::CondCode OutCC,
1619  const SDLoc &DL, SelectionDAG &DAG) {
1620  unsigned Opcode = 0;
1621  const bool FullFP16 =
1622  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1623 
1624  if (LHS.getValueType().isFloatingPoint()) {
1625  assert(LHS.getValueType() != MVT::f128);
1626  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1627  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1628  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1629  }
1630  Opcode = AArch64ISD::FCCMP;
1631  } else if (RHS.getOpcode() == ISD::SUB) {
1632  SDValue SubOp0 = RHS.getOperand(0);
1633  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1634  // See emitComparison() on why we can only do this for SETEQ and SETNE.
1635  Opcode = AArch64ISD::CCMN;
1636  RHS = RHS.getOperand(1);
1637  }
1638  }
1639  if (Opcode == 0)
1640  Opcode = AArch64ISD::CCMP;
1641 
1642  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1644  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1645  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1646  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1647 }
1648 
1649 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
1650 /// expressed as a conjunction. See \ref AArch64CCMP.
1651 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
1652 /// changing the conditions on the SETCC tests.
1653 /// (this means we can call emitConjunctionRec() with
1654 /// Negate==true on this sub-tree)
1655 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
1656 /// cannot do the negation naturally. We are required to
1657 /// emit the subtree first in this case.
1658 /// \param WillNegate Is true if are called when the result of this
1659 /// subexpression must be negated. This happens when the
1660 /// outer expression is an OR. We can use this fact to know
1661 /// that we have a double negation (or (or ...) ...) that
1662 /// can be implemented for free.
1663 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
1664  bool &MustBeFirst, bool WillNegate,
1665  unsigned Depth = 0) {
1666  if (!Val.hasOneUse())
1667  return false;
1668  unsigned Opcode = Val->getOpcode();
1669  if (Opcode == ISD::SETCC) {
1670  if (Val->getOperand(0).getValueType() == MVT::f128)
1671  return false;
1672  CanNegate = true;
1673  MustBeFirst = false;
1674  return true;
1675  }
1676  // Protect against exponential runtime and stack overflow.
1677  if (Depth > 6)
1678  return false;
1679  if (Opcode == ISD::AND || Opcode == ISD::OR) {
1680  bool IsOR = Opcode == ISD::OR;
1681  SDValue O0 = Val->getOperand(0);
1682  SDValue O1 = Val->getOperand(1);
1683  bool CanNegateL;
1684  bool MustBeFirstL;
1685  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
1686  return false;
1687  bool CanNegateR;
1688  bool MustBeFirstR;
1689  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
1690  return false;
1691 
1692  if (MustBeFirstL && MustBeFirstR)
1693  return false;
1694 
1695  if (IsOR) {
1696  // For an OR expression we need to be able to naturally negate at least
1697  // one side or we cannot do the transformation at all.
1698  if (!CanNegateL && !CanNegateR)
1699  return false;
1700  // If we the result of the OR will be negated and we can naturally negate
1701  // the leafs, then this sub-tree as a whole negates naturally.
1702  CanNegate = WillNegate && CanNegateL && CanNegateR;
1703  // If we cannot naturally negate the whole sub-tree, then this must be
1704  // emitted first.
1705  MustBeFirst = !CanNegate;
1706  } else {
1707  assert(Opcode == ISD::AND && "Must be OR or AND");
1708  // We cannot naturally negate an AND operation.
1709  CanNegate = false;
1710  MustBeFirst = MustBeFirstL || MustBeFirstR;
1711  }
1712  return true;
1713  }
1714  return false;
1715 }
1716 
1717 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1718 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1719 /// Tries to transform the given i1 producing node @p Val to a series compare
1720 /// and conditional compare operations. @returns an NZCV flags producing node
1721 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1722 /// transformation was not possible.
1723 /// \p Negate is true if we want this sub-tree being negated just by changing
1724 /// SETCC conditions.
1726  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1728  // We're at a tree leaf, produce a conditional comparison operation.
1729  unsigned Opcode = Val->getOpcode();
1730  if (Opcode == ISD::SETCC) {
1731  SDValue LHS = Val->getOperand(0);
1732  SDValue RHS = Val->getOperand(1);
1733  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1734  bool isInteger = LHS.getValueType().isInteger();
1735  if (Negate)
1736  CC = getSetCCInverse(CC, isInteger);
1737  SDLoc DL(Val);
1738  // Determine OutCC and handle FP special case.
1739  if (isInteger) {
1740  OutCC = changeIntCCToAArch64CC(CC);
1741  } else {
1743  AArch64CC::CondCode ExtraCC;
1744  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1745  // Some floating point conditions can't be tested with a single condition
1746  // code. Construct an additional comparison in this case.
1747  if (ExtraCC != AArch64CC::AL) {
1748  SDValue ExtraCmp;
1749  if (!CCOp.getNode())
1750  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1751  else
1752  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1753  ExtraCC, DL, DAG);
1754  CCOp = ExtraCmp;
1755  Predicate = ExtraCC;
1756  }
1757  }
1758 
1759  // Produce a normal comparison if we are first in the chain
1760  if (!CCOp)
1761  return emitComparison(LHS, RHS, CC, DL, DAG);
1762  // Otherwise produce a ccmp.
1763  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1764  DAG);
1765  }
1766  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
1767 
1768  bool IsOR = Opcode == ISD::OR;
1769 
1770  SDValue LHS = Val->getOperand(0);
1771  bool CanNegateL;
1772  bool MustBeFirstL;
1773  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
1774  assert(ValidL && "Valid conjunction/disjunction tree");
1775  (void)ValidL;
1776 
1777  SDValue RHS = Val->getOperand(1);
1778  bool CanNegateR;
1779  bool MustBeFirstR;
1780  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
1781  assert(ValidR && "Valid conjunction/disjunction tree");
1782  (void)ValidR;
1783 
1784  // Swap sub-tree that must come first to the right side.
1785  if (MustBeFirstL) {
1786  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
1787  std::swap(LHS, RHS);
1788  std::swap(CanNegateL, CanNegateR);
1789  std::swap(MustBeFirstL, MustBeFirstR);
1790  }
1791 
1792  bool NegateR;
1793  bool NegateAfterR;
1794  bool NegateL;
1795  bool NegateAfterAll;
1796  if (Opcode == ISD::OR) {
1797  // Swap the sub-tree that we can negate naturally to the left.
1798  if (!CanNegateL) {
1799  assert(CanNegateR && "at least one side must be negatable");
1800  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
1801  assert(!Negate);
1802  std::swap(LHS, RHS);
1803  NegateR = false;
1804  NegateAfterR = true;
1805  } else {
1806  // Negate the left sub-tree if possible, otherwise negate the result.
1807  NegateR = CanNegateR;
1808  NegateAfterR = !CanNegateR;
1809  }
1810  NegateL = true;
1811  NegateAfterAll = !Negate;
1812  } else {
1813  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
1814  assert(!Negate && "Valid conjunction/disjunction tree");
1815 
1816  NegateL = false;
1817  NegateR = false;
1818  NegateAfterR = false;
1819  NegateAfterAll = false;
1820  }
1821 
1822  // Emit sub-trees.
1823  AArch64CC::CondCode RHSCC;
1824  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
1825  if (NegateAfterR)
1826  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1827  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
1828  if (NegateAfterAll)
1829  OutCC = AArch64CC::getInvertedCondCode(OutCC);
1830  return CmpL;
1831 }
1832 
1833 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
1834 /// In some cases this is even possible with OR operations in the expression.
1835 /// See \ref AArch64CCMP.
1836 /// \see emitConjunctionRec().
1838  AArch64CC::CondCode &OutCC) {
1839  bool DummyCanNegate;
1840  bool DummyMustBeFirst;
1841  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
1842  return SDValue();
1843 
1844  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
1845 }
1846 
1847 /// @}
1848 
1849 /// Returns how profitable it is to fold a comparison's operand's shift and/or
1850 /// extension operations.
1852  auto isSupportedExtend = [&](SDValue V) {
1853  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
1854  return true;
1855 
1856  if (V.getOpcode() == ISD::AND)
1857  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
1858  uint64_t Mask = MaskCst->getZExtValue();
1859  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
1860  }
1861 
1862  return false;
1863  };
1864 
1865  if (!Op.hasOneUse())
1866  return 0;
1867 
1868  if (isSupportedExtend(Op))
1869  return 1;
1870 
1871  unsigned Opc = Op.getOpcode();
1872  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
1873  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
1874  uint64_t Shift = ShiftCst->getZExtValue();
1875  if (isSupportedExtend(Op.getOperand(0)))
1876  return (Shift <= 4) ? 2 : 1;
1877  EVT VT = Op.getValueType();
1878  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
1879  return 1;
1880  }
1881 
1882  return 0;
1883 }
1884 
1886  SDValue &AArch64cc, SelectionDAG &DAG,
1887  const SDLoc &dl) {
1888  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1889  EVT VT = RHS.getValueType();
1890  uint64_t C = RHSC->getZExtValue();
1891  if (!isLegalArithImmed(C)) {
1892  // Constant does not fit, try adjusting it by one?
1893  switch (CC) {
1894  default:
1895  break;
1896  case ISD::SETLT:
1897  case ISD::SETGE:
1898  if ((VT == MVT::i32 && C != 0x80000000 &&
1899  isLegalArithImmed((uint32_t)(C - 1))) ||
1900  (VT == MVT::i64 && C != 0x80000000ULL &&
1901  isLegalArithImmed(C - 1ULL))) {
1902  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1903  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1904  RHS = DAG.getConstant(C, dl, VT);
1905  }
1906  break;
1907  case ISD::SETULT:
1908  case ISD::SETUGE:
1909  if ((VT == MVT::i32 && C != 0 &&
1910  isLegalArithImmed((uint32_t)(C - 1))) ||
1911  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1912  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1913  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1914  RHS = DAG.getConstant(C, dl, VT);
1915  }
1916  break;
1917  case ISD::SETLE:
1918  case ISD::SETGT:
1919  if ((VT == MVT::i32 && C != INT32_MAX &&
1920  isLegalArithImmed((uint32_t)(C + 1))) ||
1921  (VT == MVT::i64 && C != INT64_MAX &&
1922  isLegalArithImmed(C + 1ULL))) {
1923  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1924  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1925  RHS = DAG.getConstant(C, dl, VT);
1926  }
1927  break;
1928  case ISD::SETULE:
1929  case ISD::SETUGT:
1930  if ((VT == MVT::i32 && C != UINT32_MAX &&
1931  isLegalArithImmed((uint32_t)(C + 1))) ||
1932  (VT == MVT::i64 && C != UINT64_MAX &&
1933  isLegalArithImmed(C + 1ULL))) {
1934  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1935  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1936  RHS = DAG.getConstant(C, dl, VT);
1937  }
1938  break;
1939  }
1940  }
1941  }
1942 
1943  // Comparisons are canonicalized so that the RHS operand is simpler than the
1944  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
1945  // can fold some shift+extend operations on the RHS operand, so swap the
1946  // operands if that can be done.
1947  //
1948  // For example:
1949  // lsl w13, w11, #1
1950  // cmp w13, w12
1951  // can be turned into:
1952  // cmp w12, w11, lsl #1
1953  if (!isa<ConstantSDNode>(RHS) ||
1954  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
1955  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
1956 
1958  std::swap(LHS, RHS);
1960  }
1961  }
1962 
1963  SDValue Cmp;
1964  AArch64CC::CondCode AArch64CC;
1965  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
1966  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
1967 
1968  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
1969  // For the i8 operand, the largest immediate is 255, so this can be easily
1970  // encoded in the compare instruction. For the i16 operand, however, the
1971  // largest immediate cannot be encoded in the compare.
1972  // Therefore, use a sign extending load and cmn to avoid materializing the
1973  // -1 constant. For example,
1974  // movz w1, #65535
1975  // ldrh w0, [x0, #0]
1976  // cmp w0, w1
1977  // >
1978  // ldrsh w0, [x0, #0]
1979  // cmn w0, #1
1980  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
1981  // if and only if (sext LHS) == (sext RHS). The checks are in place to
1982  // ensure both the LHS and RHS are truly zero extended and to make sure the
1983  // transformation is profitable.
1984  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
1985  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
1986  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
1987  LHS.getNode()->hasNUsesOfValue(1, 0)) {
1988  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
1989  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
1990  SDValue SExt =
1991  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
1992  DAG.getValueType(MVT::i16));
1993  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
1994  RHS.getValueType()),
1995  CC, dl, DAG);
1996  AArch64CC = changeIntCCToAArch64CC(CC);
1997  }
1998  }
1999 
2000  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2001  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2002  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2003  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2004  }
2005  }
2006  }
2007 
2008  if (!Cmp) {
2009  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2010  AArch64CC = changeIntCCToAArch64CC(CC);
2011  }
2012  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2013  return Cmp;
2014 }
2015 
2016 static std::pair<SDValue, SDValue>
2018  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2019  "Unsupported value type");
2020  SDValue Value, Overflow;
2021  SDLoc DL(Op);
2022  SDValue LHS = Op.getOperand(0);
2023  SDValue RHS = Op.getOperand(1);
2024  unsigned Opc = 0;
2025  switch (Op.getOpcode()) {
2026  default:
2027  llvm_unreachable("Unknown overflow instruction!");
2028  case ISD::SADDO:
2029  Opc = AArch64ISD::ADDS;
2030  CC = AArch64CC::VS;
2031  break;
2032  case ISD::UADDO:
2033  Opc = AArch64ISD::ADDS;
2034  CC = AArch64CC::HS;
2035  break;
2036  case ISD::SSUBO:
2037  Opc = AArch64ISD::SUBS;
2038  CC = AArch64CC::VS;
2039  break;
2040  case ISD::USUBO:
2041  Opc = AArch64ISD::SUBS;
2042  CC = AArch64CC::LO;
2043  break;
2044  // Multiply needs a little bit extra work.
2045  case ISD::SMULO:
2046  case ISD::UMULO: {
2047  CC = AArch64CC::NE;
2048  bool IsSigned = Op.getOpcode() == ISD::SMULO;
2049  if (Op.getValueType() == MVT::i32) {
2050  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2051  // For a 32 bit multiply with overflow check we want the instruction
2052  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2053  // need to generate the following pattern:
2054  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2055  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2056  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2057  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2058  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
2059  DAG.getConstant(0, DL, MVT::i64));
2060  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2061  // operation. We need to clear out the upper 32 bits, because we used a
2062  // widening multiply that wrote all 64 bits. In the end this should be a
2063  // noop.
2064  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
2065  if (IsSigned) {
2066  // The signed overflow check requires more than just a simple check for
2067  // any bit set in the upper 32 bits of the result. These bits could be
2068  // just the sign bits of a negative number. To perform the overflow
2069  // check we have to arithmetic shift right the 32nd bit of the result by
2070  // 31 bits. Then we compare the result to the upper 32 bits.
2071  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
2072  DAG.getConstant(32, DL, MVT::i64));
2073  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
2074  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
2075  DAG.getConstant(31, DL, MVT::i64));
2076  // It is important that LowerBits is last, otherwise the arithmetic
2077  // shift will not be folded into the compare (SUBS).
2078  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2079  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2080  .getValue(1);
2081  } else {
2082  // The overflow check for unsigned multiply is easy. We only need to
2083  // check if any of the upper 32 bits are set. This can be done with a
2084  // CMP (shifted register). For that we need to generate the following
2085  // pattern:
2086  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2087  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2088  DAG.getConstant(32, DL, MVT::i64));
2089  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2090  Overflow =
2091  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2092  DAG.getConstant(0, DL, MVT::i64),
2093  UpperBits).getValue(1);
2094  }
2095  break;
2096  }
2097  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2098  // For the 64 bit multiply
2099  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2100  if (IsSigned) {
2101  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2102  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
2103  DAG.getConstant(63, DL, MVT::i64));
2104  // It is important that LowerBits is last, otherwise the arithmetic
2105  // shift will not be folded into the compare (SUBS).
2106  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2107  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2108  .getValue(1);
2109  } else {
2110  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2111  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2112  Overflow =
2113  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2114  DAG.getConstant(0, DL, MVT::i64),
2115  UpperBits).getValue(1);
2116  }
2117  break;
2118  }
2119  } // switch (...)
2120 
2121  if (Opc) {
2122  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2123 
2124  // Emit the AArch64 operation with overflow check.
2125  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2126  Overflow = Value.getValue(1);
2127  }
2128  return std::make_pair(Value, Overflow);
2129 }
2130 
2131 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
2132  RTLIB::Libcall Call) const {
2133  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2134  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
2135 }
2136 
2137 // Returns true if the given Op is the overflow flag result of an overflow
2138 // intrinsic operation.
2139 static bool isOverflowIntrOpRes(SDValue Op) {
2140  unsigned Opc = Op.getOpcode();
2141  return (Op.getResNo() == 1 &&
2142  (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2143  Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
2144 }
2145 
2147  SDValue Sel = Op.getOperand(0);
2148  SDValue Other = Op.getOperand(1);
2149  SDLoc dl(Sel);
2150 
2151  // If the operand is an overflow checking operation, invert the condition
2152  // code and kill the Not operation. I.e., transform:
2153  // (xor (overflow_op_bool, 1))
2154  // -->
2155  // (csel 1, 0, invert(cc), overflow_op_bool)
2156  // ... which later gets transformed to just a cset instruction with an
2157  // inverted condition code, rather than a cset + eor sequence.
2158  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
2159  // Only lower legal XALUO ops.
2160  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2161  return SDValue();
2162 
2163  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2164  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2166  SDValue Value, Overflow;
2167  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2168  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2169  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2170  CCVal, Overflow);
2171  }
2172  // If neither operand is a SELECT_CC, give up.
2173  if (Sel.getOpcode() != ISD::SELECT_CC)
2174  std::swap(Sel, Other);
2175  if (Sel.getOpcode() != ISD::SELECT_CC)
2176  return Op;
2177 
2178  // The folding we want to perform is:
2179  // (xor x, (select_cc a, b, cc, 0, -1) )
2180  // -->
2181  // (csel x, (xor x, -1), cc ...)
2182  //
2183  // The latter will get matched to a CSINV instruction.
2184 
2185  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2186  SDValue LHS = Sel.getOperand(0);
2187  SDValue RHS = Sel.getOperand(1);
2188  SDValue TVal = Sel.getOperand(2);
2189  SDValue FVal = Sel.getOperand(3);
2190 
2191  // FIXME: This could be generalized to non-integer comparisons.
2192  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2193  return Op;
2194 
2195  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2196  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2197 
2198  // The values aren't constants, this isn't the pattern we're looking for.
2199  if (!CFVal || !CTVal)
2200  return Op;
2201 
2202  // We can commute the SELECT_CC by inverting the condition. This
2203  // might be needed to make this fit into a CSINV pattern.
2204  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2205  std::swap(TVal, FVal);
2206  std::swap(CTVal, CFVal);
2207  CC = ISD::getSetCCInverse(CC, true);
2208  }
2209 
2210  // If the constants line up, perform the transform!
2211  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2212  SDValue CCVal;
2213  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2214 
2215  FVal = Other;
2216  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2217  DAG.getConstant(-1ULL, dl, Other.getValueType()));
2218 
2219  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2220  CCVal, Cmp);
2221  }
2222 
2223  return Op;
2224 }
2225 
2227  EVT VT = Op.getValueType();
2228 
2229  // Let legalize expand this if it isn't a legal type yet.
2230  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2231  return SDValue();
2232 
2233  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2234 
2235  unsigned Opc;
2236  bool ExtraOp = false;
2237  switch (Op.getOpcode()) {
2238  default:
2239  llvm_unreachable("Invalid code");
2240  case ISD::ADDC:
2241  Opc = AArch64ISD::ADDS;
2242  break;
2243  case ISD::SUBC:
2244  Opc = AArch64ISD::SUBS;
2245  break;
2246  case ISD::ADDE:
2247  Opc = AArch64ISD::ADCS;
2248  ExtraOp = true;
2249  break;
2250  case ISD::SUBE:
2251  Opc = AArch64ISD::SBCS;
2252  ExtraOp = true;
2253  break;
2254  }
2255 
2256  if (!ExtraOp)
2257  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2258  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2259  Op.getOperand(2));
2260 }
2261 
2263  // Let legalize expand this if it isn't a legal type yet.
2265  return SDValue();
2266 
2267  SDLoc dl(Op);
2269  // The actual operation that sets the overflow or carry flag.
2270  SDValue Value, Overflow;
2271  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2272 
2273  // We use 0 and 1 as false and true values.
2274  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2275  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2276 
2277  // We use an inverted condition, because the conditional select is inverted
2278  // too. This will allow it to be selected to a single instruction:
2279  // CSINC Wd, WZR, WZR, invert(cond).
2280  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2281  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2282  CCVal, Overflow);
2283 
2284  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2285  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2286 }
2287 
2288 // Prefetch operands are:
2289 // 1: Address to prefetch
2290 // 2: bool isWrite
2291 // 3: int locality (0 = no locality ... 3 = extreme locality)
2292 // 4: bool isDataCache
2294  SDLoc DL(Op);
2295  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2296  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2297  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2298 
2299  bool IsStream = !Locality;
2300  // When the locality number is set
2301  if (Locality) {
2302  // The front-end should have filtered out the out-of-range values
2303  assert(Locality <= 3 && "Prefetch locality out-of-range");
2304  // The locality degree is the opposite of the cache speed.
2305  // Put the number the other way around.
2306  // The encoding starts at 0 for level 1
2307  Locality = 3 - Locality;
2308  }
2309 
2310  // built the mask value encoding the expected behavior.
2311  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2312  (!IsData << 3) | // IsDataCache bit
2313  (Locality << 1) | // Cache level bits
2314  (unsigned)IsStream; // Stream bit
2315  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2316  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2317 }
2318 
2319 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2320  SelectionDAG &DAG) const {
2321  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2322 
2323  RTLIB::Libcall LC;
2325 
2326  return LowerF128Call(Op, DAG, LC);
2327 }
2328 
2329 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2330  SelectionDAG &DAG) const {
2331  if (Op.getOperand(0).getValueType() != MVT::f128) {
2332  // It's legal except when f128 is involved
2333  return Op;
2334  }
2335 
2336  RTLIB::Libcall LC;
2338 
2339  // FP_ROUND node has a second operand indicating whether it is known to be
2340  // precise. That doesn't take part in the LibCall so we can't directly use
2341  // LowerF128Call.
2342  SDValue SrcVal = Op.getOperand(0);
2343  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
2344  SDLoc(Op)).first;
2345 }
2346 
2347 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
2348  SelectionDAG &DAG) const {
2349  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2350  // Any additional optimization in this function should be recorded
2351  // in the cost tables.
2352  EVT InVT = Op.getOperand(0).getValueType();
2353  EVT VT = Op.getValueType();
2354  unsigned NumElts = InVT.getVectorNumElements();
2355 
2356  // f16 conversions are promoted to f32 when full fp16 is not supported.
2357  if (InVT.getVectorElementType() == MVT::f16 &&
2358  !Subtarget->hasFullFP16()) {
2359  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2360  SDLoc dl(Op);
2361  return DAG.getNode(
2362  Op.getOpcode(), dl, Op.getValueType(),
2363  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2364  }
2365 
2366  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2367  SDLoc dl(Op);
2368  SDValue Cv =
2369  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2370  Op.getOperand(0));
2371  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2372  }
2373 
2374  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2375  SDLoc dl(Op);
2376  MVT ExtVT =
2378  VT.getVectorNumElements());
2379  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2380  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2381  }
2382 
2383  // Type changing conversions are illegal.
2384  return Op;
2385 }
2386 
2387 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2388  SelectionDAG &DAG) const {
2389  if (Op.getOperand(0).getValueType().isVector())
2390  return LowerVectorFP_TO_INT(Op, DAG);
2391 
2392  // f16 conversions are promoted to f32 when full fp16 is not supported.
2393  if (Op.getOperand(0).getValueType() == MVT::f16 &&
2394  !Subtarget->hasFullFP16()) {
2395  SDLoc dl(Op);
2396  return DAG.getNode(
2397  Op.getOpcode(), dl, Op.getValueType(),
2398  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2399  }
2400 
2401  if (Op.getOperand(0).getValueType() != MVT::f128) {
2402  // It's legal except when f128 is involved
2403  return Op;
2404  }
2405 
2406  RTLIB::Libcall LC;
2407  if (Op.getOpcode() == ISD::FP_TO_SINT)
2409  else
2411 
2412  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2413  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
2414 }
2415 
2417  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2418  // Any additional optimization in this function should be recorded
2419  // in the cost tables.
2420  EVT VT = Op.getValueType();
2421  SDLoc dl(Op);
2422  SDValue In = Op.getOperand(0);
2423  EVT InVT = In.getValueType();
2424 
2425  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2426  MVT CastVT =
2428  InVT.getVectorNumElements());
2429  In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2430  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2431  }
2432 
2433  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2434  unsigned CastOpc =
2436  EVT CastVT = VT.changeVectorElementTypeToInteger();
2437  In = DAG.getNode(CastOpc, dl, CastVT, In);
2438  return DAG.getNode(Op.getOpcode(), dl, VT, In);
2439  }
2440 
2441  return Op;
2442 }
2443 
2444 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2445  SelectionDAG &DAG) const {
2446  if (Op.getValueType().isVector())
2447  return LowerVectorINT_TO_FP(Op, DAG);
2448 
2449  // f16 conversions are promoted to f32 when full fp16 is not supported.
2450  if (Op.getValueType() == MVT::f16 &&
2451  !Subtarget->hasFullFP16()) {
2452  SDLoc dl(Op);
2453  return DAG.getNode(
2454  ISD::FP_ROUND, dl, MVT::f16,
2455  DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2456  DAG.getIntPtrConstant(0, dl));
2457  }
2458 
2459  // i128 conversions are libcalls.
2460  if (Op.getOperand(0).getValueType() == MVT::i128)
2461  return SDValue();
2462 
2463  // Other conversions are legal, unless it's to the completely software-based
2464  // fp128.
2465  if (Op.getValueType() != MVT::f128)
2466  return Op;
2467 
2468  RTLIB::Libcall LC;
2469  if (Op.getOpcode() == ISD::SINT_TO_FP)
2471  else
2473 
2474  return LowerF128Call(Op, DAG, LC);
2475 }
2476 
2477 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2478  SelectionDAG &DAG) const {
2479  // For iOS, we want to call an alternative entry point: __sincos_stret,
2480  // which returns the values in two S / D registers.
2481  SDLoc dl(Op);
2482  SDValue Arg = Op.getOperand(0);
2483  EVT ArgVT = Arg.getValueType();
2484  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2485 
2486  ArgListTy Args;
2487  ArgListEntry Entry;
2488 
2489  Entry.Node = Arg;
2490  Entry.Ty = ArgTy;
2491  Entry.IsSExt = false;
2492  Entry.IsZExt = false;
2493  Args.push_back(Entry);
2494 
2495  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2496  : RTLIB::SINCOS_STRET_F32;
2497  const char *LibcallName = getLibcallName(LC);
2498  SDValue Callee =
2499  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2500 
2501  StructType *RetTy = StructType::get(ArgTy, ArgTy);
2503  CLI.setDebugLoc(dl)
2504  .setChain(DAG.getEntryNode())
2505  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2506 
2507  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2508  return CallResult.first;
2509 }
2510 
2512  if (Op.getValueType() != MVT::f16)
2513  return SDValue();
2514 
2515  assert(Op.getOperand(0).getValueType() == MVT::i16);
2516  SDLoc DL(Op);
2517 
2518  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2519  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2520  return SDValue(
2521  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2522  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2523  0);
2524 }
2525 
2526 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2527  if (OrigVT.getSizeInBits() >= 64)
2528  return OrigVT;
2529 
2530  assert(OrigVT.isSimple() && "Expecting a simple value type");
2531 
2532  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2533  switch (OrigSimpleTy) {
2534  default: llvm_unreachable("Unexpected Vector Type");
2535  case MVT::v2i8:
2536  case MVT::v2i16:
2537  return MVT::v2i32;
2538  case MVT::v4i8:
2539  return MVT::v4i16;
2540  }
2541 }
2542 
2544  const EVT &OrigTy,
2545  const EVT &ExtTy,
2546  unsigned ExtOpcode) {
2547  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2548  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2549  // 64-bits we need to insert a new extension so that it will be 64-bits.
2550  assert(ExtTy.is128BitVector() && "Unexpected extension size");
2551  if (OrigTy.getSizeInBits() >= 64)
2552  return N;
2553 
2554  // Must extend size to at least 64 bits to be used as an operand for VMULL.
2555  EVT NewVT = getExtensionTo64Bits(OrigTy);
2556 
2557  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2558 }
2559 
2561  bool isSigned) {
2562  EVT VT = N->getValueType(0);
2563 
2564  if (N->getOpcode() != ISD::BUILD_VECTOR)
2565  return false;
2566 
2567  for (const SDValue &Elt : N->op_values()) {
2568  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2569  unsigned EltSize = VT.getScalarSizeInBits();
2570  unsigned HalfSize = EltSize / 2;
2571  if (isSigned) {
2572  if (!isIntN(HalfSize, C->getSExtValue()))
2573  return false;
2574  } else {
2575  if (!isUIntN(HalfSize, C->getZExtValue()))
2576  return false;
2577  }
2578  continue;
2579  }
2580  return false;
2581  }
2582 
2583  return true;
2584 }
2585 
2587  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2589  N->getOperand(0)->getValueType(0),
2590  N->getValueType(0),
2591  N->getOpcode());
2592 
2593  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2594  EVT VT = N->getValueType(0);
2595  SDLoc dl(N);
2596  unsigned EltSize = VT.getScalarSizeInBits() / 2;
2597  unsigned NumElts = VT.getVectorNumElements();
2598  MVT TruncVT = MVT::getIntegerVT(EltSize);
2600  for (unsigned i = 0; i != NumElts; ++i) {
2601  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2602  const APInt &CInt = C->getAPIntValue();
2603  // Element types smaller than 32 bits are not legal, so use i32 elements.
2604  // The values are implicitly truncated so sext vs. zext doesn't matter.
2605  Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2606  }
2607  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2608 }
2609 
2610 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2611  return N->getOpcode() == ISD::SIGN_EXTEND ||
2612  isExtendedBUILD_VECTOR(N, DAG, true);
2613 }
2614 
2615 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2616  return N->getOpcode() == ISD::ZERO_EXTEND ||
2617  isExtendedBUILD_VECTOR(N, DAG, false);
2618 }
2619 
2620 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2621  unsigned Opcode = N->getOpcode();
2622  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2623  SDNode *N0 = N->getOperand(0).getNode();
2624  SDNode *N1 = N->getOperand(1).getNode();
2625  return N0->hasOneUse() && N1->hasOneUse() &&
2626  isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2627  }
2628  return false;
2629 }
2630 
2631 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2632  unsigned Opcode = N->getOpcode();
2633  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2634  SDNode *N0 = N->getOperand(0).getNode();
2635  SDNode *N1 = N->getOperand(1).getNode();
2636  return N0->hasOneUse() && N1->hasOneUse() &&
2637  isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2638  }
2639  return false;
2640 }
2641 
2642 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
2643  SelectionDAG &DAG) const {
2644  // The rounding mode is in bits 23:22 of the FPSCR.
2645  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
2646  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
2647  // so that the shift + and get folded into a bitfield extract.
2648  SDLoc dl(Op);
2649 
2650  SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
2651  DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
2652  MVT::i64));
2653  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
2654  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
2655  DAG.getConstant(1U << 22, dl, MVT::i32));
2656  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
2657  DAG.getConstant(22, dl, MVT::i32));
2658  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
2659  DAG.getConstant(3, dl, MVT::i32));
2660 }
2661 
2663  // Multiplications are only custom-lowered for 128-bit vectors so that
2664  // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2665  EVT VT = Op.getValueType();
2666  assert(VT.is128BitVector() && VT.isInteger() &&
2667  "unexpected type for custom-lowering ISD::MUL");
2668  SDNode *N0 = Op.getOperand(0).getNode();
2669  SDNode *N1 = Op.getOperand(1).getNode();
2670  unsigned NewOpc = 0;
2671  bool isMLA = false;
2672  bool isN0SExt = isSignExtended(N0, DAG);
2673  bool isN1SExt = isSignExtended(N1, DAG);
2674  if (isN0SExt && isN1SExt)
2675  NewOpc = AArch64ISD::SMULL;
2676  else {
2677  bool isN0ZExt = isZeroExtended(N0, DAG);
2678  bool isN1ZExt = isZeroExtended(N1, DAG);
2679  if (isN0ZExt && isN1ZExt)
2680  NewOpc = AArch64ISD::UMULL;
2681  else if (isN1SExt || isN1ZExt) {
2682  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2683  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2684  if (isN1SExt && isAddSubSExt(N0, DAG)) {
2685  NewOpc = AArch64ISD::SMULL;
2686  isMLA = true;
2687  } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2688  NewOpc = AArch64ISD::UMULL;
2689  isMLA = true;
2690  } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2691  std::swap(N0, N1);
2692  NewOpc = AArch64ISD::UMULL;
2693  isMLA = true;
2694  }
2695  }
2696 
2697  if (!NewOpc) {
2698  if (VT == MVT::v2i64)
2699  // Fall through to expand this. It is not legal.
2700  return SDValue();
2701  else
2702  // Other vector multiplications are legal.
2703  return Op;
2704  }
2705  }
2706 
2707  // Legalize to a S/UMULL instruction
2708  SDLoc DL(Op);
2709  SDValue Op0;
2710  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2711  if (!isMLA) {
2712  Op0 = skipExtensionForVectorMULL(N0, DAG);
2713  assert(Op0.getValueType().is64BitVector() &&
2714  Op1.getValueType().is64BitVector() &&
2715  "unexpected types for extended operands to VMULL");
2716  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2717  }
2718  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2719  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2720  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2721  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2722  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2723  EVT Op1VT = Op1.getValueType();
2724  return DAG.getNode(N0->getOpcode(), DL, VT,
2725  DAG.getNode(NewOpc, DL, VT,
2726  DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2727  DAG.getNode(NewOpc, DL, VT,
2728  DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2729 }
2730 
2731 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2732  SelectionDAG &DAG) const {
2733  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2734  SDLoc dl(Op);
2735  switch (IntNo) {
2736  default: return SDValue(); // Don't custom lower most intrinsics.
2737  case Intrinsic::thread_pointer: {
2738  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2739  return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2740  }
2741  case Intrinsic::aarch64_neon_abs: {
2742  EVT Ty = Op.getValueType();
2743  if (Ty == MVT::i64) {
2744  SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
2745  Op.getOperand(1));
2746  Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
2747  return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
2748  } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
2749  return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
2750  } else {
2751  report_fatal_error("Unexpected type for AArch64 NEON intrinic");
2752  }
2753  }
2754  case Intrinsic::aarch64_neon_smax:
2755  return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2756  Op.getOperand(1), Op.getOperand(2));
2757  case Intrinsic::aarch64_neon_umax:
2758  return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2759  Op.getOperand(1), Op.getOperand(2));
2760  case Intrinsic::aarch64_neon_smin:
2761  return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2762  Op.getOperand(1), Op.getOperand(2));
2763  case Intrinsic::aarch64_neon_umin:
2764  return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2765  Op.getOperand(1), Op.getOperand(2));
2766 
2767  case Intrinsic::localaddress: {
2768  const auto &MF = DAG.getMachineFunction();
2769  const auto *RegInfo = Subtarget->getRegisterInfo();
2770  unsigned Reg = RegInfo->getLocalAddressRegister(MF);
2771  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
2772  Op.getSimpleValueType());
2773  }
2774 
2775  case Intrinsic::eh_recoverfp: {
2776  // FIXME: This needs to be implemented to correctly handle highly aligned
2777  // stack objects. For now we simply return the incoming FP. Refer D53541
2778  // for more details.
2779  SDValue FnOp = Op.getOperand(1);
2780  SDValue IncomingFPOp = Op.getOperand(2);
2782  auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
2783  if (!Fn)
2785  "llvm.eh.recoverfp must take a function as the first argument");
2786  return IncomingFPOp;
2787  }
2788  }
2789 }
2790 
2791 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
2793  EVT VT, EVT MemVT,
2794  SelectionDAG &DAG) {
2795  assert(VT.isVector() && "VT should be a vector type");
2796  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
2797 
2798  SDValue Value = ST->getValue();
2799 
2800  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
2801  // the word lane which represent the v4i8 subvector. It optimizes the store
2802  // to:
2803  //
2804  // xtn v0.8b, v0.8h
2805  // str s0, [x0]
2806 
2807  SDValue Undef = DAG.getUNDEF(MVT::i16);
2808  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
2809  {Undef, Undef, Undef, Undef});
2810 
2811  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
2812  Value, UndefVec);
2813  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
2814 
2815  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
2816  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
2817  Trunc, DAG.getConstant(0, DL, MVT::i64));
2818 
2819  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
2820  ST->getBasePtr(), ST->getMemOperand());
2821 }
2822 
2823 // Custom lowering for any store, vector or scalar and/or default or with
2824 // a truncate operations. Currently only custom lower truncate operation
2825 // from vector v4i16 to v4i8.
2826 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
2827  SelectionDAG &DAG) const {
2828  SDLoc Dl(Op);
2829  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
2830  assert (StoreNode && "Can only custom lower store nodes");
2831 
2832  SDValue Value = StoreNode->getValue();
2833 
2834  EVT VT = Value.getValueType();
2835  EVT MemVT = StoreNode->getMemoryVT();
2836 
2837  assert (VT.isVector() && "Can only custom lower vector store types");
2838 
2839  unsigned AS = StoreNode->getAddressSpace();
2840  unsigned Align = StoreNode->getAlignment();
2841  if (Align < MemVT.getStoreSize() &&
2842  !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
2843  return scalarizeVectorStore(StoreNode, DAG);
2844  }
2845 
2846  if (StoreNode->isTruncatingStore()) {
2847  return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
2848  }
2849 
2850  return SDValue();
2851 }
2852 
2854  SelectionDAG &DAG) const {
2855  LLVM_DEBUG(dbgs() << "Custom lowering: ");
2856  LLVM_DEBUG(Op.dump());
2857 
2858  switch (Op.getOpcode()) {
2859  default:
2860  llvm_unreachable("unimplemented operand");
2861  return SDValue();
2862  case ISD::BITCAST:
2863  return LowerBITCAST(Op, DAG);
2864  case ISD::GlobalAddress:
2865  return LowerGlobalAddress(Op, DAG);
2866  case ISD::GlobalTLSAddress:
2867  return LowerGlobalTLSAddress(Op, DAG);
2868  case ISD::SETCC:
2869  return LowerSETCC(Op, DAG);
2870  case ISD::BR_CC:
2871  return LowerBR_CC(Op, DAG);
2872  case ISD::SELECT:
2873  return LowerSELECT(Op, DAG);
2874  case ISD::SELECT_CC:
2875  return LowerSELECT_CC(Op, DAG);
2876  case ISD::JumpTable:
2877  return LowerJumpTable(Op, DAG);
2878  case ISD::BR_JT:
2879  return LowerBR_JT(Op, DAG);
2880  case ISD::ConstantPool:
2881  return LowerConstantPool(Op, DAG);
2882  case ISD::BlockAddress:
2883  return LowerBlockAddress(Op, DAG);
2884  case ISD::VASTART:
2885  return LowerVASTART(Op, DAG);
2886  case ISD::VACOPY:
2887  return LowerVACOPY(Op, DAG);
2888  case ISD::VAARG:
2889  return LowerVAARG(Op, DAG);
2890  case ISD::ADDC:
2891  case ISD::ADDE:
2892  case ISD::SUBC:
2893  case ISD::SUBE:
2894  return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2895  case ISD::SADDO:
2896  case ISD::UADDO:
2897  case ISD::SSUBO:
2898  case ISD::USUBO:
2899  case ISD::SMULO:
2900  case ISD::UMULO:
2901  return LowerXALUO(Op, DAG);
2902  case ISD::FADD:
2903  return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
2904  case ISD::FSUB:
2905  return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
2906  case ISD::FMUL:
2907  return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
2908  case ISD::FDIV:
2909  return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
2910  case ISD::FP_ROUND:
2911  return LowerFP_ROUND(Op, DAG);
2912  case ISD::FP_EXTEND:
2913  return LowerFP_EXTEND(Op, DAG);
2914  case ISD::FRAMEADDR:
2915  return LowerFRAMEADDR(Op, DAG);
2916  case ISD::SPONENTRY:
2917  return LowerSPONENTRY(Op, DAG);
2918  case ISD::RETURNADDR:
2919  return LowerRETURNADDR(Op, DAG);
2920  case ISD::ADDROFRETURNADDR:
2921  return LowerADDROFRETURNADDR(Op, DAG);
2923  return LowerINSERT_VECTOR_ELT(Op, DAG);
2925  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2926  case ISD::BUILD_VECTOR:
2927  return LowerBUILD_VECTOR(Op, DAG);
2928  case ISD::VECTOR_SHUFFLE:
2929  return LowerVECTOR_SHUFFLE(Op, DAG);
2931  return LowerEXTRACT_SUBVECTOR(Op, DAG);
2932  case ISD::SRA:
2933  case ISD::SRL:
2934  case ISD::SHL:
2935  return LowerVectorSRA_SRL_SHL(Op, DAG);
2936  case ISD::SHL_PARTS:
2937  return LowerShiftLeftParts(Op, DAG);
2938  case ISD::SRL_PARTS:
2939  case ISD::SRA_PARTS:
2940  return LowerShiftRightParts(Op, DAG);
2941  case ISD::CTPOP:
2942  return LowerCTPOP(Op, DAG);
2943  case ISD::FCOPYSIGN:
2944  return LowerFCOPYSIGN(Op, DAG);
2945  case ISD::OR:
2946  return LowerVectorOR(Op, DAG);
2947  case ISD::XOR:
2948  return LowerXOR(Op, DAG);
2949  case ISD::PREFETCH:
2950  return LowerPREFETCH(Op, DAG);
2951  case ISD::SINT_TO_FP:
2952  case ISD::UINT_TO_FP:
2953  return LowerINT_TO_FP(Op, DAG);
2954  case ISD::FP_TO_SINT:
2955  case ISD::FP_TO_UINT:
2956  return LowerFP_TO_INT(Op, DAG);
2957  case ISD::FSINCOS:
2958  return LowerFSINCOS(Op, DAG);
2959  case ISD::FLT_ROUNDS_:
2960  return LowerFLT_ROUNDS_(Op, DAG);
2961  case ISD::MUL:
2962  return LowerMUL(Op, DAG);
2964  return LowerINTRINSIC_WO_CHAIN(Op, DAG);
2965  case ISD::STORE:
2966  return LowerSTORE(Op, DAG);
2967  case ISD::VECREDUCE_ADD:
2968  case ISD::VECREDUCE_SMAX:
2969  case ISD::VECREDUCE_SMIN:
2970  case ISD::VECREDUCE_UMAX:
2971  case ISD::VECREDUCE_UMIN:
2972  case ISD::VECREDUCE_FMAX:
2973  case ISD::VECREDUCE_FMIN:
2974  return LowerVECREDUCE(Op, DAG);
2975  case ISD::ATOMIC_LOAD_SUB:
2976  return LowerATOMIC_LOAD_SUB(Op, DAG);
2977  case ISD::ATOMIC_LOAD_AND:
2978  return LowerATOMIC_LOAD_AND(Op, DAG);
2980  return LowerDYNAMIC_STACKALLOC(Op, DAG);
2981  }
2982 }
2983 
2984 //===----------------------------------------------------------------------===//
2985 // Calling Convention Implementation
2986 //===----------------------------------------------------------------------===//
2987 
2988 /// Selects the correct CCAssignFn for a given CallingConvention value.
2990  bool IsVarArg) const {
2991  switch (CC) {
2992  default:
2993  report_fatal_error("Unsupported calling convention.");
2995  return CC_AArch64_WebKit_JS;
2996  case CallingConv::GHC:
2997  return CC_AArch64_GHC;
2998  case CallingConv::C:
2999  case CallingConv::Fast:
3002  case CallingConv::Swift:
3003  if (Subtarget->isTargetWindows() && IsVarArg)
3004  return CC_AArch64_Win64_VarArg;
3005  if (!Subtarget->isTargetDarwin())
3006  return CC_AArch64_AAPCS;
3008  case CallingConv::Win64:
3009  return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
3011  return CC_AArch64_AAPCS;
3012  }
3013 }
3014 
3015 CCAssignFn *
3019 }
3020 
3021 SDValue AArch64TargetLowering::LowerFormalArguments(
3022  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3023  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3024  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3025  MachineFunction &MF = DAG.getMachineFunction();
3026  MachineFrameInfo &MFI = MF.getFrameInfo();
3027  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3028 
3029  // Assign locations to all of the incoming arguments.
3031  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3032  *DAG.getContext());
3033 
3034  // At this point, Ins[].VT may already be promoted to i32. To correctly
3035  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3036  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3037  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
3038  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
3039  // LocVT.
3040  unsigned NumArgs = Ins.size();
3042  unsigned CurArgIdx = 0;
3043  for (unsigned i = 0; i != NumArgs; ++i) {
3044  MVT ValVT = Ins[i].VT;
3045  if (Ins[i].isOrigArg()) {
3046  std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
3047  CurArgIdx = Ins[i].getOrigArgIndex();
3048 
3049  // Get type of the original argument.
3050  EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
3051  /*AllowUnknown*/ true);
3052  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
3053  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3054  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3055  ValVT = MVT::i8;
3056  else if (ActualMVT == MVT::i16)
3057  ValVT = MVT::i16;
3058  }
3059  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3060  bool Res =
3061  AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
3062  assert(!Res && "Call operand has unhandled type");
3063  (void)Res;
3064  }
3065  assert(ArgLocs.size() == Ins.size());
3066  SmallVector<SDValue, 16> ArgValues;
3067  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3068  CCValAssign &VA = ArgLocs[i];
3069 
3070  if (Ins[i].Flags.isByVal()) {
3071  // Byval is used for HFAs in the PCS, but the system should work in a
3072  // non-compliant manner for larger structs.
3073  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3074  int Size = Ins[i].Flags.getByValSize();
3075  unsigned NumRegs = (Size + 7) / 8;
3076 
3077  // FIXME: This works on big-endian for composite byvals, which are the common
3078  // case. It should also work for fundamental types too.
3079  unsigned FrameIdx =
3080  MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
3081  SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
3082  InVals.push_back(FrameIdxN);
3083 
3084  continue;
3085  }
3086 
3087  if (VA.isRegLoc()) {
3088  // Arguments stored in registers.
3089  EVT RegVT = VA.getLocVT();
3090 
3091  SDValue ArgValue;
3092  const TargetRegisterClass *RC;
3093 
3094  if (RegVT == MVT::i32)
3095  RC = &AArch64::GPR32RegClass;
3096  else if (RegVT == MVT::i64)
3097  RC = &AArch64::GPR64RegClass;
3098  else if (RegVT == MVT::f16)
3099  RC = &AArch64::FPR16RegClass;
3100  else if (RegVT == MVT::f32)
3101  RC = &AArch64::FPR32RegClass;
3102  else if (RegVT == MVT::f64 || RegVT.is64BitVector())
3103  RC = &AArch64::FPR64RegClass;
3104  else if (RegVT == MVT::f128 || RegVT.is128BitVector())
3105  RC = &AArch64::FPR128RegClass;
3106  else
3107  llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3108 
3109  // Transform the arguments in physical registers into virtual ones.
3110  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3111  ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
3112 
3113  // If this is an 8, 16 or 32-bit value, it is really passed promoted
3114  // to 64 bits. Insert an assert[sz]ext to capture this, then
3115  // truncate to the right size.
3116  switch (VA.getLocInfo()) {
3117  default:
3118  llvm_unreachable("Unknown loc info!");
3119  case CCValAssign::Full:
3120  break;
3121  case CCValAssign::BCvt:
3122  ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
3123  break;
3124  case CCValAssign::AExt:
3125  case CCValAssign::SExt:
3126  case CCValAssign::ZExt:
3127  // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
3128  // nodes after our lowering.
3129  assert(RegVT == Ins[i].VT && "incorrect register location selected");
3130  break;
3131  }
3132 
3133  InVals.push_back(ArgValue);
3134 
3135  } else { // VA.isRegLoc()
3136  assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
3137  unsigned ArgOffset = VA.getLocMemOffset();
3138  unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
3139 
3140  uint32_t BEAlign = 0;
3141  if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
3142  !Ins[i].Flags.isInConsecutiveRegs())
3143  BEAlign = 8 - ArgSize;
3144 
3145  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
3146 
3147  // Create load nodes to retrieve arguments from the stack.
3148  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3149  SDValue ArgValue;
3150 
3151  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
3153  MVT MemVT = VA.getValVT();
3154 
3155  switch (VA.getLocInfo()) {
3156  default:
3157  break;
3158  case CCValAssign::BCvt:
3159  MemVT = VA.getLocVT();
3160  break;
3161  case CCValAssign::SExt:
3162  ExtType = ISD::SEXTLOAD;
3163  break;
3164  case CCValAssign::ZExt:
3165  ExtType = ISD::ZEXTLOAD;
3166  break;
3167  case CCValAssign::AExt:
3168  ExtType = ISD::EXTLOAD;
3169  break;
3170  }
3171 
3172  ArgValue = DAG.getExtLoad(
3173  ExtType, DL, VA.getLocVT(), Chain, FIN,
3175  MemVT);
3176 
3177  InVals.push_back(ArgValue);
3178  }
3179  }
3180 
3181  // varargs
3183  if (isVarArg) {
3184  if (!Subtarget->isTargetDarwin() || IsWin64) {
3185  // The AAPCS variadic function ABI is identical to the non-variadic
3186  // one. As a result there may be more arguments in registers and we should
3187  // save them for future reference.
3188  // Win64 variadic functions also pass arguments in registers, but all float
3189  // arguments are passed in integer registers.
3190  saveVarArgRegisters(CCInfo, DAG, DL, Chain);
3191  }
3192 
3193  // This will point to the next argument passed via stack.
3194  unsigned StackOffset = CCInfo.getNextStackOffset();
3195  // We currently pass all varargs at 8-byte alignment.
3196  StackOffset = ((StackOffset + 7) & ~7);
3197  FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
3198 
3199  if (MFI.hasMustTailInVarArgFunc()) {
3200  SmallVector<MVT, 2> RegParmTypes;
3201  RegParmTypes.push_back(MVT::i64);
3202  RegParmTypes.push_back(MVT::f128);
3203  // Compute the set of forwarded registers. The rest are scratch.
3205  FuncInfo->getForwardedMustTailRegParms();
3206  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
3208  }
3209  }
3210 
3211  unsigned StackArgSize = CCInfo.getNextStackOffset();
3212  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3213  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
3214  // This is a non-standard ABI so by fiat I say we're allowed to make full
3215  // use of the stack area to be popped, which must be aligned to 16 bytes in
3216  // any case:
3217  StackArgSize = alignTo(StackArgSize, 16);
3218 
3219  // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
3220  // a multiple of 16.
3221  FuncInfo->setArgumentStackToRestore(StackArgSize);
3222 
3223  // This realignment carries over to the available bytes below. Our own
3224  // callers will guarantee the space is free by giving an aligned value to
3225  // CALLSEQ_START.
3226  }
3227  // Even if we're not expected to free up the space, it's useful to know how
3228  // much is there while considering tail calls (because we can reuse it).
3229  FuncInfo->setBytesInStackArgArea(StackArgSize);
3230 
3231  if (Subtarget->hasCustomCallingConv())
3232  Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
3233 
3234  return Chain;
3235 }
3236 
3237 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
3238  SelectionDAG &DAG,
3239  const SDLoc &DL,
3240  SDValue &Chain) const {
3241  MachineFunction &MF = DAG.getMachineFunction();
3242  MachineFrameInfo &MFI = MF.getFrameInfo();
3244  auto PtrVT = getPointerTy(DAG.getDataLayout());
3245  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3246 
3247  SmallVector<SDValue, 8> MemOps;
3248 
3249  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
3250  AArch64::X3, AArch64::X4, AArch64::X5,
3251  AArch64::X6, AArch64::X7 };
3252  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
3253  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
3254 
3255  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
3256  int GPRIdx = 0;
3257  if (GPRSaveSize != 0) {
3258  if (IsWin64) {
3259  GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
3260  if (GPRSaveSize & 15)
3261  // The extra size here, if triggered, will always be 8.
3262  MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
3263  } else
3264  GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
3265 
3266  SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
3267 
3268  for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
3269  unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
3270  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
3271  SDValue Store = DAG.getStore(
3272  Val.getValue(1), DL, Val, FIN,
3273  IsWin64
3275  GPRIdx,
3276  (i - FirstVariadicGPR) * 8)
3278  MemOps.push_back(Store);
3279  FIN =
3280  DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
3281  }
3282  }
3283  FuncInfo->setVarArgsGPRIndex(GPRIdx);
3284  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
3285 
3286  if (Subtarget->hasFPARMv8() && !IsWin64) {
3287  static const MCPhysReg FPRArgRegs[] = {
3288  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
3289  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
3290  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
3291  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
3292 
3293  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
3294  int FPRIdx = 0;
3295  if (FPRSaveSize != 0) {
3296  FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
3297 
3298  SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3299 
3300  for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3301  unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3302  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3303 
3304  SDValue Store = DAG.getStore(
3305  Val.getValue(1), DL, Val, FIN,
3307  MemOps.push_back(Store);
3308  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3309  DAG.getConstant(16, DL, PtrVT));
3310  }
3311  }
3312  FuncInfo->setVarArgsFPRIndex(FPRIdx);
3313  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3314  }
3315 
3316  if (!MemOps.empty()) {
3317  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3318  }
3319 }
3320 
3321 /// LowerCallResult - Lower the result values of a call into the
3322 /// appropriate copies out of appropriate physical registers.
3323 SDValue AArch64TargetLowering::LowerCallResult(
3324  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3325  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3326  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3327  SDValue ThisVal) const {
3328  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3331  // Assign locations to each value returned by this call.
3333  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3334  *DAG.getContext());
3335  CCInfo.AnalyzeCallResult(Ins, RetCC);
3336 
3337  // Copy all of the result registers out of their specified physreg.
3338  for (unsigned i = 0; i != RVLocs.size(); ++i) {
3339  CCValAssign VA = RVLocs[i];
3340 
3341  // Pass 'this' value directly from the argument to return value, to avoid
3342  // reg unit interference
3343  if (i == 0 && isThisReturn) {
3344  assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3345  "unexpected return calling convention register assignment");
3346  InVals.push_back(ThisVal);
3347  continue;
3348  }
3349 
3350  SDValue Val =
3351  DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3352  Chain = Val.getValue(1);
3353  InFlag = Val.getValue(2);
3354 
3355  switch (VA.getLocInfo()) {
3356  default:
3357  llvm_unreachable("Unknown loc info!");
3358  case CCValAssign::Full:
3359  break;
3360  case CCValAssign::BCvt:
3361  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3362  break;
3363  }
3364 
3365  InVals.push_back(Val);
3366  }
3367 
3368  return Chain;
3369 }
3370 
3371 /// Return true if the calling convention is one that we can guarantee TCO for.
3373  return CC == CallingConv::Fast;
3374 }
3375 
3376 /// Return true if we might ever do TCO for calls with this calling convention.
3378  switch (CC) {
3379  case CallingConv::C:
3381  case CallingConv::Swift:
3382  return true;
3383  default:
3384  return canGuaranteeTCO(CC);
3385  }
3386 }
3387 
3388 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3389  SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3390  const SmallVectorImpl<ISD::OutputArg> &Outs,
3391  const SmallVectorImpl<SDValue> &OutVals,
3392  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3393  if (!mayTailCallThisCC(CalleeCC))
3394  return false;
3395 
3396  MachineFunction &MF = DAG.getMachineFunction();
3397  const Function &CallerF = MF.getFunction();
3398  CallingConv::ID CallerCC = CallerF.getCallingConv();
3399  bool CCMatch = CallerCC == CalleeCC;
3400 
3401  // Byval parameters hand the function a pointer directly into the stack area
3402  // we want to reuse during a tail call. Working around this *is* possible (see
3403  // X86) but less efficient and uglier in LowerCall.
3404  for (Function::const_arg_iterator i = CallerF.arg_begin(),
3405  e = CallerF.arg_end();
3406  i != e; ++i)
3407  if (i->hasByValAttr())
3408  return false;
3409 
3411  return canGuaranteeTCO(CalleeCC) && CCMatch;
3412 
3413  // Externally-defined functions with weak linkage should not be
3414  // tail-called on AArch64 when the OS does not support dynamic
3415  // pre-emption of symbols, as the AAELF spec requires normal calls
3416  // to undefined weak functions to be replaced with a NOP or jump to the
3417  // next instruction. The behaviour of branch instructions in this
3418  // situation (as used for tail calls) is implementation-defined, so we
3419  // cannot rely on the linker replacing the tail call with a return.
3420  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3421  const GlobalValue *GV = G->getGlobal();
3422  const Triple &TT = getTargetMachine().getTargetTriple();
3423  if (GV->hasExternalWeakLinkage() &&
3424  (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3425  return false;
3426  }
3427 
3428  // Now we search for cases where we can use a tail call without changing the
3429  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3430  // concept.
3431 
3432  // I want anyone implementing a new calling convention to think long and hard
3433  // about this assert.
3434  assert((!isVarArg || CalleeCC == CallingConv::C) &&
3435  "Unexpected variadic calling convention");
3436 
3437  LLVMContext &C = *DAG.getContext();
3438  if (isVarArg && !Outs.empty()) {
3439  // At least two cases here: if caller is fastcc then we can't have any
3440  // memory arguments (we'd be expected to clean up the stack afterwards). If
3441  // caller is C then we could potentially use its argument area.
3442 
3443  // FIXME: for now we take the most conservative of these in both cases:
3444  // disallow all variadic memory operands.
3446  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3447 
3448  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3449  for (const CCValAssign &ArgLoc : ArgLocs)
3450  if (!ArgLoc.isRegLoc())
3451  return false;
3452  }
3453 
3454  // Check that the call results are passed in the same way.
3455  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3456  CCAssignFnForCall(CalleeCC, isVarArg),
3457  CCAssignFnForCall(CallerCC, isVarArg)))
3458  return false;
3459  // The callee has to preserve all registers the caller needs to preserve.
3460  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3461  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3462  if (!CCMatch) {
3463  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3464  if (Subtarget->hasCustomCallingConv()) {
3465  TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
3466  TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
3467  }
3468  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3469  return false;
3470  }
3471 
3472  // Nothing more to check if the callee is taking no arguments
3473  if (Outs.empty())
3474  return true;
3475 
3477  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3478 
3479  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3480 
3481  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3482 
3483  // If the stack arguments for this call do not fit into our own save area then
3484  // the call cannot be made tail.
3485  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3486  return false;
3487 
3488  const MachineRegisterInfo &MRI = MF.getRegInfo();
3489  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3490  return false;
3491 
3492  return true;
3493 }
3494 
3495 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3496  SelectionDAG &DAG,
3497  MachineFrameInfo &MFI,
3498  int ClobberedFI) const {
3499  SmallVector<SDValue, 8> ArgChains;
3500  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3501  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3502 
3503  // Include the original chain at the beginning of the list. When this is
3504  // used by target LowerCall hooks, this helps legalize find the
3505  // CALLSEQ_BEGIN node.
3506  ArgChains.push_back(Chain);
3507 
3508  // Add a chain value for each stack argument corresponding
3509  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3510  UE = DAG.getEntryNode().getNode()->use_end();
3511  U != UE; ++U)
3512  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3513  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3514  if (FI->getIndex() < 0) {
3515  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3516  int64_t InLastByte = InFirstByte;
3517  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3518 
3519  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3520  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3521  ArgChains.push_back(SDValue(L, 1));
3522  }
3523 
3524  // Build a tokenfactor for all the chains.
3525  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3526 }
3527 
3528 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3529  bool TailCallOpt) const {
3530  return CallCC == CallingConv::Fast && TailCallOpt;
3531 }
3532 
3533 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3534 /// and add input and output parameter nodes.
3535 SDValue
3536 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3537  SmallVectorImpl<SDValue> &InVals) const {
3538  SelectionDAG &DAG = CLI.DAG;
3539  SDLoc &DL = CLI.DL;
3540  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3541  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3542  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3543  SDValue Chain = CLI.Chain;
3544  SDValue Callee = CLI.Callee;
3545  bool &IsTailCall = CLI.IsTailCall;
3546  CallingConv::ID CallConv = CLI.CallConv;
3547  bool IsVarArg = CLI.IsVarArg;
3548 
3549  MachineFunction &MF = DAG.getMachineFunction();
3550  bool IsThisReturn = false;
3551 
3553  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3554  bool IsSibCall = false;
3555 
3556  if (IsTailCall) {
3557  // Check if it's really possible to do a tail call.
3558  IsTailCall = isEligibleForTailCallOptimization(
3559  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3560  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3561  report_fatal_error("failed to perform tail call elimination on a call "
3562  "site marked musttail");
3563 
3564  // A sibling call is one where we're under the usual C ABI and not planning
3565  // to change that but can still do a tail call:
3566  if (!TailCallOpt && IsTailCall)
3567  IsSibCall = true;
3568 
3569  if (IsTailCall)
3570  ++NumTailCalls;
3571  }
3572 
3573  // Analyze operands of the call, assigning locations to each operand.
3575  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3576  *DAG.getContext());
3577 
3578  if (IsVarArg) {
3579  // Handle fixed and variable vector arguments differently.
3580  // Variable vector arguments always go into memory.
3581  unsigned NumArgs = Outs.size();
3582 
3583  for (unsigned i = 0; i != NumArgs; ++i) {
3584  MVT ArgVT = Outs[i].VT;
3585  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3586  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3587  /*IsVarArg=*/ !Outs[i].IsFixed);
3588  bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3589  assert(!Res && "Call operand has unhandled type");
3590  (void)Res;
3591  }
3592  } else {
3593  // At this point, Outs[].VT may already be promoted to i32. To correctly
3594  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3595  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3596  // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3597  // we use a special version of AnalyzeCallOperands to pass in ValVT and
3598  // LocVT.
3599  unsigned NumArgs = Outs.size();
3600  for (unsigned i = 0; i != NumArgs; ++i) {
3601  MVT ValVT = Outs[i].VT;
3602  // Get type of the original argument.
3603  EVT ActualVT = getValueType(DAG.getDataLayout(),
3604  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3605  /*AllowUnknown*/ true);
3606  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3607  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3608  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3609  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3610  ValVT = MVT::i8;
3611  else if (ActualMVT == MVT::i16)
3612  ValVT = MVT::i16;
3613 
3614  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3615  bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3616  assert(!Res && "Call operand has unhandled type");
3617  (void)Res;
3618  }
3619  }
3620 
3621  // Get a count of how many bytes are to be pushed on the stack.
3622  unsigned NumBytes = CCInfo.getNextStackOffset();
3623 
3624  if (IsSibCall) {
3625  // Since we're not changing the ABI to make this a tail call, the memory
3626  // operands are already available in the caller's incoming argument space.
3627  NumBytes = 0;
3628  }
3629 
3630  // FPDiff is the byte offset of the call's argument area from the callee's.
3631  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3632  // by this amount for a tail call. In a sibling call it must be 0 because the
3633  // caller will deallocate the entire stack and the callee still expects its
3634  // arguments to begin at SP+0. Completely unused for non-tail calls.
3635  int FPDiff = 0;
3636 
3637  if (IsTailCall && !IsSibCall) {
3638  unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3639 
3640  // Since callee will pop argument stack as a tail call, we must keep the
3641  // popped size 16-byte aligned.
3642  NumBytes = alignTo(NumBytes, 16);
3643 
3644  // FPDiff will be negative if this tail call requires more space than we
3645  // would automatically have in our incoming argument space. Positive if we
3646  // can actually shrink the stack.
3647  FPDiff = NumReusableBytes - NumBytes;
3648 
3649  // The stack pointer must be 16-byte aligned at all times it's used for a
3650  // memory operation, which in practice means at *all* times and in
3651  // particular across call boundaries. Therefore our own arguments started at
3652  // a 16-byte aligned SP and the delta applied for the tail call should
3653  // satisfy the same constraint.
3654  assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3655  }
3656 
3657  // Adjust the stack pointer for the new arguments...
3658  // These operations are automatically eliminated by the prolog/epilog pass
3659  if (!IsSibCall)
3660  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3661 
3662  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3663  getPointerTy(DAG.getDataLayout()));
3664 
3666  SmallVector<SDValue, 8> MemOpChains;
3667  auto PtrVT = getPointerTy(DAG.getDataLayout());
3668 
3669  if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
3670  const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
3671  for (const auto &F : Forwards) {
3672  SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
3673  RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3674  }
3675  }
3676 
3677  // Walk the register/memloc assignments, inserting copies/loads.
3678  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3679  ++i, ++realArgIdx) {
3680  CCValAssign &VA = ArgLocs[i];
3681  SDValue Arg = OutVals[realArgIdx];
3682  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3683 
3684  // Promote the value if needed.
3685  switch (VA.getLocInfo()) {
3686  default:
3687  llvm_unreachable("Unknown loc info!");
3688  case CCValAssign::Full:
3689  break;
3690  case CCValAssign::SExt:
3691  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3692  break;
3693  case CCValAssign::ZExt:
3694  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3695  break;
3696  case CCValAssign::AExt:
3697  if (Outs[realArgIdx].ArgVT == MVT::i1) {
3698  // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3699  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3700  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3701  }
3702  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3703  break;
3704  case CCValAssign::BCvt:
3705  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3706  break;
3707  case CCValAssign::FPExt:
3708  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3709  break;
3710  }
3711 
3712  if (VA.isRegLoc()) {
3713  if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3714  Outs[0].VT == MVT::i64) {
3715  assert(VA.getLocVT() == MVT::i64 &&
3716  "unexpected calling convention register assignment");
3717  assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3718  "unexpected use of 'returned'");
3719  IsThisReturn = true;
3720  }
3721  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3722  } else {
3723  assert(VA.isMemLoc());
3724 
3725  SDValue DstAddr;
3726  MachinePointerInfo DstInfo;
3727 
3728  // FIXME: This works on big-endian for composite byvals, which are the
3729  // common case. It should also work for fundamental types too.
3730  uint32_t BEAlign = 0;
3731  unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3732  : VA.getValVT().getSizeInBits();
3733  OpSize = (OpSize + 7) / 8;
3734  if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3735  !Flags.isInConsecutiveRegs()) {
3736  if (OpSize < 8)
3737  BEAlign = 8 - OpSize;
3738  }
3739  unsigned LocMemOffset = VA.getLocMemOffset();
3740  int32_t Offset = LocMemOffset + BEAlign;
3741  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3742  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3743 
3744  if (IsTailCall) {
3745  Offset = Offset + FPDiff;
3746  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3747 
3748  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3749  DstInfo =
3751 
3752  // Make sure any stack arguments overlapping with where we're storing
3753  // are loaded before this eventual operation. Otherwise they'll be
3754  // clobbered.
3755  Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3756  } else {
3757  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3758 
3759  DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3761  LocMemOffset);
3762  }
3763 
3764  if (Outs[i].Flags.isByVal()) {
3765  SDValue SizeNode =
3766  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3767  SDValue Cpy = DAG.getMemcpy(
3768  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3769  /*isVol = */ false, /*AlwaysInline = */ false,
3770  /*isTailCall = */ false,
3771  DstInfo, MachinePointerInfo());
3772 
3773  MemOpChains.push_back(Cpy);
3774  } else {
3775  // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3776  // promoted to a legal register type i32, we should truncate Arg back to
3777  // i1/i8/i16.
3778  if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3779  VA.getValVT() == MVT::i16)
3780  Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3781 
3782  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3783  MemOpChains.push_back(Store);
3784  }
3785  }
3786  }
3787 
3788  if (!MemOpChains.empty())
3789  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3790 
3791  // Build a sequence of copy-to-reg nodes chained together with token chain
3792  // and flag operands which copy the outgoing args into the appropriate regs.
3793  SDValue InFlag;
3794  for (auto &RegToPass : RegsToPass) {
3795  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3796  RegToPass.second, InFlag);
3797  InFlag = Chain.getValue(1);
3798  }
3799 
3800  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3801  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3802  // node so that legalize doesn't hack it.
3803  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3804  auto GV = G->getGlobal();
3805  if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
3807  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
3808  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3809  } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
3810  assert(Subtarget->isTargetWindows() &&
3811  "Windows is the only supported COFF target");
3812  Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
3813  } else {
3814  const GlobalValue *GV = G->getGlobal();
3815  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
3816  }
3817  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3818  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3819  Subtarget->isTargetMachO()) {
3820  const char *Sym = S->getSymbol();
3821  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
3822  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3823  } else {
3824  const char *Sym = S->getSymbol();
3825  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
3826  }
3827  }
3828 
3829  // We don't usually want to end the call-sequence here because we would tidy
3830  // the frame up *after* the call, however in the ABI-changing tail-call case
3831  // we've carefully laid out the parameters so that when sp is reset they'll be
3832  // in the correct location.
3833  if (IsTailCall && !IsSibCall) {
3834  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3835  DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
3836  InFlag = Chain.getValue(1);
3837  }
3838 
3839  std::vector<SDValue> Ops;
3840  Ops.push_back(Chain);
3841  Ops.push_back(Callee);
3842 
3843  if (IsTailCall) {
3844  // Each tail call may have to adjust the stack by a different amount, so
3845  // this information must travel along with the operation for eventual
3846  // consumption by emitEpilogue.
3847  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3848  }
3849 
3850  // Add argument registers to the end of the list so that they are known live
3851  // into the call.
3852  for (auto &RegToPass : RegsToPass)
3853  Ops.push_back(DAG.getRegister(RegToPass.first,
3854  RegToPass.second.getValueType()));
3855 
3856  // Add a register mask operand representing the call-preserved registers.
3857  const uint32_t *Mask;
3858  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3859  if (IsThisReturn) {
3860  // For 'this' returns, use the X0-preserving mask if applicable
3861  Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
3862  if (!Mask) {
3863  IsThisReturn = false;
3864  Mask = TRI->getCallPreservedMask(MF, CallConv);
3865  }
3866  } else
3867  Mask = TRI->getCallPreservedMask(MF, CallConv);
3868 
3869  if (Subtarget->hasCustomCallingConv())
3870  TRI->UpdateCustomCallPreservedMask(MF, &Mask);
3871 
3872  if (TRI->isAnyArgRegReserved(MF))
3873  TRI->emitReservedArgRegCallError(MF);
3874 
3875  assert(Mask && "Missing call preserved mask for calling convention");
3876  Ops.push_back(DAG.getRegisterMask(Mask));
3877 
3878  if (InFlag.getNode())
3879  Ops.push_back(InFlag);
3880 
3881  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3882 
3883  // If we're doing a tall call, use a TC_RETURN here rather than an
3884  // actual call instruction.
3885  if (IsTailCall) {
3887  return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
3888  }
3889 
3890  // Returns a chain and a flag for retval copy to use.
3891  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
3892  InFlag = Chain.getValue(1);
3893 
3894  uint64_t CalleePopBytes =
3895  DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
3896 
3897  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3898  DAG.getIntPtrConstant(CalleePopBytes, DL, true),
3899  InFlag, DL);
3900  if (!Ins.empty())
3901  InFlag = Chain.getValue(1);
3902 
3903  // Handle result values, copying them out of physregs into vregs that we
3904  // return.
3905  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3906  InVals, IsThisReturn,
3907  IsThisReturn ? OutVals[0] : SDValue());
3908 }
3909 
3910 bool AArch64TargetLowering::CanLowerReturn(
3911  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3912  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3913  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3917  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3918  return CCInfo.CheckReturn(Outs, RetCC);
3919 }
3920 
3921 SDValue
3922 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3923  bool isVarArg,
3924  const SmallVectorImpl<ISD::OutputArg> &Outs,
3925  const SmallVectorImpl<SDValue> &OutVals,
3926  const SDLoc &DL, SelectionDAG &DAG) const {
3927  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3931  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3932  *DAG.getContext());
3933  CCInfo.AnalyzeReturn(Outs, RetCC);
3934 
3935  // Copy the result values into the output registers.
3936  SDValue Flag;
3937  SmallVector<SDValue, 4> RetOps(1, Chain);
3938  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
3939  ++i, ++realRVLocIdx) {
3940  CCValAssign &VA = RVLocs[i];
3941  assert(VA.isRegLoc() && "Can only return in registers!");
3942  SDValue Arg = OutVals[realRVLocIdx];
3943 
3944  switch (VA.getLocInfo()) {
3945  default:
3946  llvm_unreachable("Unknown loc info!");
3947  case CCValAssign::Full:
3948  if (Outs[i].ArgVT == MVT::i1) {
3949  // AAPCS requires i1 to be zero-extended to i8 by the producer of the
3950  // value. This is strictly redundant on Darwin (which uses "zeroext
3951  // i1"), but will be optimised out before ISel.
3952  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3953  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3954  }
3955  break;
3956  case CCValAssign::BCvt:
3957  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3958  break;
3959  }
3960 
3961  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
3962  Flag = Chain.getValue(1);
3963  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3964  }
3965  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3966  const MCPhysReg *I =
3968  if (I) {
3969  for (; *I; ++I) {
3970  if (AArch64::GPR64RegClass.contains(*I))
3971  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3972  else if (AArch64::FPR64RegClass.contains(*I))
3973  RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
3974  else
3975  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3976  }
3977  }
3978 
3979  RetOps[0] = Chain; // Update chain.
3980 
3981  // Add the flag if we have it.
3982  if (Flag.getNode())
3983  RetOps.push_back(Flag);
3984 
3985  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
3986 }
3987 
3988 //===----------------------------------------------------------------------===//
3989 // Other Lowering Code
3990 //===----------------------------------------------------------------------===//
3991 
3992 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
3993  SelectionDAG &DAG,
3994  unsigned Flag) const {
3995  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
3996  N->getOffset(), Flag);
3997 }
3998 
3999 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
4000  SelectionDAG &DAG,
4001  unsigned Flag) const {
4002  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
4003 }
4004 
4005 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
4006  SelectionDAG &DAG,
4007  unsigned Flag) const {
4008  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
4009  N->getOffset(), Flag);
4010 }
4011 
4012 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
4013  SelectionDAG &DAG,
4014  unsigned Flag) const {
4015  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
4016 }
4017 
4018 // (loadGOT sym)
4019 template <class NodeTy>
4020 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
4021  unsigned Flags) const {
4022  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
4023  SDLoc DL(N);
4024  EVT Ty = getPointerTy(DAG.getDataLayout());
4025  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
4026  // FIXME: Once remat is capable of dealing with instructions with register
4027  // operands, expand this into two nodes instead of using a wrapper node.
4028  return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
4029 }
4030 
4031 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
4032 template <class NodeTy>
4033 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
4034  unsigned Flags) const {
4035  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
4036  SDLoc DL(N);
4037  EVT Ty = getPointerTy(DAG.getDataLayout());
4038  const unsigned char MO_NC = AArch64II::MO_NC;
4039  return DAG.getNode(
4040  AArch64ISD::WrapperLarge, DL, Ty,
4041  getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
4042  getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
4043  getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
4044  getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
4045 }
4046 
4047 // (addlow (adrp %hi(sym)) %lo(sym))
4048 template <class NodeTy>
4049 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
4050  unsigned Flags) const {
4051  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
4052  SDLoc DL(N);
4053  EVT Ty = getPointerTy(DAG.getDataLayout());
4054  SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
4055  SDValue Lo = getTargetNode(N, Ty, DAG,
4057  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
4058  return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
4059 }
4060 
4061 // (adr sym)
4062 template <class NodeTy>
4063 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
4064  unsigned Flags) const {
4065  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
4066  SDLoc DL(N);
4067  EVT Ty = getPointerTy(DAG.getDataLayout());
4068  SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
4069  return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
4070 }
4071 
4072 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
4073  SelectionDAG &DAG) const {
4074  GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
4075  const GlobalValue *GV = GN->getGlobal();
4076  unsigned char OpFlags =
4077  Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
4078 
4079  if (OpFlags != AArch64II::MO_NO_FLAG)
4080  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
4081  "unexpected offset in global node");
4082 
4083  // This also catches the large code model case for Darwin, and tiny code
4084  // model with got relocations.
4085  if ((OpFlags & AArch64II::MO_GOT) != 0) {
4086  return getGOT(GN, DAG, OpFlags);
4087  }
4088 
4089  SDValue</