LLVM  10.0.0svn
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ExpandImm.h"
14 #include "AArch64ISelLowering.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
47 #include "llvm/IR/Attributes.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DataLayout.h"
50 #include "llvm/IR/DebugLoc.h"
51 #include "llvm/IR/DerivedTypes.h"
52 #include "llvm/IR/Function.h"
54 #include "llvm/IR/GlobalValue.h"
55 #include "llvm/IR/IRBuilder.h"
56 #include "llvm/IR/Instruction.h"
57 #include "llvm/IR/Instructions.h"
58 #include "llvm/IR/IntrinsicInst.h"
59 #include "llvm/IR/Intrinsics.h"
60 #include "llvm/IR/Module.h"
61 #include "llvm/IR/OperandTraits.h"
62 #include "llvm/IR/PatternMatch.h"
63 #include "llvm/IR/Type.h"
64 #include "llvm/IR/Use.h"
65 #include "llvm/IR/Value.h"
66 #include "llvm/MC/MCRegisterInfo.h"
67 #include "llvm/Support/Casting.h"
68 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/Compiler.h"
71 #include "llvm/Support/Debug.h"
73 #include "llvm/Support/KnownBits.h"
79 #include <algorithm>
80 #include <bitset>
81 #include <cassert>
82 #include <cctype>
83 #include <cstdint>
84 #include <cstdlib>
85 #include <iterator>
86 #include <limits>
87 #include <tuple>
88 #include <utility>
89 #include <vector>
90 
91 using namespace llvm;
92 using namespace llvm::PatternMatch;
93 
94 #define DEBUG_TYPE "aarch64-lower"
95 
96 STATISTIC(NumTailCalls, "Number of tail calls");
97 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
98 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
99 
100 static cl::opt<bool>
101 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
102  cl::desc("Allow AArch64 SLI/SRI formation"),
103  cl::init(false));
104 
105 // FIXME: The necessary dtprel relocations don't seem to be supported
106 // well in the GNU bfd and gold linkers at the moment. Therefore, by
107 // default, for now, fall back to GeneralDynamic code generation.
109  "aarch64-elf-ldtls-generation", cl::Hidden,
110  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
111  cl::init(false));
112 
113 static cl::opt<bool>
114 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
115  cl::desc("Enable AArch64 logical imm instruction "
116  "optimization"),
117  cl::init(true));
118 
119 /// Value type used for condition codes.
120 static const MVT MVT_CC = MVT::i32;
121 
123  const AArch64Subtarget &STI)
124  : TargetLowering(TM), Subtarget(&STI) {
125  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
126  // we have to make something up. Arbitrarily, choose ZeroOrOne.
128  // When comparing vectors the result sets the different elements in the
129  // vector to all-one or all-zero.
131 
132  // Set up the register classes.
133  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
134  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
135 
136  if (Subtarget->hasFPARMv8()) {
137  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
138  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
139  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
140  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
141  }
142 
143  if (Subtarget->hasNEON()) {
144  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
145  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
146  // Someone set us up the NEON.
147  addDRTypeForNEON(MVT::v2f32);
148  addDRTypeForNEON(MVT::v8i8);
149  addDRTypeForNEON(MVT::v4i16);
150  addDRTypeForNEON(MVT::v2i32);
151  addDRTypeForNEON(MVT::v1i64);
152  addDRTypeForNEON(MVT::v1f64);
153  addDRTypeForNEON(MVT::v4f16);
154 
155  addQRTypeForNEON(MVT::v4f32);
156  addQRTypeForNEON(MVT::v2f64);
157  addQRTypeForNEON(MVT::v16i8);
158  addQRTypeForNEON(MVT::v8i16);
159  addQRTypeForNEON(MVT::v4i32);
160  addQRTypeForNEON(MVT::v2i64);
161  addQRTypeForNEON(MVT::v8f16);
162  }
163 
164  if (Subtarget->hasSVE()) {
165  // Add legal sve predicate types
166  addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
167  addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
168  addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
169  addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
170 
171  // Add legal sve data types
172  addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
173  addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
174  addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
175  addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
176 
177  addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
178  addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
179  addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
180  addRegisterClass(MVT::nxv1f32, &AArch64::ZPRRegClass);
181  addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
182  addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
183  addRegisterClass(MVT::nxv1f64, &AArch64::ZPRRegClass);
184  addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
185  }
186 
187  // Compute derived properties from the register classes
189 
190  // Provide all sorts of operation actions
218 
222 
226 
228 
229  // Custom lowering hooks are needed for XOR
230  // to fold it into CSINC/CSINV.
233 
234  // Virtually no operation on f128 is legal, but LLVM can't expand them when
235  // there's a valid register class, so we need custom operations in most cases.
257 
258  // Lowering for many of the conversions is actually specified by the non-f128
259  // type. The LowerXXX function will be trivial when f128 isn't involved.
274 
275  // Variable arguments.
280 
281  // Variable-sized objects.
284 
285  if (Subtarget->isTargetWindows())
287  else
289 
290  // Constant pool entries
292 
293  // BlockAddress
295 
296  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
305 
306  // AArch64 lacks both left-rotate and popcount instructions.
309  for (MVT VT : MVT::vector_valuetypes()) {
312  }
313 
314  // AArch64 doesn't have {U|S}MUL_LOHI.
317 
320 
323  for (MVT VT : MVT::vector_valuetypes()) {
326  }
333 
334  // Custom lower Add/Sub/Mul with overflow.
347 
356  if (Subtarget->hasFullFP16())
358  else
360 
394 
395  if (!Subtarget->hasFullFP16()) {
418 
419  // promote v4f16 to v4f32 when that is known to be safe.
432 
448 
469  }
470 
471  // AArch64 has implementations of a lot of rounding-like FP operations.
472  for (MVT Ty : {MVT::f32, MVT::f64}) {
487  }
488 
489  if (Subtarget->hasFullFP16()) {
500  }
501 
503 
505 
511 
512  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
513  // This requires the Performance Monitors extension.
514  if (Subtarget->hasPerfMon())
516 
517  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
518  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
519  // Issue __sincos_stret if available.
522  } else {
525  }
526 
527  // Make floating-point constants legal for the large code model, so they don't
528  // become loads from the constant pool.
529  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
532  }
533 
534  // AArch64 does not have floating-point extending loads, i1 sign-extending
535  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
536  for (MVT VT : MVT::fp_valuetypes()) {
541  }
542  for (MVT VT : MVT::integer_valuetypes())
544 
552 
555 
556  // Indexed loads and stores are supported.
557  for (unsigned im = (unsigned)ISD::PRE_INC;
573  }
574 
575  // Trap.
577  if (Subtarget->isTargetWindows())
579 
580  // We combine OR nodes for bitfield operations.
582  // Try to create BICs for vector ANDs.
584 
585  // Vector add and sub nodes may conceal a high-half opportunity.
586  // Also, try to fold ADD into CSINC/CSINV..
593 
597 
599 
606  if (Subtarget->supportsAddressTopByteIgnored())
608 
610 
613 
617 
619 
620  // In case of strict alignment, avoid an excessive number of byte wide stores.
624 
629 
631 
635 
637 
639 
640  EnableExtLdPromotion = true;
641 
642  // Set required alignment.
644  // Set preferred alignments.
647 
648  // Only change the limit for entries in a jump table if specified by
649  // the sub target, but not at the command line.
650  unsigned MaxJT = STI.getMaximumJumpTableSize();
651  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
653 
654  setHasExtractBitsInsn(true);
655 
657 
658  if (Subtarget->hasNEON()) {
659  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
660  // silliness like this:
686 
692 
694 
695  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
696  // elements smaller than i32, so promote the input to i32 first.
699  // i8 vector elements also need promotion to i32 for v8i8
702  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
707  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
708  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
711 
712  if (Subtarget->hasFullFP16()) {
717  } else {
718  // when AArch64 doesn't have fullfp16 support, promote the input
719  // to i32 first.
724  }
725 
728 
729  // AArch64 doesn't have MUL.2d:
731  // Custom handling for some quad-vector types to detect MULL.
735 
736  // Vector reductions
737  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
744  }
745  for (MVT VT : { MVT::v4f16, MVT::v2f32,
749  }
750 
753  // Likewise, narrowing and extending vector loads/stores aren't handled
754  // directly.
755  for (MVT VT : MVT::vector_valuetypes()) {
757 
758  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
761  } else {
764  }
767 
770 
771  for (MVT InnerVT : MVT::vector_valuetypes()) {
772  setTruncStoreAction(VT, InnerVT, Expand);
773  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
774  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
775  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
776  }
777  }
778 
779  // AArch64 has implementations of a lot of rounding-like FP operations.
780  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
787  }
788 
789  if (Subtarget->hasFullFP16()) {
790  for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
797  }
798  }
799 
801  }
802 
804 }
805 
806 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
807  assert(VT.isVector() && "VT should be a vector type");
808 
809  if (VT.isFloatingPoint()) {
811  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
812  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
813  }
814 
815  // Mark vector float intrinsics as expand.
816  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
825 
826  // But we do support custom-lowering for FCOPYSIGN.
828  }
829 
841 
845  for (MVT InnerVT : MVT::all_valuetypes())
846  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
847 
848  // CNT supports only B element sizes, then use UADDLP to widen.
849  if (VT != MVT::v8i8 && VT != MVT::v16i8)
851 
857 
860 
861  if (!VT.isFloatingPoint())
863 
864  // [SU][MIN|MAX] are available for all NEON types apart from i64.
865  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
866  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
867  setOperationAction(Opcode, VT, Legal);
868 
869  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
870  if (VT.isFloatingPoint() &&
871  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
872  for (unsigned Opcode :
874  setOperationAction(Opcode, VT, Legal);
875 
876  if (Subtarget->isLittleEndian()) {
877  for (unsigned im = (unsigned)ISD::PRE_INC;
881  }
882  }
883 }
884 
885 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
886  addRegisterClass(VT, &AArch64::FPR64RegClass);
887  addTypeForNEON(VT, MVT::v2i32);
888 }
889 
890 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
891  addRegisterClass(VT, &AArch64::FPR128RegClass);
892  addTypeForNEON(VT, MVT::v4i32);
893 }
894 
896  EVT VT) const {
897  if (!VT.isVector())
898  return MVT::i32;
900 }
901 
902 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
903  const APInt &Demanded,
905  unsigned NewOpc) {
906  uint64_t OldImm = Imm, NewImm, Enc;
907  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
908 
909  // Return if the immediate is already all zeros, all ones, a bimm32 or a
910  // bimm64.
911  if (Imm == 0 || Imm == Mask ||
912  AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
913  return false;
914 
915  unsigned EltSize = Size;
916  uint64_t DemandedBits = Demanded.getZExtValue();
917 
918  // Clear bits that are not demanded.
919  Imm &= DemandedBits;
920 
921  while (true) {
922  // The goal here is to set the non-demanded bits in a way that minimizes
923  // the number of switching between 0 and 1. In order to achieve this goal,
924  // we set the non-demanded bits to the value of the preceding demanded bits.
925  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
926  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
927  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
928  // The final result is 0b11000011.
929  uint64_t NonDemandedBits = ~DemandedBits;
930  uint64_t InvertedImm = ~Imm & DemandedBits;
931  uint64_t RotatedImm =
932  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
933  NonDemandedBits;
934  uint64_t Sum = RotatedImm + NonDemandedBits;
935  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
936  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
937  NewImm = (Imm | Ones) & Mask;
938 
939  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
940  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
941  // we halve the element size and continue the search.
942  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
943  break;
944 
945  // We cannot shrink the element size any further if it is 2-bits.
946  if (EltSize == 2)
947  return false;
948 
949  EltSize /= 2;
950  Mask >>= EltSize;
951  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
952 
953  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
954  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
955  return false;
956 
957  // Merge the upper and lower halves of Imm and DemandedBits.
958  Imm |= Hi;
959  DemandedBits |= DemandedBitsHi;
960  }
961 
962  ++NumOptimizedImms;
963 
964  // Replicate the element across the register width.
965  while (EltSize < Size) {
966  NewImm |= NewImm << EltSize;
967  EltSize *= 2;
968  }
969 
970  (void)OldImm;
971  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
972  "demanded bits should never be altered");
973  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
974 
975  // Create the new constant immediate node.
976  EVT VT = Op.getValueType();
977  SDLoc DL(Op);
978  SDValue New;
979 
980  // If the new constant immediate is all-zeros or all-ones, let the target
981  // independent DAG combine optimize this node.
982  if (NewImm == 0 || NewImm == OrigMask) {
983  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
984  TLO.DAG.getConstant(NewImm, DL, VT));
985  // Otherwise, create a machine node so that target independent DAG combine
986  // doesn't undo this optimization.
987  } else {
988  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
989  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
990  New = SDValue(
991  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
992  }
993 
994  return TLO.CombineTo(Op, New);
995 }
996 
998  SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
999  // Delay this optimization to as late as possible.
1000  if (!TLO.LegalOps)
1001  return false;
1002 
1004  return false;
1005 
1006  EVT VT = Op.getValueType();
1007  if (VT.isVector())
1008  return false;
1009 
1010  unsigned Size = VT.getSizeInBits();
1011  assert((Size == 32 || Size == 64) &&
1012  "i32 or i64 is expected after legalization.");
1013 
1014  // Exit early if we demand all bits.
1015  if (Demanded.countPopulation() == Size)
1016  return false;
1017 
1018  unsigned NewOpc;
1019  switch (Op.getOpcode()) {
1020  default:
1021  return false;
1022  case ISD::AND:
1023  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1024  break;
1025  case ISD::OR:
1026  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1027  break;
1028  case ISD::XOR:
1029  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1030  break;
1031  }
1033  if (!C)
1034  return false;
1035  uint64_t Imm = C->getZExtValue();
1036  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
1037 }
1038 
1039 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1040 /// Mask are known to be either zero or one and return them Known.
1042  const SDValue Op, KnownBits &Known,
1043  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1044  switch (Op.getOpcode()) {
1045  default:
1046  break;
1047  case AArch64ISD::CSEL: {
1048  KnownBits Known2;
1049  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1050  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1051  Known.Zero &= Known2.Zero;
1052  Known.One &= Known2.One;
1053  break;
1054  }
1055  case ISD::INTRINSIC_W_CHAIN: {
1056  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1057  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1058  switch (IntID) {
1059  default: return;
1060  case Intrinsic::aarch64_ldaxr:
1061  case Intrinsic::aarch64_ldxr: {
1062  unsigned BitWidth = Known.getBitWidth();
1063  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1064  unsigned MemBits = VT.getScalarSizeInBits();
1065  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1066  return;
1067  }
1068  }
1069  break;
1070  }
1072  case ISD::INTRINSIC_VOID: {
1073  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1074  switch (IntNo) {
1075  default:
1076  break;
1077  case Intrinsic::aarch64_neon_umaxv:
1078  case Intrinsic::aarch64_neon_uminv: {
1079  // Figure out the datatype of the vector operand. The UMINV instruction
1080  // will zero extend the result, so we can mark as known zero all the
1081  // bits larger than the element datatype. 32-bit or larget doesn't need
1082  // this as those are legal types and will be handled by isel directly.
1083  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1084  unsigned BitWidth = Known.getBitWidth();
1085  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1086  assert(BitWidth >= 8 && "Unexpected width!");
1087  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1088  Known.Zero |= Mask;
1089  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1090  assert(BitWidth >= 16 && "Unexpected width!");
1091  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1092  Known.Zero |= Mask;
1093  }
1094  break;
1095  } break;
1096  }
1097  }
1098  }
1099 }
1100 
1102  EVT) const {
1103  return MVT::i64;
1104 }
1105 
1107  EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1108  bool *Fast) const {
1109  if (Subtarget->requiresStrictAlign())
1110  return false;
1111 
1112  if (Fast) {
1113  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1114  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1115  // See comments in performSTORECombine() for more details about
1116  // these conditions.
1117 
1118  // Code that uses clang vector extensions can mark that it
1119  // wants unaligned accesses to be treated as fast by
1120  // underspecifying alignment to be 1 or 2.
1121  Align <= 2 ||
1122 
1123  // Disregard v2i64. Memcpy lowering produces those and splitting
1124  // them regresses performance on micro-benchmarks and olden/bh.
1125  VT == MVT::v2i64;
1126  }
1127  return true;
1128 }
1129 
1130 // Same as above but handling LLTs instead.
1132  LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1133  bool *Fast) const {
1134  if (Subtarget->requiresStrictAlign())
1135  return false;
1136 
1137  if (Fast) {
1138  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1139  *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1140  Ty.getSizeInBytes() != 16 ||
1141  // See comments in performSTORECombine() for more details about
1142  // these conditions.
1143 
1144  // Code that uses clang vector extensions can mark that it
1145  // wants unaligned accesses to be treated as fast by
1146  // underspecifying alignment to be 1 or 2.
1147  Align <= 2 ||
1148 
1149  // Disregard v2i64. Memcpy lowering produces those and splitting
1150  // them regresses performance on micro-benchmarks and olden/bh.
1151  Ty == LLT::vector(2, 64);
1152  }
1153  return true;
1154 }
1155 
1156 FastISel *
1158  const TargetLibraryInfo *libInfo) const {
1159  return AArch64::createFastISel(funcInfo, libInfo);
1160 }
1161 
1162 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1163  switch ((AArch64ISD::NodeType)Opcode) {
1164  case AArch64ISD::FIRST_NUMBER: break;
1165  case AArch64ISD::CALL: return "AArch64ISD::CALL";
1166  case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
1167  case AArch64ISD::ADR: return "AArch64ISD::ADR";
1168  case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
1169  case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
1170  case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
1171  case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
1172  case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
1173  case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
1174  case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
1175  case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
1176  case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
1177  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1178  case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
1179  case AArch64ISD::ADC: return "AArch64ISD::ADC";
1180  case AArch64ISD::SBC: return "AArch64ISD::SBC";
1181  case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
1182  case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
1183  case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
1184  case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
1185  case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
1186  case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
1187  case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
1188  case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
1189  case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
1190  case AArch64ISD::DUP: return "AArch64ISD::DUP";
1191  case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
1192  case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
1193  case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
1194  case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
1195  case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
1196  case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
1197  case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
1198  case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
1199  case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
1200  case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
1201  case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
1202  case AArch64ISD::BICi: return "AArch64ISD::BICi";
1203  case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
1204  case AArch64ISD::BSL: return "AArch64ISD::BSL";
1205  case AArch64ISD::NEG: return "AArch64ISD::NEG";
1206  case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
1207  case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
1208  case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
1209  case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
1210  case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
1211  case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
1212  case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
1213  case AArch64ISD::REV16: return "AArch64ISD::REV16";
1214  case AArch64ISD::REV32: return "AArch64ISD::REV32";
1215  case AArch64ISD::REV64: return "AArch64ISD::REV64";
1216  case AArch64ISD::EXT: return "AArch64ISD::EXT";
1217  case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
1218  case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
1219  case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
1220  case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
1221  case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
1222  case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
1223  case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
1224  case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
1225  case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
1226  case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
1227  case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
1228  case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
1229  case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
1230  case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
1231  case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
1232  case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
1233  case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
1234  case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
1235  case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
1236  case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
1237  case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
1238  case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
1239  case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
1240  case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
1241  case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
1242  case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
1243  case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
1244  case AArch64ISD::NOT: return "AArch64ISD::NOT";
1245  case AArch64ISD::BIT: return "AArch64ISD::BIT";
1246  case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
1247  case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
1248  case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
1249  case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
1250  case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
1251  case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
1252  case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
1253  case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
1254  case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
1255  case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
1256  case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
1257  case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
1258  case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
1259  case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
1260  case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
1261  case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
1262  case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
1263  case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
1264  case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
1265  case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
1266  case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
1267  case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
1268  case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
1269  case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
1270  case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
1271  case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
1272  case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
1273  case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
1274  case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
1275  case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
1276  case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
1277  case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
1278  case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
1279  case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
1280  case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
1281  case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
1282  case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
1283  case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
1284  case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
1285  case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
1286  case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
1287  case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
1288  case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
1289  case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
1290  case AArch64ISD::STG: return "AArch64ISD::STG";
1291  case AArch64ISD::STZG: return "AArch64ISD::STZG";
1292  case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
1293  case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
1294  }
1295  return nullptr;
1296 }
1297 
1300  MachineBasicBlock *MBB) const {
1301  // We materialise the F128CSEL pseudo-instruction as some control flow and a
1302  // phi node:
1303 
1304  // OrigBB:
1305  // [... previous instrs leading to comparison ...]
1306  // b.ne TrueBB
1307  // b EndBB
1308  // TrueBB:
1309  // ; Fallthrough
1310  // EndBB:
1311  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1312 
1313  MachineFunction *MF = MBB->getParent();
1314  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1315  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1316  DebugLoc DL = MI.getDebugLoc();
1317  MachineFunction::iterator It = ++MBB->getIterator();
1318 
1319  Register DestReg = MI.getOperand(0).getReg();
1320  Register IfTrueReg = MI.getOperand(1).getReg();
1321  Register IfFalseReg = MI.getOperand(2).getReg();
1322  unsigned CondCode = MI.getOperand(3).getImm();
1323  bool NZCVKilled = MI.getOperand(4).isKill();
1324 
1325  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1326  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1327  MF->insert(It, TrueBB);
1328  MF->insert(It, EndBB);
1329 
1330  // Transfer rest of current basic-block to EndBB
1331  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1332  MBB->end());
1333  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1334 
1335  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1336  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1337  MBB->addSuccessor(TrueBB);
1338  MBB->addSuccessor(EndBB);
1339 
1340  // TrueBB falls through to the end.
1341  TrueBB->addSuccessor(EndBB);
1342 
1343  if (!NZCVKilled) {
1344  TrueBB->addLiveIn(AArch64::NZCV);
1345  EndBB->addLiveIn(AArch64::NZCV);
1346  }
1347 
1348  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1349  .addReg(IfTrueReg)
1350  .addMBB(TrueBB)
1351  .addReg(IfFalseReg)
1352  .addMBB(MBB);
1353 
1354  MI.eraseFromParent();
1355  return EndBB;
1356 }
1357 
1359  MachineInstr &MI, MachineBasicBlock *BB) const {
1361  BB->getParent()->getFunction().getPersonalityFn())) &&
1362  "SEH does not use catchret!");
1363  return BB;
1364 }
1365 
1367  MachineInstr &MI, MachineBasicBlock *BB) const {
1368  MI.eraseFromParent();
1369  return BB;
1370 }
1371 
1373  MachineInstr &MI, MachineBasicBlock *BB) const {
1374  switch (MI.getOpcode()) {
1375  default:
1376 #ifndef NDEBUG
1377  MI.dump();
1378 #endif
1379  llvm_unreachable("Unexpected instruction for custom inserter!");
1380 
1381  case AArch64::F128CSEL:
1382  return EmitF128CSEL(MI, BB);
1383 
1384  case TargetOpcode::STACKMAP:
1385  case TargetOpcode::PATCHPOINT:
1386  return emitPatchPoint(MI, BB);
1387 
1388  case AArch64::CATCHRET:
1389  return EmitLoweredCatchRet(MI, BB);
1390  case AArch64::CATCHPAD:
1391  return EmitLoweredCatchPad(MI, BB);
1392  }
1393 }
1394 
1395 //===----------------------------------------------------------------------===//
1396 // AArch64 Lowering private implementation.
1397 //===----------------------------------------------------------------------===//
1398 
1399 //===----------------------------------------------------------------------===//
1400 // Lowering Code
1401 //===----------------------------------------------------------------------===//
1402 
1403 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1404 /// CC
1406  switch (CC) {
1407  default:
1408  llvm_unreachable("Unknown condition code!");
1409  case ISD::SETNE:
1410  return AArch64CC::NE;
1411  case ISD::SETEQ:
1412  return AArch64CC::EQ;
1413  case ISD::SETGT:
1414  return AArch64CC::GT;
1415  case ISD::SETGE:
1416  return AArch64CC::GE;
1417  case ISD::SETLT:
1418  return AArch64CC::LT;
1419  case ISD::SETLE:
1420  return AArch64CC::LE;
1421  case ISD::SETUGT:
1422  return AArch64CC::HI;
1423  case ISD::SETUGE:
1424  return AArch64CC::HS;
1425  case ISD::SETULT:
1426  return AArch64CC::LO;
1427  case ISD::SETULE:
1428  return AArch64CC::LS;
1429  }
1430 }
1431 
1432 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1435  AArch64CC::CondCode &CondCode2) {
1436  CondCode2 = AArch64CC::AL;
1437  switch (CC) {
1438  default:
1439  llvm_unreachable("Unknown FP condition!");
1440  case ISD::SETEQ:
1441  case ISD::SETOEQ:
1442  CondCode = AArch64CC::EQ;
1443  break;
1444  case ISD::SETGT:
1445  case ISD::SETOGT:
1446  CondCode = AArch64CC::GT;
1447  break;
1448  case ISD::SETGE:
1449  case ISD::SETOGE:
1450  CondCode = AArch64CC::GE;
1451  break;
1452  case ISD::SETOLT:
1453  CondCode = AArch64CC::MI;
1454  break;
1455  case ISD::SETOLE:
1456  CondCode = AArch64CC::LS;
1457  break;
1458  case ISD::SETONE:
1459  CondCode = AArch64CC::MI;
1460  CondCode2 = AArch64CC::GT;
1461  break;
1462  case ISD::SETO:
1463  CondCode = AArch64CC::VC;
1464  break;
1465  case ISD::SETUO:
1466  CondCode = AArch64CC::VS;
1467  break;
1468  case ISD::SETUEQ:
1469  CondCode = AArch64CC::EQ;
1470  CondCode2 = AArch64CC::VS;
1471  break;
1472  case ISD::SETUGT:
1473  CondCode = AArch64CC::HI;
1474  break;
1475  case ISD::SETUGE:
1476  CondCode = AArch64CC::PL;
1477  break;
1478  case ISD::SETLT:
1479  case ISD::SETULT:
1480  CondCode = AArch64CC::LT;
1481  break;
1482  case ISD::SETLE:
1483  case ISD::SETULE:
1484  CondCode = AArch64CC::LE;
1485  break;
1486  case ISD::SETNE:
1487  case ISD::SETUNE:
1488  CondCode = AArch64CC::NE;
1489  break;
1490  }
1491 }
1492 
1493 /// Convert a DAG fp condition code to an AArch64 CC.
1494 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1495 /// should be AND'ed instead of OR'ed.
1498  AArch64CC::CondCode &CondCode2) {
1499  CondCode2 = AArch64CC::AL;
1500  switch (CC) {
1501  default:
1502  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1503  assert(CondCode2 == AArch64CC::AL);
1504  break;
1505  case ISD::SETONE:
1506  // (a one b)
1507  // == ((a olt b) || (a ogt b))
1508  // == ((a ord b) && (a une b))
1509  CondCode = AArch64CC::VC;
1510  CondCode2 = AArch64CC::NE;
1511  break;
1512  case ISD::SETUEQ:
1513  // (a ueq b)
1514  // == ((a uno b) || (a oeq b))
1515  // == ((a ule b) && (a uge b))
1516  CondCode = AArch64CC::PL;
1517  CondCode2 = AArch64CC::LE;
1518  break;
1519  }
1520 }
1521 
1522 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1523 /// CC usable with the vector instructions. Fewer operations are available
1524 /// without a real NZCV register, so we have to use less efficient combinations
1525 /// to get the same effect.
1528  AArch64CC::CondCode &CondCode2,
1529  bool &Invert) {
1530  Invert = false;
1531  switch (CC) {
1532  default:
1533  // Mostly the scalar mappings work fine.
1534  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1535  break;
1536  case ISD::SETUO:
1537  Invert = true;
1539  case ISD::SETO:
1540  CondCode = AArch64CC::MI;
1541  CondCode2 = AArch64CC::GE;
1542  break;
1543  case ISD::SETUEQ:
1544  case ISD::SETULT:
1545  case ISD::SETULE:
1546  case ISD::SETUGT:
1547  case ISD::SETUGE:
1548  // All of the compare-mask comparisons are ordered, but we can switch
1549  // between the two by a double inversion. E.g. ULE == !OGT.
1550  Invert = true;
1551  changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1552  break;
1553  }
1554 }
1555 
1556 static bool isLegalArithImmed(uint64_t C) {
1557  // Matches AArch64DAGToDAGISel::SelectArithImmed().
1558  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1559  LLVM_DEBUG(dbgs() << "Is imm " << C
1560  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1561  return IsLegal;
1562 }
1563 
1564 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
1565 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
1566 // can be set differently by this operation. It comes down to whether
1567 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1568 // everything is fine. If not then the optimization is wrong. Thus general
1569 // comparisons are only valid if op2 != 0.
1570 //
1571 // So, finally, the only LLVM-native comparisons that don't mention C and V
1572 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1573 // the absence of information about op2.
1574 static bool isCMN(SDValue Op, ISD::CondCode CC) {
1575  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
1576  (CC == ISD::SETEQ || CC == ISD::SETNE);
1577 }
1578 
1580  const SDLoc &dl, SelectionDAG &DAG) {
1581  EVT VT = LHS.getValueType();
1582  const bool FullFP16 =
1583  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1584 
1585  if (VT.isFloatingPoint()) {
1586  assert(VT != MVT::f128);
1587  if (VT == MVT::f16 && !FullFP16) {
1588  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1589  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1590  VT = MVT::f32;
1591  }
1592  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1593  }
1594 
1595  // The CMP instruction is just an alias for SUBS, and representing it as
1596  // SUBS means that it's possible to get CSE with subtract operations.
1597  // A later phase can perform the optimization of setting the destination
1598  // register to WZR/XZR if it ends up being unused.
1599  unsigned Opcode = AArch64ISD::SUBS;
1600 
1601  if (isCMN(RHS, CC)) {
1602  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
1603  Opcode = AArch64ISD::ADDS;
1604  RHS = RHS.getOperand(1);
1605  } else if (isCMN(LHS, CC)) {
1606  // As we are looking for EQ/NE compares, the operands can be commuted ; can
1607  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
1608  Opcode = AArch64ISD::ADDS;
1609  LHS = LHS.getOperand(1);
1610  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1611  !isUnsignedIntSetCC(CC)) {
1612  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1613  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1614  // of the signed comparisons.
1615  Opcode = AArch64ISD::ANDS;
1616  RHS = LHS.getOperand(1);
1617  LHS = LHS.getOperand(0);
1618  }
1619 
1620  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1621  .getValue(1);
1622 }
1623 
1624 /// \defgroup AArch64CCMP CMP;CCMP matching
1625 ///
1626 /// These functions deal with the formation of CMP;CCMP;... sequences.
1627 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1628 /// a comparison. They set the NZCV flags to a predefined value if their
1629 /// predicate is false. This allows to express arbitrary conjunctions, for
1630 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
1631 /// expressed as:
1632 /// cmp A
1633 /// ccmp B, inv(CB), CA
1634 /// check for CB flags
1635 ///
1636 /// This naturally lets us implement chains of AND operations with SETCC
1637 /// operands. And we can even implement some other situations by transforming
1638 /// them:
1639 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
1640 /// negating the flags used in a CCMP/FCCMP operations.
1641 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
1642 /// by negating the flags we test for afterwards. i.e.
1643 /// NEG (CMP CCMP CCCMP ...) can be implemented.
1644 /// - Note that we can only ever negate all previously processed results.
1645 /// What we can not implement by flipping the flags to test is a negation
1646 /// of two sub-trees (because the negation affects all sub-trees emitted so
1647 /// far, so the 2nd sub-tree we emit would also affect the first).
1648 /// With those tools we can implement some OR operations:
1649 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
1650 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
1651 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
1652 /// elimination rules from earlier to implement the whole thing as a
1653 /// CCMP/FCCMP chain.
1654 ///
1655 /// As complete example:
1656 /// or (or (setCA (cmp A)) (setCB (cmp B)))
1657 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1658 /// can be reassociated to:
1659 /// or (and (setCC (cmp C)) setCD (cmp D))
1660 // (or (setCA (cmp A)) (setCB (cmp B)))
1661 /// can be transformed to:
1662 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
1663 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1664 /// which can be implemented as:
1665 /// cmp C
1666 /// ccmp D, inv(CD), CC
1667 /// ccmp A, CA, inv(CD)
1668 /// ccmp B, CB, inv(CA)
1669 /// check for CB flags
1670 ///
1671 /// A counterexample is "or (and A B) (and C D)" which translates to
1672 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
1673 /// can only implement 1 of the inner (not) operations, but not both!
1674 /// @{
1675 
1676 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1678  ISD::CondCode CC, SDValue CCOp,
1680  AArch64CC::CondCode OutCC,
1681  const SDLoc &DL, SelectionDAG &DAG) {
1682  unsigned Opcode = 0;
1683  const bool FullFP16 =
1684  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1685 
1686  if (LHS.getValueType().isFloatingPoint()) {
1687  assert(LHS.getValueType() != MVT::f128);
1688  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1689  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1690  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1691  }
1692  Opcode = AArch64ISD::FCCMP;
1693  } else if (RHS.getOpcode() == ISD::SUB) {
1694  SDValue SubOp0 = RHS.getOperand(0);
1695  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1696  // See emitComparison() on why we can only do this for SETEQ and SETNE.
1697  Opcode = AArch64ISD::CCMN;
1698  RHS = RHS.getOperand(1);
1699  }
1700  }
1701  if (Opcode == 0)
1702  Opcode = AArch64ISD::CCMP;
1703 
1704  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1706  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1707  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1708  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1709 }
1710 
1711 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
1712 /// expressed as a conjunction. See \ref AArch64CCMP.
1713 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
1714 /// changing the conditions on the SETCC tests.
1715 /// (this means we can call emitConjunctionRec() with
1716 /// Negate==true on this sub-tree)
1717 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
1718 /// cannot do the negation naturally. We are required to
1719 /// emit the subtree first in this case.
1720 /// \param WillNegate Is true if are called when the result of this
1721 /// subexpression must be negated. This happens when the
1722 /// outer expression is an OR. We can use this fact to know
1723 /// that we have a double negation (or (or ...) ...) that
1724 /// can be implemented for free.
1725 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
1726  bool &MustBeFirst, bool WillNegate,
1727  unsigned Depth = 0) {
1728  if (!Val.hasOneUse())
1729  return false;
1730  unsigned Opcode = Val->getOpcode();
1731  if (Opcode == ISD::SETCC) {
1732  if (Val->getOperand(0).getValueType() == MVT::f128)
1733  return false;
1734  CanNegate = true;
1735  MustBeFirst = false;
1736  return true;
1737  }
1738  // Protect against exponential runtime and stack overflow.
1739  if (Depth > 6)
1740  return false;
1741  if (Opcode == ISD::AND || Opcode == ISD::OR) {
1742  bool IsOR = Opcode == ISD::OR;
1743  SDValue O0 = Val->getOperand(0);
1744  SDValue O1 = Val->getOperand(1);
1745  bool CanNegateL;
1746  bool MustBeFirstL;
1747  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
1748  return false;
1749  bool CanNegateR;
1750  bool MustBeFirstR;
1751  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
1752  return false;
1753 
1754  if (MustBeFirstL && MustBeFirstR)
1755  return false;
1756 
1757  if (IsOR) {
1758  // For an OR expression we need to be able to naturally negate at least
1759  // one side or we cannot do the transformation at all.
1760  if (!CanNegateL && !CanNegateR)
1761  return false;
1762  // If we the result of the OR will be negated and we can naturally negate
1763  // the leafs, then this sub-tree as a whole negates naturally.
1764  CanNegate = WillNegate && CanNegateL && CanNegateR;
1765  // If we cannot naturally negate the whole sub-tree, then this must be
1766  // emitted first.
1767  MustBeFirst = !CanNegate;
1768  } else {
1769  assert(Opcode == ISD::AND && "Must be OR or AND");
1770  // We cannot naturally negate an AND operation.
1771  CanNegate = false;
1772  MustBeFirst = MustBeFirstL || MustBeFirstR;
1773  }
1774  return true;
1775  }
1776  return false;
1777 }
1778 
1779 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1780 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1781 /// Tries to transform the given i1 producing node @p Val to a series compare
1782 /// and conditional compare operations. @returns an NZCV flags producing node
1783 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1784 /// transformation was not possible.
1785 /// \p Negate is true if we want this sub-tree being negated just by changing
1786 /// SETCC conditions.
1788  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1790  // We're at a tree leaf, produce a conditional comparison operation.
1791  unsigned Opcode = Val->getOpcode();
1792  if (Opcode == ISD::SETCC) {
1793  SDValue LHS = Val->getOperand(0);
1794  SDValue RHS = Val->getOperand(1);
1795  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1796  bool isInteger = LHS.getValueType().isInteger();
1797  if (Negate)
1798  CC = getSetCCInverse(CC, isInteger);
1799  SDLoc DL(Val);
1800  // Determine OutCC and handle FP special case.
1801  if (isInteger) {
1802  OutCC = changeIntCCToAArch64CC(CC);
1803  } else {
1805  AArch64CC::CondCode ExtraCC;
1806  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1807  // Some floating point conditions can't be tested with a single condition
1808  // code. Construct an additional comparison in this case.
1809  if (ExtraCC != AArch64CC::AL) {
1810  SDValue ExtraCmp;
1811  if (!CCOp.getNode())
1812  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1813  else
1814  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1815  ExtraCC, DL, DAG);
1816  CCOp = ExtraCmp;
1817  Predicate = ExtraCC;
1818  }
1819  }
1820 
1821  // Produce a normal comparison if we are first in the chain
1822  if (!CCOp)
1823  return emitComparison(LHS, RHS, CC, DL, DAG);
1824  // Otherwise produce a ccmp.
1825  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1826  DAG);
1827  }
1828  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
1829 
1830  bool IsOR = Opcode == ISD::OR;
1831 
1832  SDValue LHS = Val->getOperand(0);
1833  bool CanNegateL;
1834  bool MustBeFirstL;
1835  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
1836  assert(ValidL && "Valid conjunction/disjunction tree");
1837  (void)ValidL;
1838 
1839  SDValue RHS = Val->getOperand(1);
1840  bool CanNegateR;
1841  bool MustBeFirstR;
1842  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
1843  assert(ValidR && "Valid conjunction/disjunction tree");
1844  (void)ValidR;
1845 
1846  // Swap sub-tree that must come first to the right side.
1847  if (MustBeFirstL) {
1848  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
1849  std::swap(LHS, RHS);
1850  std::swap(CanNegateL, CanNegateR);
1851  std::swap(MustBeFirstL, MustBeFirstR);
1852  }
1853 
1854  bool NegateR;
1855  bool NegateAfterR;
1856  bool NegateL;
1857  bool NegateAfterAll;
1858  if (Opcode == ISD::OR) {
1859  // Swap the sub-tree that we can negate naturally to the left.
1860  if (!CanNegateL) {
1861  assert(CanNegateR && "at least one side must be negatable");
1862  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
1863  assert(!Negate);
1864  std::swap(LHS, RHS);
1865  NegateR = false;
1866  NegateAfterR = true;
1867  } else {
1868  // Negate the left sub-tree if possible, otherwise negate the result.
1869  NegateR = CanNegateR;
1870  NegateAfterR = !CanNegateR;
1871  }
1872  NegateL = true;
1873  NegateAfterAll = !Negate;
1874  } else {
1875  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
1876  assert(!Negate && "Valid conjunction/disjunction tree");
1877 
1878  NegateL = false;
1879  NegateR = false;
1880  NegateAfterR = false;
1881  NegateAfterAll = false;
1882  }
1883 
1884  // Emit sub-trees.
1885  AArch64CC::CondCode RHSCC;
1886  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
1887  if (NegateAfterR)
1888  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1889  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
1890  if (NegateAfterAll)
1891  OutCC = AArch64CC::getInvertedCondCode(OutCC);
1892  return CmpL;
1893 }
1894 
1895 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
1896 /// In some cases this is even possible with OR operations in the expression.
1897 /// See \ref AArch64CCMP.
1898 /// \see emitConjunctionRec().
1900  AArch64CC::CondCode &OutCC) {
1901  bool DummyCanNegate;
1902  bool DummyMustBeFirst;
1903  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
1904  return SDValue();
1905 
1906  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
1907 }
1908 
1909 /// @}
1910 
1911 /// Returns how profitable it is to fold a comparison's operand's shift and/or
1912 /// extension operations.
1914  auto isSupportedExtend = [&](SDValue V) {
1915  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
1916  return true;
1917 
1918  if (V.getOpcode() == ISD::AND)
1919  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
1920  uint64_t Mask = MaskCst->getZExtValue();
1921  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
1922  }
1923 
1924  return false;
1925  };
1926 
1927  if (!Op.hasOneUse())
1928  return 0;
1929 
1930  if (isSupportedExtend(Op))
1931  return 1;
1932 
1933  unsigned Opc = Op.getOpcode();
1934  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
1935  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
1936  uint64_t Shift = ShiftCst->getZExtValue();
1937  if (isSupportedExtend(Op.getOperand(0)))
1938  return (Shift <= 4) ? 2 : 1;
1939  EVT VT = Op.getValueType();
1940  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
1941  return 1;
1942  }
1943 
1944  return 0;
1945 }
1946 
1948  SDValue &AArch64cc, SelectionDAG &DAG,
1949  const SDLoc &dl) {
1950  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1951  EVT VT = RHS.getValueType();
1952  uint64_t C = RHSC->getZExtValue();
1953  if (!isLegalArithImmed(C)) {
1954  // Constant does not fit, try adjusting it by one?
1955  switch (CC) {
1956  default:
1957  break;
1958  case ISD::SETLT:
1959  case ISD::SETGE:
1960  if ((VT == MVT::i32 && C != 0x80000000 &&
1961  isLegalArithImmed((uint32_t)(C - 1))) ||
1962  (VT == MVT::i64 && C != 0x80000000ULL &&
1963  isLegalArithImmed(C - 1ULL))) {
1964  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1965  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1966  RHS = DAG.getConstant(C, dl, VT);
1967  }
1968  break;
1969  case ISD::SETULT:
1970  case ISD::SETUGE:
1971  if ((VT == MVT::i32 && C != 0 &&
1972  isLegalArithImmed((uint32_t)(C - 1))) ||
1973  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1974  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1975  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1976  RHS = DAG.getConstant(C, dl, VT);
1977  }
1978  break;
1979  case ISD::SETLE:
1980  case ISD::SETGT:
1981  if ((VT == MVT::i32 && C != INT32_MAX &&
1982  isLegalArithImmed((uint32_t)(C + 1))) ||
1983  (VT == MVT::i64 && C != INT64_MAX &&
1984  isLegalArithImmed(C + 1ULL))) {
1985  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1986  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1987  RHS = DAG.getConstant(C, dl, VT);
1988  }
1989  break;
1990  case ISD::SETULE:
1991  case ISD::SETUGT:
1992  if ((VT == MVT::i32 && C != UINT32_MAX &&
1993  isLegalArithImmed((uint32_t)(C + 1))) ||
1994  (VT == MVT::i64 && C != UINT64_MAX &&
1995  isLegalArithImmed(C + 1ULL))) {
1996  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1997  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1998  RHS = DAG.getConstant(C, dl, VT);
1999  }
2000  break;
2001  }
2002  }
2003  }
2004 
2005  // Comparisons are canonicalized so that the RHS operand is simpler than the
2006  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
2007  // can fold some shift+extend operations on the RHS operand, so swap the
2008  // operands if that can be done.
2009  //
2010  // For example:
2011  // lsl w13, w11, #1
2012  // cmp w13, w12
2013  // can be turned into:
2014  // cmp w12, w11, lsl #1
2015  if (!isa<ConstantSDNode>(RHS) ||
2016  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
2017  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
2018 
2020  std::swap(LHS, RHS);
2022  }
2023  }
2024 
2025  SDValue Cmp;
2026  AArch64CC::CondCode AArch64CC;
2027  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
2028  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
2029 
2030  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
2031  // For the i8 operand, the largest immediate is 255, so this can be easily
2032  // encoded in the compare instruction. For the i16 operand, however, the
2033  // largest immediate cannot be encoded in the compare.
2034  // Therefore, use a sign extending load and cmn to avoid materializing the
2035  // -1 constant. For example,
2036  // movz w1, #65535
2037  // ldrh w0, [x0, #0]
2038  // cmp w0, w1
2039  // >
2040  // ldrsh w0, [x0, #0]
2041  // cmn w0, #1
2042  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
2043  // if and only if (sext LHS) == (sext RHS). The checks are in place to
2044  // ensure both the LHS and RHS are truly zero extended and to make sure the
2045  // transformation is profitable.
2046  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
2047  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
2048  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
2049  LHS.getNode()->hasNUsesOfValue(1, 0)) {
2050  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
2051  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
2052  SDValue SExt =
2053  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2054  DAG.getValueType(MVT::i16));
2055  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2056  RHS.getValueType()),
2057  CC, dl, DAG);
2058  AArch64CC = changeIntCCToAArch64CC(CC);
2059  }
2060  }
2061 
2062  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2063  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2064  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2065  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2066  }
2067  }
2068  }
2069 
2070  if (!Cmp) {
2071  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2072  AArch64CC = changeIntCCToAArch64CC(CC);
2073  }
2074  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2075  return Cmp;
2076 }
2077 
2078 static std::pair<SDValue, SDValue>
2080  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2081  "Unsupported value type");
2082  SDValue Value, Overflow;
2083  SDLoc DL(Op);
2084  SDValue LHS = Op.getOperand(0);
2085  SDValue RHS = Op.getOperand(1);
2086  unsigned Opc = 0;
2087  switch (Op.getOpcode()) {
2088  default:
2089  llvm_unreachable("Unknown overflow instruction!");
2090  case ISD::SADDO:
2091  Opc = AArch64ISD::ADDS;
2092  CC = AArch64CC::VS;
2093  break;
2094  case ISD::UADDO:
2095  Opc = AArch64ISD::ADDS;
2096  CC = AArch64CC::HS;
2097  break;
2098  case ISD::SSUBO:
2099  Opc = AArch64ISD::SUBS;
2100  CC = AArch64CC::VS;
2101  break;
2102  case ISD::USUBO:
2103  Opc = AArch64ISD::SUBS;
2104  CC = AArch64CC::LO;
2105  break;
2106  // Multiply needs a little bit extra work.
2107  case ISD::SMULO:
2108  case ISD::UMULO: {
2109  CC = AArch64CC::NE;
2110  bool IsSigned = Op.getOpcode() == ISD::SMULO;
2111  if (Op.getValueType() == MVT::i32) {
2112  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2113  // For a 32 bit multiply with overflow check we want the instruction
2114  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2115  // need to generate the following pattern:
2116  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2117  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2118  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2119  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2120  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
2121  DAG.getConstant(0, DL, MVT::i64));
2122  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2123  // operation. We need to clear out the upper 32 bits, because we used a
2124  // widening multiply that wrote all 64 bits. In the end this should be a
2125  // noop.
2126  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
2127  if (IsSigned) {
2128  // The signed overflow check requires more than just a simple check for
2129  // any bit set in the upper 32 bits of the result. These bits could be
2130  // just the sign bits of a negative number. To perform the overflow
2131  // check we have to arithmetic shift right the 32nd bit of the result by
2132  // 31 bits. Then we compare the result to the upper 32 bits.
2133  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
2134  DAG.getConstant(32, DL, MVT::i64));
2135  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
2136  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
2137  DAG.getConstant(31, DL, MVT::i64));
2138  // It is important that LowerBits is last, otherwise the arithmetic
2139  // shift will not be folded into the compare (SUBS).
2140  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2141  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2142  .getValue(1);
2143  } else {
2144  // The overflow check for unsigned multiply is easy. We only need to
2145  // check if any of the upper 32 bits are set. This can be done with a
2146  // CMP (shifted register). For that we need to generate the following
2147  // pattern:
2148  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2149  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2150  DAG.getConstant(32, DL, MVT::i64));
2151  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2152  Overflow =
2153  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2154  DAG.getConstant(0, DL, MVT::i64),
2155  UpperBits).getValue(1);
2156  }
2157  break;
2158  }
2159  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2160  // For the 64 bit multiply
2161  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2162  if (IsSigned) {
2163  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2164  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
2165  DAG.getConstant(63, DL, MVT::i64));
2166  // It is important that LowerBits is last, otherwise the arithmetic
2167  // shift will not be folded into the compare (SUBS).
2168  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2169  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2170  .getValue(1);
2171  } else {
2172  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2173  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2174  Overflow =
2175  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2176  DAG.getConstant(0, DL, MVT::i64),
2177  UpperBits).getValue(1);
2178  }
2179  break;
2180  }
2181  } // switch (...)
2182 
2183  if (Opc) {
2184  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2185 
2186  // Emit the AArch64 operation with overflow check.
2187  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2188  Overflow = Value.getValue(1);
2189  }
2190  return std::make_pair(Value, Overflow);
2191 }
2192 
2193 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
2194  RTLIB::Libcall Call) const {
2195  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2196  MakeLibCallOptions CallOptions;
2197  return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
2198 }
2199 
2200 // Returns true if the given Op is the overflow flag result of an overflow
2201 // intrinsic operation.
2202 static bool isOverflowIntrOpRes(SDValue Op) {
2203  unsigned Opc = Op.getOpcode();
2204  return (Op.getResNo() == 1 &&
2205  (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2206  Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
2207 }
2208 
2210  SDValue Sel = Op.getOperand(0);
2211  SDValue Other = Op.getOperand(1);
2212  SDLoc dl(Sel);
2213 
2214  // If the operand is an overflow checking operation, invert the condition
2215  // code and kill the Not operation. I.e., transform:
2216  // (xor (overflow_op_bool, 1))
2217  // -->
2218  // (csel 1, 0, invert(cc), overflow_op_bool)
2219  // ... which later gets transformed to just a cset instruction with an
2220  // inverted condition code, rather than a cset + eor sequence.
2221  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
2222  // Only lower legal XALUO ops.
2223  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2224  return SDValue();
2225 
2226  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2227  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2229  SDValue Value, Overflow;
2230  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2231  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2232  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2233  CCVal, Overflow);
2234  }
2235  // If neither operand is a SELECT_CC, give up.
2236  if (Sel.getOpcode() != ISD::SELECT_CC)
2237  std::swap(Sel, Other);
2238  if (Sel.getOpcode() != ISD::SELECT_CC)
2239  return Op;
2240 
2241  // The folding we want to perform is:
2242  // (xor x, (select_cc a, b, cc, 0, -1) )
2243  // -->
2244  // (csel x, (xor x, -1), cc ...)
2245  //
2246  // The latter will get matched to a CSINV instruction.
2247 
2248  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2249  SDValue LHS = Sel.getOperand(0);
2250  SDValue RHS = Sel.getOperand(1);
2251  SDValue TVal = Sel.getOperand(2);
2252  SDValue FVal = Sel.getOperand(3);
2253 
2254  // FIXME: This could be generalized to non-integer comparisons.
2255  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2256  return Op;
2257 
2258  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2259  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2260 
2261  // The values aren't constants, this isn't the pattern we're looking for.
2262  if (!CFVal || !CTVal)
2263  return Op;
2264 
2265  // We can commute the SELECT_CC by inverting the condition. This
2266  // might be needed to make this fit into a CSINV pattern.
2267  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2268  std::swap(TVal, FVal);
2269  std::swap(CTVal, CFVal);
2270  CC = ISD::getSetCCInverse(CC, true);
2271  }
2272 
2273  // If the constants line up, perform the transform!
2274  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2275  SDValue CCVal;
2276  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2277 
2278  FVal = Other;
2279  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2280  DAG.getConstant(-1ULL, dl, Other.getValueType()));
2281 
2282  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2283  CCVal, Cmp);
2284  }
2285 
2286  return Op;
2287 }
2288 
2290  EVT VT = Op.getValueType();
2291 
2292  // Let legalize expand this if it isn't a legal type yet.
2293  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2294  return SDValue();
2295 
2296  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2297 
2298  unsigned Opc;
2299  bool ExtraOp = false;
2300  switch (Op.getOpcode()) {
2301  default:
2302  llvm_unreachable("Invalid code");
2303  case ISD::ADDC:
2304  Opc = AArch64ISD::ADDS;
2305  break;
2306  case ISD::SUBC:
2307  Opc = AArch64ISD::SUBS;
2308  break;
2309  case ISD::ADDE:
2310  Opc = AArch64ISD::ADCS;
2311  ExtraOp = true;
2312  break;
2313  case ISD::SUBE:
2314  Opc = AArch64ISD::SBCS;
2315  ExtraOp = true;
2316  break;
2317  }
2318 
2319  if (!ExtraOp)
2320  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2321  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2322  Op.getOperand(2));
2323 }
2324 
2326  // Let legalize expand this if it isn't a legal type yet.
2328  return SDValue();
2329 
2330  SDLoc dl(Op);
2332  // The actual operation that sets the overflow or carry flag.
2333  SDValue Value, Overflow;
2334  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2335 
2336  // We use 0 and 1 as false and true values.
2337  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2338  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2339 
2340  // We use an inverted condition, because the conditional select is inverted
2341  // too. This will allow it to be selected to a single instruction:
2342  // CSINC Wd, WZR, WZR, invert(cond).
2343  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2344  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2345  CCVal, Overflow);
2346 
2347  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2348  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2349 }
2350 
2351 // Prefetch operands are:
2352 // 1: Address to prefetch
2353 // 2: bool isWrite
2354 // 3: int locality (0 = no locality ... 3 = extreme locality)
2355 // 4: bool isDataCache
2357  SDLoc DL(Op);
2358  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2359  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2360  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2361 
2362  bool IsStream = !Locality;
2363  // When the locality number is set
2364  if (Locality) {
2365  // The front-end should have filtered out the out-of-range values
2366  assert(Locality <= 3 && "Prefetch locality out-of-range");
2367  // The locality degree is the opposite of the cache speed.
2368  // Put the number the other way around.
2369  // The encoding starts at 0 for level 1
2370  Locality = 3 - Locality;
2371  }
2372 
2373  // built the mask value encoding the expected behavior.
2374  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2375  (!IsData << 3) | // IsDataCache bit
2376  (Locality << 1) | // Cache level bits
2377  (unsigned)IsStream; // Stream bit
2378  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2379  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2380 }
2381 
2382 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2383  SelectionDAG &DAG) const {
2384  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2385 
2386  RTLIB::Libcall LC;
2388 
2389  return LowerF128Call(Op, DAG, LC);
2390 }
2391 
2392 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2393  SelectionDAG &DAG) const {
2394  if (Op.getOperand(0).getValueType() != MVT::f128) {
2395  // It's legal except when f128 is involved
2396  return Op;
2397  }
2398 
2399  RTLIB::Libcall LC;
2401 
2402  // FP_ROUND node has a second operand indicating whether it is known to be
2403  // precise. That doesn't take part in the LibCall so we can't directly use
2404  // LowerF128Call.
2405  SDValue SrcVal = Op.getOperand(0);
2406  MakeLibCallOptions CallOptions;
2407  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions,
2408  SDLoc(Op)).first;
2409 }
2410 
2411 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
2412  SelectionDAG &DAG) const {
2413  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2414  // Any additional optimization in this function should be recorded
2415  // in the cost tables.
2416  EVT InVT = Op.getOperand(0).getValueType();
2417  EVT VT = Op.getValueType();
2418  unsigned NumElts = InVT.getVectorNumElements();
2419 
2420  // f16 conversions are promoted to f32 when full fp16 is not supported.
2421  if (InVT.getVectorElementType() == MVT::f16 &&
2422  !Subtarget->hasFullFP16()) {
2423  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2424  SDLoc dl(Op);
2425  return DAG.getNode(
2426  Op.getOpcode(), dl, Op.getValueType(),
2427  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2428  }
2429 
2430  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2431  SDLoc dl(Op);
2432  SDValue Cv =
2433  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2434  Op.getOperand(0));
2435  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2436  }
2437 
2438  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2439  SDLoc dl(Op);
2440  MVT ExtVT =
2442  VT.getVectorNumElements());
2443  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2444  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2445  }
2446 
2447  // Type changing conversions are illegal.
2448  return Op;
2449 }
2450 
2451 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2452  SelectionDAG &DAG) const {
2453  if (Op.getOperand(0).getValueType().isVector())
2454  return LowerVectorFP_TO_INT(Op, DAG);
2455 
2456  // f16 conversions are promoted to f32 when full fp16 is not supported.
2457  if (Op.getOperand(0).getValueType() == MVT::f16 &&
2458  !Subtarget->hasFullFP16()) {
2459  SDLoc dl(Op);
2460  return DAG.getNode(
2461  Op.getOpcode(), dl, Op.getValueType(),
2462  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2463  }
2464 
2465  if (Op.getOperand(0).getValueType() != MVT::f128) {
2466  // It's legal except when f128 is involved
2467  return Op;
2468  }
2469 
2470  RTLIB::Libcall LC;
2471  if (Op.getOpcode() == ISD::FP_TO_SINT)
2473  else
2475 
2476  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2477  MakeLibCallOptions CallOptions;
2478  return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first;
2479 }
2480 
2482  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2483  // Any additional optimization in this function should be recorded
2484  // in the cost tables.
2485  EVT VT = Op.getValueType();
2486  SDLoc dl(Op);
2487  SDValue In = Op.getOperand(0);
2488  EVT InVT = In.getValueType();
2489 
2490  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2491  MVT CastVT =
2493  InVT.getVectorNumElements());
2494  In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2495  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2496  }
2497 
2498  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2499  unsigned CastOpc =
2501  EVT CastVT = VT.changeVectorElementTypeToInteger();
2502  In = DAG.getNode(CastOpc, dl, CastVT, In);
2503  return DAG.getNode(Op.getOpcode(), dl, VT, In);
2504  }
2505 
2506  return Op;
2507 }
2508 
2509 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2510  SelectionDAG &DAG) const {
2511  if (Op.getValueType().isVector())
2512  return LowerVectorINT_TO_FP(Op, DAG);
2513 
2514  // f16 conversions are promoted to f32 when full fp16 is not supported.
2515  if (Op.getValueType() == MVT::f16 &&
2516  !Subtarget->hasFullFP16()) {
2517  SDLoc dl(Op);
2518  return DAG.getNode(
2519  ISD::FP_ROUND, dl, MVT::f16,
2520  DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2521  DAG.getIntPtrConstant(0, dl));
2522  }
2523 
2524  // i128 conversions are libcalls.
2525  if (Op.getOperand(0).getValueType() == MVT::i128)
2526  return SDValue();
2527 
2528  // Other conversions are legal, unless it's to the completely software-based
2529  // fp128.
2530  if (Op.getValueType() != MVT::f128)
2531  return Op;
2532 
2533  RTLIB::Libcall LC;
2534  if (Op.getOpcode() == ISD::SINT_TO_FP)
2536  else
2538 
2539  return LowerF128Call(Op, DAG, LC);
2540 }
2541 
2542 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2543  SelectionDAG &DAG) const {
2544  // For iOS, we want to call an alternative entry point: __sincos_stret,
2545  // which returns the values in two S / D registers.
2546  SDLoc dl(Op);
2547  SDValue Arg = Op.getOperand(0);
2548  EVT ArgVT = Arg.getValueType();
2549  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2550 
2551  ArgListTy Args;
2552  ArgListEntry Entry;
2553 
2554  Entry.Node = Arg;
2555  Entry.Ty = ArgTy;
2556  Entry.IsSExt = false;
2557  Entry.IsZExt = false;
2558  Args.push_back(Entry);
2559 
2560  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2561  : RTLIB::SINCOS_STRET_F32;
2562  const char *LibcallName = getLibcallName(LC);
2563  SDValue Callee =
2564  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2565 
2566  StructType *RetTy = StructType::get(ArgTy, ArgTy);
2568  CLI.setDebugLoc(dl)
2569  .setChain(DAG.getEntryNode())
2570  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2571 
2572  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2573  return CallResult.first;
2574 }
2575 
2577  if (Op.getValueType() != MVT::f16)
2578  return SDValue();
2579 
2580  assert(Op.getOperand(0).getValueType() == MVT::i16);
2581  SDLoc DL(Op);
2582 
2583  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2584  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2585  return SDValue(
2586  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2587  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2588  0);
2589 }
2590 
2591 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2592  if (OrigVT.getSizeInBits() >= 64)
2593  return OrigVT;
2594 
2595  assert(OrigVT.isSimple() && "Expecting a simple value type");
2596 
2597  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2598  switch (OrigSimpleTy) {
2599  default: llvm_unreachable("Unexpected Vector Type");
2600  case MVT::v2i8:
2601  case MVT::v2i16:
2602  return MVT::v2i32;
2603  case MVT::v4i8:
2604  return MVT::v4i16;
2605  }
2606 }
2607 
2609  const EVT &OrigTy,
2610  const EVT &ExtTy,
2611  unsigned ExtOpcode) {
2612  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2613  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2614  // 64-bits we need to insert a new extension so that it will be 64-bits.
2615  assert(ExtTy.is128BitVector() && "Unexpected extension size");
2616  if (OrigTy.getSizeInBits() >= 64)
2617  return N;
2618 
2619  // Must extend size to at least 64 bits to be used as an operand for VMULL.
2620  EVT NewVT = getExtensionTo64Bits(OrigTy);
2621 
2622  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2623 }
2624 
2626  bool isSigned) {
2627  EVT VT = N->getValueType(0);
2628 
2629  if (N->getOpcode() != ISD::BUILD_VECTOR)
2630  return false;
2631 
2632  for (const SDValue &Elt : N->op_values()) {
2633  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2634  unsigned EltSize = VT.getScalarSizeInBits();
2635  unsigned HalfSize = EltSize / 2;
2636  if (isSigned) {
2637  if (!isIntN(HalfSize, C->getSExtValue()))
2638  return false;
2639  } else {
2640  if (!isUIntN(HalfSize, C->getZExtValue()))
2641  return false;
2642  }
2643  continue;
2644  }
2645  return false;
2646  }
2647 
2648  return true;
2649 }
2650 
2652  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2654  N->getOperand(0)->getValueType(0),
2655  N->getValueType(0),
2656  N->getOpcode());
2657 
2658  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2659  EVT VT = N->getValueType(0);
2660  SDLoc dl(N);
2661  unsigned EltSize = VT.getScalarSizeInBits() / 2;
2662  unsigned NumElts = VT.getVectorNumElements();
2663  MVT TruncVT = MVT::getIntegerVT(EltSize);
2665  for (unsigned i = 0; i != NumElts; ++i) {
2666  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2667  const APInt &CInt = C->getAPIntValue();
2668  // Element types smaller than 32 bits are not legal, so use i32 elements.
2669  // The values are implicitly truncated so sext vs. zext doesn't matter.
2670  Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2671  }
2672  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2673 }
2674 
2675 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2676  return N->getOpcode() == ISD::SIGN_EXTEND ||
2677  isExtendedBUILD_VECTOR(N, DAG, true);
2678 }
2679 
2680 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2681  return N->getOpcode() == ISD::ZERO_EXTEND ||
2682  isExtendedBUILD_VECTOR(N, DAG, false);
2683 }
2684 
2685 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2686  unsigned Opcode = N->getOpcode();
2687  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2688  SDNode *N0 = N->getOperand(0).getNode();
2689  SDNode *N1 = N->getOperand(1).getNode();
2690  return N0->hasOneUse() && N1->hasOneUse() &&
2691  isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2692  }
2693  return false;
2694 }
2695 
2696 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2697  unsigned Opcode = N->getOpcode();
2698  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2699  SDNode *N0 = N->getOperand(0).getNode();
2700  SDNode *N1 = N->getOperand(1).getNode();
2701  return N0->hasOneUse() && N1->hasOneUse() &&
2702  isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2703  }
2704  return false;
2705 }
2706 
2707 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
2708  SelectionDAG &DAG) const {
2709  // The rounding mode is in bits 23:22 of the FPSCR.
2710  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
2711  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
2712  // so that the shift + and get folded into a bitfield extract.
2713  SDLoc dl(Op);
2714 
2715  SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
2716  DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
2717  MVT::i64));
2718  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
2719  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
2720  DAG.getConstant(1U << 22, dl, MVT::i32));
2721  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
2722  DAG.getConstant(22, dl, MVT::i32));
2723  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
2724  DAG.getConstant(3, dl, MVT::i32));
2725 }
2726 
2728  // Multiplications are only custom-lowered for 128-bit vectors so that
2729  // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2730  EVT VT = Op.getValueType();
2731  assert(VT.is128BitVector() && VT.isInteger() &&
2732  "unexpected type for custom-lowering ISD::MUL");
2733  SDNode *N0 = Op.getOperand(0).getNode();
2734  SDNode *N1 = Op.getOperand(1).getNode();
2735  unsigned NewOpc = 0;
2736  bool isMLA = false;
2737  bool isN0SExt = isSignExtended(N0, DAG);
2738  bool isN1SExt = isSignExtended(N1, DAG);
2739  if (isN0SExt && isN1SExt)
2740  NewOpc = AArch64ISD::SMULL;
2741  else {
2742  bool isN0ZExt = isZeroExtended(N0, DAG);
2743  bool isN1ZExt = isZeroExtended(N1, DAG);
2744  if (isN0ZExt && isN1ZExt)
2745  NewOpc = AArch64ISD::UMULL;
2746  else if (isN1SExt || isN1ZExt) {
2747  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2748  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2749  if (isN1SExt && isAddSubSExt(N0, DAG)) {
2750  NewOpc = AArch64ISD::SMULL;
2751  isMLA = true;
2752  } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2753  NewOpc = AArch64ISD::UMULL;
2754  isMLA = true;
2755  } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2756  std::swap(N0, N1);
2757  NewOpc = AArch64ISD::UMULL;
2758  isMLA = true;
2759  }
2760  }
2761 
2762  if (!NewOpc) {
2763  if (VT == MVT::v2i64)
2764  // Fall through to expand this. It is not legal.
2765  return SDValue();
2766  else
2767  // Other vector multiplications are legal.
2768  return Op;
2769  }
2770  }
2771 
2772  // Legalize to a S/UMULL instruction
2773  SDLoc DL(Op);
2774  SDValue Op0;
2775  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2776  if (!isMLA) {
2777  Op0 = skipExtensionForVectorMULL(N0, DAG);
2778  assert(Op0.getValueType().is64BitVector() &&
2779  Op1.getValueType().is64BitVector() &&
2780  "unexpected types for extended operands to VMULL");
2781  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2782  }
2783  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2784  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2785  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2786  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2787  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2788  EVT Op1VT = Op1.getValueType();
2789  return DAG.getNode(N0->getOpcode(), DL, VT,
2790  DAG.getNode(NewOpc, DL, VT,
2791  DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2792  DAG.getNode(NewOpc, DL, VT,
2793  DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2794 }
2795 
2796 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2797  SelectionDAG &DAG) const {
2798  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2799  SDLoc dl(Op);
2800  switch (IntNo) {
2801  default: return SDValue(); // Don't custom lower most intrinsics.
2802  case Intrinsic::thread_pointer: {
2803  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2804  return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2805  }
2806  case Intrinsic::aarch64_neon_abs: {
2807  EVT Ty = Op.getValueType();
2808  if (Ty == MVT::i64) {
2809  SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
2810  Op.getOperand(1));
2811  Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
2812  return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
2813  } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
2814  return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
2815  } else {
2816  report_fatal_error("Unexpected type for AArch64 NEON intrinic");
2817  }
2818  }
2819  case Intrinsic::aarch64_neon_smax:
2820  return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2821  Op.getOperand(1), Op.getOperand(2));
2822  case Intrinsic::aarch64_neon_umax:
2823  return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2824  Op.getOperand(1), Op.getOperand(2));
2825  case Intrinsic::aarch64_neon_smin:
2826  return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2827  Op.getOperand(1), Op.getOperand(2));
2828  case Intrinsic::aarch64_neon_umin:
2829  return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2830  Op.getOperand(1), Op.getOperand(2));
2831 
2832  case Intrinsic::localaddress: {
2833  const auto &MF = DAG.getMachineFunction();
2834  const auto *RegInfo = Subtarget->getRegisterInfo();
2835  unsigned Reg = RegInfo->getLocalAddressRegister(MF);
2836  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
2837  Op.getSimpleValueType());
2838  }
2839 
2840  case Intrinsic::eh_recoverfp: {
2841  // FIXME: This needs to be implemented to correctly handle highly aligned
2842  // stack objects. For now we simply return the incoming FP. Refer D53541
2843  // for more details.
2844  SDValue FnOp = Op.getOperand(1);
2845  SDValue IncomingFPOp = Op.getOperand(2);
2847  auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
2848  if (!Fn)
2850  "llvm.eh.recoverfp must take a function as the first argument");
2851  return IncomingFPOp;
2852  }
2853  }
2854 }
2855 
2856 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
2858  EVT VT, EVT MemVT,
2859  SelectionDAG &DAG) {
2860  assert(VT.isVector() && "VT should be a vector type");
2861  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
2862 
2863  SDValue Value = ST->getValue();
2864 
2865  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
2866  // the word lane which represent the v4i8 subvector. It optimizes the store
2867  // to:
2868  //
2869  // xtn v0.8b, v0.8h
2870  // str s0, [x0]
2871 
2872  SDValue Undef = DAG.getUNDEF(MVT::i16);
2873  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
2874  {Undef, Undef, Undef, Undef});
2875 
2876  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
2877  Value, UndefVec);
2878  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
2879 
2880  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
2881  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
2882  Trunc, DAG.getConstant(0, DL, MVT::i64));
2883 
2884  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
2885  ST->getBasePtr(), ST->getMemOperand());
2886 }
2887 
2888 // Custom lowering for any store, vector or scalar and/or default or with
2889 // a truncate operations. Currently only custom lower truncate operation
2890 // from vector v4i16 to v4i8.
2891 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
2892  SelectionDAG &DAG) const {
2893  SDLoc Dl(Op);
2894  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
2895  assert (StoreNode && "Can only custom lower store nodes");
2896 
2897  SDValue Value = StoreNode->getValue();
2898 
2899  EVT VT = Value.getValueType();
2900  EVT MemVT = StoreNode->getMemoryVT();
2901 
2902  assert (VT.isVector() && "Can only custom lower vector store types");
2903 
2904  unsigned AS = StoreNode->getAddressSpace();
2905  unsigned Align = StoreNode->getAlignment();
2906  if (Align < MemVT.getStoreSize() &&
2908  MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
2909  return scalarizeVectorStore(StoreNode, DAG);
2910  }
2911 
2912  if (StoreNode->isTruncatingStore()) {
2913  return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
2914  }
2915 
2916  return SDValue();
2917 }
2918 
2920  SelectionDAG &DAG) const {
2921  LLVM_DEBUG(dbgs() << "Custom lowering: ");
2922  LLVM_DEBUG(Op.dump());
2923 
2924  switch (Op.getOpcode()) {
2925  default:
2926  llvm_unreachable("unimplemented operand");
2927  return SDValue();
2928  case ISD::BITCAST:
2929  return LowerBITCAST(Op, DAG);
2930  case ISD::GlobalAddress:
2931  return LowerGlobalAddress(Op, DAG);
2932  case ISD::GlobalTLSAddress:
2933  return LowerGlobalTLSAddress(Op, DAG);
2934  case ISD::SETCC:
2935  return LowerSETCC(Op, DAG);
2936  case ISD::BR_CC:
2937  return LowerBR_CC(Op, DAG);
2938  case ISD::SELECT:
2939  return LowerSELECT(Op, DAG);
2940  case ISD::SELECT_CC:
2941  return LowerSELECT_CC(Op, DAG);
2942  case ISD::JumpTable:
2943  return LowerJumpTable(Op, DAG);
2944  case ISD::BR_JT:
2945  return LowerBR_JT(Op, DAG);
2946  case ISD::ConstantPool:
2947  return LowerConstantPool(Op, DAG);
2948  case ISD::BlockAddress:
2949  return LowerBlockAddress(Op, DAG);
2950  case ISD::VASTART:
2951  return LowerVASTART(Op, DAG);
2952  case ISD::VACOPY:
2953  return LowerVACOPY(Op, DAG);
2954  case ISD::VAARG:
2955  return LowerVAARG(Op, DAG);
2956  case ISD::ADDC:
2957  case ISD::ADDE:
2958  case ISD::SUBC:
2959  case ISD::SUBE:
2960  return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2961  case ISD::SADDO:
2962  case ISD::UADDO:
2963  case ISD::SSUBO:
2964  case ISD::USUBO:
2965  case ISD::SMULO:
2966  case ISD::UMULO:
2967  return LowerXALUO(Op, DAG);
2968  case ISD::FADD:
2969  return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
2970  case ISD::FSUB:
2971  return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
2972  case ISD::FMUL:
2973  return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
2974  case ISD::FDIV:
2975  return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
2976  case ISD::FP_ROUND:
2977  return LowerFP_ROUND(Op, DAG);
2978  case ISD::FP_EXTEND:
2979  return LowerFP_EXTEND(Op, DAG);
2980  case ISD::FRAMEADDR:
2981  return LowerFRAMEADDR(Op, DAG);
2982  case ISD::SPONENTRY:
2983  return LowerSPONENTRY(Op, DAG);
2984  case ISD::RETURNADDR:
2985  return LowerRETURNADDR(Op, DAG);
2986  case ISD::ADDROFRETURNADDR:
2987  return LowerADDROFRETURNADDR(Op, DAG);
2989  return LowerINSERT_VECTOR_ELT(Op, DAG);
2991  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2992  case ISD::BUILD_VECTOR:
2993  return LowerBUILD_VECTOR(Op, DAG);
2994  case ISD::VECTOR_SHUFFLE:
2995  return LowerVECTOR_SHUFFLE(Op, DAG);
2997  return LowerEXTRACT_SUBVECTOR(Op, DAG);
2998  case ISD::SRA:
2999  case ISD::SRL:
3000  case ISD::SHL:
3001  return LowerVectorSRA_SRL_SHL(Op, DAG);
3002  case ISD::SHL_PARTS:
3003  return LowerShiftLeftParts(Op, DAG);
3004  case ISD::SRL_PARTS:
3005  case ISD::SRA_PARTS:
3006  return LowerShiftRightParts(Op, DAG);
3007  case ISD::CTPOP:
3008  return LowerCTPOP(Op, DAG);
3009  case ISD::FCOPYSIGN:
3010  return LowerFCOPYSIGN(Op, DAG);
3011  case ISD::OR:
3012  return LowerVectorOR(Op, DAG);
3013  case ISD::XOR:
3014  return LowerXOR(Op, DAG);
3015  case ISD::PREFETCH:
3016  return LowerPREFETCH(Op, DAG);
3017  case ISD::SINT_TO_FP:
3018  case ISD::UINT_TO_FP:
3019  return LowerINT_TO_FP(Op, DAG);
3020  case ISD::FP_TO_SINT:
3021  case ISD::FP_TO_UINT:
3022  return LowerFP_TO_INT(Op, DAG);
3023  case ISD::FSINCOS:
3024  return LowerFSINCOS(Op, DAG);
3025  case ISD::FLT_ROUNDS_:
3026  return LowerFLT_ROUNDS_(Op, DAG);
3027  case ISD::MUL:
3028  return LowerMUL(Op, DAG);
3030  return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3031  case ISD::STORE:
3032  return LowerSTORE(Op, DAG);
3033  case ISD::VECREDUCE_ADD:
3034  case ISD::VECREDUCE_SMAX:
3035  case ISD::VECREDUCE_SMIN:
3036  case ISD::VECREDUCE_UMAX:
3037  case ISD::VECREDUCE_UMIN:
3038  case ISD::VECREDUCE_FMAX:
3039  case ISD::VECREDUCE_FMIN:
3040  return LowerVECREDUCE(Op, DAG);
3041  case ISD::ATOMIC_LOAD_SUB:
3042  return LowerATOMIC_LOAD_SUB(Op, DAG);
3043  case ISD::ATOMIC_LOAD_AND:
3044  return LowerATOMIC_LOAD_AND(Op, DAG);
3046  return LowerDYNAMIC_STACKALLOC(Op, DAG);
3047  }
3048 }
3049 
3050 //===----------------------------------------------------------------------===//
3051 // Calling Convention Implementation
3052 //===----------------------------------------------------------------------===//
3053 
3054 /// Selects the correct CCAssignFn for a given CallingConvention value.
3056  bool IsVarArg) const {
3057  switch (CC) {
3058  default:
3059  report_fatal_error("Unsupported calling convention.");
3061  return CC_AArch64_WebKit_JS;
3062  case CallingConv::GHC:
3063  return CC_AArch64_GHC;
3064  case CallingConv::C:
3065  case CallingConv::Fast:
3068  case CallingConv::Swift:
3069  if (Subtarget->isTargetWindows() && IsVarArg)
3070  return CC_AArch64_Win64_VarArg;
3071  if (!Subtarget->isTargetDarwin())
3072  return CC_AArch64_AAPCS;
3074  case CallingConv::Win64:
3075  return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
3077  return CC_AArch64_AAPCS;
3078  }
3079 }
3080 
3081 CCAssignFn *
3085 }
3086 
3087 SDValue AArch64TargetLowering::LowerFormalArguments(
3088  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3089  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3090  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3091  MachineFunction &MF = DAG.getMachineFunction();
3092  MachineFrameInfo &MFI = MF.getFrameInfo();
3093  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3094 
3095  // Assign locations to all of the incoming arguments.
3097  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3098  *DAG.getContext());
3099 
3100  // At this point, Ins[].VT may already be promoted to i32. To correctly
3101  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3102  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3103  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
3104  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
3105  // LocVT.
3106  unsigned NumArgs = Ins.size();
3108  unsigned CurArgIdx = 0;
3109  for (unsigned i = 0; i != NumArgs; ++i) {
3110  MVT ValVT = Ins[i].VT;
3111  if (Ins[i].isOrigArg()) {
3112  std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
3113  CurArgIdx = Ins[i].getOrigArgIndex();
3114 
3115  // Get type of the original argument.
3116  EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
3117  /*AllowUnknown*/ true);
3118  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
3119  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3120  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3121  ValVT = MVT::i8;
3122  else if (ActualMVT == MVT::i16)
3123  ValVT = MVT::i16;
3124  }
3125  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3126  bool Res =
3127  AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
3128  assert(!Res && "Call operand has unhandled type");
3129  (void)Res;
3130  }
3131  assert(ArgLocs.size() == Ins.size());
3132  SmallVector<SDValue, 16> ArgValues;
3133  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3134  CCValAssign &VA = ArgLocs[i];
3135 
3136  if (Ins[i].Flags.isByVal()) {
3137  // Byval is used for HFAs in the PCS, but the system should work in a
3138  // non-compliant manner for larger structs.
3139  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3140  int Size = Ins[i].Flags.getByValSize();
3141  unsigned NumRegs = (Size + 7) / 8;
3142 
3143  // FIXME: This works on big-endian for composite byvals, which are the common
3144  // case. It should also work for fundamental types too.
3145  unsigned FrameIdx =
3146  MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
3147  SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
3148  InVals.push_back(FrameIdxN);
3149 
3150  continue;
3151  }
3152 
3153  if (VA.isRegLoc()) {
3154  // Arguments stored in registers.
3155  EVT RegVT = VA.getLocVT();
3156 
3157  SDValue ArgValue;
3158  const TargetRegisterClass *RC;
3159 
3160  if (RegVT == MVT::i32)
3161  RC = &AArch64::GPR32RegClass;
3162  else if (RegVT == MVT::i64)
3163  RC = &AArch64::GPR64RegClass;
3164  else if (RegVT == MVT::f16)
3165  RC = &AArch64::FPR16RegClass;
3166  else if (RegVT == MVT::f32)
3167  RC = &AArch64::FPR32RegClass;
3168  else if (RegVT == MVT::f64 || RegVT.is64BitVector())
3169  RC = &AArch64::FPR64RegClass;
3170  else if (RegVT == MVT::f128 || RegVT.is128BitVector())
3171  RC = &AArch64::FPR128RegClass;
3172  else if (RegVT.isScalableVector() &&
3173  RegVT.getVectorElementType() == MVT::i1)
3174  RC = &AArch64::PPRRegClass;
3175  else if (RegVT.isScalableVector())
3176  RC = &AArch64::ZPRRegClass;
3177  else
3178  llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3179 
3180  // Transform the arguments in physical registers into virtual ones.
3181  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3182  ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
3183 
3184  // If this is an 8, 16 or 32-bit value, it is really passed promoted
3185  // to 64 bits. Insert an assert[sz]ext to capture this, then
3186  // truncate to the right size.
3187  switch (VA.getLocInfo()) {
3188  default:
3189  llvm_unreachable("Unknown loc info!");
3190  case CCValAssign::Full:
3191  break;
3192  case CCValAssign::Indirect:
3193  assert(VA.getValVT().isScalableVector() &&
3194  "Only scalable vectors can be passed indirectly");
3195  llvm_unreachable("Spilling of SVE vectors not yet implemented");
3196  case CCValAssign::BCvt:
3197  ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
3198  break;
3199  case CCValAssign::AExt:
3200  case CCValAssign::SExt:
3201  case CCValAssign::ZExt:
3202  // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
3203  // nodes after our lowering.
3204  assert(RegVT == Ins[i].VT && "incorrect register location selected");
3205  break;
3206  }
3207 
3208  InVals.push_back(ArgValue);
3209 
3210  } else { // VA.isRegLoc()
3211  assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
3212  unsigned ArgOffset = VA.getLocMemOffset();
3213  unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
3214 
3215  uint32_t BEAlign = 0;
3216  if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
3217  !Ins[i].Flags.isInConsecutiveRegs())
3218  BEAlign = 8 - ArgSize;
3219 
3220  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
3221 
3222  // Create load nodes to retrieve arguments from the stack.
3223  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3224  SDValue ArgValue;
3225 
3226  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
3228  MVT MemVT = VA.getValVT();
3229 
3230  switch (VA.getLocInfo()) {
3231  default:
3232  break;
3233  case CCValAssign::BCvt:
3234  MemVT = VA.getLocVT();
3235  break;
3236  case CCValAssign::Indirect:
3237  assert(VA.getValVT().isScalableVector() &&
3238  "Only scalable vectors can be passed indirectly");
3239  llvm_unreachable("Spilling of SVE vectors not yet implemented");
3240  case CCValAssign::SExt:
3241  ExtType = ISD::SEXTLOAD;
3242  break;
3243  case CCValAssign::ZExt:
3244  ExtType = ISD::ZEXTLOAD;
3245  break;
3246  case CCValAssign::AExt:
3247  ExtType = ISD::EXTLOAD;
3248  break;
3249  }
3250 
3251  ArgValue = DAG.getExtLoad(
3252  ExtType, DL, VA.getLocVT(), Chain, FIN,
3254  MemVT);
3255 
3256  InVals.push_back(ArgValue);
3257  }
3258  }
3259 
3260  // varargs
3262  if (isVarArg) {
3263  if (!Subtarget->isTargetDarwin() || IsWin64) {
3264  // The AAPCS variadic function ABI is identical to the non-variadic
3265  // one. As a result there may be more arguments in registers and we should
3266  // save them for future reference.
3267  // Win64 variadic functions also pass arguments in registers, but all float
3268  // arguments are passed in integer registers.
3269  saveVarArgRegisters(CCInfo, DAG, DL, Chain);
3270  }
3271 
3272  // This will point to the next argument passed via stack.
3273  unsigned StackOffset = CCInfo.getNextStackOffset();
3274  // We currently pass all varargs at 8-byte alignment.
3275  StackOffset = ((StackOffset + 7) & ~7);
3276  FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
3277 
3278  if (MFI.hasMustTailInVarArgFunc()) {
3279  SmallVector<MVT, 2> RegParmTypes;
3280  RegParmTypes.push_back(MVT::i64);
3281  RegParmTypes.push_back(MVT::f128);
3282  // Compute the set of forwarded registers. The rest are scratch.
3284  FuncInfo->getForwardedMustTailRegParms();
3285  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
3287 
3288  // Conservatively forward X8, since it might be used for aggregate return.
3289  if (!CCInfo.isAllocated(AArch64::X8)) {
3290  unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
3291  Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
3292  }
3293  }
3294  }
3295 
3296  // On Windows, InReg pointers must be returned, so record the pointer in a
3297  // virtual register at the start of the function so it can be returned in the
3298  // epilogue.
3299  if (IsWin64) {
3300  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3301  if (Ins[I].Flags.isInReg()) {
3302  assert(!FuncInfo->getSRetReturnReg());
3303 
3304  MVT PtrTy = getPointerTy(DAG.getDataLayout());
3305  Register Reg =
3307  FuncInfo->setSRetReturnReg(Reg);
3308 
3309  SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
3310  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
3311  break;
3312  }
3313  }
3314  }
3315 
3316  unsigned StackArgSize = CCInfo.getNextStackOffset();
3317  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3318  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
3319  // This is a non-standard ABI so by fiat I say we're allowed to make full
3320  // use of the stack area to be popped, which must be aligned to 16 bytes in
3321  // any case:
3322  StackArgSize = alignTo(StackArgSize, 16);
3323 
3324  // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
3325  // a multiple of 16.
3326  FuncInfo->setArgumentStackToRestore(StackArgSize);
3327 
3328  // This realignment carries over to the available bytes below. Our own
3329  // callers will guarantee the space is free by giving an aligned value to
3330  // CALLSEQ_START.
3331  }
3332  // Even if we're not expected to free up the space, it's useful to know how
3333  // much is there while considering tail calls (because we can reuse it).
3334  FuncInfo->setBytesInStackArgArea(StackArgSize);
3335 
3336  if (Subtarget->hasCustomCallingConv())
3337  Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
3338 
3339  return Chain;
3340 }
3341 
3342 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
3343  SelectionDAG &DAG,
3344  const SDLoc &DL,
3345  SDValue &Chain) const {
3346  MachineFunction &MF = DAG.getMachineFunction();
3347  MachineFrameInfo &MFI = MF.getFrameInfo();
3349  auto PtrVT = getPointerTy(DAG.getDataLayout());
3350  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3351 
3352  SmallVector<SDValue, 8> MemOps;
3353 
3354  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
3355  AArch64::X3, AArch64::X4, AArch64::X5,
3356  AArch64::X6, AArch64::X7 };
3357  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
3358  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
3359 
3360  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
3361  int GPRIdx = 0;
3362  if (GPRSaveSize != 0) {
3363  if (IsWin64) {
3364  GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
3365  if (GPRSaveSize & 15)
3366  // The extra size here, if triggered, will always be 8.
3367  MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
3368  } else
3369  GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
3370 
3371  SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
3372 
3373  for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
3374  unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
3375  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
3376  SDValue Store = DAG.getStore(
3377  Val.getValue(1), DL, Val, FIN,
3378  IsWin64
3380  GPRIdx,
3381  (i - FirstVariadicGPR) * 8)
3383  MemOps.push_back(Store);
3384  FIN =
3385  DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
3386  }
3387  }
3388  FuncInfo->setVarArgsGPRIndex(GPRIdx);
3389  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
3390 
3391  if (Subtarget->hasFPARMv8() && !IsWin64) {
3392  static const MCPhysReg FPRArgRegs[] = {
3393  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
3394  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
3395  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
3396  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
3397 
3398  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
3399  int FPRIdx = 0;
3400  if (FPRSaveSize != 0) {
3401  FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
3402 
3403  SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3404 
3405  for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3406  unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3407  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3408 
3409  SDValue Store = DAG.getStore(
3410  Val.getValue(1), DL, Val, FIN,
3412  MemOps.push_back(Store);
3413  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3414  DAG.getConstant(16, DL, PtrVT));
3415  }
3416  }
3417  FuncInfo->setVarArgsFPRIndex(FPRIdx);
3418  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3419  }
3420 
3421  if (!MemOps.empty()) {
3422  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3423  }
3424 }
3425 
3426 /// LowerCallResult - Lower the result values of a call into the
3427 /// appropriate copies out of appropriate physical registers.
3428 SDValue AArch64TargetLowering::LowerCallResult(
3429  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3430  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3431  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3432  SDValue ThisVal) const {
3433  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3436  // Assign locations to each value returned by this call.
3438  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3439  *DAG.getContext());
3440  CCInfo.AnalyzeCallResult(Ins, RetCC);
3441 
3442  // Copy all of the result registers out of their specified physreg.
3443  for (unsigned i = 0; i != RVLocs.size(); ++i) {
3444  CCValAssign VA = RVLocs[i];
3445 
3446  // Pass 'this' value directly from the argument to return value, to avoid
3447  // reg unit interference
3448  if (i == 0 && isThisReturn) {
3449  assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3450  "unexpected return calling convention register assignment");
3451  InVals.push_back(ThisVal);
3452  continue;
3453  }
3454 
3455  SDValue Val =
3456  DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3457  Chain = Val.getValue(1);
3458  InFlag = Val.getValue(2);
3459 
3460  switch (VA.getLocInfo()) {
3461  default:
3462  llvm_unreachable("Unknown loc info!");
3463  case CCValAssign::Full:
3464  break;
3465  case CCValAssign::BCvt:
3466  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3467  break;
3468  }
3469 
3470  InVals.push_back(Val);
3471  }
3472 
3473  return Chain;
3474 }
3475 
3476 /// Return true if the calling convention is one that we can guarantee TCO for.
3478  return CC == CallingConv::Fast;
3479 }
3480 
3481 /// Return true if we might ever do TCO for calls with this calling convention.
3483  switch (CC) {
3484  case CallingConv::C:
3486  case CallingConv::Swift:
3487  return true;
3488  default:
3489  return canGuaranteeTCO(CC);
3490  }
3491 }
3492 
3493 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3494  SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3495  const SmallVectorImpl<ISD::OutputArg> &Outs,
3496  const SmallVectorImpl<SDValue> &OutVals,
3497  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3498  if (!mayTailCallThisCC(CalleeCC))
3499  return false;
3500 
3501  MachineFunction &MF = DAG.getMachineFunction();
3502  const Function &CallerF = MF.getFunction();
3503  CallingConv::ID CallerCC = CallerF.getCallingConv();
3504  bool CCMatch = CallerCC == CalleeCC;
3505 
3506  // Byval parameters hand the function a pointer directly into the stack area
3507  // we want to reuse during a tail call. Working around this *is* possible (see
3508  // X86) but less efficient and uglier in LowerCall.
3509  for (Function::const_arg_iterator i = CallerF.arg_begin(),
3510  e = CallerF.arg_end();
3511  i != e; ++i) {
3512  if (i->hasByValAttr())
3513  return false;
3514 
3515  // On Windows, "inreg" attributes signify non-aggregate indirect returns.
3516  // In this case, it is necessary to save/restore X0 in the callee. Tail
3517  // call opt interferes with this. So we disable tail call opt when the
3518  // caller has an argument with "inreg" attribute.
3519 
3520  // FIXME: Check whether the callee also has an "inreg" argument.
3521  if (i->hasInRegAttr())
3522  return false;
3523  }
3524 
3526  return canGuaranteeTCO(CalleeCC) && CCMatch;
3527 
3528  // Externally-defined functions with weak linkage should not be
3529  // tail-called on AArch64 when the OS does not support dynamic
3530  // pre-emption of symbols, as the AAELF spec requires normal calls
3531  // to undefined weak functions to be replaced with a NOP or jump to the
3532  // next instruction. The behaviour of branch instructions in this
3533  // situation (as used for tail calls) is implementation-defined, so we
3534  // cannot rely on the linker replacing the tail call with a return.
3535  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3536  const GlobalValue *GV = G->getGlobal();
3538  if (GV->hasExternalWeakLinkage() &&
3539  (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3540  return false;
3541  }
3542 
3543  // Now we search for cases where we can use a tail call without changing the
3544  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3545  // concept.
3546 
3547  // I want anyone implementing a new calling convention to think long and hard
3548  // about this assert.
3549  assert((!isVarArg || CalleeCC == CallingConv::C) &&
3550  "Unexpected variadic calling convention");
3551 
3552  LLVMContext &C = *DAG.getContext();
3553  if (isVarArg && !Outs.empty()) {
3554  // At least two cases here: if caller is fastcc then we can't have any
3555  // memory arguments (we'd be expected to clean up the stack afterwards). If
3556  // caller is C then we could potentially use its argument area.
3557 
3558  // FIXME: for now we take the most conservative of these in both cases:
3559  // disallow all variadic memory operands.
3561  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3562 
3563  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3564  for (const CCValAssign &ArgLoc : ArgLocs)
3565  if (!ArgLoc.isRegLoc())
3566  return false;
3567  }
3568 
3569  // Check that the call results are passed in the same way.
3570  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3571  CCAssignFnForCall(CalleeCC, isVarArg),
3572  CCAssignFnForCall(CallerCC, isVarArg)))
3573  return false;
3574  // The callee has to preserve all registers the caller needs to preserve.
3575  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3576  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3577  if (!CCMatch) {
3578  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3579  if (Subtarget->hasCustomCallingConv()) {
3580  TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
3581  TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
3582  }
3583  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3584  return false;
3585  }
3586 
3587  // Nothing more to check if the callee is taking no arguments
3588  if (Outs.empty())
3589  return true;
3590 
3592  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3593 
3594  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3595 
3596  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3597 
3598  // If the stack arguments for this call do not fit into our own save area then
3599  // the call cannot be made tail.
3600  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3601  return false;
3602 
3603  const MachineRegisterInfo &MRI = MF.getRegInfo();
3604  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3605  return false;
3606 
3607  return true;
3608 }
3609 
3610 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3611  SelectionDAG &DAG,
3612  MachineFrameInfo &MFI,
3613  int ClobberedFI) const {
3614  SmallVector<SDValue, 8> ArgChains;
3615  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3616  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3617 
3618  // Include the original chain at the beginning of the list. When this is
3619  // used by target LowerCall hooks, this helps legalize find the
3620  // CALLSEQ_BEGIN node.
3621  ArgChains.push_back(Chain);
3622 
3623  // Add a chain value for each stack argument corresponding
3624  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3625  UE = DAG.getEntryNode().getNode()->use_end();
3626  U != UE; ++U)
3627  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3628  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3629  if (FI->getIndex() < 0) {
3630  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3631  int64_t InLastByte = InFirstByte;
3632  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3633 
3634  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3635  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3636  ArgChains.push_back(SDValue(L, 1));
3637  }
3638 
3639  // Build a tokenfactor for all the chains.
3640  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3641 }
3642 
3643 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3644  bool TailCallOpt) const {
3645  return CallCC == CallingConv::Fast && TailCallOpt;
3646 }
3647 
3648 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3649 /// and add input and output parameter nodes.
3650 SDValue
3651 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3652  SmallVectorImpl<SDValue> &InVals) const {
3653  SelectionDAG &DAG = CLI.DAG;
3654  SDLoc &DL = CLI.DL;
3655  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3656  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3657  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3658  SDValue Chain = CLI.Chain;
3659  SDValue Callee = CLI.Callee;
3660  bool &IsTailCall = CLI.IsTailCall;
3661  CallingConv::ID CallConv = CLI.CallConv;
3662  bool IsVarArg = CLI.IsVarArg;
3663 
3664  MachineFunction &MF = DAG.getMachineFunction();
3665  bool IsThisReturn = false;
3666 
3668  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3669  bool IsSibCall = false;
3670 
3671  if (IsTailCall) {
3672  // Check if it's really possible to do a tail call.
3673  IsTailCall = isEligibleForTailCallOptimization(
3674  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3675  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3676  report_fatal_error("failed to perform tail call elimination on a call "
3677  "site marked musttail");
3678 
3679  // A sibling call is one where we're under the usual C ABI and not planning
3680  // to change that but can still do a tail call:
3681  if (!TailCallOpt && IsTailCall)
3682  IsSibCall = true;
3683 
3684  if (IsTailCall)
3685  ++NumTailCalls;
3686  }
3687 
3688  // Analyze operands of the call, assigning locations to each operand.
3690  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3691  *DAG.getContext());
3692 
3693  if (IsVarArg) {
3694  // Handle fixed and variable vector arguments differently.
3695  // Variable vector arguments always go into memory.
3696  unsigned NumArgs = Outs.size();
3697 
3698  for (unsigned i = 0; i != NumArgs; ++i) {
3699  MVT ArgVT = Outs[i].VT;
3700  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3701  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3702  /*IsVarArg=*/ !Outs[i].IsFixed);
3703  bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3704  assert(!Res && "Call operand has unhandled type");
3705  (void)Res;
3706  }
3707  } else {
3708  // At this point, Outs[].VT may already be promoted to i32. To correctly
3709  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3710  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3711  // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3712  // we use a special version of AnalyzeCallOperands to pass in ValVT and
3713  // LocVT.
3714  unsigned NumArgs = Outs.size();
3715  for (unsigned i = 0; i != NumArgs; ++i) {
3716  MVT ValVT = Outs[i].VT;
3717  // Get type of the original argument.
3718  EVT ActualVT = getValueType(DAG.getDataLayout(),
3719  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3720  /*AllowUnknown*/ true);
3721  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3722  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3723  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3724  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3725  ValVT = MVT::i8;
3726  else if (ActualMVT == MVT::i16)
3727  ValVT = MVT::i16;
3728 
3729  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3730  bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3731  assert(!Res && "Call operand has unhandled type");
3732  (void)Res;
3733  }
3734  }
3735 
3736  // Get a count of how many bytes are to be pushed on the stack.
3737  unsigned NumBytes = CCInfo.getNextStackOffset();
3738 
3739  if (IsSibCall) {
3740  // Since we're not changing the ABI to make this a tail call, the memory
3741  // operands are already available in the caller's incoming argument space.
3742  NumBytes = 0;
3743  }
3744 
3745  // FPDiff is the byte offset of the call's argument area from the callee's.
3746  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3747  // by this amount for a tail call. In a sibling call it must be 0 because the
3748  // caller will deallocate the entire stack and the callee still expects its
3749  // arguments to begin at SP+0. Completely unused for non-tail calls.
3750  int FPDiff = 0;
3751 
3752  if (IsTailCall && !IsSibCall) {
3753  unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3754 
3755  // Since callee will pop argument stack as a tail call, we must keep the
3756  // popped size 16-byte aligned.
3757  NumBytes = alignTo(NumBytes, 16);
3758 
3759  // FPDiff will be negative if this tail call requires more space than we
3760  // would automatically have in our incoming argument space. Positive if we
3761  // can actually shrink the stack.
3762  FPDiff = NumReusableBytes - NumBytes;
3763 
3764  // The stack pointer must be 16-byte aligned at all times it's used for a
3765  // memory operation, which in practice means at *all* times and in
3766  // particular across call boundaries. Therefore our own arguments started at
3767  // a 16-byte aligned SP and the delta applied for the tail call should
3768  // satisfy the same constraint.
3769  assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3770  }
3771 
3772  // Adjust the stack pointer for the new arguments...
3773  // These operations are automatically eliminated by the prolog/epilog pass
3774  if (!IsSibCall)
3775  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3776 
3777  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3778  getPointerTy(DAG.getDataLayout()));
3779 
3781  SmallVector<SDValue, 8> MemOpChains;
3782  auto PtrVT = getPointerTy(DAG.getDataLayout());
3783 
3784  if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
3785  const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
3786  for (const auto &F : Forwards) {
3787  SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
3788  RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3789  }
3790  }
3791 
3792  // Walk the register/memloc assignments, inserting copies/loads.
3793  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3794  ++i, ++realArgIdx) {
3795  CCValAssign &VA = ArgLocs[i];
3796  SDValue Arg = OutVals[realArgIdx];
3797  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3798 
3799  // Promote the value if needed.
3800  switch (VA.getLocInfo()) {
3801  default:
3802  llvm_unreachable("Unknown loc info!");
3803  case CCValAssign::Full:
3804  break;
3805  case CCValAssign::SExt:
3806  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3807  break;
3808  case CCValAssign::ZExt:
3809  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3810  break;
3811  case CCValAssign::AExt:
3812  if (Outs[realArgIdx].ArgVT == MVT::i1) {
3813  // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3814  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3815  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3816  }
3817  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3818  break;
3819  case CCValAssign::BCvt:
3820  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3821  break;
3822  case CCValAssign::FPExt:
3823  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3824  break;
3825  case CCValAssign::Indirect:
3826  assert(VA.getValVT().isScalableVector() &&
3827  "Only scalable vectors can be passed indirectly");
3828  llvm_unreachable("Spilling of SVE vectors not yet implemented");
3829  }
3830 
3831  if (VA.isRegLoc()) {
3832  if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3833  Outs[0].VT == MVT::i64) {
3834  assert(VA.getLocVT() == MVT::i64 &&
3835  "unexpected calling convention register assignment");
3836  assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3837  "unexpected use of 'returned'");
3838  IsThisReturn = true;
3839  }
3840  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3841  } else {
3842  assert(VA.isMemLoc());
3843 
3844  SDValue DstAddr;
3845  MachinePointerInfo DstInfo;
3846 
3847  // FIXME: This works on big-endian for composite byvals, which are the
3848  // common case. It should also work for fundamental types too.
3849  uint32_t BEAlign = 0;
3850  unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3851  : VA.getValVT().getSizeInBits();
3852  OpSize = (OpSize + 7) / 8;
3853  if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3854  !Flags.isInConsecutiveRegs()) {
3855  if (OpSize < 8)
3856  BEAlign = 8 - OpSize;
3857  }
3858  unsigned LocMemOffset = VA.getLocMemOffset();
3859  int32_t Offset = LocMemOffset + BEAlign;
3860  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3861  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3862 
3863  if (IsTailCall) {
3864  Offset = Offset + FPDiff;
3865  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3866 
3867  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3868  DstInfo =
3870 
3871  // Make sure any stack arguments overlapping with where we're storing
3872  // are loaded before this eventual operation. Otherwise they'll be
3873  // clobbered.
3874  Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3875  } else {
3876  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3877 
3878  DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3880  LocMemOffset);
3881  }
3882 
3883  if (Outs[i].Flags.isByVal()) {
3884  SDValue SizeNode =
3885  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3886  SDValue Cpy = DAG.getMemcpy(
3887  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3888  /*isVol = */ false, /*AlwaysInline = */ false,
3889  /*isTailCall = */ false,
3890  DstInfo, MachinePointerInfo());
3891 
3892  MemOpChains.push_back(Cpy);
3893  } else {
3894  // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3895  // promoted to a legal register type i32, we should truncate Arg back to
3896  // i1/i8/i16.
3897  if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3898  VA.getValVT() == MVT::i16)
3899  Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3900 
3901  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3902  MemOpChains.push_back(Store);
3903  }
3904  }
3905  }
3906 
3907  if (!MemOpChains.empty())
3908  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3909 
3910  // Build a sequence of copy-to-reg nodes chained together with token chain
3911  // and flag operands which copy the outgoing args into the appropriate regs.
3912  SDValue InFlag;
3913  for (auto &RegToPass : RegsToPass) {
3914  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3915  RegToPass.second, InFlag);
3916  InFlag = Chain.getValue(1);
3917  }
3918 
3919  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3920  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3921  // node so that legalize doesn't hack it.
3922  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3923  auto GV = G->getGlobal();
3924  if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
3926  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
3927  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3928  } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
3929  assert(Subtarget->isTargetWindows() &&
3930  "Windows is the only supported COFF target");
3931  Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
3932  } else {
3933  const GlobalValue *GV = G->getGlobal();
3934  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
3935  }
3936  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3937  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3938  Subtarget->isTargetMachO()) {
3939  const char *Sym = S->getSymbol();
3940  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
3941  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3942  } else {
3943  const char *Sym = S->getSymbol();
3944  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
3945  }
3946  }
3947 
3948  // We don't usually want to end the call-sequence here because we would tidy
3949  // the frame up *after* the call, however in the ABI-changing tail-call case
3950  // we've carefully laid out the parameters so that when sp is reset they'll be
3951  // in the correct location.
3952  if (IsTailCall && !IsSibCall) {
3953  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3954  DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
3955  InFlag = Chain.getValue(1);
3956  }
3957 
3958  std::vector<SDValue> Ops;
3959  Ops.push_back(Chain);
3960  Ops.push_back(Callee);
3961 
3962  if (IsTailCall) {
3963  // Each tail call may have to adjust the stack by a different amount, so
3964  // this information must travel along with the operation for eventual
3965  // consumption by emitEpilogue.
3966  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3967  }
3968 
3969  // Add argument registers to the end of the list so that they are known live
3970  // into the call.
3971  for (auto &RegToPass : RegsToPass)
3972  Ops.push_back(DAG.getRegister(RegToPass.first,
3973  RegToPass.second.getValueType()));
3974 
3975  // Check callee args/returns for SVE registers and set calling convention
3976  // accordingly.
3977  if (CallConv == CallingConv::C) {
3978  bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
3979  return Out.VT.isScalableVector();
3980  });
3981  bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
3982  return In.VT.isScalableVector();
3983  });
3984 
3985  if (CalleeInSVE || CalleeOutSVE)
3987  }
3988 
3989  // Add a register mask operand representing the call-preserved registers.
3990  const uint32_t *Mask;
3991  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3992  if (IsThisReturn) {
3993  // For 'this' returns, use the X0-preserving mask if applicable
3994  Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
3995  if (!Mask) {
3996  IsThisReturn = false;
3997  Mask = TRI->getCallPreservedMask(MF, CallConv);
3998  }
3999  } else
4000  Mask = TRI->getCallPreservedMask(MF, CallConv);
4001 
4002  if (Subtarget->hasCustomCallingConv())
4003  TRI->UpdateCustomCallPreservedMask(MF, &Mask);
4004 
4005  if (TRI->isAnyArgRegReserved(MF))
4006  TRI->emitReservedArgRegCallError(MF);
4007 
4008  assert(Mask && "Missing call preserved mask for calling convention");
4009  Ops.push_back(DAG.getRegisterMask(Mask));
4010 
4011  if (InFlag.getNode())
4012  Ops.push_back(InFlag);
4013 
4014  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4015 
4016  // If we're doing a tall call, use a TC_RETURN here rather than an
4017  // actual call instruction.
4018  if (IsTailCall) {
4020  return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
4021  }
4022 
4023  // Returns a chain and a flag for retval copy to use.
4024  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
4025  InFlag = Chain.getValue(1);
4026 
4027  uint64_t CalleePopBytes =
4028  DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
4029 
4030  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
4031  DAG.getIntPtrConstant(CalleePopBytes, DL, true),
4032  InFlag, DL);
4033  if (!Ins.empty())
4034  InFlag = Chain.getValue(1);
4035 
4036  // Handle result values, copying them out of physregs into vregs that we
4037  // return.
4038  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
4039  InVals, IsThisReturn,
4040  IsThisReturn ? OutVals[0] : SDValue());
4041 }
4042 
4043 bool AArch64TargetLowering::CanLowerReturn(
4044  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
4045  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
4046  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
4050  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
4051  return CCInfo.CheckReturn(Outs, RetCC);
4052 }
4053 
4054 SDValue
4055 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
4056  bool isVarArg,
4057  const SmallVectorImpl<ISD::OutputArg> &Outs,
4058  const SmallVectorImpl<SDValue> &OutVals,
4059  const SDLoc &DL, SelectionDAG &DAG) const {
4060  auto &MF = DAG.getMachineFunction();
4061  auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
4062 
4063  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
4067  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
4068  *DAG.getContext());
4069  CCInfo.AnalyzeReturn(Outs, RetCC);
4070 
4071  // Copy the result values into the output registers.
4072  SDValue Flag;
4073  SmallVector<SDValue, 4> RetOps(1, Chain);
4074  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
4075  ++i, ++realRVLocIdx) {
4076  CCValAssign &VA = RVLocs[i];
4077  assert(VA.isRegLoc() && "Can only return in registers!");
4078  SDValue Arg = OutVals[realRVLocIdx];
4079 
4080  switch (VA.getLocInfo()) {
4081  default:
4082  llvm_unreachable("Unknown loc info!");
4083  case CCValAssign::Full:
4084