LLVM  10.0.0svn
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ExpandImm.h"
14 #include "AArch64ISelLowering.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/StringSwitch.h"
31 #include "llvm/ADT/Triple.h"
32 #include "llvm/ADT/Twine.h"
48 #include "llvm/IR/Attributes.h"
49 #include "llvm/IR/Constants.h"
50 #include "llvm/IR/DataLayout.h"
51 #include "llvm/IR/DebugLoc.h"
52 #include "llvm/IR/DerivedTypes.h"
53 #include "llvm/IR/Function.h"
55 #include "llvm/IR/GlobalValue.h"
56 #include "llvm/IR/IRBuilder.h"
57 #include "llvm/IR/Instruction.h"
58 #include "llvm/IR/Instructions.h"
59 #include "llvm/IR/IntrinsicInst.h"
60 #include "llvm/IR/Intrinsics.h"
61 #include "llvm/IR/Module.h"
62 #include "llvm/IR/OperandTraits.h"
63 #include "llvm/IR/PatternMatch.h"
64 #include "llvm/IR/Type.h"
65 #include "llvm/IR/Use.h"
66 #include "llvm/IR/Value.h"
67 #include "llvm/MC/MCRegisterInfo.h"
68 #include "llvm/Support/Casting.h"
69 #include "llvm/Support/CodeGen.h"
71 #include "llvm/Support/Compiler.h"
72 #include "llvm/Support/Debug.h"
74 #include "llvm/Support/KnownBits.h"
80 #include <algorithm>
81 #include <bitset>
82 #include <cassert>
83 #include <cctype>
84 #include <cstdint>
85 #include <cstdlib>
86 #include <iterator>
87 #include <limits>
88 #include <tuple>
89 #include <utility>
90 #include <vector>
91 
92 using namespace llvm;
93 using namespace llvm::PatternMatch;
94 
95 #define DEBUG_TYPE "aarch64-lower"
96 
97 STATISTIC(NumTailCalls, "Number of tail calls");
98 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
99 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
100 
101 static cl::opt<bool>
102 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
103  cl::desc("Allow AArch64 SLI/SRI formation"),
104  cl::init(false));
105 
106 // FIXME: The necessary dtprel relocations don't seem to be supported
107 // well in the GNU bfd and gold linkers at the moment. Therefore, by
108 // default, for now, fall back to GeneralDynamic code generation.
110  "aarch64-elf-ldtls-generation", cl::Hidden,
111  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
112  cl::init(false));
113 
114 static cl::opt<bool>
115 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
116  cl::desc("Enable AArch64 logical imm instruction "
117  "optimization"),
118  cl::init(true));
119 
120 /// Value type used for condition codes.
121 static const MVT MVT_CC = MVT::i32;
122 
124  const AArch64Subtarget &STI)
125  : TargetLowering(TM), Subtarget(&STI) {
126  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
127  // we have to make something up. Arbitrarily, choose ZeroOrOne.
129  // When comparing vectors the result sets the different elements in the
130  // vector to all-one or all-zero.
132 
133  // Set up the register classes.
134  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
135  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
136 
137  if (Subtarget->hasFPARMv8()) {
138  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
139  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
140  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
141  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
142  }
143 
144  if (Subtarget->hasNEON()) {
145  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
146  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
147  // Someone set us up the NEON.
148  addDRTypeForNEON(MVT::v2f32);
149  addDRTypeForNEON(MVT::v8i8);
150  addDRTypeForNEON(MVT::v4i16);
151  addDRTypeForNEON(MVT::v2i32);
152  addDRTypeForNEON(MVT::v1i64);
153  addDRTypeForNEON(MVT::v1f64);
154  addDRTypeForNEON(MVT::v4f16);
155 
156  addQRTypeForNEON(MVT::v4f32);
157  addQRTypeForNEON(MVT::v2f64);
158  addQRTypeForNEON(MVT::v16i8);
159  addQRTypeForNEON(MVT::v8i16);
160  addQRTypeForNEON(MVT::v4i32);
161  addQRTypeForNEON(MVT::v2i64);
162  addQRTypeForNEON(MVT::v8f16);
163  }
164 
165  if (Subtarget->hasSVE()) {
166  // Add legal sve predicate types
167  addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
168  addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
169  addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
170  addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
171 
172  // Add legal sve data types
173  addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
174  addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
175  addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
176  addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
177 
178  addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
179  addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
180  addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
181  addRegisterClass(MVT::nxv1f32, &AArch64::ZPRRegClass);
182  addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
183  addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
184  addRegisterClass(MVT::nxv1f64, &AArch64::ZPRRegClass);
185  addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
186  }
187 
188  // Compute derived properties from the register classes
190 
191  // Provide all sorts of operation actions
219 
223 
227 
229 
230  // Custom lowering hooks are needed for XOR
231  // to fold it into CSINC/CSINV.
234 
235  // Virtually no operation on f128 is legal, but LLVM can't expand them when
236  // there's a valid register class, so we need custom operations in most cases.
258 
259  // Lowering for many of the conversions is actually specified by the non-f128
260  // type. The LowerXXX function will be trivial when f128 isn't involved.
275 
276  // Variable arguments.
281 
282  // Variable-sized objects.
285 
286  if (Subtarget->isTargetWindows())
288  else
290 
291  // Constant pool entries
293 
294  // BlockAddress
296 
297  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
306 
307  // AArch64 lacks both left-rotate and popcount instructions.
310  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
313  }
314 
315  // AArch64 doesn't have {U|S}MUL_LOHI.
318 
321 
324  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
327  }
334 
335  // Custom lower Add/Sub/Mul with overflow.
348 
357  if (Subtarget->hasFullFP16())
359  else
361 
395 
396  if (!Subtarget->hasFullFP16()) {
419 
420  // promote v4f16 to v4f32 when that is known to be safe.
433 
449 
470  }
471 
472  // AArch64 has implementations of a lot of rounding-like FP operations.
473  for (MVT Ty : {MVT::f32, MVT::f64}) {
488  }
489 
490  if (Subtarget->hasFullFP16()) {
501  }
502 
504 
506 
512 
513  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
514  // This requires the Performance Monitors extension.
515  if (Subtarget->hasPerfMon())
517 
518  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
519  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
520  // Issue __sincos_stret if available.
523  } else {
526  }
527 
528  // Make floating-point constants legal for the large code model, so they don't
529  // become loads from the constant pool.
530  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
533  }
534 
535  // AArch64 does not have floating-point extending loads, i1 sign-extending
536  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
537  for (MVT VT : MVT::fp_valuetypes()) {
542  }
543  for (MVT VT : MVT::integer_valuetypes())
545 
553 
556 
557  // Indexed loads and stores are supported.
558  for (unsigned im = (unsigned)ISD::PRE_INC;
574  }
575 
576  // Trap.
578  if (Subtarget->isTargetWindows())
580 
581  // We combine OR nodes for bitfield operations.
583  // Try to create BICs for vector ANDs.
585 
586  // Vector add and sub nodes may conceal a high-half opportunity.
587  // Also, try to fold ADD into CSINC/CSINV..
594 
598 
600 
607  if (Subtarget->supportsAddressTopByteIgnored())
609 
611 
614 
618 
620 
621  // In case of strict alignment, avoid an excessive number of byte wide stores.
625 
630 
632 
636 
638 
640 
641  EnableExtLdPromotion = true;
642 
643  // Set required alignment.
645  // Set preferred alignments.
648 
649  // Only change the limit for entries in a jump table if specified by
650  // the sub target, but not at the command line.
651  unsigned MaxJT = STI.getMaximumJumpTableSize();
652  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
654 
655  setHasExtractBitsInsn(true);
656 
658 
659  if (Subtarget->hasNEON()) {
660  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
661  // silliness like this:
687 
693 
695 
696  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
697  // elements smaller than i32, so promote the input to i32 first.
700  // i8 vector elements also need promotion to i32 for v8i8
703  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
708  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
709  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
712 
713  if (Subtarget->hasFullFP16()) {
718  } else {
719  // when AArch64 doesn't have fullfp16 support, promote the input
720  // to i32 first.
725  }
726 
729 
730  // AArch64 doesn't have MUL.2d:
732  // Custom handling for some quad-vector types to detect MULL.
736 
737  // Vector reductions
738  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
745  }
746  for (MVT VT : { MVT::v4f16, MVT::v2f32,
750  }
751 
754  // Likewise, narrowing and extending vector loads/stores aren't handled
755  // directly.
756  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
758 
759  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
762  } else {
765  }
768 
771 
772  for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
773  setTruncStoreAction(VT, InnerVT, Expand);
774  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
775  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
776  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
777  }
778  }
779 
780  // AArch64 has implementations of a lot of rounding-like FP operations.
781  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
788  }
789 
790  if (Subtarget->hasFullFP16()) {
791  for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
798  }
799  }
800 
802  }
803 
804  if (Subtarget->hasSVE()) {
806  if (isTypeLegal(VT) && VT.getVectorElementType() != MVT::i1)
808  }
809  }
810 
812 }
813 
814 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
815  assert(VT.isVector() && "VT should be a vector type");
816 
817  if (VT.isFloatingPoint()) {
819  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
820  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
821  }
822 
823  // Mark vector float intrinsics as expand.
824  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
833 
834  // But we do support custom-lowering for FCOPYSIGN.
836  }
837 
849 
853  for (MVT InnerVT : MVT::all_valuetypes())
854  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
855 
856  // CNT supports only B element sizes, then use UADDLP to widen.
857  if (VT != MVT::v8i8 && VT != MVT::v16i8)
859 
865 
868 
869  if (!VT.isFloatingPoint())
871 
872  // [SU][MIN|MAX] are available for all NEON types apart from i64.
873  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
874  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
875  setOperationAction(Opcode, VT, Legal);
876 
877  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
878  if (VT.isFloatingPoint() &&
879  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
880  for (unsigned Opcode :
882  setOperationAction(Opcode, VT, Legal);
883 
884  if (Subtarget->isLittleEndian()) {
885  for (unsigned im = (unsigned)ISD::PRE_INC;
889  }
890  }
891 }
892 
893 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
894  addRegisterClass(VT, &AArch64::FPR64RegClass);
895  addTypeForNEON(VT, MVT::v2i32);
896 }
897 
898 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
899  addRegisterClass(VT, &AArch64::FPR128RegClass);
900  addTypeForNEON(VT, MVT::v4i32);
901 }
902 
904  EVT VT) const {
905  if (!VT.isVector())
906  return MVT::i32;
908 }
909 
910 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
911  const APInt &Demanded,
913  unsigned NewOpc) {
914  uint64_t OldImm = Imm, NewImm, Enc;
915  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
916 
917  // Return if the immediate is already all zeros, all ones, a bimm32 or a
918  // bimm64.
919  if (Imm == 0 || Imm == Mask ||
920  AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
921  return false;
922 
923  unsigned EltSize = Size;
924  uint64_t DemandedBits = Demanded.getZExtValue();
925 
926  // Clear bits that are not demanded.
927  Imm &= DemandedBits;
928 
929  while (true) {
930  // The goal here is to set the non-demanded bits in a way that minimizes
931  // the number of switching between 0 and 1. In order to achieve this goal,
932  // we set the non-demanded bits to the value of the preceding demanded bits.
933  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
934  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
935  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
936  // The final result is 0b11000011.
937  uint64_t NonDemandedBits = ~DemandedBits;
938  uint64_t InvertedImm = ~Imm & DemandedBits;
939  uint64_t RotatedImm =
940  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
941  NonDemandedBits;
942  uint64_t Sum = RotatedImm + NonDemandedBits;
943  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
944  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
945  NewImm = (Imm | Ones) & Mask;
946 
947  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
948  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
949  // we halve the element size and continue the search.
950  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
951  break;
952 
953  // We cannot shrink the element size any further if it is 2-bits.
954  if (EltSize == 2)
955  return false;
956 
957  EltSize /= 2;
958  Mask >>= EltSize;
959  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
960 
961  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
962  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
963  return false;
964 
965  // Merge the upper and lower halves of Imm and DemandedBits.
966  Imm |= Hi;
967  DemandedBits |= DemandedBitsHi;
968  }
969 
970  ++NumOptimizedImms;
971 
972  // Replicate the element across the register width.
973  while (EltSize < Size) {
974  NewImm |= NewImm << EltSize;
975  EltSize *= 2;
976  }
977 
978  (void)OldImm;
979  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
980  "demanded bits should never be altered");
981  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
982 
983  // Create the new constant immediate node.
984  EVT VT = Op.getValueType();
985  SDLoc DL(Op);
986  SDValue New;
987 
988  // If the new constant immediate is all-zeros or all-ones, let the target
989  // independent DAG combine optimize this node.
990  if (NewImm == 0 || NewImm == OrigMask) {
991  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
992  TLO.DAG.getConstant(NewImm, DL, VT));
993  // Otherwise, create a machine node so that target independent DAG combine
994  // doesn't undo this optimization.
995  } else {
996  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
997  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
998  New = SDValue(
999  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1000  }
1001 
1002  return TLO.CombineTo(Op, New);
1003 }
1004 
1006  SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
1007  // Delay this optimization to as late as possible.
1008  if (!TLO.LegalOps)
1009  return false;
1010 
1012  return false;
1013 
1014  EVT VT = Op.getValueType();
1015  if (VT.isVector())
1016  return false;
1017 
1018  unsigned Size = VT.getSizeInBits();
1019  assert((Size == 32 || Size == 64) &&
1020  "i32 or i64 is expected after legalization.");
1021 
1022  // Exit early if we demand all bits.
1023  if (Demanded.countPopulation() == Size)
1024  return false;
1025 
1026  unsigned NewOpc;
1027  switch (Op.getOpcode()) {
1028  default:
1029  return false;
1030  case ISD::AND:
1031  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1032  break;
1033  case ISD::OR:
1034  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1035  break;
1036  case ISD::XOR:
1037  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1038  break;
1039  }
1041  if (!C)
1042  return false;
1043  uint64_t Imm = C->getZExtValue();
1044  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
1045 }
1046 
1047 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1048 /// Mask are known to be either zero or one and return them Known.
1050  const SDValue Op, KnownBits &Known,
1051  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1052  switch (Op.getOpcode()) {
1053  default:
1054  break;
1055  case AArch64ISD::CSEL: {
1056  KnownBits Known2;
1057  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1058  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1059  Known.Zero &= Known2.Zero;
1060  Known.One &= Known2.One;
1061  break;
1062  }
1063  case AArch64ISD::LOADgot:
1064  case AArch64ISD::ADDlow: {
1065  if (!Subtarget->isTargetILP32())
1066  break;
1067  // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1068  Known.Zero = APInt::getHighBitsSet(64, 32);
1069  break;
1070  }
1071  case ISD::INTRINSIC_W_CHAIN: {
1072  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1073  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1074  switch (IntID) {
1075  default: return;
1076  case Intrinsic::aarch64_ldaxr:
1077  case Intrinsic::aarch64_ldxr: {
1078  unsigned BitWidth = Known.getBitWidth();
1079  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1080  unsigned MemBits = VT.getScalarSizeInBits();
1081  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1082  return;
1083  }
1084  }
1085  break;
1086  }
1088  case ISD::INTRINSIC_VOID: {
1089  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1090  switch (IntNo) {
1091  default:
1092  break;
1093  case Intrinsic::aarch64_neon_umaxv:
1094  case Intrinsic::aarch64_neon_uminv: {
1095  // Figure out the datatype of the vector operand. The UMINV instruction
1096  // will zero extend the result, so we can mark as known zero all the
1097  // bits larger than the element datatype. 32-bit or larget doesn't need
1098  // this as those are legal types and will be handled by isel directly.
1099  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1100  unsigned BitWidth = Known.getBitWidth();
1101  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1102  assert(BitWidth >= 8 && "Unexpected width!");
1103  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1104  Known.Zero |= Mask;
1105  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1106  assert(BitWidth >= 16 && "Unexpected width!");
1107  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1108  Known.Zero |= Mask;
1109  }
1110  break;
1111  } break;
1112  }
1113  }
1114  }
1115 }
1116 
1118  EVT) const {
1119  return MVT::i64;
1120 }
1121 
1123  EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1124  bool *Fast) const {
1125  if (Subtarget->requiresStrictAlign())
1126  return false;
1127 
1128  if (Fast) {
1129  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1130  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1131  // See comments in performSTORECombine() for more details about
1132  // these conditions.
1133 
1134  // Code that uses clang vector extensions can mark that it
1135  // wants unaligned accesses to be treated as fast by
1136  // underspecifying alignment to be 1 or 2.
1137  Align <= 2 ||
1138 
1139  // Disregard v2i64. Memcpy lowering produces those and splitting
1140  // them regresses performance on micro-benchmarks and olden/bh.
1141  VT == MVT::v2i64;
1142  }
1143  return true;
1144 }
1145 
1146 // Same as above but handling LLTs instead.
1148  LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1149  bool *Fast) const {
1150  if (Subtarget->requiresStrictAlign())
1151  return false;
1152 
1153  if (Fast) {
1154  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1155  *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1156  Ty.getSizeInBytes() != 16 ||
1157  // See comments in performSTORECombine() for more details about
1158  // these conditions.
1159 
1160  // Code that uses clang vector extensions can mark that it
1161  // wants unaligned accesses to be treated as fast by
1162  // underspecifying alignment to be 1 or 2.
1163  Align <= 2 ||
1164 
1165  // Disregard v2i64. Memcpy lowering produces those and splitting
1166  // them regresses performance on micro-benchmarks and olden/bh.
1167  Ty == LLT::vector(2, 64);
1168  }
1169  return true;
1170 }
1171 
1172 FastISel *
1174  const TargetLibraryInfo *libInfo) const {
1175  return AArch64::createFastISel(funcInfo, libInfo);
1176 }
1177 
1178 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1179  switch ((AArch64ISD::NodeType)Opcode) {
1180  case AArch64ISD::FIRST_NUMBER: break;
1181  case AArch64ISD::CALL: return "AArch64ISD::CALL";
1182  case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
1183  case AArch64ISD::ADR: return "AArch64ISD::ADR";
1184  case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
1185  case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
1186  case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
1187  case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
1188  case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
1189  case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
1190  case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
1191  case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
1192  case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
1193  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1194  case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
1195  case AArch64ISD::ADC: return "AArch64ISD::ADC";
1196  case AArch64ISD::SBC: return "AArch64ISD::SBC";
1197  case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
1198  case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
1199  case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
1200  case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
1201  case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
1202  case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
1203  case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
1204  case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
1205  case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
1206  case AArch64ISD::DUP: return "AArch64ISD::DUP";
1207  case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
1208  case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
1209  case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
1210  case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
1211  case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
1212  case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
1213  case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
1214  case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
1215  case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
1216  case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
1217  case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
1218  case AArch64ISD::BICi: return "AArch64ISD::BICi";
1219  case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
1220  case AArch64ISD::BSL: return "AArch64ISD::BSL";
1221  case AArch64ISD::NEG: return "AArch64ISD::NEG";
1222  case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
1223  case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
1224  case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
1225  case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
1226  case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
1227  case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
1228  case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
1229  case AArch64ISD::REV16: return "AArch64ISD::REV16";
1230  case AArch64ISD::REV32: return "AArch64ISD::REV32";
1231  case AArch64ISD::REV64: return "AArch64ISD::REV64";
1232  case AArch64ISD::EXT: return "AArch64ISD::EXT";
1233  case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
1234  case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
1235  case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
1236  case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
1237  case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
1238  case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
1239  case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
1240  case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
1241  case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
1242  case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
1243  case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
1244  case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
1245  case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
1246  case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
1247  case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
1248  case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
1249  case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
1250  case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
1251  case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
1252  case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
1253  case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
1254  case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
1255  case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
1256  case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
1257  case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
1258  case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
1259  case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
1260  case AArch64ISD::NOT: return "AArch64ISD::NOT";
1261  case AArch64ISD::BIT: return "AArch64ISD::BIT";
1262  case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
1263  case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
1264  case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
1265  case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
1266  case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
1267  case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
1268  case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
1269  case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
1270  case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
1271  case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
1272  case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
1273  case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
1274  case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
1275  case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
1276  case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
1277  case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
1278  case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
1279  case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
1280  case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
1281  case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
1282  case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
1283  case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
1284  case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
1285  case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
1286  case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
1287  case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
1288  case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
1289  case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
1290  case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
1291  case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
1292  case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
1293  case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
1294  case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
1295  case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
1296  case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
1297  case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
1298  case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
1299  case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
1300  case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
1301  case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
1302  case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
1303  case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
1304  case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
1305  case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
1306  case AArch64ISD::STG: return "AArch64ISD::STG";
1307  case AArch64ISD::STZG: return "AArch64ISD::STZG";
1308  case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
1309  case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
1310  case AArch64ISD::SUNPKHI: return "AArch64ISD::SUNPKHI";
1311  case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO";
1312  case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI";
1313  case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO";
1314  }
1315  return nullptr;
1316 }
1317 
1320  MachineBasicBlock *MBB) const {
1321  // We materialise the F128CSEL pseudo-instruction as some control flow and a
1322  // phi node:
1323 
1324  // OrigBB:
1325  // [... previous instrs leading to comparison ...]
1326  // b.ne TrueBB
1327  // b EndBB
1328  // TrueBB:
1329  // ; Fallthrough
1330  // EndBB:
1331  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1332 
1333  MachineFunction *MF = MBB->getParent();
1334  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1335  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1336  DebugLoc DL = MI.getDebugLoc();
1337  MachineFunction::iterator It = ++MBB->getIterator();
1338 
1339  Register DestReg = MI.getOperand(0).getReg();
1340  Register IfTrueReg = MI.getOperand(1).getReg();
1341  Register IfFalseReg = MI.getOperand(2).getReg();
1342  unsigned CondCode = MI.getOperand(3).getImm();
1343  bool NZCVKilled = MI.getOperand(4).isKill();
1344 
1345  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1346  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1347  MF->insert(It, TrueBB);
1348  MF->insert(It, EndBB);
1349 
1350  // Transfer rest of current basic-block to EndBB
1351  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1352  MBB->end());
1353  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1354 
1355  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1356  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1357  MBB->addSuccessor(TrueBB);
1358  MBB->addSuccessor(EndBB);
1359 
1360  // TrueBB falls through to the end.
1361  TrueBB->addSuccessor(EndBB);
1362 
1363  if (!NZCVKilled) {
1364  TrueBB->addLiveIn(AArch64::NZCV);
1365  EndBB->addLiveIn(AArch64::NZCV);
1366  }
1367 
1368  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1369  .addReg(IfTrueReg)
1370  .addMBB(TrueBB)
1371  .addReg(IfFalseReg)
1372  .addMBB(MBB);
1373 
1374  MI.eraseFromParent();
1375  return EndBB;
1376 }
1377 
1379  MachineInstr &MI, MachineBasicBlock *BB) const {
1381  BB->getParent()->getFunction().getPersonalityFn())) &&
1382  "SEH does not use catchret!");
1383  return BB;
1384 }
1385 
1387  MachineInstr &MI, MachineBasicBlock *BB) const {
1388  MI.eraseFromParent();
1389  return BB;
1390 }
1391 
1393  MachineInstr &MI, MachineBasicBlock *BB) const {
1394  switch (MI.getOpcode()) {
1395  default:
1396 #ifndef NDEBUG
1397  MI.dump();
1398 #endif
1399  llvm_unreachable("Unexpected instruction for custom inserter!");
1400 
1401  case AArch64::F128CSEL:
1402  return EmitF128CSEL(MI, BB);
1403 
1404  case TargetOpcode::STACKMAP:
1405  case TargetOpcode::PATCHPOINT:
1406  return emitPatchPoint(MI, BB);
1407 
1408  case AArch64::CATCHRET:
1409  return EmitLoweredCatchRet(MI, BB);
1410  case AArch64::CATCHPAD:
1411  return EmitLoweredCatchPad(MI, BB);
1412  }
1413 }
1414 
1415 //===----------------------------------------------------------------------===//
1416 // AArch64 Lowering private implementation.
1417 //===----------------------------------------------------------------------===//
1418 
1419 //===----------------------------------------------------------------------===//
1420 // Lowering Code
1421 //===----------------------------------------------------------------------===//
1422 
1423 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1424 /// CC
1426  switch (CC) {
1427  default:
1428  llvm_unreachable("Unknown condition code!");
1429  case ISD::SETNE:
1430  return AArch64CC::NE;
1431  case ISD::SETEQ:
1432  return AArch64CC::EQ;
1433  case ISD::SETGT:
1434  return AArch64CC::GT;
1435  case ISD::SETGE:
1436  return AArch64CC::GE;
1437  case ISD::SETLT:
1438  return AArch64CC::LT;
1439  case ISD::SETLE:
1440  return AArch64CC::LE;
1441  case ISD::SETUGT:
1442  return AArch64CC::HI;
1443  case ISD::SETUGE:
1444  return AArch64CC::HS;
1445  case ISD::SETULT:
1446  return AArch64CC::LO;
1447  case ISD::SETULE:
1448  return AArch64CC::LS;
1449  }
1450 }
1451 
1452 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1455  AArch64CC::CondCode &CondCode2) {
1456  CondCode2 = AArch64CC::AL;
1457  switch (CC) {
1458  default:
1459  llvm_unreachable("Unknown FP condition!");
1460  case ISD::SETEQ:
1461  case ISD::SETOEQ:
1462  CondCode = AArch64CC::EQ;
1463  break;
1464  case ISD::SETGT:
1465  case ISD::SETOGT:
1466  CondCode = AArch64CC::GT;
1467  break;
1468  case ISD::SETGE:
1469  case ISD::SETOGE:
1470  CondCode = AArch64CC::GE;
1471  break;
1472  case ISD::SETOLT:
1473  CondCode = AArch64CC::MI;
1474  break;
1475  case ISD::SETOLE:
1476  CondCode = AArch64CC::LS;
1477  break;
1478  case ISD::SETONE:
1479  CondCode = AArch64CC::MI;
1480  CondCode2 = AArch64CC::GT;
1481  break;
1482  case ISD::SETO:
1483  CondCode = AArch64CC::VC;
1484  break;
1485  case ISD::SETUO:
1486  CondCode = AArch64CC::VS;
1487  break;
1488  case ISD::SETUEQ:
1489  CondCode = AArch64CC::EQ;
1490  CondCode2 = AArch64CC::VS;
1491  break;
1492  case ISD::SETUGT:
1493  CondCode = AArch64CC::HI;
1494  break;
1495  case ISD::SETUGE:
1496  CondCode = AArch64CC::PL;
1497  break;
1498  case ISD::SETLT:
1499  case ISD::SETULT:
1500  CondCode = AArch64CC::LT;
1501  break;
1502  case ISD::SETLE:
1503  case ISD::SETULE:
1504  CondCode = AArch64CC::LE;
1505  break;
1506  case ISD::SETNE:
1507  case ISD::SETUNE:
1508  CondCode = AArch64CC::NE;
1509  break;
1510  }
1511 }
1512 
1513 /// Convert a DAG fp condition code to an AArch64 CC.
1514 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1515 /// should be AND'ed instead of OR'ed.
1518  AArch64CC::CondCode &CondCode2) {
1519  CondCode2 = AArch64CC::AL;
1520  switch (CC) {
1521  default:
1522  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1523  assert(CondCode2 == AArch64CC::AL);
1524  break;
1525  case ISD::SETONE:
1526  // (a one b)
1527  // == ((a olt b) || (a ogt b))
1528  // == ((a ord b) && (a une b))
1529  CondCode = AArch64CC::VC;
1530  CondCode2 = AArch64CC::NE;
1531  break;
1532  case ISD::SETUEQ:
1533  // (a ueq b)
1534  // == ((a uno b) || (a oeq b))
1535  // == ((a ule b) && (a uge b))
1536  CondCode = AArch64CC::PL;
1537  CondCode2 = AArch64CC::LE;
1538  break;
1539  }
1540 }
1541 
1542 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1543 /// CC usable with the vector instructions. Fewer operations are available
1544 /// without a real NZCV register, so we have to use less efficient combinations
1545 /// to get the same effect.
1548  AArch64CC::CondCode &CondCode2,
1549  bool &Invert) {
1550  Invert = false;
1551  switch (CC) {
1552  default:
1553  // Mostly the scalar mappings work fine.
1554  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1555  break;
1556  case ISD::SETUO:
1557  Invert = true;
1559  case ISD::SETO:
1560  CondCode = AArch64CC::MI;
1561  CondCode2 = AArch64CC::GE;
1562  break;
1563  case ISD::SETUEQ:
1564  case ISD::SETULT:
1565  case ISD::SETULE:
1566  case ISD::SETUGT:
1567  case ISD::SETUGE:
1568  // All of the compare-mask comparisons are ordered, but we can switch
1569  // between the two by a double inversion. E.g. ULE == !OGT.
1570  Invert = true;
1571  changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1572  break;
1573  }
1574 }
1575 
1576 static bool isLegalArithImmed(uint64_t C) {
1577  // Matches AArch64DAGToDAGISel::SelectArithImmed().
1578  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1579  LLVM_DEBUG(dbgs() << "Is imm " << C
1580  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1581  return IsLegal;
1582 }
1583 
1584 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
1585 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
1586 // can be set differently by this operation. It comes down to whether
1587 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1588 // everything is fine. If not then the optimization is wrong. Thus general
1589 // comparisons are only valid if op2 != 0.
1590 //
1591 // So, finally, the only LLVM-native comparisons that don't mention C and V
1592 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1593 // the absence of information about op2.
1594 static bool isCMN(SDValue Op, ISD::CondCode CC) {
1595  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
1596  (CC == ISD::SETEQ || CC == ISD::SETNE);
1597 }
1598 
1600  const SDLoc &dl, SelectionDAG &DAG) {
1601  EVT VT = LHS.getValueType();
1602  const bool FullFP16 =
1603  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1604 
1605  if (VT.isFloatingPoint()) {
1606  assert(VT != MVT::f128);
1607  if (VT == MVT::f16 && !FullFP16) {
1608  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1609  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1610  VT = MVT::f32;
1611  }
1612  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1613  }
1614 
1615  // The CMP instruction is just an alias for SUBS, and representing it as
1616  // SUBS means that it's possible to get CSE with subtract operations.
1617  // A later phase can perform the optimization of setting the destination
1618  // register to WZR/XZR if it ends up being unused.
1619  unsigned Opcode = AArch64ISD::SUBS;
1620 
1621  if (isCMN(RHS, CC)) {
1622  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
1623  Opcode = AArch64ISD::ADDS;
1624  RHS = RHS.getOperand(1);
1625  } else if (isCMN(LHS, CC)) {
1626  // As we are looking for EQ/NE compares, the operands can be commuted ; can
1627  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
1628  Opcode = AArch64ISD::ADDS;
1629  LHS = LHS.getOperand(1);
1630  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1631  !isUnsignedIntSetCC(CC)) {
1632  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1633  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1634  // of the signed comparisons.
1635  Opcode = AArch64ISD::ANDS;
1636  RHS = LHS.getOperand(1);
1637  LHS = LHS.getOperand(0);
1638  }
1639 
1640  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1641  .getValue(1);
1642 }
1643 
1644 /// \defgroup AArch64CCMP CMP;CCMP matching
1645 ///
1646 /// These functions deal with the formation of CMP;CCMP;... sequences.
1647 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1648 /// a comparison. They set the NZCV flags to a predefined value if their
1649 /// predicate is false. This allows to express arbitrary conjunctions, for
1650 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
1651 /// expressed as:
1652 /// cmp A
1653 /// ccmp B, inv(CB), CA
1654 /// check for CB flags
1655 ///
1656 /// This naturally lets us implement chains of AND operations with SETCC
1657 /// operands. And we can even implement some other situations by transforming
1658 /// them:
1659 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
1660 /// negating the flags used in a CCMP/FCCMP operations.
1661 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
1662 /// by negating the flags we test for afterwards. i.e.
1663 /// NEG (CMP CCMP CCCMP ...) can be implemented.
1664 /// - Note that we can only ever negate all previously processed results.
1665 /// What we can not implement by flipping the flags to test is a negation
1666 /// of two sub-trees (because the negation affects all sub-trees emitted so
1667 /// far, so the 2nd sub-tree we emit would also affect the first).
1668 /// With those tools we can implement some OR operations:
1669 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
1670 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
1671 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
1672 /// elimination rules from earlier to implement the whole thing as a
1673 /// CCMP/FCCMP chain.
1674 ///
1675 /// As complete example:
1676 /// or (or (setCA (cmp A)) (setCB (cmp B)))
1677 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1678 /// can be reassociated to:
1679 /// or (and (setCC (cmp C)) setCD (cmp D))
1680 // (or (setCA (cmp A)) (setCB (cmp B)))
1681 /// can be transformed to:
1682 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
1683 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1684 /// which can be implemented as:
1685 /// cmp C
1686 /// ccmp D, inv(CD), CC
1687 /// ccmp A, CA, inv(CD)
1688 /// ccmp B, CB, inv(CA)
1689 /// check for CB flags
1690 ///
1691 /// A counterexample is "or (and A B) (and C D)" which translates to
1692 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
1693 /// can only implement 1 of the inner (not) operations, but not both!
1694 /// @{
1695 
1696 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1698  ISD::CondCode CC, SDValue CCOp,
1700  AArch64CC::CondCode OutCC,
1701  const SDLoc &DL, SelectionDAG &DAG) {
1702  unsigned Opcode = 0;
1703  const bool FullFP16 =
1704  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1705 
1706  if (LHS.getValueType().isFloatingPoint()) {
1707  assert(LHS.getValueType() != MVT::f128);
1708  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1709  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1710  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1711  }
1712  Opcode = AArch64ISD::FCCMP;
1713  } else if (RHS.getOpcode() == ISD::SUB) {
1714  SDValue SubOp0 = RHS.getOperand(0);
1715  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1716  // See emitComparison() on why we can only do this for SETEQ and SETNE.
1717  Opcode = AArch64ISD::CCMN;
1718  RHS = RHS.getOperand(1);
1719  }
1720  }
1721  if (Opcode == 0)
1722  Opcode = AArch64ISD::CCMP;
1723 
1724  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1726  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1727  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1728  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1729 }
1730 
1731 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
1732 /// expressed as a conjunction. See \ref AArch64CCMP.
1733 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
1734 /// changing the conditions on the SETCC tests.
1735 /// (this means we can call emitConjunctionRec() with
1736 /// Negate==true on this sub-tree)
1737 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
1738 /// cannot do the negation naturally. We are required to
1739 /// emit the subtree first in this case.
1740 /// \param WillNegate Is true if are called when the result of this
1741 /// subexpression must be negated. This happens when the
1742 /// outer expression is an OR. We can use this fact to know
1743 /// that we have a double negation (or (or ...) ...) that
1744 /// can be implemented for free.
1745 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
1746  bool &MustBeFirst, bool WillNegate,
1747  unsigned Depth = 0) {
1748  if (!Val.hasOneUse())
1749  return false;
1750  unsigned Opcode = Val->getOpcode();
1751  if (Opcode == ISD::SETCC) {
1752  if (Val->getOperand(0).getValueType() == MVT::f128)
1753  return false;
1754  CanNegate = true;
1755  MustBeFirst = false;
1756  return true;
1757  }
1758  // Protect against exponential runtime and stack overflow.
1759  if (Depth > 6)
1760  return false;
1761  if (Opcode == ISD::AND || Opcode == ISD::OR) {
1762  bool IsOR = Opcode == ISD::OR;
1763  SDValue O0 = Val->getOperand(0);
1764  SDValue O1 = Val->getOperand(1);
1765  bool CanNegateL;
1766  bool MustBeFirstL;
1767  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
1768  return false;
1769  bool CanNegateR;
1770  bool MustBeFirstR;
1771  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
1772  return false;
1773 
1774  if (MustBeFirstL && MustBeFirstR)
1775  return false;
1776 
1777  if (IsOR) {
1778  // For an OR expression we need to be able to naturally negate at least
1779  // one side or we cannot do the transformation at all.
1780  if (!CanNegateL && !CanNegateR)
1781  return false;
1782  // If we the result of the OR will be negated and we can naturally negate
1783  // the leafs, then this sub-tree as a whole negates naturally.
1784  CanNegate = WillNegate && CanNegateL && CanNegateR;
1785  // If we cannot naturally negate the whole sub-tree, then this must be
1786  // emitted first.
1787  MustBeFirst = !CanNegate;
1788  } else {
1789  assert(Opcode == ISD::AND && "Must be OR or AND");
1790  // We cannot naturally negate an AND operation.
1791  CanNegate = false;
1792  MustBeFirst = MustBeFirstL || MustBeFirstR;
1793  }
1794  return true;
1795  }
1796  return false;
1797 }
1798 
1799 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1800 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1801 /// Tries to transform the given i1 producing node @p Val to a series compare
1802 /// and conditional compare operations. @returns an NZCV flags producing node
1803 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1804 /// transformation was not possible.
1805 /// \p Negate is true if we want this sub-tree being negated just by changing
1806 /// SETCC conditions.
1808  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1810  // We're at a tree leaf, produce a conditional comparison operation.
1811  unsigned Opcode = Val->getOpcode();
1812  if (Opcode == ISD::SETCC) {
1813  SDValue LHS = Val->getOperand(0);
1814  SDValue RHS = Val->getOperand(1);
1815  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1816  bool isInteger = LHS.getValueType().isInteger();
1817  if (Negate)
1818  CC = getSetCCInverse(CC, isInteger);
1819  SDLoc DL(Val);
1820  // Determine OutCC and handle FP special case.
1821  if (isInteger) {
1822  OutCC = changeIntCCToAArch64CC(CC);
1823  } else {
1825  AArch64CC::CondCode ExtraCC;
1826  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1827  // Some floating point conditions can't be tested with a single condition
1828  // code. Construct an additional comparison in this case.
1829  if (ExtraCC != AArch64CC::AL) {
1830  SDValue ExtraCmp;
1831  if (!CCOp.getNode())
1832  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1833  else
1834  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1835  ExtraCC, DL, DAG);
1836  CCOp = ExtraCmp;
1837  Predicate = ExtraCC;
1838  }
1839  }
1840 
1841  // Produce a normal comparison if we are first in the chain
1842  if (!CCOp)
1843  return emitComparison(LHS, RHS, CC, DL, DAG);
1844  // Otherwise produce a ccmp.
1845  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1846  DAG);
1847  }
1848  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
1849 
1850  bool IsOR = Opcode == ISD::OR;
1851 
1852  SDValue LHS = Val->getOperand(0);
1853  bool CanNegateL;
1854  bool MustBeFirstL;
1855  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
1856  assert(ValidL && "Valid conjunction/disjunction tree");
1857  (void)ValidL;
1858 
1859  SDValue RHS = Val->getOperand(1);
1860  bool CanNegateR;
1861  bool MustBeFirstR;
1862  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
1863  assert(ValidR && "Valid conjunction/disjunction tree");
1864  (void)ValidR;
1865 
1866  // Swap sub-tree that must come first to the right side.
1867  if (MustBeFirstL) {
1868  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
1869  std::swap(LHS, RHS);
1870  std::swap(CanNegateL, CanNegateR);
1871  std::swap(MustBeFirstL, MustBeFirstR);
1872  }
1873 
1874  bool NegateR;
1875  bool NegateAfterR;
1876  bool NegateL;
1877  bool NegateAfterAll;
1878  if (Opcode == ISD::OR) {
1879  // Swap the sub-tree that we can negate naturally to the left.
1880  if (!CanNegateL) {
1881  assert(CanNegateR && "at least one side must be negatable");
1882  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
1883  assert(!Negate);
1884  std::swap(LHS, RHS);
1885  NegateR = false;
1886  NegateAfterR = true;
1887  } else {
1888  // Negate the left sub-tree if possible, otherwise negate the result.
1889  NegateR = CanNegateR;
1890  NegateAfterR = !CanNegateR;
1891  }
1892  NegateL = true;
1893  NegateAfterAll = !Negate;
1894  } else {
1895  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
1896  assert(!Negate && "Valid conjunction/disjunction tree");
1897 
1898  NegateL = false;
1899  NegateR = false;
1900  NegateAfterR = false;
1901  NegateAfterAll = false;
1902  }
1903 
1904  // Emit sub-trees.
1905  AArch64CC::CondCode RHSCC;
1906  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
1907  if (NegateAfterR)
1908  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1909  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
1910  if (NegateAfterAll)
1911  OutCC = AArch64CC::getInvertedCondCode(OutCC);
1912  return CmpL;
1913 }
1914 
1915 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
1916 /// In some cases this is even possible with OR operations in the expression.
1917 /// See \ref AArch64CCMP.
1918 /// \see emitConjunctionRec().
1920  AArch64CC::CondCode &OutCC) {
1921  bool DummyCanNegate;
1922  bool DummyMustBeFirst;
1923  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
1924  return SDValue();
1925 
1926  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
1927 }
1928 
1929 /// @}
1930 
1931 /// Returns how profitable it is to fold a comparison's operand's shift and/or
1932 /// extension operations.
1934  auto isSupportedExtend = [&](SDValue V) {
1935  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
1936  return true;
1937 
1938  if (V.getOpcode() == ISD::AND)
1939  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
1940  uint64_t Mask = MaskCst->getZExtValue();
1941  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
1942  }
1943 
1944  return false;
1945  };
1946 
1947  if (!Op.hasOneUse())
1948  return 0;
1949 
1950  if (isSupportedExtend(Op))
1951  return 1;
1952 
1953  unsigned Opc = Op.getOpcode();
1954  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
1955  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
1956  uint64_t Shift = ShiftCst->getZExtValue();
1957  if (isSupportedExtend(Op.getOperand(0)))
1958  return (Shift <= 4) ? 2 : 1;
1959  EVT VT = Op.getValueType();
1960  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
1961  return 1;
1962  }
1963 
1964  return 0;
1965 }
1966 
1968  SDValue &AArch64cc, SelectionDAG &DAG,
1969  const SDLoc &dl) {
1970  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1971  EVT VT = RHS.getValueType();
1972  uint64_t C = RHSC->getZExtValue();
1973  if (!isLegalArithImmed(C)) {
1974  // Constant does not fit, try adjusting it by one?
1975  switch (CC) {
1976  default:
1977  break;
1978  case ISD::SETLT:
1979  case ISD::SETGE:
1980  if ((VT == MVT::i32 && C != 0x80000000 &&
1981  isLegalArithImmed((uint32_t)(C - 1))) ||
1982  (VT == MVT::i64 && C != 0x80000000ULL &&
1983  isLegalArithImmed(C - 1ULL))) {
1984  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1985  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1986  RHS = DAG.getConstant(C, dl, VT);
1987  }
1988  break;
1989  case ISD::SETULT:
1990  case ISD::SETUGE:
1991  if ((VT == MVT::i32 && C != 0 &&
1992  isLegalArithImmed((uint32_t)(C - 1))) ||
1993  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1994  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1995  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1996  RHS = DAG.getConstant(C, dl, VT);
1997  }
1998  break;
1999  case ISD::SETLE:
2000  case ISD::SETGT:
2001  if ((VT == MVT::i32 && C != INT32_MAX &&
2002  isLegalArithImmed((uint32_t)(C + 1))) ||
2003  (VT == MVT::i64 && C != INT64_MAX &&
2004  isLegalArithImmed(C + 1ULL))) {
2005  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2006  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2007  RHS = DAG.getConstant(C, dl, VT);
2008  }
2009  break;
2010  case ISD::SETULE:
2011  case ISD::SETUGT:
2012  if ((VT == MVT::i32 && C != UINT32_MAX &&
2013  isLegalArithImmed((uint32_t)(C + 1))) ||
2014  (VT == MVT::i64 && C != UINT64_MAX &&
2015  isLegalArithImmed(C + 1ULL))) {
2016  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2017  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2018  RHS = DAG.getConstant(C, dl, VT);
2019  }
2020  break;
2021  }
2022  }
2023  }
2024 
2025  // Comparisons are canonicalized so that the RHS operand is simpler than the
2026  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
2027  // can fold some shift+extend operations on the RHS operand, so swap the
2028  // operands if that can be done.
2029  //
2030  // For example:
2031  // lsl w13, w11, #1
2032  // cmp w13, w12
2033  // can be turned into:
2034  // cmp w12, w11, lsl #1
2035  if (!isa<ConstantSDNode>(RHS) ||
2036  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
2037  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
2038 
2040  std::swap(LHS, RHS);
2042  }
2043  }
2044 
2045  SDValue Cmp;
2046  AArch64CC::CondCode AArch64CC;
2047  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
2048  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
2049 
2050  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
2051  // For the i8 operand, the largest immediate is 255, so this can be easily
2052  // encoded in the compare instruction. For the i16 operand, however, the
2053  // largest immediate cannot be encoded in the compare.
2054  // Therefore, use a sign extending load and cmn to avoid materializing the
2055  // -1 constant. For example,
2056  // movz w1, #65535
2057  // ldrh w0, [x0, #0]
2058  // cmp w0, w1
2059  // >
2060  // ldrsh w0, [x0, #0]
2061  // cmn w0, #1
2062  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
2063  // if and only if (sext LHS) == (sext RHS). The checks are in place to
2064  // ensure both the LHS and RHS are truly zero extended and to make sure the
2065  // transformation is profitable.
2066  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
2067  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
2068  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
2069  LHS.getNode()->hasNUsesOfValue(1, 0)) {
2070  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
2071  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
2072  SDValue SExt =
2073  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2074  DAG.getValueType(MVT::i16));
2075  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2076  RHS.getValueType()),
2077  CC, dl, DAG);
2078  AArch64CC = changeIntCCToAArch64CC(CC);
2079  }
2080  }
2081 
2082  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2083  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2084  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2085  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2086  }
2087  }
2088  }
2089 
2090  if (!Cmp) {
2091  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2092  AArch64CC = changeIntCCToAArch64CC(CC);
2093  }
2094  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2095  return Cmp;
2096 }
2097 
2098 static std::pair<SDValue, SDValue>
2100  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2101  "Unsupported value type");
2102  SDValue Value, Overflow;
2103  SDLoc DL(Op);
2104  SDValue LHS = Op.getOperand(0);
2105  SDValue RHS = Op.getOperand(1);
2106  unsigned Opc = 0;
2107  switch (Op.getOpcode()) {
2108  default:
2109  llvm_unreachable("Unknown overflow instruction!");
2110  case ISD::SADDO:
2111  Opc = AArch64ISD::ADDS;
2112  CC = AArch64CC::VS;
2113  break;
2114  case ISD::UADDO:
2115  Opc = AArch64ISD::ADDS;
2116  CC = AArch64CC::HS;
2117  break;
2118  case ISD::SSUBO:
2119  Opc = AArch64ISD::SUBS;
2120  CC = AArch64CC::VS;
2121  break;
2122  case ISD::USUBO:
2123  Opc = AArch64ISD::SUBS;
2124  CC = AArch64CC::LO;
2125  break;
2126  // Multiply needs a little bit extra work.
2127  case ISD::SMULO:
2128  case ISD::UMULO: {
2129  CC = AArch64CC::NE;
2130  bool IsSigned = Op.getOpcode() == ISD::SMULO;
2131  if (Op.getValueType() == MVT::i32) {
2132  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2133  // For a 32 bit multiply with overflow check we want the instruction
2134  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2135  // need to generate the following pattern:
2136  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2137  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2138  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2139  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2140  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
2141  DAG.getConstant(0, DL, MVT::i64));
2142  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2143  // operation. We need to clear out the upper 32 bits, because we used a
2144  // widening multiply that wrote all 64 bits. In the end this should be a
2145  // noop.
2146  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
2147  if (IsSigned) {
2148  // The signed overflow check requires more than just a simple check for
2149  // any bit set in the upper 32 bits of the result. These bits could be
2150  // just the sign bits of a negative number. To perform the overflow
2151  // check we have to arithmetic shift right the 32nd bit of the result by
2152  // 31 bits. Then we compare the result to the upper 32 bits.
2153  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
2154  DAG.getConstant(32, DL, MVT::i64));
2155  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
2156  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
2157  DAG.getConstant(31, DL, MVT::i64));
2158  // It is important that LowerBits is last, otherwise the arithmetic
2159  // shift will not be folded into the compare (SUBS).
2160  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2161  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2162  .getValue(1);
2163  } else {
2164  // The overflow check for unsigned multiply is easy. We only need to
2165  // check if any of the upper 32 bits are set. This can be done with a
2166  // CMP (shifted register). For that we need to generate the following
2167  // pattern:
2168  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2169  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2170  DAG.getConstant(32, DL, MVT::i64));
2171  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2172  Overflow =
2173  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2174  DAG.getConstant(0, DL, MVT::i64),
2175  UpperBits).getValue(1);
2176  }
2177  break;
2178  }
2179  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2180  // For the 64 bit multiply
2181  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2182  if (IsSigned) {
2183  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2184  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
2185  DAG.getConstant(63, DL, MVT::i64));
2186  // It is important that LowerBits is last, otherwise the arithmetic
2187  // shift will not be folded into the compare (SUBS).
2188  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2189  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2190  .getValue(1);
2191  } else {
2192  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2193  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2194  Overflow =
2195  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2196  DAG.getConstant(0, DL, MVT::i64),
2197  UpperBits).getValue(1);
2198  }
2199  break;
2200  }
2201  } // switch (...)
2202 
2203  if (Opc) {
2204  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2205 
2206  // Emit the AArch64 operation with overflow check.
2207  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2208  Overflow = Value.getValue(1);
2209  }
2210  return std::make_pair(Value, Overflow);
2211 }
2212 
2213 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
2214  RTLIB::Libcall Call) const {
2215  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2216  MakeLibCallOptions CallOptions;
2217  return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
2218 }
2219 
2220 // Returns true if the given Op is the overflow flag result of an overflow
2221 // intrinsic operation.
2222 static bool isOverflowIntrOpRes(SDValue Op) {
2223  unsigned Opc = Op.getOpcode();
2224  return (Op.getResNo() == 1 &&
2225  (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2226  Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
2227 }
2228 
2230  SDValue Sel = Op.getOperand(0);
2231  SDValue Other = Op.getOperand(1);
2232  SDLoc dl(Sel);
2233 
2234  // If the operand is an overflow checking operation, invert the condition
2235  // code and kill the Not operation. I.e., transform:
2236  // (xor (overflow_op_bool, 1))
2237  // -->
2238  // (csel 1, 0, invert(cc), overflow_op_bool)
2239  // ... which later gets transformed to just a cset instruction with an
2240  // inverted condition code, rather than a cset + eor sequence.
2241  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
2242  // Only lower legal XALUO ops.
2243  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2244  return SDValue();
2245 
2246  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2247  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2249  SDValue Value, Overflow;
2250  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2251  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2252  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2253  CCVal, Overflow);
2254  }
2255  // If neither operand is a SELECT_CC, give up.
2256  if (Sel.getOpcode() != ISD::SELECT_CC)
2257  std::swap(Sel, Other);
2258  if (Sel.getOpcode() != ISD::SELECT_CC)
2259  return Op;
2260 
2261  // The folding we want to perform is:
2262  // (xor x, (select_cc a, b, cc, 0, -1) )
2263  // -->
2264  // (csel x, (xor x, -1), cc ...)
2265  //
2266  // The latter will get matched to a CSINV instruction.
2267 
2268  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2269  SDValue LHS = Sel.getOperand(0);
2270  SDValue RHS = Sel.getOperand(1);
2271  SDValue TVal = Sel.getOperand(2);
2272  SDValue FVal = Sel.getOperand(3);
2273 
2274  // FIXME: This could be generalized to non-integer comparisons.
2275  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2276  return Op;
2277 
2278  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2279  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2280 
2281  // The values aren't constants, this isn't the pattern we're looking for.
2282  if (!CFVal || !CTVal)
2283  return Op;
2284 
2285  // We can commute the SELECT_CC by inverting the condition. This
2286  // might be needed to make this fit into a CSINV pattern.
2287  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2288  std::swap(TVal, FVal);
2289  std::swap(CTVal, CFVal);
2290  CC = ISD::getSetCCInverse(CC, true);
2291  }
2292 
2293  // If the constants line up, perform the transform!
2294  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2295  SDValue CCVal;
2296  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2297 
2298  FVal = Other;
2299  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2300  DAG.getConstant(-1ULL, dl, Other.getValueType()));
2301 
2302  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2303  CCVal, Cmp);
2304  }
2305 
2306  return Op;
2307 }
2308 
2310  EVT VT = Op.getValueType();
2311 
2312  // Let legalize expand this if it isn't a legal type yet.
2313  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2314  return SDValue();
2315 
2316  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2317 
2318  unsigned Opc;
2319  bool ExtraOp = false;
2320  switch (Op.getOpcode()) {
2321  default:
2322  llvm_unreachable("Invalid code");
2323  case ISD::ADDC:
2324  Opc = AArch64ISD::ADDS;
2325  break;
2326  case ISD::SUBC:
2327  Opc = AArch64ISD::SUBS;
2328  break;
2329  case ISD::ADDE:
2330  Opc = AArch64ISD::ADCS;
2331  ExtraOp = true;
2332  break;
2333  case ISD::SUBE:
2334  Opc = AArch64ISD::SBCS;
2335  ExtraOp = true;
2336  break;
2337  }
2338 
2339  if (!ExtraOp)
2340  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2341  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2342  Op.getOperand(2));
2343 }
2344 
2346  // Let legalize expand this if it isn't a legal type yet.
2348  return SDValue();
2349 
2350  SDLoc dl(Op);
2352  // The actual operation that sets the overflow or carry flag.
2353  SDValue Value, Overflow;
2354  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2355 
2356  // We use 0 and 1 as false and true values.
2357  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2358  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2359 
2360  // We use an inverted condition, because the conditional select is inverted
2361  // too. This will allow it to be selected to a single instruction:
2362  // CSINC Wd, WZR, WZR, invert(cond).
2363  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2364  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2365  CCVal, Overflow);
2366 
2367  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2368  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2369 }
2370 
2371 // Prefetch operands are:
2372 // 1: Address to prefetch
2373 // 2: bool isWrite
2374 // 3: int locality (0 = no locality ... 3 = extreme locality)
2375 // 4: bool isDataCache
2377  SDLoc DL(Op);
2378  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2379  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2380  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2381 
2382  bool IsStream = !Locality;
2383  // When the locality number is set
2384  if (Locality) {
2385  // The front-end should have filtered out the out-of-range values
2386  assert(Locality <= 3 && "Prefetch locality out-of-range");
2387  // The locality degree is the opposite of the cache speed.
2388  // Put the number the other way around.
2389  // The encoding starts at 0 for level 1
2390  Locality = 3 - Locality;
2391  }
2392 
2393  // built the mask value encoding the expected behavior.
2394  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2395  (!IsData << 3) | // IsDataCache bit
2396  (Locality << 1) | // Cache level bits
2397  (unsigned)IsStream; // Stream bit
2398  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2399  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2400 }
2401 
2402 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2403  SelectionDAG &DAG) const {
2404  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2405 
2406  RTLIB::Libcall LC;
2408 
2409  return LowerF128Call(Op, DAG, LC);
2410 }
2411 
2412 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2413  SelectionDAG &DAG) const {
2414  if (Op.getOperand(0).getValueType() != MVT::f128) {
2415  // It's legal except when f128 is involved
2416  return Op;
2417  }
2418 
2419  RTLIB::Libcall LC;
2421 
2422  // FP_ROUND node has a second operand indicating whether it is known to be
2423  // precise. That doesn't take part in the LibCall so we can't directly use
2424  // LowerF128Call.
2425  SDValue SrcVal = Op.getOperand(0);
2426  MakeLibCallOptions CallOptions;
2427  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions,
2428  SDLoc(Op)).first;
2429 }
2430 
2431 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
2432  SelectionDAG &DAG) const {
2433  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2434  // Any additional optimization in this function should be recorded
2435  // in the cost tables.
2436  EVT InVT = Op.getOperand(0).getValueType();
2437  EVT VT = Op.getValueType();
2438  unsigned NumElts = InVT.getVectorNumElements();
2439 
2440  // f16 conversions are promoted to f32 when full fp16 is not supported.
2441  if (InVT.getVectorElementType() == MVT::f16 &&
2442  !Subtarget->hasFullFP16()) {
2443  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2444  SDLoc dl(Op);
2445  return DAG.getNode(
2446  Op.getOpcode(), dl, Op.getValueType(),
2447  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2448  }
2449 
2450  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2451  SDLoc dl(Op);
2452  SDValue Cv =
2453  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2454  Op.getOperand(0));
2455  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2456  }
2457 
2458  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2459  SDLoc dl(Op);
2460  MVT ExtVT =
2462  VT.getVectorNumElements());
2463  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2464  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2465  }
2466 
2467  // Type changing conversions are illegal.
2468  return Op;
2469 }
2470 
2471 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2472  SelectionDAG &DAG) const {
2473  if (Op.getOperand(0).getValueType().isVector())
2474  return LowerVectorFP_TO_INT(Op, DAG);
2475 
2476  // f16 conversions are promoted to f32 when full fp16 is not supported.
2477  if (Op.getOperand(0).getValueType() == MVT::f16 &&
2478  !Subtarget->hasFullFP16()) {
2479  SDLoc dl(Op);
2480  return DAG.getNode(
2481  Op.getOpcode(), dl, Op.getValueType(),
2482  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2483  }
2484 
2485  if (Op.getOperand(0).getValueType() != MVT::f128) {
2486  // It's legal except when f128 is involved
2487  return Op;
2488  }
2489 
2490  RTLIB::Libcall LC;
2491  if (Op.getOpcode() == ISD::FP_TO_SINT)
2493  else
2495 
2496  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2497  MakeLibCallOptions CallOptions;
2498  return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first;
2499 }
2500 
2502  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2503  // Any additional optimization in this function should be recorded
2504  // in the cost tables.
2505  EVT VT = Op.getValueType();
2506  SDLoc dl(Op);
2507  SDValue In = Op.getOperand(0);
2508  EVT InVT = In.getValueType();
2509 
2510  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2511  MVT CastVT =
2513  InVT.getVectorNumElements());
2514  In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2515  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2516  }
2517 
2518  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2519  unsigned CastOpc =
2521  EVT CastVT = VT.changeVectorElementTypeToInteger();
2522  In = DAG.getNode(CastOpc, dl, CastVT, In);
2523  return DAG.getNode(Op.getOpcode(), dl, VT, In);
2524  }
2525 
2526  return Op;
2527 }
2528 
2529 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2530  SelectionDAG &DAG) const {
2531  if (Op.getValueType().isVector())
2532  return LowerVectorINT_TO_FP(Op, DAG);
2533 
2534  // f16 conversions are promoted to f32 when full fp16 is not supported.
2535  if (Op.getValueType() == MVT::f16 &&
2536  !Subtarget->hasFullFP16()) {
2537  SDLoc dl(Op);
2538  return DAG.getNode(
2539  ISD::FP_ROUND, dl, MVT::f16,
2540  DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2541  DAG.getIntPtrConstant(0, dl));
2542  }
2543 
2544  // i128 conversions are libcalls.
2545  if (Op.getOperand(0).getValueType() == MVT::i128)
2546  return SDValue();
2547 
2548  // Other conversions are legal, unless it's to the completely software-based
2549  // fp128.
2550  if (Op.getValueType() != MVT::f128)
2551  return Op;
2552 
2553  RTLIB::Libcall LC;
2554  if (Op.getOpcode() == ISD::SINT_TO_FP)
2556  else
2558 
2559  return LowerF128Call(Op, DAG, LC);
2560 }
2561 
2562 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2563  SelectionDAG &DAG) const {
2564  // For iOS, we want to call an alternative entry point: __sincos_stret,
2565  // which returns the values in two S / D registers.
2566  SDLoc dl(Op);
2567  SDValue Arg = Op.getOperand(0);
2568  EVT ArgVT = Arg.getValueType();
2569  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2570 
2571  ArgListTy Args;
2572  ArgListEntry Entry;
2573 
2574  Entry.Node = Arg;
2575  Entry.Ty = ArgTy;
2576  Entry.IsSExt = false;
2577  Entry.IsZExt = false;
2578  Args.push_back(Entry);
2579 
2580  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2581  : RTLIB::SINCOS_STRET_F32;
2582  const char *LibcallName = getLibcallName(LC);
2583  SDValue Callee =
2584  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2585 
2586  StructType *RetTy = StructType::get(ArgTy, ArgTy);
2588  CLI.setDebugLoc(dl)
2589  .setChain(DAG.getEntryNode())
2590  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2591 
2592  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2593  return CallResult.first;
2594 }
2595 
2597  if (Op.getValueType() != MVT::f16)
2598  return SDValue();
2599 
2600  assert(Op.getOperand(0).getValueType() == MVT::i16);
2601  SDLoc DL(Op);
2602 
2603  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2604  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2605  return SDValue(
2606  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2607  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2608  0);
2609 }
2610 
2611 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2612  if (OrigVT.getSizeInBits() >= 64)
2613  return OrigVT;
2614 
2615  assert(OrigVT.isSimple() && "Expecting a simple value type");
2616 
2617  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2618  switch (OrigSimpleTy) {
2619  default: llvm_unreachable("Unexpected Vector Type");
2620  case MVT::v2i8:
2621  case MVT::v2i16:
2622  return MVT::v2i32;
2623  case MVT::v4i8:
2624  return MVT::v4i16;
2625  }
2626 }
2627 
2629  const EVT &OrigTy,
2630  const EVT &ExtTy,
2631  unsigned ExtOpcode) {
2632  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2633  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2634  // 64-bits we need to insert a new extension so that it will be 64-bits.
2635  assert(ExtTy.is128BitVector() && "Unexpected extension size");
2636  if (OrigTy.getSizeInBits() >= 64)
2637  return N;
2638 
2639  // Must extend size to at least 64 bits to be used as an operand for VMULL.
2640  EVT NewVT = getExtensionTo64Bits(OrigTy);
2641 
2642  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2643 }
2644 
2646  bool isSigned) {
2647  EVT VT = N->getValueType(0);
2648 
2649  if (N->getOpcode() != ISD::BUILD_VECTOR)
2650  return false;
2651 
2652  for (const SDValue &Elt : N->op_values()) {
2653  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2654  unsigned EltSize = VT.getScalarSizeInBits();
2655  unsigned HalfSize = EltSize / 2;
2656  if (isSigned) {
2657  if (!isIntN(HalfSize, C->getSExtValue()))
2658  return false;
2659  } else {
2660  if (!isUIntN(HalfSize, C->getZExtValue()))
2661  return false;
2662  }
2663  continue;
2664  }
2665  return false;
2666  }
2667 
2668  return true;
2669 }
2670 
2672  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2674  N->getOperand(0)->getValueType(0),
2675  N->getValueType(0),
2676  N->getOpcode());
2677 
2678  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2679  EVT VT = N->getValueType(0);
2680  SDLoc dl(N);
2681  unsigned EltSize = VT.getScalarSizeInBits() / 2;
2682  unsigned NumElts = VT.getVectorNumElements();
2683  MVT TruncVT = MVT::getIntegerVT(EltSize);
2685  for (unsigned i = 0; i != NumElts; ++i) {
2686  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2687  const APInt &CInt = C->getAPIntValue();
2688  // Element types smaller than 32 bits are not legal, so use i32 elements.
2689  // The values are implicitly truncated so sext vs. zext doesn't matter.
2690  Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2691  }
2692  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2693 }
2694 
2695 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2696  return N->getOpcode() == ISD::SIGN_EXTEND ||
2697  isExtendedBUILD_VECTOR(N, DAG, true);
2698 }
2699 
2700 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2701  return N->getOpcode() == ISD::ZERO_EXTEND ||
2702  isExtendedBUILD_VECTOR(N, DAG, false);
2703 }
2704 
2705 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2706  unsigned Opcode = N->getOpcode();
2707  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2708  SDNode *N0 = N->getOperand(0).getNode();
2709  SDNode *N1 = N->getOperand(1).getNode();
2710  return N0->hasOneUse() && N1->hasOneUse() &&
2711  isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2712  }
2713  return false;
2714 }
2715 
2716 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2717  unsigned Opcode = N->getOpcode();
2718  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2719  SDNode *N0 = N->getOperand(0).getNode();
2720  SDNode *N1 = N->getOperand(1).getNode();
2721  return N0->hasOneUse() && N1->hasOneUse() &&
2722  isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2723  }
2724  return false;
2725 }
2726 
2727 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
2728  SelectionDAG &DAG) const {
2729  // The rounding mode is in bits 23:22 of the FPSCR.
2730  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
2731  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
2732  // so that the shift + and get folded into a bitfield extract.
2733  SDLoc dl(Op);
2734 
2735  SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
2736  DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
2737  MVT::i64));
2738  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
2739  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
2740  DAG.getConstant(1U << 22, dl, MVT::i32));
2741  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
2742  DAG.getConstant(22, dl, MVT::i32));
2743  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
2744  DAG.getConstant(3, dl, MVT::i32));
2745 }
2746 
2748  // Multiplications are only custom-lowered for 128-bit vectors so that
2749  // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2750  EVT VT = Op.getValueType();
2751  assert(VT.is128BitVector() && VT.isInteger() &&
2752  "unexpected type for custom-lowering ISD::MUL");
2753  SDNode *N0 = Op.getOperand(0).getNode();
2754  SDNode *N1 = Op.getOperand(1).getNode();
2755  unsigned NewOpc = 0;
2756  bool isMLA = false;
2757  bool isN0SExt = isSignExtended(N0, DAG);
2758  bool isN1SExt = isSignExtended(N1, DAG);
2759  if (isN0SExt && isN1SExt)
2760  NewOpc = AArch64ISD::SMULL;
2761  else {
2762  bool isN0ZExt = isZeroExtended(N0, DAG);
2763  bool isN1ZExt = isZeroExtended(N1, DAG);
2764  if (isN0ZExt && isN1ZExt)
2765  NewOpc = AArch64ISD::UMULL;
2766  else if (isN1SExt || isN1ZExt) {
2767  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2768  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2769  if (isN1SExt && isAddSubSExt(N0, DAG)) {
2770  NewOpc = AArch64ISD::SMULL;
2771  isMLA = true;
2772  } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2773  NewOpc = AArch64ISD::UMULL;
2774  isMLA = true;
2775  } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2776  std::swap(N0, N1);
2777  NewOpc = AArch64ISD::UMULL;
2778  isMLA = true;
2779  }
2780  }
2781 
2782  if (!NewOpc) {
2783  if (VT == MVT::v2i64)
2784  // Fall through to expand this. It is not legal.
2785  return SDValue();
2786  else
2787  // Other vector multiplications are legal.
2788  return Op;
2789  }
2790  }
2791 
2792  // Legalize to a S/UMULL instruction
2793  SDLoc DL(Op);
2794  SDValue Op0;
2795  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2796  if (!isMLA) {
2797  Op0 = skipExtensionForVectorMULL(N0, DAG);
2798  assert(Op0.getValueType().is64BitVector() &&
2799  Op1.getValueType().is64BitVector() &&
2800  "unexpected types for extended operands to VMULL");
2801  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2802  }
2803  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2804  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2805  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2806  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2807  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2808  EVT Op1VT = Op1.getValueType();
2809  return DAG.getNode(N0->getOpcode(), DL, VT,
2810  DAG.getNode(NewOpc, DL, VT,
2811  DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2812  DAG.getNode(NewOpc, DL, VT,
2813  DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2814 }
2815 
2816 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2817  SelectionDAG &DAG) const {
2818  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2819  SDLoc dl(Op);
2820  switch (IntNo) {
2821  default: return SDValue(); // Don't custom lower most intrinsics.
2822  case Intrinsic::thread_pointer: {
2823  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2824  return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2825  }
2826  case Intrinsic::aarch64_neon_abs: {
2827  EVT Ty = Op.getValueType();
2828  if (Ty == MVT::i64) {
2829  SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
2830  Op.getOperand(1));
2831  Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
2832  return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
2833  } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
2834  return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
2835  } else {
2836  report_fatal_error("Unexpected type for AArch64 NEON intrinic");
2837  }
2838  }
2839  case Intrinsic::aarch64_neon_smax:
2840  return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2841  Op.getOperand(1), Op.getOperand(2));
2842  case Intrinsic::aarch64_neon_umax:
2843  return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2844  Op.getOperand(1), Op.getOperand(2));
2845  case Intrinsic::aarch64_neon_smin:
2846  return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2847  Op.getOperand(1), Op.getOperand(2));
2848  case Intrinsic::aarch64_neon_umin:
2849  return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2850  Op.getOperand(1), Op.getOperand(2));
2851 
2852  case Intrinsic::aarch64_sve_sunpkhi:
2853  return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
2854  Op.getOperand(1));
2855  case Intrinsic::aarch64_sve_sunpklo:
2856  return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
2857  Op.getOperand(1));
2858  case Intrinsic::aarch64_sve_uunpkhi:
2859  return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
2860  Op.getOperand(1));
2861  case Intrinsic::aarch64_sve_uunpklo:
2862  return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
2863  Op.getOperand(1));
2864 
2865  case Intrinsic::localaddress: {
2866  const auto &MF = DAG.getMachineFunction();
2867  const auto *RegInfo = Subtarget->getRegisterInfo();
2868  unsigned Reg = RegInfo->getLocalAddressRegister(MF);
2869  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
2870  Op.getSimpleValueType());
2871  }
2872 
2873  case Intrinsic::eh_recoverfp: {
2874  // FIXME: This needs to be implemented to correctly handle highly aligned
2875  // stack objects. For now we simply return the incoming FP. Refer D53541
2876  // for more details.
2877  SDValue FnOp = Op.getOperand(1);
2878  SDValue IncomingFPOp = Op.getOperand(2);
2880  auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
2881  if (!Fn)
2883  "llvm.eh.recoverfp must take a function as the first argument");
2884  return IncomingFPOp;
2885  }
2886  }
2887 }
2888 
2889 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
2891  EVT VT, EVT MemVT,
2892  SelectionDAG &DAG) {
2893  assert(VT.isVector() && "VT should be a vector type");
2894  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
2895 
2896  SDValue Value = ST->getValue();
2897 
2898  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
2899  // the word lane which represent the v4i8 subvector. It optimizes the store
2900  // to:
2901  //
2902  // xtn v0.8b, v0.8h
2903  // str s0, [x0]
2904 
2905  SDValue Undef = DAG.getUNDEF(MVT::i16);
2906  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
2907  {Undef, Undef, Undef, Undef});
2908 
2909  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
2910  Value, UndefVec);
2911  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
2912 
2913  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
2914  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
2915  Trunc, DAG.getConstant(0, DL, MVT::i64));
2916 
2917  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
2918  ST->getBasePtr(), ST->getMemOperand());
2919 }
2920 
2921 // Custom lowering for any store, vector or scalar and/or default or with
2922 // a truncate operations. Currently only custom lower truncate operation
2923 // from vector v4i16 to v4i8.
2924 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
2925  SelectionDAG &DAG) const {
2926  SDLoc Dl(Op);
2927  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
2928  assert (StoreNode && "Can only custom lower store nodes");
2929 
2930  SDValue Value = StoreNode->getValue();
2931 
2932  EVT VT = Value.getValueType();
2933  EVT MemVT = StoreNode->getMemoryVT();
2934 
2935  assert (VT.isVector() && "Can only custom lower vector store types");
2936 
2937  unsigned AS = StoreNode->getAddressSpace();
2938  unsigned Align = StoreNode->getAlignment();
2939  if (Align < MemVT.getStoreSize() &&
2941  MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
2942  return scalarizeVectorStore(StoreNode, DAG);
2943  }
2944 
2945  if (StoreNode->isTruncatingStore()) {
2946  return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
2947  }
2948 
2949  return SDValue();
2950 }
2951 
2953  SelectionDAG &DAG) const {
2954  LLVM_DEBUG(dbgs() << "Custom lowering: ");
2955  LLVM_DEBUG(Op.dump());
2956 
2957  switch (Op.getOpcode()) {
2958  default:
2959  llvm_unreachable("unimplemented operand");
2960  return SDValue();
2961  case ISD::BITCAST:
2962  return LowerBITCAST(Op, DAG);
2963  case ISD::GlobalAddress:
2964  return LowerGlobalAddress(Op, DAG);
2965  case ISD::GlobalTLSAddress:
2966  return LowerGlobalTLSAddress(Op, DAG);
2967  case ISD::SETCC:
2968  return LowerSETCC(Op, DAG);
2969  case ISD::BR_CC:
2970  return LowerBR_CC(Op, DAG);
2971  case ISD::SELECT:
2972  return LowerSELECT(Op, DAG);
2973  case ISD::SELECT_CC:
2974  return LowerSELECT_CC(Op, DAG);
2975  case ISD::JumpTable:
2976  return LowerJumpTable(Op, DAG);
2977  case ISD::BR_JT:
2978  return LowerBR_JT(Op, DAG);
2979  case ISD::ConstantPool:
2980  return LowerConstantPool(Op, DAG);
2981  case ISD::BlockAddress:
2982  return LowerBlockAddress(Op, DAG);
2983  case ISD::VASTART:
2984  return LowerVASTART(Op, DAG);
2985  case ISD::VACOPY:
2986  return LowerVACOPY(Op, DAG);
2987  case ISD::VAARG:
2988  return LowerVAARG(Op, DAG);
2989  case ISD::ADDC:
2990  case ISD::ADDE:
2991  case ISD::SUBC:
2992  case ISD::SUBE:
2993  return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2994  case ISD::SADDO:
2995  case ISD::UADDO:
2996  case ISD::SSUBO:
2997  case ISD::USUBO:
2998  case ISD::SMULO:
2999  case ISD::UMULO:
3000  return LowerXALUO(Op, DAG);
3001  case ISD::FADD:
3002  return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
3003  case ISD::FSUB:
3004  return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
3005  case ISD::FMUL:
3006  return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
3007  case ISD::FDIV:
3008  return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
3009  case ISD::FP_ROUND:
3010  return LowerFP_ROUND(Op, DAG);
3011  case ISD::FP_EXTEND:
3012  return LowerFP_EXTEND(Op, DAG);
3013  case ISD::FRAMEADDR:
3014  return LowerFRAMEADDR(Op, DAG);
3015  case ISD::SPONENTRY:
3016  return LowerSPONENTRY(Op, DAG);
3017  case ISD::RETURNADDR:
3018  return LowerRETURNADDR(Op, DAG);
3019  case ISD::ADDROFRETURNADDR:
3020  return LowerADDROFRETURNADDR(Op, DAG);
3022  return LowerINSERT_VECTOR_ELT(Op, DAG);
3024  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3025  case ISD::BUILD_VECTOR:
3026  return LowerBUILD_VECTOR(Op, DAG);
3027  case ISD::VECTOR_SHUFFLE:
3028  return LowerVECTOR_SHUFFLE(Op, DAG);
3029  case ISD::SPLAT_VECTOR:
3030  return LowerSPLAT_VECTOR(Op, DAG);
3032  return LowerEXTRACT_SUBVECTOR(Op, DAG);
3033  case ISD::SRA:
3034  case ISD::SRL:
3035  case ISD::SHL:
3036  return LowerVectorSRA_SRL_SHL(Op, DAG);
3037  case ISD::SHL_PARTS:
3038  return LowerShiftLeftParts(Op, DAG);
3039  case ISD::SRL_PARTS:
3040  case ISD::SRA_PARTS:
3041  return LowerShiftRightParts(Op, DAG);
3042  case ISD::CTPOP:
3043  return LowerCTPOP(Op, DAG);
3044  case ISD::FCOPYSIGN:
3045  return LowerFCOPYSIGN(Op, DAG);
3046  case ISD::OR:
3047  return LowerVectorOR(Op, DAG);
3048  case ISD::XOR:
3049  return LowerXOR(Op, DAG);
3050  case ISD::PREFETCH:
3051  return LowerPREFETCH(Op, DAG);
3052  case ISD::SINT_TO_FP:
3053  case ISD::UINT_TO_FP:
3054  return LowerINT_TO_FP(Op, DAG);
3055  case ISD::FP_TO_SINT:
3056  case ISD::FP_TO_UINT:
3057  return LowerFP_TO_INT(Op, DAG);
3058  case ISD::FSINCOS:
3059  return LowerFSINCOS(Op, DAG);
3060  case ISD::FLT_ROUNDS_:
3061  return LowerFLT_ROUNDS_(Op, DAG);
3062  case ISD::MUL:
3063  return LowerMUL(Op, DAG);
3065  return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3066  case ISD::STORE:
3067  return LowerSTORE(Op, DAG);
3068  case ISD::VECREDUCE_ADD:
3069  case ISD::VECREDUCE_SMAX:
3070  case ISD::VECREDUCE_SMIN:
3071  case ISD::VECREDUCE_UMAX:
3072  case ISD::VECREDUCE_UMIN:
3073  case ISD::VECREDUCE_FMAX:
3074  case ISD::VECREDUCE_FMIN:
3075  return LowerVECREDUCE(Op, DAG);
3076  case ISD::ATOMIC_LOAD_SUB:
3077  return LowerATOMIC_LOAD_SUB(Op, DAG);
3078  case ISD::ATOMIC_LOAD_AND:
3079  return LowerATOMIC_LOAD_AND(Op, DAG);
3081  return LowerDYNAMIC_STACKALLOC(Op, DAG);
3082  }
3083 }
3084 
3085 //===----------------------------------------------------------------------===//
3086 // Calling Convention Implementation
3087 //===----------------------------------------------------------------------===//
3088 
3089 /// Selects the correct CCAssignFn for a given CallingConvention value.
3091  bool IsVarArg) const {
3092  switch (CC) {
3093  default:
3094  report_fatal_error("Unsupported calling convention.");
3096  return CC_AArch64_WebKit_JS;
3097  case CallingConv::GHC:
3098  return CC_AArch64_GHC;
3099  case CallingConv::C:
3100  case CallingConv::Fast:
3103  case CallingConv::Swift:
3104  if (Subtarget->isTargetWindows() && IsVarArg)
3105  return CC_AArch64_Win64_VarArg;
3106  if (!Subtarget->isTargetDarwin())
3107  return CC_AArch64_AAPCS;
3108  if (!IsVarArg)
3109  return CC_AArch64_DarwinPCS;
3110  return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
3112  case CallingConv::Win64:
3113  return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
3115  return CC_AArch64_AAPCS;
3116  }
3117 }
3118 
3119 CCAssignFn *
3123 }
3124 
3125 SDValue AArch64TargetLowering::LowerFormalArguments(
3126  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3127  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3128  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3129  MachineFunction &MF = DAG.getMachineFunction();
3130  MachineFrameInfo &MFI = MF.getFrameInfo();
3131  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3132 
3133  // Assign locations to all of the incoming arguments.
3135  DenseMap<unsigned, SDValue> CopiedRegs;
3136  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3137  *DAG.getContext());
3138 
3139  // At this point, Ins[].VT may already be promoted to i32. To correctly
3140  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3141  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3142  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
3143  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
3144  // LocVT.
3145  unsigned NumArgs = Ins.size();
3147  unsigned CurArgIdx = 0;
3148  for (unsigned i = 0; i != NumArgs; ++i) {
3149  MVT ValVT = Ins[i].VT;
3150  if (Ins[i].isOrigArg()) {
3151  std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
3152  CurArgIdx = Ins[i].getOrigArgIndex();
3153 
3154  // Get type of the original argument.
3155  EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
3156  /*AllowUnknown*/ true);
3157  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
3158  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3159  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3160  ValVT = MVT::i8;
3161  else if (ActualMVT == MVT::i16)
3162  ValVT = MVT::i16;
3163  }
3164  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3165  bool Res =
3166  AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
3167  assert(!Res && "Call operand has unhandled type");
3168  (void)Res;
3169  }
3170  assert(ArgLocs.size() == Ins.size());
3171  SmallVector<SDValue, 16> ArgValues;
3172  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3173  CCValAssign &VA = ArgLocs[i];
3174 
3175  if (Ins[i].Flags.isByVal()) {
3176  // Byval is used for HFAs in the PCS, but the system should work in a
3177  // non-compliant manner for larger structs.
3178  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3179  int Size = Ins[i].Flags.getByValSize();
3180  unsigned NumRegs = (Size + 7) / 8;
3181 
3182  // FIXME: This works on big-endian for composite byvals, which are the common
3183  // case. It should also work for fundamental types too.
3184  unsigned FrameIdx =
3185  MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
3186  SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
3187  InVals.push_back(FrameIdxN);
3188 
3189  continue;
3190  }
3191 
3192  SDValue ArgValue;
3193  if (VA.isRegLoc()) {
3194  // Arguments stored in registers.
3195  EVT RegVT = VA.getLocVT();
3196  const TargetRegisterClass *RC;
3197 
3198  if (RegVT == MVT::i32)
3199  RC = &AArch64::GPR32RegClass;
3200  else if (RegVT == MVT::i64)
3201  RC = &AArch64::GPR64RegClass;
3202  else if (RegVT == MVT::f16)
3203  RC = &AArch64::FPR16RegClass;
3204  else if (RegVT == MVT::f32)
3205  RC = &AArch64::FPR32RegClass;
3206  else if (RegVT == MVT::f64 || RegVT.is64BitVector())
3207  RC = &AArch64::FPR64RegClass;
3208  else if (RegVT == MVT::f128 || RegVT.is128BitVector())
3209  RC = &AArch64::FPR128RegClass;
3210  else if (RegVT.isScalableVector() &&
3211  RegVT.getVectorElementType() == MVT::i1)
3212  RC = &AArch64::PPRRegClass;
3213  else if (RegVT.isScalableVector())
3214  RC = &AArch64::ZPRRegClass;
3215  else
3216  llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3217 
3218  // Transform the arguments in physical registers into virtual ones.
3219  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3220  ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
3221 
3222  // If this is an 8, 16 or 32-bit value, it is really passed promoted
3223  // to 64 bits. Insert an assert[sz]ext to capture this, then
3224  // truncate to the right size.
3225  switch (VA.getLocInfo()) {
3226  default:
3227  llvm_unreachable("Unknown loc info!");
3228  case CCValAssign::Full:
3229  break;
3230  case CCValAssign::Indirect:
3231  assert(VA.getValVT().isScalableVector() &&
3232  "Only scalable vectors can be passed indirectly");
3233  llvm_unreachable("Spilling of SVE vectors not yet implemented");
3234  case CCValAssign::BCvt:
3235  ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
3236  break;
3237  case CCValAssign::AExt:
3238  case CCValAssign::SExt:
3239  case CCValAssign::ZExt:
3240  break;
3242  ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
3243  DAG.getConstant(32, DL, RegVT));
3244  ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
3245  break;
3246  }
3247  } else { // VA.isRegLoc()
3248  assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
3249  unsigned ArgOffset = VA.getLocMemOffset();
3250  unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
3251 
3252  uint32_t BEAlign = 0;
3253  if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
3254  !Ins[i].Flags.isInConsecutiveRegs())
3255  BEAlign = 8 - ArgSize;
3256 
3257  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
3258 
3259  // Create load nodes to retrieve arguments from the stack.
3260  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3261 
3262  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
3264  MVT MemVT = VA.getValVT();
3265 
3266  switch (VA.getLocInfo()) {
3267  default:
3268  break;
3269  case CCValAssign::Trunc:
3270  case CCValAssign::BCvt:
3271  MemVT = VA.getLocVT();
3272  break;
3273  case CCValAssign::Indirect:
3274  assert(VA.getValVT().isScalableVector() &&
3275  "Only scalable vectors can be passed indirectly");
3276  llvm_unreachable("Spilling of SVE vectors not yet implemented");
3277  case CCValAssign::SExt:
3278  ExtType = ISD::SEXTLOAD;
3279  break;
3280  case CCValAssign::ZExt:
3281  ExtType = ISD::ZEXTLOAD;
3282  break;
3283  case CCValAssign::AExt:
3284  ExtType = ISD::EXTLOAD;
3285  break;
3286  }
3287 
3288  ArgValue = DAG.getExtLoad(
3289  ExtType, DL, VA.getLocVT(), Chain, FIN,
3291  MemVT);
3292 
3293  }
3294  if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
3295  ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
3296  ArgValue, DAG.getValueType(MVT::i32));
3297  InVals.push_back(ArgValue);
3298  }
3299 
3300  // varargs
3302  if (isVarArg) {
3303  if (!Subtarget->isTargetDarwin() || IsWin64) {
3304  // The AAPCS variadic function ABI is identical to the non-variadic
3305  // one. As a result there may be more arguments in registers and we should
3306  // save them for future reference.
3307  // Win64 variadic functions also pass arguments in registers, but all float
3308  // arguments are passed in integer registers.
3309  saveVarArgRegisters(CCInfo, DAG, DL, Chain);
3310  }
3311 
3312  // This will point to the next argument passed via stack.
3313  unsigned StackOffset = CCInfo.getNextStackOffset();
3314  // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
3315  StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
3316  FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
3317 
3318  if (MFI.hasMustTailInVarArgFunc()) {
3319  SmallVector<MVT, 2> RegParmTypes;
3320  RegParmTypes.push_back(MVT::i64);
3321  RegParmTypes.push_back(MVT::f128);
3322  // Compute the set of forwarded registers. The rest are scratch.
3324  FuncInfo->getForwardedMustTailRegParms();
3325  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
3327 
3328  // Conservatively forward X8, since it might be used for aggregate return.
3329  if (!CCInfo.isAllocated(AArch64::X8)) {
3330  unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
3331  Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
3332  }
3333  }
3334  }
3335 
3336  // On Windows, InReg pointers must be returned, so record the pointer in a
3337  // virtual register at the start of the function so it can be returned in the
3338  // epilogue.
3339  if (IsWin64) {
3340  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3341  if (Ins[I].Flags.isInReg()) {
3342  assert(!FuncInfo->getSRetReturnReg());
3343 
3344  MVT PtrTy = getPointerTy(DAG.getDataLayout());
3345  Register Reg =
3347  FuncInfo->setSRetReturnReg(Reg);
3348 
3349  SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
3350  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
3351  break;
3352  }
3353  }
3354  }
3355 
3356  unsigned StackArgSize = CCInfo.getNextStackOffset();
3357  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3358  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
3359  // This is a non-standard ABI so by fiat I say we're allowed to make full
3360  // use of the stack area to be popped, which must be aligned to 16 bytes in
3361  // any case:
3362  StackArgSize = alignTo(StackArgSize, 16);
3363 
3364  // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
3365  // a multiple of 16.
3366  FuncInfo->setArgumentStackToRestore(StackArgSize);
3367 
3368  // This realignment carries over to the available bytes below. Our own
3369  // callers will guarantee the space is free by giving an aligned value to
3370  // CALLSEQ_START.
3371  }
3372  // Even if we're not expected to free up the space, it's useful to know how
3373  // much is there while considering tail calls (because we can reuse it).
3374  FuncInfo->setBytesInStackArgArea(StackArgSize);
3375 
3376  if (Subtarget->hasCustomCallingConv())
3377  Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
3378 
3379  return Chain;
3380 }
3381 
3382 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
3383  SelectionDAG &DAG,
3384  const SDLoc &DL,
3385  SDValue &Chain) const {
3386  MachineFunction &MF = DAG.getMachineFunction();
3387  MachineFrameInfo &MFI = MF.getFrameInfo();
3389  auto PtrVT = getPointerTy(DAG.getDataLayout());
3390  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3391 
3392  SmallVector<SDValue, 8> MemOps;
3393 
3394  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
3395  AArch64::X3, AArch64::X4, AArch64::X5,
3396  AArch64::X6, AArch64::X7 };
3397  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
3398  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
3399 
3400  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
3401  int GPRIdx = 0;
3402  if (GPRSaveSize != 0) {
3403  if (IsWin64) {
3404  GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
3405  if (GPRSaveSize & 15)
3406  // The extra size here, if triggered, will always be 8.
3407  MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
3408  } else
3409  GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
3410 
3411  SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
3412 
3413  for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
3414  unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
3415  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
3416  SDValue Store = DAG.getStore(
3417  Val.getValue(1), DL, Val, FIN,
3418  IsWin64
3420  GPRIdx,
3421  (i - FirstVariadicGPR) * 8)
3423  MemOps.push_back(Store);
3424  FIN =
3425  DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
3426  }
3427  }
3428  FuncInfo->setVarArgsGPRIndex(GPRIdx);
3429  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
3430 
3431  if (Subtarget->hasFPARMv8() && !IsWin64) {
3432  static const MCPhysReg FPRArgRegs[] = {
3433  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
3434  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
3435  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
3436  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
3437 
3438  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
3439  int FPRIdx = 0;
3440  if (FPRSaveSize != 0) {
3441  FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
3442 
3443  SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3444 
3445  for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3446  unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3447  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3448 
3449  SDValue Store = DAG.getStore(
3450  Val.getValue(1), DL, Val, FIN,
3452  MemOps.push_back(Store);
3453  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3454  DAG.getConstant(16, DL, PtrVT));
3455  }
3456  }
3457  FuncInfo->setVarArgsFPRIndex(FPRIdx);
3458  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3459  }
3460 
3461  if (!MemOps.empty()) {
3462  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3463  }
3464 }
3465 
3466 /// LowerCallResult - Lower the result values of a call into the
3467 /// appropriate copies out of appropriate physical registers.
3468 SDValue AArch64TargetLowering::LowerCallResult(
3469  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3470  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3471  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3472  SDValue ThisVal) const {
3473  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3476  // Assign locations to each value returned by this call.
3478  DenseMap<unsigned, SDValue> CopiedRegs;
3479  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3480  *DAG.getContext());
3481  CCInfo.AnalyzeCallResult(Ins, RetCC);
3482 
3483  // Copy all of the result registers out of their specified physreg.
3484  for (unsigned i = 0; i != RVLocs.size(); ++i) {
3485  CCValAssign VA = RVLocs[i];
3486 
3487  // Pass 'this' value directly from the argument to return value, to avoid
3488  // reg unit interference
3489  if (i == 0 && isThisReturn) {
3490  assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3491  "unexpected return calling convention register assignment");
3492  InVals.push_back(ThisVal);
3493  continue;
3494  }
3495 
3496  // Avoid copying a physreg twice since RegAllocFast is incompetent and only
3497  // allows one use of a physreg per block.
3498  SDValue Val = CopiedRegs.lookup(VA.getLocReg());
3499  if (!Val) {
3500  Val =
3501  DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3502  Chain = Val.getValue(1);
3503  InFlag = Val.getValue(2);
3504  CopiedRegs[VA.getLocReg()] = Val;
3505  }
3506 
3507  switch (VA.getLocInfo()) {
3508  default:
3509  llvm_unreachable("Unknown loc info!");
3510  case CCValAssign::Full:
3511  break;
3512  case CCValAssign::BCvt:
3513  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3514  break;
3516  Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
3517  DAG.getConstant(32, DL, VA.getLocVT()));
3519  case CCValAssign::AExt:
3521  case CCValAssign::ZExt:
3522  Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
3523  break;
3524  }
3525 
3526  InVals.push_back(Val);
3527  }
3528 
3529  return Chain;
3530 }
3531 
3532 /// Return true if the calling convention is one that we can guarantee TCO for.
3534  return CC == CallingConv::Fast;
3535 }
3536 
3537 /// Return true if we might ever do TCO for calls with this calling convention.
3539  switch (CC) {
3540  case CallingConv::C:
3542  case CallingConv::Swift:
3543  return true;
3544  default:
3545  return canGuaranteeTCO(CC);
3546  }
3547 }
3548 
3549 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3550  SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3551  const SmallVectorImpl<ISD::OutputArg> &Outs,
3552  const SmallVectorImpl<SDValue> &OutVals,
3553  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3554  if (!mayTailCallThisCC(CalleeCC))
3555  return false;
3556 
3557  MachineFunction &MF = DAG.getMachineFunction();
3558  const Function &CallerF = MF.getFunction();
3559  CallingConv::ID CallerCC = CallerF.getCallingConv();
3560  bool CCMatch = CallerCC == CalleeCC;
3561 
3562  // Byval parameters hand the function a pointer directly into the stack area
3563  // we want to reuse during a tail call. Working around this *is* possible (see
3564  // X86) but less efficient and uglier in LowerCall.
3565  for (Function::const_arg_iterator i = CallerF.arg_begin(),
3566  e = CallerF.arg_end();
3567  i != e; ++i) {
3568  if (i->hasByValAttr())
3569  return false;
3570 
3571  // On Windows, "inreg" attributes signify non-aggregate indirect returns.
3572  // In this case, it is necessary to save/restore X0 in the callee. Tail
3573  // call opt interferes with this. So we disable tail call opt when the
3574  // caller has an argument with "inreg" attribute.
3575 
3576  // FIXME: Check whether the callee also has an "inreg" argument.
3577  if (i->hasInRegAttr())
3578  return false;
3579  }
3580 
3582  return canGuaranteeTCO(CalleeCC) && CCMatch;
3583 
3584  // Externally-defined functions with weak linkage should not be
3585  // tail-called on AArch64 when the OS does not support dynamic
3586  // pre-emption of symbols, as the AAELF spec requires normal calls
3587  // to undefined weak functions to be replaced with a NOP or jump to the
3588  // next instruction. The behaviour of branch instructions in this
3589  // situation (as used for tail calls) is implementation-defined, so we
3590  // cannot rely on the linker replacing the tail call with a return.
3591  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3592  const GlobalValue *GV = G->getGlobal();
3594  if (GV->hasExternalWeakLinkage() &&
3595  (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3596  return false;
3597  }
3598 
3599  // Now we search for cases where we can use a tail call without changing the
3600  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3601  // concept.
3602 
3603  // I want anyone implementing a new calling convention to think long and hard
3604  // about this assert.
3605  assert((!isVarArg || CalleeCC == CallingConv::C) &&
3606  "Unexpected variadic calling convention");
3607 
3608  LLVMContext &C = *DAG.getContext();
3609  if (isVarArg && !Outs.empty()) {
3610  // At least two cases here: if caller is fastcc then we can't have any
3611  // memory arguments (we'd be expected to clean up the stack afterwards). If
3612  // caller is C then we could potentially use its argument area.
3613 
3614  // FIXME: for now we take the most conservative of these in both cases:
3615  // disallow all variadic memory operands.
3617  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3618 
3619  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3620  for (const CCValAssign &ArgLoc : ArgLocs)
3621  if (!ArgLoc.isRegLoc())
3622  return false;
3623  }
3624 
3625  // Check that the call results are passed in the same way.
3626  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3627  CCAssignFnForCall(CalleeCC, isVarArg),
3628  CCAssignFnForCall(CallerCC, isVarArg)))
3629  return false;
3630  // The callee has to preserve all registers the caller needs to preserve.
3631  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3632  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3633  if (!CCMatch) {
3634  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3635  if (Subtarget->hasCustomCallingConv()) {
3636  TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
3637  TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
3638  }
3639  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3640  return false;
3641  }
3642 
3643  // Nothing more to check if the callee is taking no arguments
3644  if (Outs.empty())
3645  return true;
3646 
3648  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3649 
3650  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3651 
3652  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3653 
3654  // If the stack arguments for this call do not fit into our own save area then
3655  // the call cannot be made tail.
3656  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3657  return false;
3658 
3659  const MachineRegisterInfo &MRI = MF.getRegInfo();
3660  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3661  return false;
3662 
3663  return true;
3664 }
3665 
3666 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3667  SelectionDAG &DAG,
3668  MachineFrameInfo &MFI,
3669  int ClobberedFI) const {
3670  SmallVector<SDValue, 8> ArgChains;
3671  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3672  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3673 
3674  // Include the original chain at the beginning of the list. When this is
3675  // used by target LowerCall hooks, this helps legalize find the
3676  // CALLSEQ_BEGIN node.
3677  ArgChains.push_back(Chain);
3678 
3679  // Add a chain value for each stack argument corresponding
3680  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3681  UE = DAG.getEntryNode().getNode()->use_end();
3682  U != UE; ++U)
3683  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3684  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3685  if (FI->getIndex() < 0) {
3686  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3687  int64_t InLastByte = InFirstByte;
3688  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3689 
3690  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3691  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3692  ArgChains.push_back(SDValue(L, 1));
3693  }
3694 
3695  // Build a tokenfactor for all the chains.
3696  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3697 }
3698 
3699 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3700  bool TailCallOpt) const {
3701  return CallCC == CallingConv::Fast && TailCallOpt;
3702 }
3703 
3704 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3705 /// and add input and output parameter nodes.
3706 SDValue
3707 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3708  SmallVectorImpl<SDValue> &InVals) const {
3709  SelectionDAG &DAG = CLI.DAG;
3710  SDLoc &DL = CLI.DL;
3711  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3712  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3713  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3714  SDValue Chain = CLI.Chain;
3715  SDValue Callee = CLI.Callee;
3716  bool &IsTailCall = CLI.IsTailCall;
3717  CallingConv::ID CallConv = CLI.CallConv;
3718  bool IsVarArg = CLI.IsVarArg;
3719 
3720  MachineFunction &MF = DAG.getMachineFunction();
3722  bool IsThisReturn = false;
3723 
3725  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3726  bool IsSibCall = false;
3727 
3728  if (IsTailCall) {
3729  // Check if it's really possible to do a tail call.
3730  IsTailCall = isEligibleForTailCallOptimization(
3731  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3732  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3733  report_fatal_error("failed to perform tail call elimination on a call "
3734  "site marked musttail");
3735 
3736  // A sibling call is one where we're under the usual C ABI and not planning
3737  // to change that but can still do a tail call:
3738  if (!TailCallOpt && IsTailCall)
3739  IsSibCall = true;
3740 
3741  if (IsTailCall)
3742  ++NumTailCalls;
3743  }
3744 
3745  // Analyze operands of the call, assigning locations to each operand.
3747  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3748  *DAG.getContext());
3749 
3750  if (IsVarArg) {
3751  // Handle fixed and variable vector arguments differently.
3752  // Variable vector arguments always go into memory.
3753  unsigned NumArgs = Outs.size();
3754 
3755  for (unsigned i = 0; i != NumArgs; ++i) {
3756  MVT ArgVT = Outs[i].VT;
3757  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3758  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3759  /*IsVarArg=*/ !Outs[i].IsFixed);
3760  bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3761  assert(!Res && "Call operand has unhandled type");
3762  (void)Res;
3763  }
3764  } else {
3765  // At this point, Outs[].VT may already be promoted to i32. To correctly
3766  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3767  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3768  // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3769  // we use a special version of AnalyzeCallOperands to pass in ValVT and
3770  // LocVT.
3771  unsigned NumArgs = Outs.size();
3772  for (unsigned i = 0; i != NumArgs; ++i) {
3773  MVT ValVT = Outs[i].VT;
3774  // Get type of the original argument.
3775  EVT ActualVT = getValueType(DAG.getDataLayout(),
3776  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3777  /*AllowUnknown*/ true);
3778  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3779  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3780  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3781  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3782  ValVT = MVT::i8;
3783  else if (ActualMVT == MVT::i16)
3784  ValVT = MVT::i16;
3785 
3786  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3787  bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3788  assert(!Res && "Call operand has unhandled type");
3789  (void)Res;
3790  }
3791  }
3792 
3793  // Get a count of how many bytes are to be pushed on the stack.
3794  unsigned NumBytes = CCInfo.getNextStackOffset();
3795 
3796  if (IsSibCall) {
3797  // Since we're not changing the ABI to make this a tail call, the memory
3798  // operands are already available in the caller's incoming argument space.
3799  NumBytes = 0;
3800  }
3801 
3802  // FPDiff is the byte offset of the call's argument area from the callee's.
3803  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3804  // by this amount for a tail call. In a sibling call it must be 0 because the
3805  // caller will deallocate the entire stack and the callee still expects its
3806  // arguments to begin at SP+0. Completely unused for non-tail calls.
3807  int FPDiff = 0;
3808 
3809  if (IsTailCall && !IsSibCall) {
3810  unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3811 
3812  // Since callee will pop argument stack as a tail call, we must keep the
3813  // popped size 16-byte aligned.
3814  NumBytes = alignTo(NumBytes, 16);
3815 
3816  // FPDiff will be negative if this tail call requires more space than we
3817  // would automatically have in our incoming argument space. Positive if we
3818  // can actually shrink the stack.
3819  FPDiff = NumReusableBytes - NumBytes;
3820 
3821  // The stack pointer must be 16-byte aligned at all times it's used for a
3822  // memory operation, which in practice means at *all* times and in
3823  // particular across call boundaries. Therefore our own arguments started at
3824  // a 16-byte aligned SP and the delta applied for the tail call should
3825  // satisfy the same constraint.
3826  assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3827  }
3828 
3829  // Adjust the stack pointer for the new arguments...
3830  // These operations are automatically eliminated by the prolog/epilog pass
3831  if (!IsSibCall)
3832  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3833 
3834  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3835  getPointerTy(DAG.getDataLayout()));
3836 
3838  SmallSet<unsigned, 8> RegsUsed;
3839  SmallVector<SDValue, 8> MemOpChains;
3840  auto PtrVT = getPointerTy(DAG.getDataLayout());
3841 
3842  if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
3843  const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
3844  for (const auto &F : Forwards) {
3845  SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
3846  RegsToPass.emplace_back(F.PReg, Val);
3847  }
3848  }
3849 
3850  // Walk the register/memloc assignments, inserting copies/loads.
3851  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3852  ++i, ++realArgIdx) {
3853  CCValAssign &VA = ArgLocs[i];
3854  SDValue Arg = OutVals[realArgIdx];
3855  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3856 
3857  // Promote the value if needed.
3858  switch (VA.getLocInfo()) {
3859  default:
3860  llvm_unreachable("Unknown loc info!");
3861  case CCValAssign::Full:
3862  break;
3863  case CCValAssign::SExt:
3864  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3865  break;
3866  case CCValAssign::ZExt:
3867  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3868  break;
3869  case CCValAssign::AExt:
3870  if (Outs[realArgIdx].ArgVT == MVT::i1) {
3871  // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3872  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3873  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3874  }
3875  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3876  break;
3878  assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
3879  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3880  Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
3881  DAG.getConstant(32, DL, VA.getLocVT()));
3882  break;
3883  case CCValAssign::BCvt:
3884  Arg = DAG.getBitcast(VA.getLocVT(), Arg);
3885  break;
3886  case CCValAssign::Trunc:
3887  Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
3888  break;
3889  case CCValAssign::FPExt:
3890  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3891  break;
3892  case CCValAssign::Indirect:
3893  assert(VA.getValVT().isScalableVector() &&
3894  "Only scalable vectors can be passed indirectly");
3895  llvm_unreachable("Spilling of SVE vectors not yet implemented");
3896  }
3897 
3898  if (VA.isRegLoc()) {
3899  if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3900  Outs[0].VT == MVT::i64) {
3901  assert(VA.getLocVT() == MVT::i64 &&
3902  "unexpected calling convention register assignment");
3903  assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3904  "unexpected use of 'returned'");
3905  IsThisReturn = true;
3906  }
3907  if (RegsUsed.count(VA.getLocReg())) {
3908  // If this register has already been used then we're trying to pack
3909  // parts of an [N x i32] into an X-register. The extension type will
3910  // take care of putting the two halves in the right place but we have to
3911  // combine them.
3912  SDValue &Bits =
3913  std::find_if(RegsToPass.begin(), RegsToPass.end(),
3914  [=](const std::pair<unsigned, SDValue> &Elt) {
3915  return Elt.first == VA.getLocReg();
3916  })
3917  ->second;
3918  Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
3919  // Call site info is used for function's parameter entry value
3920  // tracking. For now we track only simple cases when parameter
3921  // is transferred through whole register.
3922  CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(),
3923  [&VA](MachineFunction::ArgRegPair ArgReg) {
3924  return ArgReg.Reg == VA.getLocReg();
3925  }),
3926  CSInfo.end());
3927  } else {
3928  RegsToPass.emplace_back(VA.getLocReg(), Arg);
3929  RegsUsed.insert(VA.getLocReg());
3930  const TargetOptions &Options = DAG.getTarget().Options;
3931  if (Options.EnableDebugEntryValues)
3932  CSInfo.emplace_back(VA.getLocReg(), i);
3933  }
3934  } else {
3935  assert(VA.isMemLoc());
3936 
3937  SDValue DstAddr;
3938  MachinePointerInfo DstInfo;
3939 
3940  // FIXME: This works on big-endian for composite byvals, which are the
3941  // common case. It should also work for fundamental types too.
3942  uint32_t BEAlign = 0;
3943  unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3944  : VA.getValVT().getSizeInBits();
3945  OpSize = (OpSize + 7) / 8;
3946  if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3947  !Flags.isInConsecutiveRegs()) {
3948  if (OpSize < 8)
3949  BEAlign = 8 - OpSize;
3950  }
3951  unsigned LocMemOffset = VA.getLocMemOffset();
3952  int32_t Offset = LocMemOffset + BEAlign;
3953  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3954  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3955 
3956  if (IsTailCall) {
3957  Offset = Offset + FPDiff;
3958  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3959 
3960  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3961  DstInfo =
3963 
3964  // Make sure any stack arguments overlapping with where we're storing
3965  // are loaded before this eventual operation. Otherwise they'll be
3966  // clobbered.
3967  Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3968  } else {
3969  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3970 
3971  DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3973  LocMemOffset);
3974  }
3975 
3976  if (Outs[i].Flags.isByVal()) {
3977  SDValue SizeNode =
3978  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3979  SDValue Cpy = DAG.getMemcpy(
3980  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3981  /*isVol = */ false, /*AlwaysInline = */ false,
3982  /*isTailCall = */ false,
3983  DstInfo, MachinePointerInfo());
3984 
3985  MemOpChains.push_back(Cpy);
3986  } else {
3987  // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3988  // promoted to a legal register type i32, we should truncate Arg back to
3989  // i1/i8/i16.
3990  if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3991  VA.getValVT() == MVT::i16)
3992  Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3993 
3994  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3995  MemOpChains.push_back(Store);
3996  }
3997  }
3998  }
3999 
4000  if (!MemOpChains.empty())
4001  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4002 
4003  // Build a sequence of copy-to-reg nodes chained together with token chain
4004  // and flag operands which copy the outgoing args into the appropriate regs.
4005  SDValue InFlag;
4006  for (auto &RegToPass : RegsToPass) {
4007  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
4008  RegToPass.second, InFlag);
4009  InFlag = Chain.getValue(1);
4010  }
4011 
4012  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
4013  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
4014  // node so that legalize doesn't hack it.
4015  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
4016  auto GV = G->getGlobal();
4017  if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
4019  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
4020  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
4021  } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
4022  assert(Subtarget->isTargetWindows() &&
4023  "Windows is the only supported COFF target");
4024  Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
4025  } else {
4026  const GlobalValue *GV = G->getGlobal();
4027  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
4028  }
4029  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
4030  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
4031  Subtarget->isTargetMachO()) {
4032  const char *Sym = S->getSymbol();
4033  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
4034  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
4035  } else {
4036  const char *Sym = S->getSymbol();
4037  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
4038  }
4039  }
4040 
4041  // We don't usually want to end the call-sequence here because we would tidy
4042  // the frame up *after* the call, however in the ABI-changing tail-call case
4043  // we've carefully laid out the parameters so that when sp is reset they'll be
4044  // in the correct location.
4045  if (IsTailCall && !IsSibCall) {