LLVM  10.0.0svn
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ExpandImm.h"
14 #include "AArch64ISelLowering.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/StringSwitch.h"
31 #include "llvm/ADT/Triple.h"
32 #include "llvm/ADT/Twine.h"
48 #include "llvm/IR/Attributes.h"
49 #include "llvm/IR/Constants.h"
50 #include "llvm/IR/DataLayout.h"
51 #include "llvm/IR/DebugLoc.h"
52 #include "llvm/IR/DerivedTypes.h"
53 #include "llvm/IR/Function.h"
55 #include "llvm/IR/GlobalValue.h"
56 #include "llvm/IR/IRBuilder.h"
57 #include "llvm/IR/Instruction.h"
58 #include "llvm/IR/Instructions.h"
59 #include "llvm/IR/IntrinsicInst.h"
60 #include "llvm/IR/Intrinsics.h"
61 #include "llvm/IR/Module.h"
62 #include "llvm/IR/OperandTraits.h"
63 #include "llvm/IR/PatternMatch.h"
64 #include "llvm/IR/Type.h"
65 #include "llvm/IR/Use.h"
66 #include "llvm/IR/Value.h"
67 #include "llvm/MC/MCRegisterInfo.h"
68 #include "llvm/Support/Casting.h"
69 #include "llvm/Support/CodeGen.h"
71 #include "llvm/Support/Compiler.h"
72 #include "llvm/Support/Debug.h"
74 #include "llvm/Support/KnownBits.h"
80 #include <algorithm>
81 #include <bitset>
82 #include <cassert>
83 #include <cctype>
84 #include <cstdint>
85 #include <cstdlib>
86 #include <iterator>
87 #include <limits>
88 #include <tuple>
89 #include <utility>
90 #include <vector>
91 
92 using namespace llvm;
93 using namespace llvm::PatternMatch;
94 
95 #define DEBUG_TYPE "aarch64-lower"
96 
97 STATISTIC(NumTailCalls, "Number of tail calls");
98 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
99 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
100 
101 static cl::opt<bool>
102 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
103  cl::desc("Allow AArch64 SLI/SRI formation"),
104  cl::init(false));
105 
106 // FIXME: The necessary dtprel relocations don't seem to be supported
107 // well in the GNU bfd and gold linkers at the moment. Therefore, by
108 // default, for now, fall back to GeneralDynamic code generation.
110  "aarch64-elf-ldtls-generation", cl::Hidden,
111  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
112  cl::init(false));
113 
114 static cl::opt<bool>
115 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
116  cl::desc("Enable AArch64 logical imm instruction "
117  "optimization"),
118  cl::init(true));
119 
120 /// Value type used for condition codes.
121 static const MVT MVT_CC = MVT::i32;
122 
124  const AArch64Subtarget &STI)
125  : TargetLowering(TM), Subtarget(&STI) {
126  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
127  // we have to make something up. Arbitrarily, choose ZeroOrOne.
129  // When comparing vectors the result sets the different elements in the
130  // vector to all-one or all-zero.
132 
133  // Set up the register classes.
134  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
135  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
136 
137  if (Subtarget->hasFPARMv8()) {
138  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
139  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
140  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
141  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
142  }
143 
144  if (Subtarget->hasNEON()) {
145  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
146  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
147  // Someone set us up the NEON.
148  addDRTypeForNEON(MVT::v2f32);
149  addDRTypeForNEON(MVT::v8i8);
150  addDRTypeForNEON(MVT::v4i16);
151  addDRTypeForNEON(MVT::v2i32);
152  addDRTypeForNEON(MVT::v1i64);
153  addDRTypeForNEON(MVT::v1f64);
154  addDRTypeForNEON(MVT::v4f16);
155 
156  addQRTypeForNEON(MVT::v4f32);
157  addQRTypeForNEON(MVT::v2f64);
158  addQRTypeForNEON(MVT::v16i8);
159  addQRTypeForNEON(MVT::v8i16);
160  addQRTypeForNEON(MVT::v4i32);
161  addQRTypeForNEON(MVT::v2i64);
162  addQRTypeForNEON(MVT::v8f16);
163  }
164 
165  if (Subtarget->hasSVE()) {
166  // Add legal sve predicate types
167  addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
168  addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
169  addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
170  addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
171 
172  // Add legal sve data types
173  addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
174  addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
175  addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
176  addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
177 
178  addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
179  addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
180  addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
181  addRegisterClass(MVT::nxv1f32, &AArch64::ZPRRegClass);
182  addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
183  addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
184  addRegisterClass(MVT::nxv1f64, &AArch64::ZPRRegClass);
185  addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
186  }
187 
188  // Compute derived properties from the register classes
190 
191  // Provide all sorts of operation actions
219 
223 
227 
229 
230  // Custom lowering hooks are needed for XOR
231  // to fold it into CSINC/CSINV.
234 
235  // Virtually no operation on f128 is legal, but LLVM can't expand them when
236  // there's a valid register class, so we need custom operations in most cases.
258 
259  // Lowering for many of the conversions is actually specified by the non-f128
260  // type. The LowerXXX function will be trivial when f128 isn't involved.
275 
276  // Variable arguments.
281 
282  // Variable-sized objects.
285 
286  if (Subtarget->isTargetWindows())
288  else
290 
291  // Constant pool entries
293 
294  // BlockAddress
296 
297  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
306 
307  // AArch64 lacks both left-rotate and popcount instructions.
310  for (MVT VT : MVT::vector_valuetypes()) {
313  }
314 
315  // AArch64 doesn't have {U|S}MUL_LOHI.
318 
321 
324  for (MVT VT : MVT::vector_valuetypes()) {
327  }
334 
335  // Custom lower Add/Sub/Mul with overflow.
348 
357  if (Subtarget->hasFullFP16())
359  else
361 
395 
396  if (!Subtarget->hasFullFP16()) {
419 
420  // promote v4f16 to v4f32 when that is known to be safe.
433 
449 
470  }
471 
472  // AArch64 has implementations of a lot of rounding-like FP operations.
473  for (MVT Ty : {MVT::f32, MVT::f64}) {
488  }
489 
490  if (Subtarget->hasFullFP16()) {
501  }
502 
504 
506 
512 
513  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
514  // This requires the Performance Monitors extension.
515  if (Subtarget->hasPerfMon())
517 
518  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
519  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
520  // Issue __sincos_stret if available.
523  } else {
526  }
527 
528  // Make floating-point constants legal for the large code model, so they don't
529  // become loads from the constant pool.
530  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
533  }
534 
535  // AArch64 does not have floating-point extending loads, i1 sign-extending
536  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
537  for (MVT VT : MVT::fp_valuetypes()) {
542  }
543  for (MVT VT : MVT::integer_valuetypes())
545 
553 
556 
557  // Indexed loads and stores are supported.
558  for (unsigned im = (unsigned)ISD::PRE_INC;
574  }
575 
576  // Trap.
578  if (Subtarget->isTargetWindows())
580 
581  // We combine OR nodes for bitfield operations.
583  // Try to create BICs for vector ANDs.
585 
586  // Vector add and sub nodes may conceal a high-half opportunity.
587  // Also, try to fold ADD into CSINC/CSINV..
594 
598 
600 
607  if (Subtarget->supportsAddressTopByteIgnored())
609 
611 
614 
618 
620 
621  // In case of strict alignment, avoid an excessive number of byte wide stores.
625 
630 
632 
636 
638 
640 
641  EnableExtLdPromotion = true;
642 
643  // Set required alignment.
645  // Set preferred alignments.
649 
650  // Only change the limit for entries in a jump table if specified by
651  // the sub target, but not at the command line.
652  unsigned MaxJT = STI.getMaximumJumpTableSize();
653  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
655 
656  setHasExtractBitsInsn(true);
657 
659 
660  if (Subtarget->hasNEON()) {
661  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
662  // silliness like this:
688 
694 
696 
697  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
698  // elements smaller than i32, so promote the input to i32 first.
701  // i8 vector elements also need promotion to i32 for v8i8
704  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
709  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
710  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
713 
714  if (Subtarget->hasFullFP16()) {
719  } else {
720  // when AArch64 doesn't have fullfp16 support, promote the input
721  // to i32 first.
726  }
727 
730 
731  // AArch64 doesn't have MUL.2d:
733  // Custom handling for some quad-vector types to detect MULL.
737 
738  // Vector reductions
739  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
746  }
747  for (MVT VT : { MVT::v4f16, MVT::v2f32,
751  }
752 
755  // Likewise, narrowing and extending vector loads/stores aren't handled
756  // directly.
757  for (MVT VT : MVT::vector_valuetypes()) {
759 
760  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
763  } else {
766  }
769 
772 
773  for (MVT InnerVT : MVT::vector_valuetypes()) {
774  setTruncStoreAction(VT, InnerVT, Expand);
775  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
776  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
777  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
778  }
779  }
780 
781  // AArch64 has implementations of a lot of rounding-like FP operations.
782  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
789  }
790 
791  if (Subtarget->hasFullFP16()) {
792  for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
799  }
800  }
801 
803  }
804 
806 }
807 
808 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
809  assert(VT.isVector() && "VT should be a vector type");
810 
811  if (VT.isFloatingPoint()) {
813  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
814  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
815  }
816 
817  // Mark vector float intrinsics as expand.
818  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
827 
828  // But we do support custom-lowering for FCOPYSIGN.
830  }
831 
843 
847  for (MVT InnerVT : MVT::all_valuetypes())
848  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
849 
850  // CNT supports only B element sizes, then use UADDLP to widen.
851  if (VT != MVT::v8i8 && VT != MVT::v16i8)
853 
859 
862 
863  if (!VT.isFloatingPoint())
865 
866  // [SU][MIN|MAX] are available for all NEON types apart from i64.
867  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
868  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
869  setOperationAction(Opcode, VT, Legal);
870 
871  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
872  if (VT.isFloatingPoint() &&
873  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
874  for (unsigned Opcode :
876  setOperationAction(Opcode, VT, Legal);
877 
878  if (Subtarget->isLittleEndian()) {
879  for (unsigned im = (unsigned)ISD::PRE_INC;
883  }
884  }
885 }
886 
887 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
888  addRegisterClass(VT, &AArch64::FPR64RegClass);
889  addTypeForNEON(VT, MVT::v2i32);
890 }
891 
892 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
893  addRegisterClass(VT, &AArch64::FPR128RegClass);
894  addTypeForNEON(VT, MVT::v4i32);
895 }
896 
898  EVT VT) const {
899  if (!VT.isVector())
900  return MVT::i32;
902 }
903 
904 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
905  const APInt &Demanded,
907  unsigned NewOpc) {
908  uint64_t OldImm = Imm, NewImm, Enc;
909  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
910 
911  // Return if the immediate is already all zeros, all ones, a bimm32 or a
912  // bimm64.
913  if (Imm == 0 || Imm == Mask ||
914  AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
915  return false;
916 
917  unsigned EltSize = Size;
918  uint64_t DemandedBits = Demanded.getZExtValue();
919 
920  // Clear bits that are not demanded.
921  Imm &= DemandedBits;
922 
923  while (true) {
924  // The goal here is to set the non-demanded bits in a way that minimizes
925  // the number of switching between 0 and 1. In order to achieve this goal,
926  // we set the non-demanded bits to the value of the preceding demanded bits.
927  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
928  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
929  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
930  // The final result is 0b11000011.
931  uint64_t NonDemandedBits = ~DemandedBits;
932  uint64_t InvertedImm = ~Imm & DemandedBits;
933  uint64_t RotatedImm =
934  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
935  NonDemandedBits;
936  uint64_t Sum = RotatedImm + NonDemandedBits;
937  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
938  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
939  NewImm = (Imm | Ones) & Mask;
940 
941  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
942  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
943  // we halve the element size and continue the search.
944  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
945  break;
946 
947  // We cannot shrink the element size any further if it is 2-bits.
948  if (EltSize == 2)
949  return false;
950 
951  EltSize /= 2;
952  Mask >>= EltSize;
953  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
954 
955  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
956  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
957  return false;
958 
959  // Merge the upper and lower halves of Imm and DemandedBits.
960  Imm |= Hi;
961  DemandedBits |= DemandedBitsHi;
962  }
963 
964  ++NumOptimizedImms;
965 
966  // Replicate the element across the register width.
967  while (EltSize < Size) {
968  NewImm |= NewImm << EltSize;
969  EltSize *= 2;
970  }
971 
972  (void)OldImm;
973  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
974  "demanded bits should never be altered");
975  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
976 
977  // Create the new constant immediate node.
978  EVT VT = Op.getValueType();
979  SDLoc DL(Op);
980  SDValue New;
981 
982  // If the new constant immediate is all-zeros or all-ones, let the target
983  // independent DAG combine optimize this node.
984  if (NewImm == 0 || NewImm == OrigMask) {
985  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
986  TLO.DAG.getConstant(NewImm, DL, VT));
987  // Otherwise, create a machine node so that target independent DAG combine
988  // doesn't undo this optimization.
989  } else {
990  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
991  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
992  New = SDValue(
993  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
994  }
995 
996  return TLO.CombineTo(Op, New);
997 }
998 
1000  SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
1001  // Delay this optimization to as late as possible.
1002  if (!TLO.LegalOps)
1003  return false;
1004 
1006  return false;
1007 
1008  EVT VT = Op.getValueType();
1009  if (VT.isVector())
1010  return false;
1011 
1012  unsigned Size = VT.getSizeInBits();
1013  assert((Size == 32 || Size == 64) &&
1014  "i32 or i64 is expected after legalization.");
1015 
1016  // Exit early if we demand all bits.
1017  if (Demanded.countPopulation() == Size)
1018  return false;
1019 
1020  unsigned NewOpc;
1021  switch (Op.getOpcode()) {
1022  default:
1023  return false;
1024  case ISD::AND:
1025  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1026  break;
1027  case ISD::OR:
1028  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1029  break;
1030  case ISD::XOR:
1031  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1032  break;
1033  }
1035  if (!C)
1036  return false;
1037  uint64_t Imm = C->getZExtValue();
1038  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
1039 }
1040 
1041 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1042 /// Mask are known to be either zero or one and return them Known.
1044  const SDValue Op, KnownBits &Known,
1045  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1046  switch (Op.getOpcode()) {
1047  default:
1048  break;
1049  case AArch64ISD::CSEL: {
1050  KnownBits Known2;
1051  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1052  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1053  Known.Zero &= Known2.Zero;
1054  Known.One &= Known2.One;
1055  break;
1056  }
1057  case AArch64ISD::LOADgot:
1058  case AArch64ISD::ADDlow: {
1059  if (!Subtarget->isTargetILP32())
1060  break;
1061  // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1062  Known.Zero = APInt::getHighBitsSet(64, 32);
1063  break;
1064  }
1065  case ISD::INTRINSIC_W_CHAIN: {
1066  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1067  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1068  switch (IntID) {
1069  default: return;
1070  case Intrinsic::aarch64_ldaxr:
1071  case Intrinsic::aarch64_ldxr: {
1072  unsigned BitWidth = Known.getBitWidth();
1073  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1074  unsigned MemBits = VT.getScalarSizeInBits();
1075  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1076  return;
1077  }
1078  }
1079  break;
1080  }
1082  case ISD::INTRINSIC_VOID: {
1083  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1084  switch (IntNo) {
1085  default:
1086  break;
1087  case Intrinsic::aarch64_neon_umaxv:
1088  case Intrinsic::aarch64_neon_uminv: {
1089  // Figure out the datatype of the vector operand. The UMINV instruction
1090  // will zero extend the result, so we can mark as known zero all the
1091  // bits larger than the element datatype. 32-bit or larget doesn't need
1092  // this as those are legal types and will be handled by isel directly.
1093  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1094  unsigned BitWidth = Known.getBitWidth();
1095  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1096  assert(BitWidth >= 8 && "Unexpected width!");
1097  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1098  Known.Zero |= Mask;
1099  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1100  assert(BitWidth >= 16 && "Unexpected width!");
1101  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1102  Known.Zero |= Mask;
1103  }
1104  break;
1105  } break;
1106  }
1107  }
1108  }
1109 }
1110 
1112  EVT) const {
1113  return MVT::i64;
1114 }
1115 
1117  EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1118  bool *Fast) const {
1119  if (Subtarget->requiresStrictAlign())
1120  return false;
1121 
1122  if (Fast) {
1123  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1124  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1125  // See comments in performSTORECombine() for more details about
1126  // these conditions.
1127 
1128  // Code that uses clang vector extensions can mark that it
1129  // wants unaligned accesses to be treated as fast by
1130  // underspecifying alignment to be 1 or 2.
1131  Align <= 2 ||
1132 
1133  // Disregard v2i64. Memcpy lowering produces those and splitting
1134  // them regresses performance on micro-benchmarks and olden/bh.
1135  VT == MVT::v2i64;
1136  }
1137  return true;
1138 }
1139 
1140 // Same as above but handling LLTs instead.
1142  LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1143  bool *Fast) const {
1144  if (Subtarget->requiresStrictAlign())
1145  return false;
1146 
1147  if (Fast) {
1148  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1149  *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1150  Ty.getSizeInBytes() != 16 ||
1151  // See comments in performSTORECombine() for more details about
1152  // these conditions.
1153 
1154  // Code that uses clang vector extensions can mark that it
1155  // wants unaligned accesses to be treated as fast by
1156  // underspecifying alignment to be 1 or 2.
1157  Align <= 2 ||
1158 
1159  // Disregard v2i64. Memcpy lowering produces those and splitting
1160  // them regresses performance on micro-benchmarks and olden/bh.
1161  Ty == LLT::vector(2, 64);
1162  }
1163  return true;
1164 }
1165 
1166 FastISel *
1168  const TargetLibraryInfo *libInfo) const {
1169  return AArch64::createFastISel(funcInfo, libInfo);
1170 }
1171 
1172 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1173  switch ((AArch64ISD::NodeType)Opcode) {
1174  case AArch64ISD::FIRST_NUMBER: break;
1175  case AArch64ISD::CALL: return "AArch64ISD::CALL";
1176  case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
1177  case AArch64ISD::ADR: return "AArch64ISD::ADR";
1178  case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
1179  case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
1180  case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
1181  case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
1182  case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
1183  case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
1184  case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
1185  case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
1186  case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
1187  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1188  case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
1189  case AArch64ISD::ADC: return "AArch64ISD::ADC";
1190  case AArch64ISD::SBC: return "AArch64ISD::SBC";
1191  case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
1192  case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
1193  case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
1194  case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
1195  case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
1196  case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
1197  case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
1198  case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
1199  case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
1200  case AArch64ISD::DUP: return "AArch64ISD::DUP";
1201  case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
1202  case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
1203  case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
1204  case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
1205  case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
1206  case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
1207  case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
1208  case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
1209  case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
1210  case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
1211  case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
1212  case AArch64ISD::BICi: return "AArch64ISD::BICi";
1213  case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
1214  case AArch64ISD::BSL: return "AArch64ISD::BSL";
1215  case AArch64ISD::NEG: return "AArch64ISD::NEG";
1216  case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
1217  case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
1218  case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
1219  case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
1220  case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
1221  case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
1222  case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
1223  case AArch64ISD::REV16: return "AArch64ISD::REV16";
1224  case AArch64ISD::REV32: return "AArch64ISD::REV32";
1225  case AArch64ISD::REV64: return "AArch64ISD::REV64";
1226  case AArch64ISD::EXT: return "AArch64ISD::EXT";
1227  case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
1228  case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
1229  case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
1230  case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
1231  case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
1232  case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
1233  case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
1234  case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
1235  case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
1236  case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
1237  case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
1238  case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
1239  case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
1240  case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
1241  case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
1242  case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
1243  case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
1244  case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
1245  case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
1246  case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
1247  case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
1248  case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
1249  case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
1250  case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
1251  case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
1252  case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
1253  case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
1254  case AArch64ISD::NOT: return "AArch64ISD::NOT";
1255  case AArch64ISD::BIT: return "AArch64ISD::BIT";
1256  case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
1257  case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
1258  case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
1259  case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
1260  case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
1261  case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
1262  case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
1263  case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
1264  case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
1265  case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
1266  case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
1267  case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
1268  case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
1269  case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
1270  case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
1271  case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
1272  case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
1273  case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
1274  case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
1275  case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
1276  case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
1277  case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
1278  case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
1279  case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
1280  case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
1281  case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
1282  case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
1283  case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
1284  case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
1285  case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
1286  case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
1287  case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
1288  case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
1289  case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
1290  case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
1291  case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
1292  case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
1293  case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
1294  case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
1295  case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
1296  case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
1297  case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
1298  case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
1299  case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
1300  case AArch64ISD::STG: return "AArch64ISD::STG";
1301  case AArch64ISD::STZG: return "AArch64ISD::STZG";
1302  case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
1303  case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
1304  }
1305  return nullptr;
1306 }
1307 
1310  MachineBasicBlock *MBB) const {
1311  // We materialise the F128CSEL pseudo-instruction as some control flow and a
1312  // phi node:
1313 
1314  // OrigBB:
1315  // [... previous instrs leading to comparison ...]
1316  // b.ne TrueBB
1317  // b EndBB
1318  // TrueBB:
1319  // ; Fallthrough
1320  // EndBB:
1321  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1322 
1323  MachineFunction *MF = MBB->getParent();
1324  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1325  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1326  DebugLoc DL = MI.getDebugLoc();
1327  MachineFunction::iterator It = ++MBB->getIterator();
1328 
1329  Register DestReg = MI.getOperand(0).getReg();
1330  Register IfTrueReg = MI.getOperand(1).getReg();
1331  Register IfFalseReg = MI.getOperand(2).getReg();
1332  unsigned CondCode = MI.getOperand(3).getImm();
1333  bool NZCVKilled = MI.getOperand(4).isKill();
1334 
1335  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1336  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1337  MF->insert(It, TrueBB);
1338  MF->insert(It, EndBB);
1339 
1340  // Transfer rest of current basic-block to EndBB
1341  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1342  MBB->end());
1343  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1344 
1345  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1346  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1347  MBB->addSuccessor(TrueBB);
1348  MBB->addSuccessor(EndBB);
1349 
1350  // TrueBB falls through to the end.
1351  TrueBB->addSuccessor(EndBB);
1352 
1353  if (!NZCVKilled) {
1354  TrueBB->addLiveIn(AArch64::NZCV);
1355  EndBB->addLiveIn(AArch64::NZCV);
1356  }
1357 
1358  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1359  .addReg(IfTrueReg)
1360  .addMBB(TrueBB)
1361  .addReg(IfFalseReg)
1362  .addMBB(MBB);
1363 
1364  MI.eraseFromParent();
1365  return EndBB;
1366 }
1367 
1369  MachineInstr &MI, MachineBasicBlock *BB) const {
1371  BB->getParent()->getFunction().getPersonalityFn())) &&
1372  "SEH does not use catchret!");
1373  return BB;
1374 }
1375 
1377  MachineInstr &MI, MachineBasicBlock *BB) const {
1378  MI.eraseFromParent();
1379  return BB;
1380 }
1381 
1383  MachineInstr &MI, MachineBasicBlock *BB) const {
1384  switch (MI.getOpcode()) {
1385  default:
1386 #ifndef NDEBUG
1387  MI.dump();
1388 #endif
1389  llvm_unreachable("Unexpected instruction for custom inserter!");
1390 
1391  case AArch64::F128CSEL:
1392  return EmitF128CSEL(MI, BB);
1393 
1394  case TargetOpcode::STACKMAP:
1395  case TargetOpcode::PATCHPOINT:
1396  return emitPatchPoint(MI, BB);
1397 
1398  case AArch64::CATCHRET:
1399  return EmitLoweredCatchRet(MI, BB);
1400  case AArch64::CATCHPAD:
1401  return EmitLoweredCatchPad(MI, BB);
1402  }
1403 }
1404 
1405 //===----------------------------------------------------------------------===//
1406 // AArch64 Lowering private implementation.
1407 //===----------------------------------------------------------------------===//
1408 
1409 //===----------------------------------------------------------------------===//
1410 // Lowering Code
1411 //===----------------------------------------------------------------------===//
1412 
1413 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1414 /// CC
1416  switch (CC) {
1417  default:
1418  llvm_unreachable("Unknown condition code!");
1419  case ISD::SETNE:
1420  return AArch64CC::NE;
1421  case ISD::SETEQ:
1422  return AArch64CC::EQ;
1423  case ISD::SETGT:
1424  return AArch64CC::GT;
1425  case ISD::SETGE:
1426  return AArch64CC::GE;
1427  case ISD::SETLT:
1428  return AArch64CC::LT;
1429  case ISD::SETLE:
1430  return AArch64CC::LE;
1431  case ISD::SETUGT:
1432  return AArch64CC::HI;
1433  case ISD::SETUGE:
1434  return AArch64CC::HS;
1435  case ISD::SETULT:
1436  return AArch64CC::LO;
1437  case ISD::SETULE:
1438  return AArch64CC::LS;
1439  }
1440 }
1441 
1442 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1445  AArch64CC::CondCode &CondCode2) {
1446  CondCode2 = AArch64CC::AL;
1447  switch (CC) {
1448  default:
1449  llvm_unreachable("Unknown FP condition!");
1450  case ISD::SETEQ:
1451  case ISD::SETOEQ:
1452  CondCode = AArch64CC::EQ;
1453  break;
1454  case ISD::SETGT:
1455  case ISD::SETOGT:
1456  CondCode = AArch64CC::GT;
1457  break;
1458  case ISD::SETGE:
1459  case ISD::SETOGE:
1460  CondCode = AArch64CC::GE;
1461  break;
1462  case ISD::SETOLT:
1463  CondCode = AArch64CC::MI;
1464  break;
1465  case ISD::SETOLE:
1466  CondCode = AArch64CC::LS;
1467  break;
1468  case ISD::SETONE:
1469  CondCode = AArch64CC::MI;
1470  CondCode2 = AArch64CC::GT;
1471  break;
1472  case ISD::SETO:
1473  CondCode = AArch64CC::VC;
1474  break;
1475  case ISD::SETUO:
1476  CondCode = AArch64CC::VS;
1477  break;
1478  case ISD::SETUEQ:
1479  CondCode = AArch64CC::EQ;
1480  CondCode2 = AArch64CC::VS;
1481  break;
1482  case ISD::SETUGT:
1483  CondCode = AArch64CC::HI;
1484  break;
1485  case ISD::SETUGE:
1486  CondCode = AArch64CC::PL;
1487  break;
1488  case ISD::SETLT:
1489  case ISD::SETULT:
1490  CondCode = AArch64CC::LT;
1491  break;
1492  case ISD::SETLE:
1493  case ISD::SETULE:
1494  CondCode = AArch64CC::LE;
1495  break;
1496  case ISD::SETNE:
1497  case ISD::SETUNE:
1498  CondCode = AArch64CC::NE;
1499  break;
1500  }
1501 }
1502 
1503 /// Convert a DAG fp condition code to an AArch64 CC.
1504 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1505 /// should be AND'ed instead of OR'ed.
1508  AArch64CC::CondCode &CondCode2) {
1509  CondCode2 = AArch64CC::AL;
1510  switch (CC) {
1511  default:
1512  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1513  assert(CondCode2 == AArch64CC::AL);
1514  break;
1515  case ISD::SETONE:
1516  // (a one b)
1517  // == ((a olt b) || (a ogt b))
1518  // == ((a ord b) && (a une b))
1519  CondCode = AArch64CC::VC;
1520  CondCode2 = AArch64CC::NE;
1521  break;
1522  case ISD::SETUEQ:
1523  // (a ueq b)
1524  // == ((a uno b) || (a oeq b))
1525  // == ((a ule b) && (a uge b))
1526  CondCode = AArch64CC::PL;
1527  CondCode2 = AArch64CC::LE;
1528  break;
1529  }
1530 }
1531 
1532 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1533 /// CC usable with the vector instructions. Fewer operations are available
1534 /// without a real NZCV register, so we have to use less efficient combinations
1535 /// to get the same effect.
1538  AArch64CC::CondCode &CondCode2,
1539  bool &Invert) {
1540  Invert = false;
1541  switch (CC) {
1542  default:
1543  // Mostly the scalar mappings work fine.
1544  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1545  break;
1546  case ISD::SETUO:
1547  Invert = true;
1549  case ISD::SETO:
1550  CondCode = AArch64CC::MI;
1551  CondCode2 = AArch64CC::GE;
1552  break;
1553  case ISD::SETUEQ:
1554  case ISD::SETULT:
1555  case ISD::SETULE:
1556  case ISD::SETUGT:
1557  case ISD::SETUGE:
1558  // All of the compare-mask comparisons are ordered, but we can switch
1559  // between the two by a double inversion. E.g. ULE == !OGT.
1560  Invert = true;
1561  changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1562  break;
1563  }
1564 }
1565 
1566 static bool isLegalArithImmed(uint64_t C) {
1567  // Matches AArch64DAGToDAGISel::SelectArithImmed().
1568  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1569  LLVM_DEBUG(dbgs() << "Is imm " << C
1570  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1571  return IsLegal;
1572 }
1573 
1574 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
1575 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
1576 // can be set differently by this operation. It comes down to whether
1577 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1578 // everything is fine. If not then the optimization is wrong. Thus general
1579 // comparisons are only valid if op2 != 0.
1580 //
1581 // So, finally, the only LLVM-native comparisons that don't mention C and V
1582 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1583 // the absence of information about op2.
1584 static bool isCMN(SDValue Op, ISD::CondCode CC) {
1585  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
1586  (CC == ISD::SETEQ || CC == ISD::SETNE);
1587 }
1588 
1590  const SDLoc &dl, SelectionDAG &DAG) {
1591  EVT VT = LHS.getValueType();
1592  const bool FullFP16 =
1593  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1594 
1595  if (VT.isFloatingPoint()) {
1596  assert(VT != MVT::f128);
1597  if (VT == MVT::f16 && !FullFP16) {
1598  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1599  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1600  VT = MVT::f32;
1601  }
1602  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1603  }
1604 
1605  // The CMP instruction is just an alias for SUBS, and representing it as
1606  // SUBS means that it's possible to get CSE with subtract operations.
1607  // A later phase can perform the optimization of setting the destination
1608  // register to WZR/XZR if it ends up being unused.
1609  unsigned Opcode = AArch64ISD::SUBS;
1610 
1611  if (isCMN(RHS, CC)) {
1612  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
1613  Opcode = AArch64ISD::ADDS;
1614  RHS = RHS.getOperand(1);
1615  } else if (isCMN(LHS, CC)) {
1616  // As we are looking for EQ/NE compares, the operands can be commuted ; can
1617  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
1618  Opcode = AArch64ISD::ADDS;
1619  LHS = LHS.getOperand(1);
1620  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1621  !isUnsignedIntSetCC(CC)) {
1622  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1623  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1624  // of the signed comparisons.
1625  Opcode = AArch64ISD::ANDS;
1626  RHS = LHS.getOperand(1);
1627  LHS = LHS.getOperand(0);
1628  }
1629 
1630  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1631  .getValue(1);
1632 }
1633 
1634 /// \defgroup AArch64CCMP CMP;CCMP matching
1635 ///
1636 /// These functions deal with the formation of CMP;CCMP;... sequences.
1637 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1638 /// a comparison. They set the NZCV flags to a predefined value if their
1639 /// predicate is false. This allows to express arbitrary conjunctions, for
1640 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
1641 /// expressed as:
1642 /// cmp A
1643 /// ccmp B, inv(CB), CA
1644 /// check for CB flags
1645 ///
1646 /// This naturally lets us implement chains of AND operations with SETCC
1647 /// operands. And we can even implement some other situations by transforming
1648 /// them:
1649 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
1650 /// negating the flags used in a CCMP/FCCMP operations.
1651 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
1652 /// by negating the flags we test for afterwards. i.e.
1653 /// NEG (CMP CCMP CCCMP ...) can be implemented.
1654 /// - Note that we can only ever negate all previously processed results.
1655 /// What we can not implement by flipping the flags to test is a negation
1656 /// of two sub-trees (because the negation affects all sub-trees emitted so
1657 /// far, so the 2nd sub-tree we emit would also affect the first).
1658 /// With those tools we can implement some OR operations:
1659 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
1660 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
1661 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
1662 /// elimination rules from earlier to implement the whole thing as a
1663 /// CCMP/FCCMP chain.
1664 ///
1665 /// As complete example:
1666 /// or (or (setCA (cmp A)) (setCB (cmp B)))
1667 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1668 /// can be reassociated to:
1669 /// or (and (setCC (cmp C)) setCD (cmp D))
1670 // (or (setCA (cmp A)) (setCB (cmp B)))
1671 /// can be transformed to:
1672 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
1673 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1674 /// which can be implemented as:
1675 /// cmp C
1676 /// ccmp D, inv(CD), CC
1677 /// ccmp A, CA, inv(CD)
1678 /// ccmp B, CB, inv(CA)
1679 /// check for CB flags
1680 ///
1681 /// A counterexample is "or (and A B) (and C D)" which translates to
1682 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
1683 /// can only implement 1 of the inner (not) operations, but not both!
1684 /// @{
1685 
1686 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1688  ISD::CondCode CC, SDValue CCOp,
1690  AArch64CC::CondCode OutCC,
1691  const SDLoc &DL, SelectionDAG &DAG) {
1692  unsigned Opcode = 0;
1693  const bool FullFP16 =
1694  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1695 
1696  if (LHS.getValueType().isFloatingPoint()) {
1697  assert(LHS.getValueType() != MVT::f128);
1698  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1699  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1700  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1701  }
1702  Opcode = AArch64ISD::FCCMP;
1703  } else if (RHS.getOpcode() == ISD::SUB) {
1704  SDValue SubOp0 = RHS.getOperand(0);
1705  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1706  // See emitComparison() on why we can only do this for SETEQ and SETNE.
1707  Opcode = AArch64ISD::CCMN;
1708  RHS = RHS.getOperand(1);
1709  }
1710  }
1711  if (Opcode == 0)
1712  Opcode = AArch64ISD::CCMP;
1713 
1714  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1716  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1717  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1718  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1719 }
1720 
1721 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
1722 /// expressed as a conjunction. See \ref AArch64CCMP.
1723 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
1724 /// changing the conditions on the SETCC tests.
1725 /// (this means we can call emitConjunctionRec() with
1726 /// Negate==true on this sub-tree)
1727 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
1728 /// cannot do the negation naturally. We are required to
1729 /// emit the subtree first in this case.
1730 /// \param WillNegate Is true if are called when the result of this
1731 /// subexpression must be negated. This happens when the
1732 /// outer expression is an OR. We can use this fact to know
1733 /// that we have a double negation (or (or ...) ...) that
1734 /// can be implemented for free.
1735 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
1736  bool &MustBeFirst, bool WillNegate,
1737  unsigned Depth = 0) {
1738  if (!Val.hasOneUse())
1739  return false;
1740  unsigned Opcode = Val->getOpcode();
1741  if (Opcode == ISD::SETCC) {
1742  if (Val->getOperand(0).getValueType() == MVT::f128)
1743  return false;
1744  CanNegate = true;
1745  MustBeFirst = false;
1746  return true;
1747  }
1748  // Protect against exponential runtime and stack overflow.
1749  if (Depth > 6)
1750  return false;
1751  if (Opcode == ISD::AND || Opcode == ISD::OR) {
1752  bool IsOR = Opcode == ISD::OR;
1753  SDValue O0 = Val->getOperand(0);
1754  SDValue O1 = Val->getOperand(1);
1755  bool CanNegateL;
1756  bool MustBeFirstL;
1757  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
1758  return false;
1759  bool CanNegateR;
1760  bool MustBeFirstR;
1761  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
1762  return false;
1763 
1764  if (MustBeFirstL && MustBeFirstR)
1765  return false;
1766 
1767  if (IsOR) {
1768  // For an OR expression we need to be able to naturally negate at least
1769  // one side or we cannot do the transformation at all.
1770  if (!CanNegateL && !CanNegateR)
1771  return false;
1772  // If we the result of the OR will be negated and we can naturally negate
1773  // the leafs, then this sub-tree as a whole negates naturally.
1774  CanNegate = WillNegate && CanNegateL && CanNegateR;
1775  // If we cannot naturally negate the whole sub-tree, then this must be
1776  // emitted first.
1777  MustBeFirst = !CanNegate;
1778  } else {
1779  assert(Opcode == ISD::AND && "Must be OR or AND");
1780  // We cannot naturally negate an AND operation.
1781  CanNegate = false;
1782  MustBeFirst = MustBeFirstL || MustBeFirstR;
1783  }
1784  return true;
1785  }
1786  return false;
1787 }
1788 
1789 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1790 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1791 /// Tries to transform the given i1 producing node @p Val to a series compare
1792 /// and conditional compare operations. @returns an NZCV flags producing node
1793 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1794 /// transformation was not possible.
1795 /// \p Negate is true if we want this sub-tree being negated just by changing
1796 /// SETCC conditions.
1798  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1800  // We're at a tree leaf, produce a conditional comparison operation.
1801  unsigned Opcode = Val->getOpcode();
1802  if (Opcode == ISD::SETCC) {
1803  SDValue LHS = Val->getOperand(0);
1804  SDValue RHS = Val->getOperand(1);
1805  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1806  bool isInteger = LHS.getValueType().isInteger();
1807  if (Negate)
1808  CC = getSetCCInverse(CC, isInteger);
1809  SDLoc DL(Val);
1810  // Determine OutCC and handle FP special case.
1811  if (isInteger) {
1812  OutCC = changeIntCCToAArch64CC(CC);
1813  } else {
1815  AArch64CC::CondCode ExtraCC;
1816  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1817  // Some floating point conditions can't be tested with a single condition
1818  // code. Construct an additional comparison in this case.
1819  if (ExtraCC != AArch64CC::AL) {
1820  SDValue ExtraCmp;
1821  if (!CCOp.getNode())
1822  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1823  else
1824  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1825  ExtraCC, DL, DAG);
1826  CCOp = ExtraCmp;
1827  Predicate = ExtraCC;
1828  }
1829  }
1830 
1831  // Produce a normal comparison if we are first in the chain
1832  if (!CCOp)
1833  return emitComparison(LHS, RHS, CC, DL, DAG);
1834  // Otherwise produce a ccmp.
1835  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1836  DAG);
1837  }
1838  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
1839 
1840  bool IsOR = Opcode == ISD::OR;
1841 
1842  SDValue LHS = Val->getOperand(0);
1843  bool CanNegateL;
1844  bool MustBeFirstL;
1845  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
1846  assert(ValidL && "Valid conjunction/disjunction tree");
1847  (void)ValidL;
1848 
1849  SDValue RHS = Val->getOperand(1);
1850  bool CanNegateR;
1851  bool MustBeFirstR;
1852  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
1853  assert(ValidR && "Valid conjunction/disjunction tree");
1854  (void)ValidR;
1855 
1856  // Swap sub-tree that must come first to the right side.
1857  if (MustBeFirstL) {
1858  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
1859  std::swap(LHS, RHS);
1860  std::swap(CanNegateL, CanNegateR);
1861  std::swap(MustBeFirstL, MustBeFirstR);
1862  }
1863 
1864  bool NegateR;
1865  bool NegateAfterR;
1866  bool NegateL;
1867  bool NegateAfterAll;
1868  if (Opcode == ISD::OR) {
1869  // Swap the sub-tree that we can negate naturally to the left.
1870  if (!CanNegateL) {
1871  assert(CanNegateR && "at least one side must be negatable");
1872  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
1873  assert(!Negate);
1874  std::swap(LHS, RHS);
1875  NegateR = false;
1876  NegateAfterR = true;
1877  } else {
1878  // Negate the left sub-tree if possible, otherwise negate the result.
1879  NegateR = CanNegateR;
1880  NegateAfterR = !CanNegateR;
1881  }
1882  NegateL = true;
1883  NegateAfterAll = !Negate;
1884  } else {
1885  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
1886  assert(!Negate && "Valid conjunction/disjunction tree");
1887 
1888  NegateL = false;
1889  NegateR = false;
1890  NegateAfterR = false;
1891  NegateAfterAll = false;
1892  }
1893 
1894  // Emit sub-trees.
1895  AArch64CC::CondCode RHSCC;
1896  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
1897  if (NegateAfterR)
1898  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1899  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
1900  if (NegateAfterAll)
1901  OutCC = AArch64CC::getInvertedCondCode(OutCC);
1902  return CmpL;
1903 }
1904 
1905 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
1906 /// In some cases this is even possible with OR operations in the expression.
1907 /// See \ref AArch64CCMP.
1908 /// \see emitConjunctionRec().
1910  AArch64CC::CondCode &OutCC) {
1911  bool DummyCanNegate;
1912  bool DummyMustBeFirst;
1913  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
1914  return SDValue();
1915 
1916  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
1917 }
1918 
1919 /// @}
1920 
1921 /// Returns how profitable it is to fold a comparison's operand's shift and/or
1922 /// extension operations.
1924  auto isSupportedExtend = [&](SDValue V) {
1925  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
1926  return true;
1927 
1928  if (V.getOpcode() == ISD::AND)
1929  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
1930  uint64_t Mask = MaskCst->getZExtValue();
1931  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
1932  }
1933 
1934  return false;
1935  };
1936 
1937  if (!Op.hasOneUse())
1938  return 0;
1939 
1940  if (isSupportedExtend(Op))
1941  return 1;
1942 
1943  unsigned Opc = Op.getOpcode();
1944  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
1945  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
1946  uint64_t Shift = ShiftCst->getZExtValue();
1947  if (isSupportedExtend(Op.getOperand(0)))
1948  return (Shift <= 4) ? 2 : 1;
1949  EVT VT = Op.getValueType();
1950  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
1951  return 1;
1952  }
1953 
1954  return 0;
1955 }
1956 
1958  SDValue &AArch64cc, SelectionDAG &DAG,
1959  const SDLoc &dl) {
1960  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1961  EVT VT = RHS.getValueType();
1962  uint64_t C = RHSC->getZExtValue();
1963  if (!isLegalArithImmed(C)) {
1964  // Constant does not fit, try adjusting it by one?
1965  switch (CC) {
1966  default:
1967  break;
1968  case ISD::SETLT:
1969  case ISD::SETGE:
1970  if ((VT == MVT::i32 && C != 0x80000000 &&
1971  isLegalArithImmed((uint32_t)(C - 1))) ||
1972  (VT == MVT::i64 && C != 0x80000000ULL &&
1973  isLegalArithImmed(C - 1ULL))) {
1974  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1975  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1976  RHS = DAG.getConstant(C, dl, VT);
1977  }
1978  break;
1979  case ISD::SETULT:
1980  case ISD::SETUGE:
1981  if ((VT == MVT::i32 && C != 0 &&
1982  isLegalArithImmed((uint32_t)(C - 1))) ||
1983  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1984  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1985  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1986  RHS = DAG.getConstant(C, dl, VT);
1987  }
1988  break;
1989  case ISD::SETLE:
1990  case ISD::SETGT:
1991  if ((VT == MVT::i32 && C != INT32_MAX &&
1992  isLegalArithImmed((uint32_t)(C + 1))) ||
1993  (VT == MVT::i64 && C != INT64_MAX &&
1994  isLegalArithImmed(C + 1ULL))) {
1995  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1996  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1997  RHS = DAG.getConstant(C, dl, VT);
1998  }
1999  break;
2000  case ISD::SETULE:
2001  case ISD::SETUGT:
2002  if ((VT == MVT::i32 && C != UINT32_MAX &&
2003  isLegalArithImmed((uint32_t)(C + 1))) ||
2004  (VT == MVT::i64 && C != UINT64_MAX &&
2005  isLegalArithImmed(C + 1ULL))) {
2006  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2007  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2008  RHS = DAG.getConstant(C, dl, VT);
2009  }
2010  break;
2011  }
2012  }
2013  }
2014 
2015  // Comparisons are canonicalized so that the RHS operand is simpler than the
2016  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
2017  // can fold some shift+extend operations on the RHS operand, so swap the
2018  // operands if that can be done.
2019  //
2020  // For example:
2021  // lsl w13, w11, #1
2022  // cmp w13, w12
2023  // can be turned into:
2024  // cmp w12, w11, lsl #1
2025  if (!isa<ConstantSDNode>(RHS) ||
2026  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
2027  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
2028 
2030  std::swap(LHS, RHS);
2032  }
2033  }
2034 
2035  SDValue Cmp;
2036  AArch64CC::CondCode AArch64CC;
2037  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
2038  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
2039 
2040  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
2041  // For the i8 operand, the largest immediate is 255, so this can be easily
2042  // encoded in the compare instruction. For the i16 operand, however, the
2043  // largest immediate cannot be encoded in the compare.
2044  // Therefore, use a sign extending load and cmn to avoid materializing the
2045  // -1 constant. For example,
2046  // movz w1, #65535
2047  // ldrh w0, [x0, #0]
2048  // cmp w0, w1
2049  // >
2050  // ldrsh w0, [x0, #0]
2051  // cmn w0, #1
2052  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
2053  // if and only if (sext LHS) == (sext RHS). The checks are in place to
2054  // ensure both the LHS and RHS are truly zero extended and to make sure the
2055  // transformation is profitable.
2056  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
2057  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
2058  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
2059  LHS.getNode()->hasNUsesOfValue(1, 0)) {
2060  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
2061  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
2062  SDValue SExt =
2063  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2064  DAG.getValueType(MVT::i16));
2065  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2066  RHS.getValueType()),
2067  CC, dl, DAG);
2068  AArch64CC = changeIntCCToAArch64CC(CC);
2069  }
2070  }
2071 
2072  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2073  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2074  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2075  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2076  }
2077  }
2078  }
2079 
2080  if (!Cmp) {
2081  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2082  AArch64CC = changeIntCCToAArch64CC(CC);
2083  }
2084  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2085  return Cmp;
2086 }
2087 
2088 static std::pair<SDValue, SDValue>
2090  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2091  "Unsupported value type");
2092  SDValue Value, Overflow;
2093  SDLoc DL(Op);
2094  SDValue LHS = Op.getOperand(0);
2095  SDValue RHS = Op.getOperand(1);
2096  unsigned Opc = 0;
2097  switch (Op.getOpcode()) {
2098  default:
2099  llvm_unreachable("Unknown overflow instruction!");
2100  case ISD::SADDO:
2101  Opc = AArch64ISD::ADDS;
2102  CC = AArch64CC::VS;
2103  break;
2104  case ISD::UADDO:
2105  Opc = AArch64ISD::ADDS;
2106  CC = AArch64CC::HS;
2107  break;
2108  case ISD::SSUBO:
2109  Opc = AArch64ISD::SUBS;
2110  CC = AArch64CC::VS;
2111  break;
2112  case ISD::USUBO:
2113  Opc = AArch64ISD::SUBS;
2114  CC = AArch64CC::LO;
2115  break;
2116  // Multiply needs a little bit extra work.
2117  case ISD::SMULO:
2118  case ISD::UMULO: {
2119  CC = AArch64CC::NE;
2120  bool IsSigned = Op.getOpcode() == ISD::SMULO;
2121  if (Op.getValueType() == MVT::i32) {
2122  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2123  // For a 32 bit multiply with overflow check we want the instruction
2124  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2125  // need to generate the following pattern:
2126  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2127  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2128  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2129  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2130  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
2131  DAG.getConstant(0, DL, MVT::i64));
2132  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2133  // operation. We need to clear out the upper 32 bits, because we used a
2134  // widening multiply that wrote all 64 bits. In the end this should be a
2135  // noop.
2136  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
2137  if (IsSigned) {
2138  // The signed overflow check requires more than just a simple check for
2139  // any bit set in the upper 32 bits of the result. These bits could be
2140  // just the sign bits of a negative number. To perform the overflow
2141  // check we have to arithmetic shift right the 32nd bit of the result by
2142  // 31 bits. Then we compare the result to the upper 32 bits.
2143  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
2144  DAG.getConstant(32, DL, MVT::i64));
2145  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
2146  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
2147  DAG.getConstant(31, DL, MVT::i64));
2148  // It is important that LowerBits is last, otherwise the arithmetic
2149  // shift will not be folded into the compare (SUBS).
2150  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2151  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2152  .getValue(1);
2153  } else {
2154  // The overflow check for unsigned multiply is easy. We only need to
2155  // check if any of the upper 32 bits are set. This can be done with a
2156  // CMP (shifted register). For that we need to generate the following
2157  // pattern:
2158  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2159  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2160  DAG.getConstant(32, DL, MVT::i64));
2161  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2162  Overflow =
2163  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2164  DAG.getConstant(0, DL, MVT::i64),
2165  UpperBits).getValue(1);
2166  }
2167  break;
2168  }
2169  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2170  // For the 64 bit multiply
2171  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2172  if (IsSigned) {
2173  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2174  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
2175  DAG.getConstant(63, DL, MVT::i64));
2176  // It is important that LowerBits is last, otherwise the arithmetic
2177  // shift will not be folded into the compare (SUBS).
2178  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2179  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2180  .getValue(1);
2181  } else {
2182  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2183  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2184  Overflow =
2185  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2186  DAG.getConstant(0, DL, MVT::i64),
2187  UpperBits).getValue(1);
2188  }
2189  break;
2190  }
2191  } // switch (...)
2192 
2193  if (Opc) {
2194  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2195 
2196  // Emit the AArch64 operation with overflow check.
2197  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2198  Overflow = Value.getValue(1);
2199  }
2200  return std::make_pair(Value, Overflow);
2201 }
2202 
2203 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
2204  RTLIB::Libcall Call) const {
2205  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2206  MakeLibCallOptions CallOptions;
2207  return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
2208 }
2209 
2210 // Returns true if the given Op is the overflow flag result of an overflow
2211 // intrinsic operation.
2212 static bool isOverflowIntrOpRes(SDValue Op) {
2213  unsigned Opc = Op.getOpcode();
2214  return (Op.getResNo() == 1 &&
2215  (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2216  Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
2217 }
2218 
2220  SDValue Sel = Op.getOperand(0);
2221  SDValue Other = Op.getOperand(1);
2222  SDLoc dl(Sel);
2223 
2224  // If the operand is an overflow checking operation, invert the condition
2225  // code and kill the Not operation. I.e., transform:
2226  // (xor (overflow_op_bool, 1))
2227  // -->
2228  // (csel 1, 0, invert(cc), overflow_op_bool)
2229  // ... which later gets transformed to just a cset instruction with an
2230  // inverted condition code, rather than a cset + eor sequence.
2231  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
2232  // Only lower legal XALUO ops.
2233  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2234  return SDValue();
2235 
2236  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2237  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2239  SDValue Value, Overflow;
2240  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2241  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2242  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2243  CCVal, Overflow);
2244  }
2245  // If neither operand is a SELECT_CC, give up.
2246  if (Sel.getOpcode() != ISD::SELECT_CC)
2247  std::swap(Sel, Other);
2248  if (Sel.getOpcode() != ISD::SELECT_CC)
2249  return Op;
2250 
2251  // The folding we want to perform is:
2252  // (xor x, (select_cc a, b, cc, 0, -1) )
2253  // -->
2254  // (csel x, (xor x, -1), cc ...)
2255  //
2256  // The latter will get matched to a CSINV instruction.
2257 
2258  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2259  SDValue LHS = Sel.getOperand(0);
2260  SDValue RHS = Sel.getOperand(1);
2261  SDValue TVal = Sel.getOperand(2);
2262  SDValue FVal = Sel.getOperand(3);
2263 
2264  // FIXME: This could be generalized to non-integer comparisons.
2265  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2266  return Op;
2267 
2268  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2269  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2270 
2271  // The values aren't constants, this isn't the pattern we're looking for.
2272  if (!CFVal || !CTVal)
2273  return Op;
2274 
2275  // We can commute the SELECT_CC by inverting the condition. This
2276  // might be needed to make this fit into a CSINV pattern.
2277  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2278  std::swap(TVal, FVal);
2279  std::swap(CTVal, CFVal);
2280  CC = ISD::getSetCCInverse(CC, true);
2281  }
2282 
2283  // If the constants line up, perform the transform!
2284  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2285  SDValue CCVal;
2286  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2287 
2288  FVal = Other;
2289  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2290  DAG.getConstant(-1ULL, dl, Other.getValueType()));
2291 
2292  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2293  CCVal, Cmp);
2294  }
2295 
2296  return Op;
2297 }
2298 
2300  EVT VT = Op.getValueType();
2301 
2302  // Let legalize expand this if it isn't a legal type yet.
2303  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2304  return SDValue();
2305 
2306  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2307 
2308  unsigned Opc;
2309  bool ExtraOp = false;
2310  switch (Op.getOpcode()) {
2311  default:
2312  llvm_unreachable("Invalid code");
2313  case ISD::ADDC:
2314  Opc = AArch64ISD::ADDS;
2315  break;
2316  case ISD::SUBC:
2317  Opc = AArch64ISD::SUBS;
2318  break;
2319  case ISD::ADDE:
2320  Opc = AArch64ISD::ADCS;
2321  ExtraOp = true;
2322  break;
2323  case ISD::SUBE:
2324  Opc = AArch64ISD::SBCS;
2325  ExtraOp = true;
2326  break;
2327  }
2328 
2329  if (!ExtraOp)
2330  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2331  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2332  Op.getOperand(2));
2333 }
2334 
2336  // Let legalize expand this if it isn't a legal type yet.
2338  return SDValue();
2339 
2340  SDLoc dl(Op);
2342  // The actual operation that sets the overflow or carry flag.
2343  SDValue Value, Overflow;
2344  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2345 
2346  // We use 0 and 1 as false and true values.
2347  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2348  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2349 
2350  // We use an inverted condition, because the conditional select is inverted
2351  // too. This will allow it to be selected to a single instruction:
2352  // CSINC Wd, WZR, WZR, invert(cond).
2353  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2354  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2355  CCVal, Overflow);
2356 
2357  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2358  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2359 }
2360 
2361 // Prefetch operands are:
2362 // 1: Address to prefetch
2363 // 2: bool isWrite
2364 // 3: int locality (0 = no locality ... 3 = extreme locality)
2365 // 4: bool isDataCache
2367  SDLoc DL(Op);
2368  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2369  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2370  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2371 
2372  bool IsStream = !Locality;
2373  // When the locality number is set
2374  if (Locality) {
2375  // The front-end should have filtered out the out-of-range values
2376  assert(Locality <= 3 && "Prefetch locality out-of-range");
2377  // The locality degree is the opposite of the cache speed.
2378  // Put the number the other way around.
2379  // The encoding starts at 0 for level 1
2380  Locality = 3 - Locality;
2381  }
2382 
2383  // built the mask value encoding the expected behavior.
2384  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2385  (!IsData << 3) | // IsDataCache bit
2386  (Locality << 1) | // Cache level bits
2387  (unsigned)IsStream; // Stream bit
2388  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2389  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2390 }
2391 
2392 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2393  SelectionDAG &DAG) const {
2394  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2395 
2396  RTLIB::Libcall LC;
2398 
2399  return LowerF128Call(Op, DAG, LC);
2400 }
2401 
2402 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2403  SelectionDAG &DAG) const {
2404  if (Op.getOperand(0).getValueType() != MVT::f128) {
2405  // It's legal except when f128 is involved
2406  return Op;
2407  }
2408 
2409  RTLIB::Libcall LC;
2411 
2412  // FP_ROUND node has a second operand indicating whether it is known to be
2413  // precise. That doesn't take part in the LibCall so we can't directly use
2414  // LowerF128Call.
2415  SDValue SrcVal = Op.getOperand(0);
2416  MakeLibCallOptions CallOptions;
2417  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions,
2418  SDLoc(Op)).first;
2419 }
2420 
2421 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
2422  SelectionDAG &DAG) const {
2423  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2424  // Any additional optimization in this function should be recorded
2425  // in the cost tables.
2426  EVT InVT = Op.getOperand(0).getValueType();
2427  EVT VT = Op.getValueType();
2428  unsigned NumElts = InVT.getVectorNumElements();
2429 
2430  // f16 conversions are promoted to f32 when full fp16 is not supported.
2431  if (InVT.getVectorElementType() == MVT::f16 &&
2432  !Subtarget->hasFullFP16()) {
2433  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2434  SDLoc dl(Op);
2435  return DAG.getNode(
2436  Op.getOpcode(), dl, Op.getValueType(),
2437  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2438  }
2439 
2440  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2441  SDLoc dl(Op);
2442  SDValue Cv =
2443  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2444  Op.getOperand(0));
2445  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2446  }
2447 
2448  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2449  SDLoc dl(Op);
2450  MVT ExtVT =
2452  VT.getVectorNumElements());
2453  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2454  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2455  }
2456 
2457  // Type changing conversions are illegal.
2458  return Op;
2459 }
2460 
2461 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2462  SelectionDAG &DAG) const {
2463  if (Op.getOperand(0).getValueType().isVector())
2464  return LowerVectorFP_TO_INT(Op, DAG);
2465 
2466  // f16 conversions are promoted to f32 when full fp16 is not supported.
2467  if (Op.getOperand(0).getValueType() == MVT::f16 &&
2468  !Subtarget->hasFullFP16()) {
2469  SDLoc dl(Op);
2470  return DAG.getNode(
2471  Op.getOpcode(), dl, Op.getValueType(),
2472  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2473  }
2474 
2475  if (Op.getOperand(0).getValueType() != MVT::f128) {
2476  // It's legal except when f128 is involved
2477  return Op;
2478  }
2479 
2480  RTLIB::Libcall LC;
2481  if (Op.getOpcode() == ISD::FP_TO_SINT)
2483  else
2485 
2486  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2487  MakeLibCallOptions CallOptions;
2488  return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first;
2489 }
2490 
2492  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2493  // Any additional optimization in this function should be recorded
2494  // in the cost tables.
2495  EVT VT = Op.getValueType();
2496  SDLoc dl(Op);
2497  SDValue In = Op.getOperand(0);
2498  EVT InVT = In.getValueType();
2499 
2500  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2501  MVT CastVT =
2503  InVT.getVectorNumElements());
2504  In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2505  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2506  }
2507 
2508  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2509  unsigned CastOpc =
2511  EVT CastVT = VT.changeVectorElementTypeToInteger();
2512  In = DAG.getNode(CastOpc, dl, CastVT, In);
2513  return DAG.getNode(Op.getOpcode(), dl, VT, In);
2514  }
2515 
2516  return Op;
2517 }
2518 
2519 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2520  SelectionDAG &DAG) const {
2521  if (Op.getValueType().isVector())
2522  return LowerVectorINT_TO_FP(Op, DAG);
2523 
2524  // f16 conversions are promoted to f32 when full fp16 is not supported.
2525  if (Op.getValueType() == MVT::f16 &&
2526  !Subtarget->hasFullFP16()) {
2527  SDLoc dl(Op);
2528  return DAG.getNode(
2529  ISD::FP_ROUND, dl, MVT::f16,
2530  DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2531  DAG.getIntPtrConstant(0, dl));
2532  }
2533 
2534  // i128 conversions are libcalls.
2535  if (Op.getOperand(0).getValueType() == MVT::i128)
2536  return SDValue();
2537 
2538  // Other conversions are legal, unless it's to the completely software-based
2539  // fp128.
2540  if (Op.getValueType() != MVT::f128)
2541  return Op;
2542 
2543  RTLIB::Libcall LC;
2544  if (Op.getOpcode() == ISD::SINT_TO_FP)
2546  else
2548 
2549  return LowerF128Call(Op, DAG, LC);
2550 }
2551 
2552 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2553  SelectionDAG &DAG) const {
2554  // For iOS, we want to call an alternative entry point: __sincos_stret,
2555  // which returns the values in two S / D registers.
2556  SDLoc dl(Op);
2557  SDValue Arg = Op.getOperand(0);
2558  EVT ArgVT = Arg.getValueType();
2559  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2560 
2561  ArgListTy Args;
2562  ArgListEntry Entry;
2563 
2564  Entry.Node = Arg;
2565  Entry.Ty = ArgTy;
2566  Entry.IsSExt = false;
2567  Entry.IsZExt = false;
2568  Args.push_back(Entry);
2569 
2570  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2571  : RTLIB::SINCOS_STRET_F32;
2572  const char *LibcallName = getLibcallName(LC);
2573  SDValue Callee =
2574  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2575 
2576  StructType *RetTy = StructType::get(ArgTy, ArgTy);
2578  CLI.setDebugLoc(dl)
2579  .setChain(DAG.getEntryNode())
2580  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2581 
2582  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2583  return CallResult.first;
2584 }
2585 
2587  if (Op.getValueType() != MVT::f16)
2588  return SDValue();
2589 
2590  assert(Op.getOperand(0).getValueType() == MVT::i16);
2591  SDLoc DL(Op);
2592 
2593  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2594  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2595  return SDValue(
2596  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2597  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2598  0);
2599 }
2600 
2601 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2602  if (OrigVT.getSizeInBits() >= 64)
2603  return OrigVT;
2604 
2605  assert(OrigVT.isSimple() && "Expecting a simple value type");
2606 
2607  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2608  switch (OrigSimpleTy) {
2609  default: llvm_unreachable("Unexpected Vector Type");
2610  case MVT::v2i8:
2611  case MVT::v2i16:
2612  return MVT::v2i32;
2613  case MVT::v4i8:
2614  return MVT::v4i16;
2615  }
2616 }
2617 
2619  const EVT &OrigTy,
2620  const EVT &ExtTy,
2621  unsigned ExtOpcode) {
2622  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2623  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2624  // 64-bits we need to insert a new extension so that it will be 64-bits.
2625  assert(ExtTy.is128BitVector() && "Unexpected extension size");
2626  if (OrigTy.getSizeInBits() >= 64)
2627  return N;
2628 
2629  // Must extend size to at least 64 bits to be used as an operand for VMULL.
2630  EVT NewVT = getExtensionTo64Bits(OrigTy);
2631 
2632  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2633 }
2634 
2636  bool isSigned) {
2637  EVT VT = N->getValueType(0);
2638 
2639  if (N->getOpcode() != ISD::BUILD_VECTOR)
2640  return false;
2641 
2642  for (const SDValue &Elt : N->op_values()) {
2643  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2644  unsigned EltSize = VT.getScalarSizeInBits();
2645  unsigned HalfSize = EltSize / 2;
2646  if (isSigned) {
2647  if (!isIntN(HalfSize, C->getSExtValue()))
2648  return false;
2649  } else {
2650  if (!isUIntN(HalfSize, C->getZExtValue()))
2651  return false;
2652  }
2653  continue;
2654  }
2655  return false;
2656  }
2657 
2658  return true;
2659 }
2660 
2662  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2664  N->getOperand(0)->getValueType(0),
2665  N->getValueType(0),
2666  N->getOpcode());
2667 
2668  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2669  EVT VT = N->getValueType(0);
2670  SDLoc dl(N);
2671  unsigned EltSize = VT.getScalarSizeInBits() / 2;
2672  unsigned NumElts = VT.getVectorNumElements();
2673  MVT TruncVT = MVT::getIntegerVT(EltSize);
2675  for (unsigned i = 0; i != NumElts; ++i) {
2676  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2677  const APInt &CInt = C->getAPIntValue();
2678  // Element types smaller than 32 bits are not legal, so use i32 elements.
2679  // The values are implicitly truncated so sext vs. zext doesn't matter.
2680  Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2681  }
2682  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2683 }
2684 
2685 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2686  return N->getOpcode() == ISD::SIGN_EXTEND ||
2687  isExtendedBUILD_VECTOR(N, DAG, true);
2688 }
2689 
2690 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2691  return N->getOpcode() == ISD::ZERO_EXTEND ||
2692  isExtendedBUILD_VECTOR(N, DAG, false);
2693 }
2694 
2695 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2696  unsigned Opcode = N->getOpcode();
2697  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2698  SDNode *N0 = N->getOperand(0).getNode();
2699  SDNode *N1 = N->getOperand(1).getNode();
2700  return N0->hasOneUse() && N1->hasOneUse() &&
2701  isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2702  }
2703  return false;
2704 }
2705 
2706 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2707  unsigned Opcode = N->getOpcode();
2708  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2709  SDNode *N0 = N->getOperand(0).getNode();
2710  SDNode *N1 = N->getOperand(1).getNode();
2711  return N0->hasOneUse() && N1->hasOneUse() &&
2712  isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2713  }
2714  return false;
2715 }
2716 
2717 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
2718  SelectionDAG &DAG) const {
2719  // The rounding mode is in bits 23:22 of the FPSCR.
2720  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
2721  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
2722  // so that the shift + and get folded into a bitfield extract.
2723  SDLoc dl(Op);
2724 
2725  SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
2726  DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
2727  MVT::i64));
2728  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
2729  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
2730  DAG.getConstant(1U << 22, dl, MVT::i32));
2731  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
2732  DAG.getConstant(22, dl, MVT::i32));
2733  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
2734  DAG.getConstant(3, dl, MVT::i32));
2735 }
2736 
2738  // Multiplications are only custom-lowered for 128-bit vectors so that
2739  // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2740  EVT VT = Op.getValueType();
2741  assert(VT.is128BitVector() && VT.isInteger() &&
2742  "unexpected type for custom-lowering ISD::MUL");
2743  SDNode *N0 = Op.getOperand(0).getNode();
2744  SDNode *N1 = Op.getOperand(1).getNode();
2745  unsigned NewOpc = 0;
2746  bool isMLA = false;
2747  bool isN0SExt = isSignExtended(N0, DAG);
2748  bool isN1SExt = isSignExtended(N1, DAG);
2749  if (isN0SExt && isN1SExt)
2750  NewOpc = AArch64ISD::SMULL;
2751  else {
2752  bool isN0ZExt = isZeroExtended(N0, DAG);
2753  bool isN1ZExt = isZeroExtended(N1, DAG);
2754  if (isN0ZExt && isN1ZExt)
2755  NewOpc = AArch64ISD::UMULL;
2756  else if (isN1SExt || isN1ZExt) {
2757  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2758  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2759  if (isN1SExt && isAddSubSExt(N0, DAG)) {
2760  NewOpc = AArch64ISD::SMULL;
2761  isMLA = true;
2762  } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2763  NewOpc = AArch64ISD::UMULL;
2764  isMLA = true;
2765  } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2766  std::swap(N0, N1);
2767  NewOpc = AArch64ISD::UMULL;
2768  isMLA = true;
2769  }
2770  }
2771 
2772  if (!NewOpc) {
2773  if (VT == MVT::v2i64)
2774  // Fall through to expand this. It is not legal.
2775  return SDValue();
2776  else
2777  // Other vector multiplications are legal.
2778  return Op;
2779  }
2780  }
2781 
2782  // Legalize to a S/UMULL instruction
2783  SDLoc DL(Op);
2784  SDValue Op0;
2785  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2786  if (!isMLA) {
2787  Op0 = skipExtensionForVectorMULL(N0, DAG);
2788  assert(Op0.getValueType().is64BitVector() &&
2789  Op1.getValueType().is64BitVector() &&
2790  "unexpected types for extended operands to VMULL");
2791  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2792  }
2793  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2794  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2795  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2796  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2797  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2798  EVT Op1VT = Op1.getValueType();
2799  return DAG.getNode(N0->getOpcode(), DL, VT,
2800  DAG.getNode(NewOpc, DL, VT,
2801  DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2802  DAG.getNode(NewOpc, DL, VT,
2803  DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2804 }
2805 
2806 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2807  SelectionDAG &DAG) const {
2808  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2809  SDLoc dl(Op);
2810  switch (IntNo) {
2811  default: return SDValue(); // Don't custom lower most intrinsics.
2812  case Intrinsic::thread_pointer: {
2813  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2814  return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2815  }
2816  case Intrinsic::aarch64_neon_abs: {
2817  EVT Ty = Op.getValueType();
2818  if (Ty == MVT::i64) {
2819  SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
2820  Op.getOperand(1));
2821  Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
2822  return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
2823  } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
2824  return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
2825  } else {
2826  report_fatal_error("Unexpected type for AArch64 NEON intrinic");
2827  }
2828  }
2829  case Intrinsic::aarch64_neon_smax:
2830  return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2831  Op.getOperand(1), Op.getOperand(2));
2832  case Intrinsic::aarch64_neon_umax:
2833  return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2834  Op.getOperand(1), Op.getOperand(2));
2835  case Intrinsic::aarch64_neon_smin:
2836  return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2837  Op.getOperand(1), Op.getOperand(2));
2838  case Intrinsic::aarch64_neon_umin:
2839  return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2840  Op.getOperand(1), Op.getOperand(2));
2841 
2842  case Intrinsic::localaddress: {
2843  const auto &MF = DAG.getMachineFunction();
2844  const auto *RegInfo = Subtarget->getRegisterInfo();
2845  unsigned Reg = RegInfo->getLocalAddressRegister(MF);
2846  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
2847  Op.getSimpleValueType());
2848  }
2849 
2850  case Intrinsic::eh_recoverfp: {
2851  // FIXME: This needs to be implemented to correctly handle highly aligned
2852  // stack objects. For now we simply return the incoming FP. Refer D53541
2853  // for more details.
2854  SDValue FnOp = Op.getOperand(1);
2855  SDValue IncomingFPOp = Op.getOperand(2);
2857  auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
2858  if (!Fn)
2860  "llvm.eh.recoverfp must take a function as the first argument");
2861  return IncomingFPOp;
2862  }
2863  }
2864 }
2865 
2866 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
2868  EVT VT, EVT MemVT,
2869  SelectionDAG &DAG) {
2870  assert(VT.isVector() && "VT should be a vector type");
2871  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
2872 
2873  SDValue Value = ST->getValue();
2874 
2875  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
2876  // the word lane which represent the v4i8 subvector. It optimizes the store
2877  // to:
2878  //
2879  // xtn v0.8b, v0.8h
2880  // str s0, [x0]
2881 
2882  SDValue Undef = DAG.getUNDEF(MVT::i16);
2883  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
2884  {Undef, Undef, Undef, Undef});
2885 
2886  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
2887  Value, UndefVec);
2888  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
2889 
2890  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
2891  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
2892  Trunc, DAG.getConstant(0, DL, MVT::i64));
2893 
2894  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
2895  ST->getBasePtr(), ST->getMemOperand());
2896 }
2897 
2898 // Custom lowering for any store, vector or scalar and/or default or with
2899 // a truncate operations. Currently only custom lower truncate operation
2900 // from vector v4i16 to v4i8.
2901 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
2902  SelectionDAG &DAG) const {
2903  SDLoc Dl(Op);
2904  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
2905  assert (StoreNode && "Can only custom lower store nodes");
2906 
2907  SDValue Value = StoreNode->getValue();
2908 
2909  EVT VT = Value.getValueType();
2910  EVT MemVT = StoreNode->getMemoryVT();
2911 
2912  assert (VT.isVector() && "Can only custom lower vector store types");
2913 
2914  unsigned AS = StoreNode->getAddressSpace();
2915  unsigned Align = StoreNode->getAlignment();
2916  if (Align < MemVT.getStoreSize() &&
2918  MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
2919  return scalarizeVectorStore(StoreNode, DAG);
2920  }
2921 
2922  if (StoreNode->isTruncatingStore()) {
2923  return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
2924  }
2925 
2926  return SDValue();
2927 }
2928 
2930  SelectionDAG &DAG) const {
2931  LLVM_DEBUG(dbgs() << "Custom lowering: ");
2932  LLVM_DEBUG(Op.dump());
2933 
2934  switch (Op.getOpcode()) {
2935  default:
2936  llvm_unreachable("unimplemented operand");
2937  return SDValue();
2938  case ISD::BITCAST:
2939  return LowerBITCAST(Op, DAG);
2940  case ISD::GlobalAddress:
2941  return LowerGlobalAddress(Op, DAG);
2942  case ISD::GlobalTLSAddress:
2943  return LowerGlobalTLSAddress(Op, DAG);
2944  case ISD::SETCC:
2945  return LowerSETCC(Op, DAG);
2946  case ISD::BR_CC:
2947  return LowerBR_CC(Op, DAG);
2948  case ISD::SELECT:
2949  return LowerSELECT(Op, DAG);
2950  case ISD::SELECT_CC:
2951  return LowerSELECT_CC(Op, DAG);
2952  case ISD::JumpTable:
2953  return LowerJumpTable(Op, DAG);
2954  case ISD::BR_JT:
2955  return LowerBR_JT(Op, DAG);
2956  case ISD::ConstantPool:
2957  return LowerConstantPool(Op, DAG);
2958  case ISD::BlockAddress:
2959  return LowerBlockAddress(Op, DAG);
2960  case ISD::VASTART:
2961  return LowerVASTART(Op, DAG);
2962  case ISD::VACOPY:
2963  return LowerVACOPY(Op, DAG);
2964  case ISD::VAARG:
2965  return LowerVAARG(Op, DAG);
2966  case ISD::ADDC:
2967  case ISD::ADDE:
2968  case ISD::SUBC:
2969  case ISD::SUBE:
2970  return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2971  case ISD::SADDO:
2972  case ISD::UADDO:
2973  case ISD::SSUBO:
2974  case ISD::USUBO:
2975  case ISD::SMULO:
2976  case ISD::UMULO:
2977  return LowerXALUO(Op, DAG);
2978  case ISD::FADD:
2979  return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
2980  case ISD::FSUB:
2981  return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
2982  case ISD::FMUL:
2983  return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
2984  case ISD::FDIV:
2985  return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
2986  case ISD::FP_ROUND:
2987  return LowerFP_ROUND(Op, DAG);
2988  case ISD::FP_EXTEND:
2989  return LowerFP_EXTEND(Op, DAG);
2990  case ISD::FRAMEADDR:
2991  return LowerFRAMEADDR(Op, DAG);
2992  case ISD::SPONENTRY:
2993  return LowerSPONENTRY(Op, DAG);
2994  case ISD::RETURNADDR:
2995  return LowerRETURNADDR(Op, DAG);
2996  case ISD::ADDROFRETURNADDR:
2997  return LowerADDROFRETURNADDR(Op, DAG);
2999  return LowerINSERT_VECTOR_ELT(Op, DAG);
3001  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3002  case ISD::BUILD_VECTOR:
3003  return LowerBUILD_VECTOR(Op, DAG);
3004  case ISD::VECTOR_SHUFFLE:
3005  return LowerVECTOR_SHUFFLE(Op, DAG);
3007  return LowerEXTRACT_SUBVECTOR(Op, DAG);
3008  case ISD::SRA:
3009  case ISD::SRL:
3010  case ISD::SHL:
3011  return LowerVectorSRA_SRL_SHL(Op, DAG);
3012  case ISD::SHL_PARTS:
3013  return LowerShiftLeftParts(Op, DAG);
3014  case ISD::SRL_PARTS:
3015  case ISD::SRA_PARTS:
3016  return LowerShiftRightParts(Op, DAG);
3017  case ISD::CTPOP:
3018  return LowerCTPOP(Op, DAG);
3019  case ISD::FCOPYSIGN:
3020  return LowerFCOPYSIGN(Op, DAG);
3021  case ISD::OR:
3022  return LowerVectorOR(Op, DAG);
3023  case ISD::XOR:
3024  return LowerXOR(Op, DAG);
3025  case ISD::PREFETCH:
3026  return LowerPREFETCH(Op, DAG);
3027  case ISD::SINT_TO_FP:
3028  case ISD::UINT_TO_FP:
3029  return LowerINT_TO_FP(Op, DAG);
3030  case ISD::FP_TO_SINT:
3031  case ISD::FP_TO_UINT:
3032  return LowerFP_TO_INT(Op, DAG);
3033  case ISD::FSINCOS:
3034  return LowerFSINCOS(Op, DAG);
3035  case ISD::FLT_ROUNDS_:
3036  return LowerFLT_ROUNDS_(Op, DAG);
3037  case ISD::MUL:
3038  return LowerMUL(Op, DAG);
3040  return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3041  case ISD::STORE:
3042  return LowerSTORE(Op, DAG);
3043  case ISD::VECREDUCE_ADD:
3044  case ISD::VECREDUCE_SMAX:
3045  case ISD::VECREDUCE_SMIN:
3046  case ISD::VECREDUCE_UMAX:
3047  case ISD::VECREDUCE_UMIN:
3048  case ISD::VECREDUCE_FMAX:
3049  case ISD::VECREDUCE_FMIN:
3050  return LowerVECREDUCE(Op, DAG);
3051  case ISD::ATOMIC_LOAD_SUB:
3052  return LowerATOMIC_LOAD_SUB(Op, DAG);
3053  case ISD::ATOMIC_LOAD_AND:
3054  return LowerATOMIC_LOAD_AND(Op, DAG);
3056  return LowerDYNAMIC_STACKALLOC(Op, DAG);
3057  }
3058 }
3059 
3060 //===----------------------------------------------------------------------===//
3061 // Calling Convention Implementation
3062 //===----------------------------------------------------------------------===//
3063 
3064 /// Selects the correct CCAssignFn for a given CallingConvention value.
3066  bool IsVarArg) const {
3067  switch (CC) {
3068  default:
3069  report_fatal_error("Unsupported calling convention.");
3071  return CC_AArch64_WebKit_JS;
3072  case CallingConv::GHC:
3073  return CC_AArch64_GHC;
3074  case CallingConv::C:
3075  case CallingConv::Fast:
3078  case CallingConv::Swift:
3079  if (Subtarget->isTargetWindows() && IsVarArg)
3080  return CC_AArch64_Win64_VarArg;
3081  if (!Subtarget->isTargetDarwin())
3082  return CC_AArch64_AAPCS;
3083  if (!IsVarArg)
3084  return CC_AArch64_DarwinPCS;
3085  return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
3087  case CallingConv::Win64:
3088  return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
3090  return CC_AArch64_AAPCS;
3091  }
3092 }
3093 
3094 CCAssignFn *
3098 }
3099 
3100 SDValue AArch64TargetLowering::LowerFormalArguments(
3101  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3102  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3103  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3104  MachineFunction &MF = DAG.getMachineFunction();
3105  MachineFrameInfo &MFI = MF.getFrameInfo();
3106  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3107 
3108  // Assign locations to all of the incoming arguments.
3110  DenseMap<unsigned, SDValue> CopiedRegs;
3111  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3112  *DAG.getContext());
3113 
3114  // At this point, Ins[].VT may already be promoted to i32. To correctly
3115  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3116  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3117  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
3118  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
3119  // LocVT.
3120  unsigned NumArgs = Ins.size();
3122  unsigned CurArgIdx = 0;
3123  for (unsigned i = 0; i != NumArgs; ++i) {
3124  MVT ValVT = Ins[i].VT;
3125  if (Ins[i].isOrigArg()) {
3126  std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
3127  CurArgIdx = Ins[i].getOrigArgIndex();
3128 
3129  // Get type of the original argument.
3130  EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
3131  /*AllowUnknown*/ true);
3132  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
3133  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3134  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3135  ValVT = MVT::i8;
3136  else if (ActualMVT == MVT::i16)
3137  ValVT = MVT::i16;
3138  }
3139  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3140  bool Res =
3141  AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
3142  assert(!Res && "Call operand has unhandled type");
3143  (void)Res;
3144  }
3145  assert(ArgLocs.size() == Ins.size());
3146  SmallVector<SDValue, 16> ArgValues;
3147  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3148  CCValAssign &VA = ArgLocs[i];
3149 
3150  if (Ins[i].Flags.isByVal()) {
3151  // Byval is used for HFAs in the PCS, but the system should work in a
3152  // non-compliant manner for larger structs.
3153  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3154  int Size = Ins[i].Flags.getByValSize();
3155  unsigned NumRegs = (Size + 7) / 8;
3156 
3157  // FIXME: This works on big-endian for composite byvals, which are the common
3158  // case. It should also work for fundamental types too.
3159  unsigned FrameIdx =
3160  MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
3161  SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
3162  InVals.push_back(FrameIdxN);
3163 
3164  continue;
3165  }
3166 
3167  SDValue ArgValue;
3168  if (VA.isRegLoc()) {
3169  // Arguments stored in registers.
3170  EVT RegVT = VA.getLocVT();
3171  const TargetRegisterClass *RC;
3172 
3173  if (RegVT == MVT::i32)
3174  RC = &AArch64::GPR32RegClass;
3175  else if (RegVT == MVT::i64)
3176  RC = &AArch64::GPR64RegClass;
3177  else if (RegVT == MVT::f16)
3178  RC = &AArch64::FPR16RegClass;
3179  else if (RegVT == MVT::f32)
3180  RC = &AArch64::FPR32RegClass;
3181  else if (RegVT == MVT::f64 || RegVT.is64BitVector())
3182  RC = &AArch64::FPR64RegClass;
3183  else if (RegVT == MVT::f128 || RegVT.is128BitVector())
3184  RC = &AArch64::FPR128RegClass;
3185  else if (RegVT.isScalableVector() &&
3186  RegVT.getVectorElementType() == MVT::i1)
3187  RC = &AArch64::PPRRegClass;
3188  else if (RegVT.isScalableVector())
3189  RC = &AArch64::ZPRRegClass;
3190  else
3191  llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3192 
3193  // Transform the arguments in physical registers into virtual ones.
3194  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3195  ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
3196 
3197  // If this is an 8, 16 or 32-bit value, it is really passed promoted
3198  // to 64 bits. Insert an assert[sz]ext to capture this, then
3199  // truncate to the right size.
3200  switch (VA.getLocInfo()) {
3201  default:
3202  llvm_unreachable("Unknown loc info!");
3203  case CCValAssign::Full:
3204  break;
3205  case CCValAssign::Indirect:
3206  assert(VA.getValVT().isScalableVector() &&
3207  "Only scalable vectors can be passed indirectly");
3208  llvm_unreachable("Spilling of SVE vectors not yet implemented");
3209  case CCValAssign::BCvt:
3210  ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
3211  break;
3212  case CCValAssign::AExt:
3213  case CCValAssign::SExt:
3214  case CCValAssign::ZExt:
3215  break;
3217  ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
3218  DAG.getConstant(32, DL, RegVT));
3219  ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
3220  break;
3221  }
3222  } else { // VA.isRegLoc()
3223  assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
3224  unsigned ArgOffset = VA.getLocMemOffset();
3225  unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
3226 
3227  uint32_t BEAlign = 0;
3228  if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
3229  !Ins[i].Flags.isInConsecutiveRegs())
3230  BEAlign = 8 - ArgSize;
3231 
3232  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
3233 
3234  // Create load nodes to retrieve arguments from the stack.
3235  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3236 
3237  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
3239  MVT MemVT = VA.getValVT();
3240 
3241  switch (VA.getLocInfo()) {
3242  default:
3243  break;
3244  case CCValAssign::Trunc:
3245  case CCValAssign::BCvt:
3246  MemVT = VA.getLocVT();
3247  break;
3248  case CCValAssign::Indirect:
3249  assert(VA.getValVT().isScalableVector() &&
3250  "Only scalable vectors can be passed indirectly");
3251  llvm_unreachable("Spilling of SVE vectors not yet implemented");
3252  case CCValAssign::SExt:
3253  ExtType = ISD::SEXTLOAD;
3254  break;
3255  case CCValAssign::ZExt:
3256  ExtType = ISD::ZEXTLOAD;
3257  break;
3258  case CCValAssign::AExt:
3259  ExtType = ISD::EXTLOAD;
3260  break;
3261  }
3262 
3263  ArgValue = DAG.getExtLoad(
3264  ExtType, DL, VA.getLocVT(), Chain, FIN,
3266  MemVT);
3267 
3268  }
3269  if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
3270  ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
3271  ArgValue, DAG.getValueType(MVT::i32));
3272  InVals.push_back(ArgValue);
3273  }
3274 
3275  // varargs
3277  if (isVarArg) {
3278  if (!Subtarget->isTargetDarwin() || IsWin64) {
3279  // The AAPCS variadic function ABI is identical to the non-variadic
3280  // one. As a result there may be more arguments in registers and we should
3281  // save them for future reference.
3282  // Win64 variadic functions also pass arguments in registers, but all float
3283  // arguments are passed in integer registers.
3284  saveVarArgRegisters(CCInfo, DAG, DL, Chain);
3285  }
3286 
3287  // This will point to the next argument passed via stack.
3288  unsigned StackOffset = CCInfo.getNextStackOffset();
3289  // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
3290  StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
3291  FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
3292 
3293  if (MFI.hasMustTailInVarArgFunc()) {
3294  SmallVector<MVT, 2> RegParmTypes;
3295  RegParmTypes.push_back(MVT::i64);
3296  RegParmTypes.push_back(MVT::f128);
3297  // Compute the set of forwarded registers. The rest are scratch.
3299  FuncInfo->getForwardedMustTailRegParms();
3300  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
3302 
3303  // Conservatively forward X8, since it might be used for aggregate return.
3304  if (!CCInfo.isAllocated(AArch64::X8)) {
3305  unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
3306  Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
3307  }
3308  }
3309  }
3310 
3311  // On Windows, InReg pointers must be returned, so record the pointer in a
3312  // virtual register at the start of the function so it can be returned in the
3313  // epilogue.
3314  if (IsWin64) {
3315  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3316  if (Ins[I].Flags.isInReg()) {
3317  assert(!FuncInfo->getSRetReturnReg());
3318 
3319  MVT PtrTy = getPointerTy(DAG.getDataLayout());
3320  Register Reg =
3322  FuncInfo->setSRetReturnReg(Reg);
3323 
3324  SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
3325  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
3326  break;
3327  }
3328  }
3329  }
3330 
3331  unsigned StackArgSize = CCInfo.getNextStackOffset();
3332  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3333  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
3334  // This is a non-standard ABI so by fiat I say we're allowed to make full
3335  // use of the stack area to be popped, which must be aligned to 16 bytes in
3336  // any case:
3337  StackArgSize = alignTo(StackArgSize, 16);
3338 
3339  // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
3340  // a multiple of 16.
3341  FuncInfo->setArgumentStackToRestore(StackArgSize);
3342 
3343  // This realignment carries over to the available bytes below. Our own
3344  // callers will guarantee the space is free by giving an aligned value to
3345  // CALLSEQ_START.
3346  }
3347  // Even if we're not expected to free up the space, it's useful to know how
3348  // much is there while considering tail calls (because we can reuse it).
3349  FuncInfo->setBytesInStackArgArea(StackArgSize);
3350 
3351  if (Subtarget->hasCustomCallingConv())
3352  Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
3353 
3354  return Chain;
3355 }
3356 
3357 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
3358  SelectionDAG &DAG,
3359  const SDLoc &DL,
3360  SDValue &Chain) const {
3361  MachineFunction &MF = DAG.getMachineFunction();
3362  MachineFrameInfo &MFI = MF.getFrameInfo();
3364  auto PtrVT = getPointerTy(DAG.getDataLayout());
3365  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3366 
3367  SmallVector<SDValue, 8> MemOps;
3368 
3369  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
3370  AArch64::X3, AArch64::X4, AArch64::X5,
3371  AArch64::X6, AArch64::X7 };
3372  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
3373  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
3374 
3375  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
3376  int GPRIdx = 0;
3377  if (GPRSaveSize != 0) {
3378  if (IsWin64) {
3379  GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
3380  if (GPRSaveSize & 15)
3381  // The extra size here, if triggered, will always be 8.
3382  MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
3383  } else
3384  GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
3385 
3386  SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
3387 
3388  for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
3389  unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
3390  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
3391  SDValue Store = DAG.getStore(
3392  Val.getValue(1), DL, Val, FIN,
3393  IsWin64
3395  GPRIdx,
3396  (i - FirstVariadicGPR) * 8)
3398  MemOps.push_back(Store);
3399  FIN =
3400  DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
3401  }
3402  }
3403  FuncInfo->setVarArgsGPRIndex(GPRIdx);
3404  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
3405 
3406  if (Subtarget->hasFPARMv8() && !IsWin64) {
3407  static const MCPhysReg FPRArgRegs[] = {
3408  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
3409  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
3410  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
3411  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
3412 
3413  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
3414  int FPRIdx = 0;
3415  if (FPRSaveSize != 0) {
3416  FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
3417 
3418  SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3419 
3420  for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3421  unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3422  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3423 
3424  SDValue Store = DAG.getStore(
3425  Val.getValue(1), DL, Val, FIN,
3427  MemOps.push_back(Store);
3428  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3429  DAG.getConstant(16, DL, PtrVT));
3430  }
3431  }
3432  FuncInfo->setVarArgsFPRIndex(FPRIdx);
3433  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3434  }
3435 
3436  if (!MemOps.empty()) {
3437  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3438  }
3439 }
3440 
3441 /// LowerCallResult - Lower the result values of a call into the
3442 /// appropriate copies out of appropriate physical registers.
3443 SDValue AArch64TargetLowering::LowerCallResult(
3444  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3445  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3446  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3447  SDValue ThisVal) const {
3448  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3451  // Assign locations to each value returned by this call.
3453  DenseMap<unsigned, SDValue> CopiedRegs;
3454  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3455  *DAG.getContext());
3456  CCInfo.AnalyzeCallResult(Ins, RetCC);
3457 
3458  // Copy all of the result registers out of their specified physreg.
3459  for (unsigned i = 0; i != RVLocs.size(); ++i) {
3460  CCValAssign VA = RVLocs[i];
3461 
3462  // Pass 'this' value directly from the argument to return value, to avoid
3463  // reg unit interference
3464  if (i == 0 && isThisReturn) {
3465  assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3466  "unexpected return calling convention register assignment");
3467  InVals.push_back(ThisVal);
3468  continue;
3469  }
3470 
3471  // Avoid copying a physreg twice since RegAllocFast is incompetent and only
3472  // allows one use of a physreg per block.
3473  SDValue Val = CopiedRegs.lookup(VA.getLocReg());
3474  if (!Val) {
3475  Val =
3476  DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3477  Chain = Val.getValue(1);
3478  InFlag = Val.getValue(2);
3479  CopiedRegs[VA.getLocReg()] = Val;
3480  }
3481 
3482  switch (VA.getLocInfo()) {
3483  default:
3484  llvm_unreachable("Unknown loc info!");
3485  case CCValAssign::Full:
3486  break;
3487  case CCValAssign::BCvt:
3488  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3489  break;
3491  Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
3492  DAG.getConstant(32, DL, VA.getLocVT()));
3494  case CCValAssign::AExt:
3496  case CCValAssign::ZExt:
3497  Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
3498  break;
3499  }
3500 
3501  InVals.push_back(Val);
3502  }
3503 
3504  return Chain;
3505 }
3506 
3507 /// Return true if the calling convention is one that we can guarantee TCO for.
3509  return CC == CallingConv::Fast;
3510 }
3511 
3512 /// Return true if we might ever do TCO for calls with this calling convention.
3514  switch (CC) {
3515  case CallingConv::C:
3517  case CallingConv::Swift:
3518  return true;
3519  default:
3520  return canGuaranteeTCO(CC);
3521  }
3522 }
3523 
3524 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3525  SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3526  const SmallVectorImpl<ISD::OutputArg> &Outs,
3527  const SmallVectorImpl<SDValue> &OutVals,
3528  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3529  if (!mayTailCallThisCC(CalleeCC))
3530  return false;
3531 
3532  MachineFunction &MF = DAG.getMachineFunction();
3533  const Function &CallerF = MF.getFunction();
3534  CallingConv::ID CallerCC = CallerF.getCallingConv();
3535  bool CCMatch = CallerCC == CalleeCC;
3536 
3537  // Byval parameters hand the function a pointer directly into the stack area
3538  // we want to reuse during a tail call. Working around this *is* possible (see
3539  // X86) but less efficient and uglier in LowerCall.
3540  for (Function::const_arg_iterator i = CallerF.arg_begin(),
3541  e = CallerF.arg_end();
3542  i != e; ++i) {
3543  if (i->hasByValAttr())
3544  return false;
3545 
3546  // On Windows, "inreg" attributes signify non-aggregate indirect returns.
3547  // In this case, it is necessary to save/restore X0 in the callee. Tail
3548  // call opt interferes with this. So we disable tail call opt when the
3549  // caller has an argument with "inreg" attribute.
3550 
3551  // FIXME: Check whether the callee also has an "inreg" argument.
3552  if (i->hasInRegAttr())
3553  return false;
3554  }
3555 
3557  return canGuaranteeTCO(CalleeCC) && CCMatch;
3558 
3559  // Externally-defined functions with weak linkage should not be
3560  // tail-called on AArch64 when the OS does not support dynamic
3561  // pre-emption of symbols, as the AAELF spec requires normal calls
3562  // to undefined weak functions to be replaced with a NOP or jump to the
3563  // next instruction. The behaviour of branch instructions in this
3564  // situation (as used for tail calls) is implementation-defined, so we
3565  // cannot rely on the linker replacing the tail call with a return.
3566  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3567  const GlobalValue *GV = G->getGlobal();
3569  if (GV->hasExternalWeakLinkage() &&
3570  (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3571  return false;
3572  }
3573 
3574  // Now we search for cases where we can use a tail call without changing the
3575  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3576  // concept.
3577 
3578  // I want anyone implementing a new calling convention to think long and hard
3579  // about this assert.
3580  assert((!isVarArg || CalleeCC == CallingConv::C) &&
3581  "Unexpected variadic calling convention");
3582 
3583  LLVMContext &C = *DAG.getContext();
3584  if (isVarArg && !Outs.empty()) {
3585  // At least two cases here: if caller is fastcc then we can't have any
3586  // memory arguments (we'd be expected to clean up the stack afterwards). If
3587  // caller is C then we could potentially use its argument area.
3588 
3589  // FIXME: for now we take the most conservative of these in both cases:
3590  // disallow all variadic memory operands.
3592  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3593 
3594  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3595  for (const CCValAssign &ArgLoc : ArgLocs)
3596  if (!ArgLoc.isRegLoc())
3597  return false;
3598  }
3599 
3600  // Check that the call results are passed in the same way.
3601  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3602  CCAssignFnForCall(CalleeCC, isVarArg),
3603  CCAssignFnForCall(CallerCC, isVarArg)))
3604  return false;
3605  // The callee has to preserve all registers the caller needs to preserve.
3606  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3607  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3608  if (!CCMatch) {
3609  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3610  if (Subtarget->hasCustomCallingConv()) {
3611  TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
3612  TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
3613  }
3614  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3615  return false;
3616  }
3617 
3618  // Nothing more to check if the callee is taking no arguments
3619  if (Outs.empty())
3620  return true;
3621 
3623  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3624 
3625  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3626 
3627  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3628 
3629  // If the stack arguments for this call do not fit into our own save area then
3630  // the call cannot be made tail.
3631  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3632  return false;
3633 
3634  const MachineRegisterInfo &MRI = MF.getRegInfo();
3635  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3636  return false;
3637 
3638  return true;
3639 }
3640 
3641 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3642  SelectionDAG &DAG,
3643  MachineFrameInfo &MFI,
3644  int ClobberedFI) const {
3645  SmallVector<SDValue, 8> ArgChains;
3646  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3647  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3648 
3649  // Include the original chain at the beginning of the list. When this is
3650  // used by target LowerCall hooks, this helps legalize find the
3651  // CALLSEQ_BEGIN node.
3652  ArgChains.push_back(Chain);
3653 
3654  // Add a chain value for each stack argument corresponding
3655  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3656  UE = DAG.getEntryNode().getNode()->use_end();
3657  U != UE; ++U)
3658  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3659  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3660  if (FI->getIndex() < 0) {
3661  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3662  int64_t InLastByte = InFirstByte;
3663  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3664 
3665  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3666  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3667  ArgChains.push_back(SDValue(L, 1));
3668  }
3669 
3670  // Build a tokenfactor for all the chains.
3671  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3672 }
3673 
3674 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3675  bool TailCallOpt) const {
3676  return CallCC == CallingConv::Fast && TailCallOpt;
3677 }
3678 
3679 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3680 /// and add input and output parameter nodes.
3681 SDValue
3682 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3683  SmallVectorImpl<SDValue> &InVals) const {
3684  SelectionDAG &DAG = CLI.DAG;
3685  SDLoc &DL = CLI.DL;
3686  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3687  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3688  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3689  SDValue Chain = CLI.Chain;
3690  SDValue Callee = CLI.Callee;
3691  bool &IsTailCall = CLI.IsTailCall;
3692  CallingConv::ID CallConv = CLI.CallConv;
3693  bool IsVarArg = CLI.IsVarArg;
3694 
3695  MachineFunction &MF = DAG.getMachineFunction();
3696  bool IsThisReturn = false;
3697 
3699  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3700  bool IsSibCall = false;
3701 
3702  if (IsTailCall) {
3703  // Check if it's really possible to do a tail call.
3704  IsTailCall = isEligibleForTailCallOptimization(
3705  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3706  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3707  report_fatal_error("failed to perform tail call elimination on a call "
3708  "site marked musttail");
3709 
3710  // A sibling call is one where we're under the usual C ABI and not planning
3711  // to change that but can still do a tail call:
3712  if (!TailCallOpt && IsTailCall)
3713  IsSibCall = true;
3714 
3715  if (IsTailCall)
3716  ++NumTailCalls;
3717  }
3718 
3719  // Analyze operands of the call, assigning locations to each operand.
3721  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3722  *DAG.getContext());
3723 
3724  if (IsVarArg) {
3725  // Handle fixed and variable vector arguments differently.
3726  // Variable vector arguments always go into memory.
3727  unsigned NumArgs = Outs.size();
3728 
3729  for (unsigned i = 0; i != NumArgs; ++i) {
3730  MVT ArgVT = Outs[i].VT;
3731  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3732  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3733  /*IsVarArg=*/ !Outs[i].IsFixed);
3734  bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3735  assert(!Res && "Call operand has unhandled type");
3736  (void)Res;
3737  }
3738  } else {
3739  // At this point, Outs[].VT may already be promoted to i32. To correctly
3740  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3741  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3742  // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3743  // we use a special version of AnalyzeCallOperands to pass in ValVT and
3744  // LocVT.
3745  unsigned NumArgs = Outs.size();
3746  for (unsigned i = 0; i != NumArgs; ++i) {
3747  MVT ValVT = Outs[i].VT;
3748  // Get type of the original argument.
3749  EVT ActualVT = getValueType(DAG.getDataLayout(),
3750  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3751  /*AllowUnknown*/ true);
3752  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3753  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3754  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3755  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3756  ValVT = MVT::i8;
3757  else if (ActualMVT == MVT::i16)
3758  ValVT = MVT::i16;
3759 
3760  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3761  bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3762  assert(!Res && "Call operand has unhandled type");
3763  (void)Res;
3764  }
3765  }
3766 
3767  // Get a count of how many bytes are to be pushed on the stack.
3768  unsigned NumBytes = CCInfo.getNextStackOffset();
3769 
3770  if (IsSibCall) {
3771  // Since we're not changing the ABI to make this a tail call, the memory
3772  // operands are already available in the caller's incoming argument space.
3773  NumBytes = 0;
3774  }
3775 
3776  // FPDiff is the byte offset of the call's argument area from the callee's.
3777  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3778  // by this amount for a tail call. In a sibling call it must be 0 because the
3779  // caller will deallocate the entire stack and the callee still expects its
3780  // arguments to begin at SP+0. Completely unused for non-tail calls.
3781  int FPDiff = 0;
3782 
3783  if (IsTailCall && !IsSibCall) {
3784  unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3785 
3786  // Since callee will pop argument stack as a tail call, we must keep the
3787  // popped size 16-byte aligned.
3788  NumBytes = alignTo(NumBytes, 16);
3789 
3790  // FPDiff will be negative if this tail call requires more space than we
3791  // would automatically have in our incoming argument space. Positive if we
3792  // can actually shrink the stack.
3793  FPDiff = NumReusableBytes - NumBytes;
3794 
3795  // The stack pointer must be 16-byte aligned at all times it's used for a
3796  // memory operation, which in practice means at *all* times and in
3797  // particular across call boundaries. Therefore our own arguments started at
3798  // a 16-byte aligned SP and the delta applied for the tail call should
3799  // satisfy the same constraint.
3800  assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3801  }
3802 
3803  // Adjust the stack pointer for the new arguments...
3804  // These operations are automatically eliminated by the prolog/epilog pass
3805  if (!IsSibCall)
3806  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3807 
3808  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3809  getPointerTy(DAG.getDataLayout()));
3810 
3812  SmallSet<unsigned, 8> RegsUsed;
3813  SmallVector<SDValue, 8> MemOpChains;
3814  auto PtrVT = getPointerTy(DAG.getDataLayout());
3815 
3816  if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
3817  const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
3818  for (const auto &F : Forwards) {
3819  SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
3820  RegsToPass.emplace_back(F.PReg, Val);
3821  }
3822  }
3823 
3824  // Walk the register/memloc assignments, inserting copies/loads.
3825  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3826  ++i, ++realArgIdx) {
3827  CCValAssign &VA = ArgLocs[i];
3828  SDValue Arg = OutVals[realArgIdx];
3829  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3830 
3831  // Promote the value if needed.
3832  switch (VA.getLocInfo()) {
3833  default:
3834  llvm_unreachable("Unknown loc info!");
3835  case CCValAssign::Full:
3836  break;
3837  case CCValAssign::SExt:
3838  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3839  break;
3840  case CCValAssign::ZExt:
3841  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3842  break;
3843  case CCValAssign::AExt:
3844  if (Outs[realArgIdx].ArgVT == MVT::i1) {
3845  // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3846  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3847  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3848  }
3849  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3850  break;
3852  assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
3853  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3854  Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
3855  DAG.getConstant(32, DL, VA.getLocVT()));
3856  break;
3857  case CCValAssign::BCvt:
3858  Arg = DAG.getBitcast(VA.getLocVT(), Arg);
3859  break;
3860  case CCValAssign::Trunc:
3861  Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
3862  break;
3863  case CCValAssign::FPExt:
3864  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3865  break;
3866  case CCValAssign::Indirect:
3867  assert(VA.getValVT().isScalableVector() &&
3868  "Only scalable vectors can be passed indirectly");
3869  llvm_unreachable("Spilling of SVE vectors not yet implemented");
3870  }
3871 
3872  if (VA.isRegLoc()) {
3873  if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3874  Outs[0].VT == MVT::i64) {
3875  assert(VA.getLocVT() == MVT::i64 &&
3876  "unexpected calling convention register assignment");
3877  assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3878  "unexpected use of 'returned'");
3879  IsThisReturn = true;
3880  }
3881  if (RegsUsed.count(VA.getLocReg())) {
3882  // If this register has already been used then we're trying to pack
3883  // parts of an [N x i32] into an X-register. The extension type will
3884  // take care of putting the two halves in the right place but we have to
3885  // combine them.
3886  SDValue &Bits =
3887  std::find_if(RegsToPass.begin(), RegsToPass.end(),
3888  [=](const std::pair<unsigned, SDValue> &Elt) {
3889  return Elt.first == VA.getLocReg();
3890  })
3891  ->second;
3892  Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
3893  } else {
3894  RegsToPass.emplace_back(VA.getLocReg(), Arg);
3895  RegsUsed.insert(VA.getLocReg());
3896  }
3897  } else {
3898  assert(VA.isMemLoc());
3899 
3900  SDValue DstAddr;
3901  MachinePointerInfo DstInfo;
3902 
3903  // FIXME: This works on big-endian for composite byvals, which are the
3904  // common case. It should also work for fundamental types too.
3905  uint32_t BEAlign = 0;
3906  unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3907  : VA.getValVT().getSizeInBits();
3908  OpSize = (OpSize + 7) / 8;
3909  if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3910  !Flags.isInConsecutiveRegs()) {
3911  if (OpSize < 8)
3912  BEAlign = 8 - OpSize;
3913  }
3914  unsigned LocMemOffset = VA.getLocMemOffset();
3915  int32_t Offset = LocMemOffset + BEAlign;
3916  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3917  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3918 
3919  if (IsTailCall) {
3920  Offset = Offset + FPDiff;
3921  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3922 
3923  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3924  DstInfo =
3926 
3927  // Make sure any stack arguments overlapping with where we're storing
3928  // are loaded before this eventual operation. Otherwise they'll be
3929  // clobbered.
3930  Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3931  } else {
3932  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3933 
3934  DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3936  LocMemOffset);
3937  }
3938 
3939  if (Outs[i].Flags.isByVal()) {
3940  SDValue SizeNode =
3941  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3942  SDValue Cpy = DAG.getMemcpy(
3943  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3944  /*isVol = */ false, /*AlwaysInline = */ false,
3945  /*isTailCall = */ false,
3946  DstInfo, MachinePointerInfo());
3947 
3948  MemOpChains.push_back(Cpy);
3949  } else {
3950  // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3951  // promoted to a legal register type i32, we should truncate Arg back to
3952  // i1/i8/i16.
3953  if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3954  VA.getValVT() == MVT::i16)
3955  Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3956 
3957  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3958  MemOpChains.push_back(Store);
3959  }
3960  }
3961  }
3962 
3963  if (!MemOpChains.empty())
3964  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3965 
3966  // Build a sequence of copy-to-reg nodes chained together with token chain
3967  // and flag operands which copy the outgoing args into the appropriate regs.
3968  SDValue InFlag;
3969  for (auto &RegToPass : RegsToPass) {
3970  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3971  RegToPass.second, InFlag);
3972  InFlag = Chain.getValue(1);
3973  }
3974 
3975  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3976  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3977  // node so that legalize doesn't hack it.
3978  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3979  auto GV = G->getGlobal();
3980  if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
3982  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
3983  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3984  } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
3985  assert(Subtarget->isTargetWindows() &&
3986  "Windows is the only supported COFF target");
3987  Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
3988  } else {
3989  const GlobalValue *GV = G->getGlobal();
3990  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
3991  }
3992  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3993  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3994  Subtarget->isTargetMachO()) {
3995  const char *Sym = S->getSymbol();
3996  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
3997  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3998  } else {
3999  const char *Sym = S->getSymbol();
4000  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
4001  }
4002  }
4003 
4004  // We don't usually want to end the call-sequence here because we would tidy
4005  // the frame up *after* the call, however in the ABI-changing tail-call case
4006  // we've carefully laid out the parameters so that when sp is reset they'll be
4007  // in the correct location.
4008  if (IsTailCall && !IsSibCall) {
4009  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
4010  DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
4011  InFlag = Chain.getValue(1);
4012  }
4013 
4014  std::vector<SDValue> Ops;
4015  Ops.push_back(Chain);
4016  Ops.push_back(Callee);
4017 
4018  if (IsTailCall) {
4019  // Each tail call may have to adjust the stack by a different amount, so
4020  // this information must travel along with the operation for eventual
4021  // consumption by emitEpilogue.
4022  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4023  }
4024 
4025  // Add argument registers to the end of the list so that they are known live
4026  // into the call.
4027  for (auto &RegToPass : RegsToPass)
4028  Ops.push_back(DAG.getRegister(RegToPass.first,
4029  RegToPass.second.getValueType()));
4030 
4031  // Check callee args/returns for SVE registers and set calling convention
4032  // accordingly.
4033  if (CallConv == CallingConv::C) {
4034  bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
4035  return Out.VT.isScalableVector();
4036  });
4037  bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
4038  return In.VT.isScalableVector();
4039  });
4040 
4041  if (CalleeInSVE || CalleeOutSVE)
4043  }
4044 
4045  // Add a register mask operand representing the call-preserved registers.
4046  const uint32_t *Mask;
4047  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
4048  if (IsThisReturn) {
4049  // For 'this' returns, use the X0-preserving mask if applicable
4050  Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
4051  if (!Mask) {
4052  IsThisReturn = false;
4053  Mask = TRI->getCallPreservedMask(MF, CallConv);
4054  }
4055  } else
4056  Mask = TRI->getCallPreservedMask(MF, CallConv);
4057 
4058  if (Subtarget->hasCustomCallingConv())
4059  TRI->UpdateCustomCallPreservedMask(MF, &Mask);
4060 
4061  if (TRI->isAnyArgRegReserved(MF))
4062  TRI->emitReservedArgRegCallError(MF);
4063 
4064  assert(Mask && "Missing call preserved mask for calling convention");
4065  Ops.push_back(DAG.getRegisterMask(Mask));
4066 
4067  if (InFlag.getNode())