LLVM  9.0.0svn
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ExpandImm.h"
14 #include "AArch64ISelLowering.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
47 #include "llvm/IR/Attributes.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DataLayout.h"
50 #include "llvm/IR/DebugLoc.h"
51 #include "llvm/IR/DerivedTypes.h"
52 #include "llvm/IR/Function.h"
54 #include "llvm/IR/GlobalValue.h"
55 #include "llvm/IR/IRBuilder.h"
56 #include "llvm/IR/Instruction.h"
57 #include "llvm/IR/Instructions.h"
58 #include "llvm/IR/IntrinsicInst.h"
59 #include "llvm/IR/Intrinsics.h"
60 #include "llvm/IR/Module.h"
61 #include "llvm/IR/OperandTraits.h"
62 #include "llvm/IR/PatternMatch.h"
63 #include "llvm/IR/Type.h"
64 #include "llvm/IR/Use.h"
65 #include "llvm/IR/Value.h"
66 #include "llvm/MC/MCRegisterInfo.h"
67 #include "llvm/Support/Casting.h"
68 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/Compiler.h"
71 #include "llvm/Support/Debug.h"
73 #include "llvm/Support/KnownBits.h"
79 #include <algorithm>
80 #include <bitset>
81 #include <cassert>
82 #include <cctype>
83 #include <cstdint>
84 #include <cstdlib>
85 #include <iterator>
86 #include <limits>
87 #include <tuple>
88 #include <utility>
89 #include <vector>
90 
91 using namespace llvm;
92 using namespace llvm::PatternMatch;
93 
94 #define DEBUG_TYPE "aarch64-lower"
95 
96 STATISTIC(NumTailCalls, "Number of tail calls");
97 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
98 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
99 
100 static cl::opt<bool>
101 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
102  cl::desc("Allow AArch64 SLI/SRI formation"),
103  cl::init(false));
104 
105 // FIXME: The necessary dtprel relocations don't seem to be supported
106 // well in the GNU bfd and gold linkers at the moment. Therefore, by
107 // default, for now, fall back to GeneralDynamic code generation.
109  "aarch64-elf-ldtls-generation", cl::Hidden,
110  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
111  cl::init(false));
112 
113 static cl::opt<bool>
114 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
115  cl::desc("Enable AArch64 logical imm instruction "
116  "optimization"),
117  cl::init(true));
118 
119 /// Value type used for condition codes.
120 static const MVT MVT_CC = MVT::i32;
121 
123  const AArch64Subtarget &STI)
124  : TargetLowering(TM), Subtarget(&STI) {
125  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
126  // we have to make something up. Arbitrarily, choose ZeroOrOne.
128  // When comparing vectors the result sets the different elements in the
129  // vector to all-one or all-zero.
131 
132  // Set up the register classes.
133  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
134  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
135 
136  if (Subtarget->hasFPARMv8()) {
137  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
138  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
139  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
140  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
141  }
142 
143  if (Subtarget->hasNEON()) {
144  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
145  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
146  // Someone set us up the NEON.
147  addDRTypeForNEON(MVT::v2f32);
148  addDRTypeForNEON(MVT::v8i8);
149  addDRTypeForNEON(MVT::v4i16);
150  addDRTypeForNEON(MVT::v2i32);
151  addDRTypeForNEON(MVT::v1i64);
152  addDRTypeForNEON(MVT::v1f64);
153  addDRTypeForNEON(MVT::v4f16);
154 
155  addQRTypeForNEON(MVT::v4f32);
156  addQRTypeForNEON(MVT::v2f64);
157  addQRTypeForNEON(MVT::v16i8);
158  addQRTypeForNEON(MVT::v8i16);
159  addQRTypeForNEON(MVT::v4i32);
160  addQRTypeForNEON(MVT::v2i64);
161  addQRTypeForNEON(MVT::v8f16);
162  }
163 
164  // Compute derived properties from the register classes
166 
167  // Provide all sorts of operation actions
195 
199 
203 
205 
206  // Custom lowering hooks are needed for XOR
207  // to fold it into CSINC/CSINV.
210 
211  // Virtually no operation on f128 is legal, but LLVM can't expand them when
212  // there's a valid register class, so we need custom operations in most cases.
234 
235  // Lowering for many of the conversions is actually specified by the non-f128
236  // type. The LowerXXX function will be trivial when f128 isn't involved.
251 
252  // Variable arguments.
257 
258  // Variable-sized objects.
261 
262  if (Subtarget->isTargetWindows())
264  else
266 
267  // Constant pool entries
269 
270  // BlockAddress
272 
273  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
282 
283  // AArch64 lacks both left-rotate and popcount instructions.
286  for (MVT VT : MVT::vector_valuetypes()) {
289  }
290 
291  // AArch64 doesn't have {U|S}MUL_LOHI.
294 
297 
300  for (MVT VT : MVT::vector_valuetypes()) {
303  }
310 
311  // Custom lower Add/Sub/Mul with overflow.
324 
333  if (Subtarget->hasFullFP16())
335  else
337 
371 
372  if (!Subtarget->hasFullFP16()) {
395 
396  // promote v4f16 to v4f32 when that is known to be safe.
409 
425 
446  }
447 
448  // AArch64 has implementations of a lot of rounding-like FP operations.
449  for (MVT Ty : {MVT::f32, MVT::f64}) {
462  }
463 
464  if (Subtarget->hasFullFP16()) {
475  }
476 
478 
480 
486 
487  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
488  // This requires the Performance Monitors extension.
489  if (Subtarget->hasPerfMon())
491 
492  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
493  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
494  // Issue __sincos_stret if available.
497  } else {
500  }
501 
502  // Make floating-point constants legal for the large code model, so they don't
503  // become loads from the constant pool.
504  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
507  }
508 
509  // AArch64 does not have floating-point extending loads, i1 sign-extending
510  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
511  for (MVT VT : MVT::fp_valuetypes()) {
516  }
517  for (MVT VT : MVT::integer_valuetypes())
519 
527 
530 
531  // Indexed loads and stores are supported.
532  for (unsigned im = (unsigned)ISD::PRE_INC;
548  }
549 
550  // Trap.
552 
553  // We combine OR nodes for bitfield operations.
555  // Try to create BICs for vector ANDs.
557 
558  // Vector add and sub nodes may conceal a high-half opportunity.
559  // Also, try to fold ADD into CSINC/CSINV..
566 
570 
572 
579  if (Subtarget->supportsAddressTopByteIgnored())
581 
583 
586 
590 
592 
593  // In case of strict alignment, avoid an excessive number of byte wide stores.
597 
602 
604 
606 
608 
609  EnableExtLdPromotion = true;
610 
611  // Set required alignment.
613  // Set preferred alignments.
616 
617  // Only change the limit for entries in a jump table if specified by
618  // the sub target, but not at the command line.
619  unsigned MaxJT = STI.getMaximumJumpTableSize();
620  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
622 
623  setHasExtractBitsInsn(true);
624 
626 
627  if (Subtarget->hasNEON()) {
628  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
629  // silliness like this:
655 
661 
663 
664  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
665  // elements smaller than i32, so promote the input to i32 first.
668  // i8 vector elements also need promotion to i32 for v8i8
671  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
676  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
677  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
680 
681  if (Subtarget->hasFullFP16()) {
686  } else {
687  // when AArch64 doesn't have fullfp16 support, promote the input
688  // to i32 first.
693  }
694 
697 
698  // AArch64 doesn't have MUL.2d:
700  // Custom handling for some quad-vector types to detect MULL.
704 
705  // Vector reductions
706  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
713  }
714  for (MVT VT : { MVT::v4f16, MVT::v2f32,
718  }
719 
722  // Likewise, narrowing and extending vector loads/stores aren't handled
723  // directly.
724  for (MVT VT : MVT::vector_valuetypes()) {
726 
727  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
730  } else {
733  }
736 
739 
740  for (MVT InnerVT : MVT::vector_valuetypes()) {
741  setTruncStoreAction(VT, InnerVT, Expand);
742  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
743  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
744  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
745  }
746  }
747 
748  // AArch64 has implementations of a lot of rounding-like FP operations.
749  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
756  }
757 
758  if (Subtarget->hasFullFP16()) {
759  for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
766  }
767  }
768 
770  }
771 
773 }
774 
775 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
776  assert(VT.isVector() && "VT should be a vector type");
777 
778  if (VT.isFloatingPoint()) {
780  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
781  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
782  }
783 
784  // Mark vector float intrinsics as expand.
785  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
794 
795  // But we do support custom-lowering for FCOPYSIGN.
797  }
798 
810 
814  for (MVT InnerVT : MVT::all_valuetypes())
815  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
816 
817  // CNT supports only B element sizes, then use UADDLP to widen.
818  if (VT != MVT::v8i8 && VT != MVT::v16i8)
820 
826 
829 
830  if (!VT.isFloatingPoint())
832 
833  // [SU][MIN|MAX] are available for all NEON types apart from i64.
834  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
835  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
836  setOperationAction(Opcode, VT, Legal);
837 
838  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
839  if (VT.isFloatingPoint() &&
840  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
841  for (unsigned Opcode :
843  setOperationAction(Opcode, VT, Legal);
844 
845  if (Subtarget->isLittleEndian()) {
846  for (unsigned im = (unsigned)ISD::PRE_INC;
850  }
851  }
852 }
853 
854 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
855  addRegisterClass(VT, &AArch64::FPR64RegClass);
856  addTypeForNEON(VT, MVT::v2i32);
857 }
858 
859 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
860  addRegisterClass(VT, &AArch64::FPR128RegClass);
861  addTypeForNEON(VT, MVT::v4i32);
862 }
863 
865  EVT VT) const {
866  if (!VT.isVector())
867  return MVT::i32;
869 }
870 
871 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
872  const APInt &Demanded,
874  unsigned NewOpc) {
875  uint64_t OldImm = Imm, NewImm, Enc;
876  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
877 
878  // Return if the immediate is already all zeros, all ones, a bimm32 or a
879  // bimm64.
880  if (Imm == 0 || Imm == Mask ||
881  AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
882  return false;
883 
884  unsigned EltSize = Size;
885  uint64_t DemandedBits = Demanded.getZExtValue();
886 
887  // Clear bits that are not demanded.
888  Imm &= DemandedBits;
889 
890  while (true) {
891  // The goal here is to set the non-demanded bits in a way that minimizes
892  // the number of switching between 0 and 1. In order to achieve this goal,
893  // we set the non-demanded bits to the value of the preceding demanded bits.
894  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
895  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
896  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
897  // The final result is 0b11000011.
898  uint64_t NonDemandedBits = ~DemandedBits;
899  uint64_t InvertedImm = ~Imm & DemandedBits;
900  uint64_t RotatedImm =
901  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
902  NonDemandedBits;
903  uint64_t Sum = RotatedImm + NonDemandedBits;
904  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
905  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
906  NewImm = (Imm | Ones) & Mask;
907 
908  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
909  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
910  // we halve the element size and continue the search.
911  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
912  break;
913 
914  // We cannot shrink the element size any further if it is 2-bits.
915  if (EltSize == 2)
916  return false;
917 
918  EltSize /= 2;
919  Mask >>= EltSize;
920  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
921 
922  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
923  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
924  return false;
925 
926  // Merge the upper and lower halves of Imm and DemandedBits.
927  Imm |= Hi;
928  DemandedBits |= DemandedBitsHi;
929  }
930 
931  ++NumOptimizedImms;
932 
933  // Replicate the element across the register width.
934  while (EltSize < Size) {
935  NewImm |= NewImm << EltSize;
936  EltSize *= 2;
937  }
938 
939  (void)OldImm;
940  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
941  "demanded bits should never be altered");
942  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
943 
944  // Create the new constant immediate node.
945  EVT VT = Op.getValueType();
946  SDLoc DL(Op);
947  SDValue New;
948 
949  // If the new constant immediate is all-zeros or all-ones, let the target
950  // independent DAG combine optimize this node.
951  if (NewImm == 0 || NewImm == OrigMask) {
952  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
953  TLO.DAG.getConstant(NewImm, DL, VT));
954  // Otherwise, create a machine node so that target independent DAG combine
955  // doesn't undo this optimization.
956  } else {
957  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
958  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
959  New = SDValue(
960  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
961  }
962 
963  return TLO.CombineTo(Op, New);
964 }
965 
967  SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
968  // Delay this optimization to as late as possible.
969  if (!TLO.LegalOps)
970  return false;
971 
973  return false;
974 
975  EVT VT = Op.getValueType();
976  if (VT.isVector())
977  return false;
978 
979  unsigned Size = VT.getSizeInBits();
980  assert((Size == 32 || Size == 64) &&
981  "i32 or i64 is expected after legalization.");
982 
983  // Exit early if we demand all bits.
984  if (Demanded.countPopulation() == Size)
985  return false;
986 
987  unsigned NewOpc;
988  switch (Op.getOpcode()) {
989  default:
990  return false;
991  case ISD::AND:
992  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
993  break;
994  case ISD::OR:
995  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
996  break;
997  case ISD::XOR:
998  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
999  break;
1000  }
1002  if (!C)
1003  return false;
1004  uint64_t Imm = C->getZExtValue();
1005  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
1006 }
1007 
1008 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1009 /// Mask are known to be either zero or one and return them Known.
1011  const SDValue Op, KnownBits &Known,
1012  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1013  switch (Op.getOpcode()) {
1014  default:
1015  break;
1016  case AArch64ISD::CSEL: {
1017  KnownBits Known2;
1018  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1019  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1020  Known.Zero &= Known2.Zero;
1021  Known.One &= Known2.One;
1022  break;
1023  }
1024  case ISD::INTRINSIC_W_CHAIN: {
1025  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1026  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1027  switch (IntID) {
1028  default: return;
1029  case Intrinsic::aarch64_ldaxr:
1030  case Intrinsic::aarch64_ldxr: {
1031  unsigned BitWidth = Known.getBitWidth();
1032  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1033  unsigned MemBits = VT.getScalarSizeInBits();
1034  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1035  return;
1036  }
1037  }
1038  break;
1039  }
1041  case ISD::INTRINSIC_VOID: {
1042  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1043  switch (IntNo) {
1044  default:
1045  break;
1046  case Intrinsic::aarch64_neon_umaxv:
1047  case Intrinsic::aarch64_neon_uminv: {
1048  // Figure out the datatype of the vector operand. The UMINV instruction
1049  // will zero extend the result, so we can mark as known zero all the
1050  // bits larger than the element datatype. 32-bit or larget doesn't need
1051  // this as those are legal types and will be handled by isel directly.
1052  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1053  unsigned BitWidth = Known.getBitWidth();
1054  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1055  assert(BitWidth >= 8 && "Unexpected width!");
1056  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1057  Known.Zero |= Mask;
1058  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1059  assert(BitWidth >= 16 && "Unexpected width!");
1060  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1061  Known.Zero |= Mask;
1062  }
1063  break;
1064  } break;
1065  }
1066  }
1067  }
1068 }
1069 
1071  EVT) const {
1072  return MVT::i64;
1073 }
1074 
1076  unsigned AddrSpace,
1077  unsigned Align,
1078  bool *Fast) const {
1079  if (Subtarget->requiresStrictAlign())
1080  return false;
1081 
1082  if (Fast) {
1083  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1084  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1085  // See comments in performSTORECombine() for more details about
1086  // these conditions.
1087 
1088  // Code that uses clang vector extensions can mark that it
1089  // wants unaligned accesses to be treated as fast by
1090  // underspecifying alignment to be 1 or 2.
1091  Align <= 2 ||
1092 
1093  // Disregard v2i64. Memcpy lowering produces those and splitting
1094  // them regresses performance on micro-benchmarks and olden/bh.
1095  VT == MVT::v2i64;
1096  }
1097  return true;
1098 }
1099 
1100 FastISel *
1102  const TargetLibraryInfo *libInfo) const {
1103  return AArch64::createFastISel(funcInfo, libInfo);
1104 }
1105 
1106 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1107  switch ((AArch64ISD::NodeType)Opcode) {
1108  case AArch64ISD::FIRST_NUMBER: break;
1109  case AArch64ISD::CALL: return "AArch64ISD::CALL";
1110  case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
1111  case AArch64ISD::ADR: return "AArch64ISD::ADR";
1112  case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
1113  case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
1114  case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
1115  case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
1116  case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
1117  case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
1118  case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
1119  case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
1120  case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
1121  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1122  case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
1123  case AArch64ISD::ADC: return "AArch64ISD::ADC";
1124  case AArch64ISD::SBC: return "AArch64ISD::SBC";
1125  case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
1126  case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
1127  case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
1128  case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
1129  case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
1130  case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
1131  case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
1132  case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
1133  case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
1134  case AArch64ISD::DUP: return "AArch64ISD::DUP";
1135  case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
1136  case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
1137  case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
1138  case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
1139  case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
1140  case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
1141  case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
1142  case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
1143  case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
1144  case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
1145  case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
1146  case AArch64ISD::BICi: return "AArch64ISD::BICi";
1147  case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
1148  case AArch64ISD::BSL: return "AArch64ISD::BSL";
1149  case AArch64ISD::NEG: return "AArch64ISD::NEG";
1150  case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
1151  case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
1152  case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
1153  case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
1154  case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
1155  case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
1156  case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
1157  case AArch64ISD::REV16: return "AArch64ISD::REV16";
1158  case AArch64ISD::REV32: return "AArch64ISD::REV32";
1159  case AArch64ISD::REV64: return "AArch64ISD::REV64";
1160  case AArch64ISD::EXT: return "AArch64ISD::EXT";
1161  case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
1162  case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
1163  case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
1164  case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
1165  case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
1166  case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
1167  case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
1168  case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
1169  case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
1170  case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
1171  case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
1172  case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
1173  case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
1174  case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
1175  case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
1176  case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
1177  case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
1178  case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
1179  case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
1180  case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
1181  case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
1182  case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
1183  case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
1184  case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
1185  case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
1186  case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
1187  case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
1188  case AArch64ISD::NOT: return "AArch64ISD::NOT";
1189  case AArch64ISD::BIT: return "AArch64ISD::BIT";
1190  case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
1191  case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
1192  case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
1193  case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
1194  case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
1195  case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
1196  case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
1197  case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
1198  case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
1199  case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
1200  case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
1201  case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
1202  case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
1203  case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
1204  case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
1205  case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
1206  case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
1207  case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
1208  case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
1209  case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
1210  case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
1211  case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
1212  case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
1213  case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
1214  case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
1215  case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
1216  case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
1217  case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
1218  case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
1219  case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
1220  case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
1221  case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
1222  case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
1223  case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
1224  case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
1225  case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
1226  case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
1227  case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
1228  case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
1229  case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
1230  case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
1231  case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
1232  case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
1233  case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
1234  }
1235  return nullptr;
1236 }
1237 
1240  MachineBasicBlock *MBB) const {
1241  // We materialise the F128CSEL pseudo-instruction as some control flow and a
1242  // phi node:
1243 
1244  // OrigBB:
1245  // [... previous instrs leading to comparison ...]
1246  // b.ne TrueBB
1247  // b EndBB
1248  // TrueBB:
1249  // ; Fallthrough
1250  // EndBB:
1251  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1252 
1253  MachineFunction *MF = MBB->getParent();
1254  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1255  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1256  DebugLoc DL = MI.getDebugLoc();
1257  MachineFunction::iterator It = ++MBB->getIterator();
1258 
1259  unsigned DestReg = MI.getOperand(0).getReg();
1260  unsigned IfTrueReg = MI.getOperand(1).getReg();
1261  unsigned IfFalseReg = MI.getOperand(2).getReg();
1262  unsigned CondCode = MI.getOperand(3).getImm();
1263  bool NZCVKilled = MI.getOperand(4).isKill();
1264 
1265  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1266  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1267  MF->insert(It, TrueBB);
1268  MF->insert(It, EndBB);
1269 
1270  // Transfer rest of current basic-block to EndBB
1271  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1272  MBB->end());
1273  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1274 
1275  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1276  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1277  MBB->addSuccessor(TrueBB);
1278  MBB->addSuccessor(EndBB);
1279 
1280  // TrueBB falls through to the end.
1281  TrueBB->addSuccessor(EndBB);
1282 
1283  if (!NZCVKilled) {
1284  TrueBB->addLiveIn(AArch64::NZCV);
1285  EndBB->addLiveIn(AArch64::NZCV);
1286  }
1287 
1288  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1289  .addReg(IfTrueReg)
1290  .addMBB(TrueBB)
1291  .addReg(IfFalseReg)
1292  .addMBB(MBB);
1293 
1294  MI.eraseFromParent();
1295  return EndBB;
1296 }
1297 
1299  MachineInstr &MI, MachineBasicBlock *BB) const {
1301  BB->getParent()->getFunction().getPersonalityFn())) &&
1302  "SEH does not use catchret!");
1303  return BB;
1304 }
1305 
1307  MachineInstr &MI, MachineBasicBlock *BB) const {
1308  MI.eraseFromParent();
1309  return BB;
1310 }
1311 
1313  MachineInstr &MI, MachineBasicBlock *BB) const {
1314  switch (MI.getOpcode()) {
1315  default:
1316 #ifndef NDEBUG
1317  MI.dump();
1318 #endif
1319  llvm_unreachable("Unexpected instruction for custom inserter!");
1320 
1321  case AArch64::F128CSEL:
1322  return EmitF128CSEL(MI, BB);
1323 
1324  case TargetOpcode::STACKMAP:
1325  case TargetOpcode::PATCHPOINT:
1326  return emitPatchPoint(MI, BB);
1327 
1328  case AArch64::CATCHRET:
1329  return EmitLoweredCatchRet(MI, BB);
1330  case AArch64::CATCHPAD:
1331  return EmitLoweredCatchPad(MI, BB);
1332  }
1333 }
1334 
1335 //===----------------------------------------------------------------------===//
1336 // AArch64 Lowering private implementation.
1337 //===----------------------------------------------------------------------===//
1338 
1339 //===----------------------------------------------------------------------===//
1340 // Lowering Code
1341 //===----------------------------------------------------------------------===//
1342 
1343 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1344 /// CC
1346  switch (CC) {
1347  default:
1348  llvm_unreachable("Unknown condition code!");
1349  case ISD::SETNE:
1350  return AArch64CC::NE;
1351  case ISD::SETEQ:
1352  return AArch64CC::EQ;
1353  case ISD::SETGT:
1354  return AArch64CC::GT;
1355  case ISD::SETGE:
1356  return AArch64CC::GE;
1357  case ISD::SETLT:
1358  return AArch64CC::LT;
1359  case ISD::SETLE:
1360  return AArch64CC::LE;
1361  case ISD::SETUGT:
1362  return AArch64CC::HI;
1363  case ISD::SETUGE:
1364  return AArch64CC::HS;
1365  case ISD::SETULT:
1366  return AArch64CC::LO;
1367  case ISD::SETULE:
1368  return AArch64CC::LS;
1369  }
1370 }
1371 
1372 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1375  AArch64CC::CondCode &CondCode2) {
1376  CondCode2 = AArch64CC::AL;
1377  switch (CC) {
1378  default:
1379  llvm_unreachable("Unknown FP condition!");
1380  case ISD::SETEQ:
1381  case ISD::SETOEQ:
1382  CondCode = AArch64CC::EQ;
1383  break;
1384  case ISD::SETGT:
1385  case ISD::SETOGT:
1386  CondCode = AArch64CC::GT;
1387  break;
1388  case ISD::SETGE:
1389  case ISD::SETOGE:
1390  CondCode = AArch64CC::GE;
1391  break;
1392  case ISD::SETOLT:
1393  CondCode = AArch64CC::MI;
1394  break;
1395  case ISD::SETOLE:
1396  CondCode = AArch64CC::LS;
1397  break;
1398  case ISD::SETONE:
1399  CondCode = AArch64CC::MI;
1400  CondCode2 = AArch64CC::GT;
1401  break;
1402  case ISD::SETO:
1403  CondCode = AArch64CC::VC;
1404  break;
1405  case ISD::SETUO:
1406  CondCode = AArch64CC::VS;
1407  break;
1408  case ISD::SETUEQ:
1409  CondCode = AArch64CC::EQ;
1410  CondCode2 = AArch64CC::VS;
1411  break;
1412  case ISD::SETUGT:
1413  CondCode = AArch64CC::HI;
1414  break;
1415  case ISD::SETUGE:
1416  CondCode = AArch64CC::PL;
1417  break;
1418  case ISD::SETLT:
1419  case ISD::SETULT:
1420  CondCode = AArch64CC::LT;
1421  break;
1422  case ISD::SETLE:
1423  case ISD::SETULE:
1424  CondCode = AArch64CC::LE;
1425  break;
1426  case ISD::SETNE:
1427  case ISD::SETUNE:
1428  CondCode = AArch64CC::NE;
1429  break;
1430  }
1431 }
1432 
1433 /// Convert a DAG fp condition code to an AArch64 CC.
1434 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1435 /// should be AND'ed instead of OR'ed.
1438  AArch64CC::CondCode &CondCode2) {
1439  CondCode2 = AArch64CC::AL;
1440  switch (CC) {
1441  default:
1442  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1443  assert(CondCode2 == AArch64CC::AL);
1444  break;
1445  case ISD::SETONE:
1446  // (a one b)
1447  // == ((a olt b) || (a ogt b))
1448  // == ((a ord b) && (a une b))
1449  CondCode = AArch64CC::VC;
1450  CondCode2 = AArch64CC::NE;
1451  break;
1452  case ISD::SETUEQ:
1453  // (a ueq b)
1454  // == ((a uno b) || (a oeq b))
1455  // == ((a ule b) && (a uge b))
1456  CondCode = AArch64CC::PL;
1457  CondCode2 = AArch64CC::LE;
1458  break;
1459  }
1460 }
1461 
1462 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1463 /// CC usable with the vector instructions. Fewer operations are available
1464 /// without a real NZCV register, so we have to use less efficient combinations
1465 /// to get the same effect.
1468  AArch64CC::CondCode &CondCode2,
1469  bool &Invert) {
1470  Invert = false;
1471  switch (CC) {
1472  default:
1473  // Mostly the scalar mappings work fine.
1474  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1475  break;
1476  case ISD::SETUO:
1477  Invert = true;
1479  case ISD::SETO:
1480  CondCode = AArch64CC::MI;
1481  CondCode2 = AArch64CC::GE;
1482  break;
1483  case ISD::SETUEQ:
1484  case ISD::SETULT:
1485  case ISD::SETULE:
1486  case ISD::SETUGT:
1487  case ISD::SETUGE:
1488  // All of the compare-mask comparisons are ordered, but we can switch
1489  // between the two by a double inversion. E.g. ULE == !OGT.
1490  Invert = true;
1491  changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1492  break;
1493  }
1494 }
1495 
1496 static bool isLegalArithImmed(uint64_t C) {
1497  // Matches AArch64DAGToDAGISel::SelectArithImmed().
1498  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1499  LLVM_DEBUG(dbgs() << "Is imm " << C
1500  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1501  return IsLegal;
1502 }
1503 
1504 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
1505 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
1506 // can be set differently by this operation. It comes down to whether
1507 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1508 // everything is fine. If not then the optimization is wrong. Thus general
1509 // comparisons are only valid if op2 != 0.
1510 //
1511 // So, finally, the only LLVM-native comparisons that don't mention C and V
1512 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1513 // the absence of information about op2.
1514 static bool isCMN(SDValue Op, ISD::CondCode CC) {
1515  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
1516  (CC == ISD::SETEQ || CC == ISD::SETNE);
1517 }
1518 
1520  const SDLoc &dl, SelectionDAG &DAG) {
1521  EVT VT = LHS.getValueType();
1522  const bool FullFP16 =
1523  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1524 
1525  if (VT.isFloatingPoint()) {
1526  assert(VT != MVT::f128);
1527  if (VT == MVT::f16 && !FullFP16) {
1528  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1529  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1530  VT = MVT::f32;
1531  }
1532  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1533  }
1534 
1535  // The CMP instruction is just an alias for SUBS, and representing it as
1536  // SUBS means that it's possible to get CSE with subtract operations.
1537  // A later phase can perform the optimization of setting the destination
1538  // register to WZR/XZR if it ends up being unused.
1539  unsigned Opcode = AArch64ISD::SUBS;
1540 
1541  if (isCMN(RHS, CC)) {
1542  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
1543  Opcode = AArch64ISD::ADDS;
1544  RHS = RHS.getOperand(1);
1545  } else if (isCMN(LHS, CC)) {
1546  // As we are looking for EQ/NE compares, the operands can be commuted ; can
1547  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
1548  Opcode = AArch64ISD::ADDS;
1549  LHS = LHS.getOperand(1);
1550  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1551  !isUnsignedIntSetCC(CC)) {
1552  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1553  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1554  // of the signed comparisons.
1555  Opcode = AArch64ISD::ANDS;
1556  RHS = LHS.getOperand(1);
1557  LHS = LHS.getOperand(0);
1558  }
1559 
1560  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1561  .getValue(1);
1562 }
1563 
1564 /// \defgroup AArch64CCMP CMP;CCMP matching
1565 ///
1566 /// These functions deal with the formation of CMP;CCMP;... sequences.
1567 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1568 /// a comparison. They set the NZCV flags to a predefined value if their
1569 /// predicate is false. This allows to express arbitrary conjunctions, for
1570 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
1571 /// expressed as:
1572 /// cmp A
1573 /// ccmp B, inv(CB), CA
1574 /// check for CB flags
1575 ///
1576 /// This naturally lets us implement chains of AND operations with SETCC
1577 /// operands. And we can even implement some other situations by transforming
1578 /// them:
1579 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
1580 /// negating the flags used in a CCMP/FCCMP operations.
1581 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
1582 /// by negating the flags we test for afterwards. i.e.
1583 /// NEG (CMP CCMP CCCMP ...) can be implemented.
1584 /// - Note that we can only ever negate all previously processed results.
1585 /// What we can not implement by flipping the flags to test is a negation
1586 /// of two sub-trees (because the negation affects all sub-trees emitted so
1587 /// far, so the 2nd sub-tree we emit would also affect the first).
1588 /// With those tools we can implement some OR operations:
1589 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
1590 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
1591 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
1592 /// elimination rules from earlier to implement the whole thing as a
1593 /// CCMP/FCCMP chain.
1594 ///
1595 /// As complete example:
1596 /// or (or (setCA (cmp A)) (setCB (cmp B)))
1597 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1598 /// can be reassociated to:
1599 /// or (and (setCC (cmp C)) setCD (cmp D))
1600 // (or (setCA (cmp A)) (setCB (cmp B)))
1601 /// can be transformed to:
1602 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
1603 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1604 /// which can be implemented as:
1605 /// cmp C
1606 /// ccmp D, inv(CD), CC
1607 /// ccmp A, CA, inv(CD)
1608 /// ccmp B, CB, inv(CA)
1609 /// check for CB flags
1610 ///
1611 /// A counterexample is "or (and A B) (and C D)" which translates to
1612 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
1613 /// can only implement 1 of the inner (not) operations, but not both!
1614 /// @{
1615 
1616 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1618  ISD::CondCode CC, SDValue CCOp,
1620  AArch64CC::CondCode OutCC,
1621  const SDLoc &DL, SelectionDAG &DAG) {
1622  unsigned Opcode = 0;
1623  const bool FullFP16 =
1624  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1625 
1626  if (LHS.getValueType().isFloatingPoint()) {
1627  assert(LHS.getValueType() != MVT::f128);
1628  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1629  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1630  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1631  }
1632  Opcode = AArch64ISD::FCCMP;
1633  } else if (RHS.getOpcode() == ISD::SUB) {
1634  SDValue SubOp0 = RHS.getOperand(0);
1635  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1636  // See emitComparison() on why we can only do this for SETEQ and SETNE.
1637  Opcode = AArch64ISD::CCMN;
1638  RHS = RHS.getOperand(1);
1639  }
1640  }
1641  if (Opcode == 0)
1642  Opcode = AArch64ISD::CCMP;
1643 
1644  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1646  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1647  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1648  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1649 }
1650 
1651 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
1652 /// expressed as a conjunction. See \ref AArch64CCMP.
1653 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
1654 /// changing the conditions on the SETCC tests.
1655 /// (this means we can call emitConjunctionRec() with
1656 /// Negate==true on this sub-tree)
1657 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
1658 /// cannot do the negation naturally. We are required to
1659 /// emit the subtree first in this case.
1660 /// \param WillNegate Is true if are called when the result of this
1661 /// subexpression must be negated. This happens when the
1662 /// outer expression is an OR. We can use this fact to know
1663 /// that we have a double negation (or (or ...) ...) that
1664 /// can be implemented for free.
1665 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
1666  bool &MustBeFirst, bool WillNegate,
1667  unsigned Depth = 0) {
1668  if (!Val.hasOneUse())
1669  return false;
1670  unsigned Opcode = Val->getOpcode();
1671  if (Opcode == ISD::SETCC) {
1672  if (Val->getOperand(0).getValueType() == MVT::f128)
1673  return false;
1674  CanNegate = true;
1675  MustBeFirst = false;
1676  return true;
1677  }
1678  // Protect against exponential runtime and stack overflow.
1679  if (Depth > 6)
1680  return false;
1681  if (Opcode == ISD::AND || Opcode == ISD::OR) {
1682  bool IsOR = Opcode == ISD::OR;
1683  SDValue O0 = Val->getOperand(0);
1684  SDValue O1 = Val->getOperand(1);
1685  bool CanNegateL;
1686  bool MustBeFirstL;
1687  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
1688  return false;
1689  bool CanNegateR;
1690  bool MustBeFirstR;
1691  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
1692  return false;
1693 
1694  if (MustBeFirstL && MustBeFirstR)
1695  return false;
1696 
1697  if (IsOR) {
1698  // For an OR expression we need to be able to naturally negate at least
1699  // one side or we cannot do the transformation at all.
1700  if (!CanNegateL && !CanNegateR)
1701  return false;
1702  // If we the result of the OR will be negated and we can naturally negate
1703  // the leafs, then this sub-tree as a whole negates naturally.
1704  CanNegate = WillNegate && CanNegateL && CanNegateR;
1705  // If we cannot naturally negate the whole sub-tree, then this must be
1706  // emitted first.
1707  MustBeFirst = !CanNegate;
1708  } else {
1709  assert(Opcode == ISD::AND && "Must be OR or AND");
1710  // We cannot naturally negate an AND operation.
1711  CanNegate = false;
1712  MustBeFirst = MustBeFirstL || MustBeFirstR;
1713  }
1714  return true;
1715  }
1716  return false;
1717 }
1718 
1719 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1720 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1721 /// Tries to transform the given i1 producing node @p Val to a series compare
1722 /// and conditional compare operations. @returns an NZCV flags producing node
1723 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1724 /// transformation was not possible.
1725 /// \p Negate is true if we want this sub-tree being negated just by changing
1726 /// SETCC conditions.
1728  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1730  // We're at a tree leaf, produce a conditional comparison operation.
1731  unsigned Opcode = Val->getOpcode();
1732  if (Opcode == ISD::SETCC) {
1733  SDValue LHS = Val->getOperand(0);
1734  SDValue RHS = Val->getOperand(1);
1735  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1736  bool isInteger = LHS.getValueType().isInteger();
1737  if (Negate)
1738  CC = getSetCCInverse(CC, isInteger);
1739  SDLoc DL(Val);
1740  // Determine OutCC and handle FP special case.
1741  if (isInteger) {
1742  OutCC = changeIntCCToAArch64CC(CC);
1743  } else {
1745  AArch64CC::CondCode ExtraCC;
1746  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1747  // Some floating point conditions can't be tested with a single condition
1748  // code. Construct an additional comparison in this case.
1749  if (ExtraCC != AArch64CC::AL) {
1750  SDValue ExtraCmp;
1751  if (!CCOp.getNode())
1752  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1753  else
1754  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1755  ExtraCC, DL, DAG);
1756  CCOp = ExtraCmp;
1757  Predicate = ExtraCC;
1758  }
1759  }
1760 
1761  // Produce a normal comparison if we are first in the chain
1762  if (!CCOp)
1763  return emitComparison(LHS, RHS, CC, DL, DAG);
1764  // Otherwise produce a ccmp.
1765  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1766  DAG);
1767  }
1768  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
1769 
1770  bool IsOR = Opcode == ISD::OR;
1771 
1772  SDValue LHS = Val->getOperand(0);
1773  bool CanNegateL;
1774  bool MustBeFirstL;
1775  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
1776  assert(ValidL && "Valid conjunction/disjunction tree");
1777  (void)ValidL;
1778 
1779  SDValue RHS = Val->getOperand(1);
1780  bool CanNegateR;
1781  bool MustBeFirstR;
1782  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
1783  assert(ValidR && "Valid conjunction/disjunction tree");
1784  (void)ValidR;
1785 
1786  // Swap sub-tree that must come first to the right side.
1787  if (MustBeFirstL) {
1788  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
1789  std::swap(LHS, RHS);
1790  std::swap(CanNegateL, CanNegateR);
1791  std::swap(MustBeFirstL, MustBeFirstR);
1792  }
1793 
1794  bool NegateR;
1795  bool NegateAfterR;
1796  bool NegateL;
1797  bool NegateAfterAll;
1798  if (Opcode == ISD::OR) {
1799  // Swap the sub-tree that we can negate naturally to the left.
1800  if (!CanNegateL) {
1801  assert(CanNegateR && "at least one side must be negatable");
1802  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
1803  assert(!Negate);
1804  std::swap(LHS, RHS);
1805  NegateR = false;
1806  NegateAfterR = true;
1807  } else {
1808  // Negate the left sub-tree if possible, otherwise negate the result.
1809  NegateR = CanNegateR;
1810  NegateAfterR = !CanNegateR;
1811  }
1812  NegateL = true;
1813  NegateAfterAll = !Negate;
1814  } else {
1815  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
1816  assert(!Negate && "Valid conjunction/disjunction tree");
1817 
1818  NegateL = false;
1819  NegateR = false;
1820  NegateAfterR = false;
1821  NegateAfterAll = false;
1822  }
1823 
1824  // Emit sub-trees.
1825  AArch64CC::CondCode RHSCC;
1826  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
1827  if (NegateAfterR)
1828  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1829  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
1830  if (NegateAfterAll)
1831  OutCC = AArch64CC::getInvertedCondCode(OutCC);
1832  return CmpL;
1833 }
1834 
1835 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
1836 /// In some cases this is even possible with OR operations in the expression.
1837 /// See \ref AArch64CCMP.
1838 /// \see emitConjunctionRec().
1840  AArch64CC::CondCode &OutCC) {
1841  bool DummyCanNegate;
1842  bool DummyMustBeFirst;
1843  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
1844  return SDValue();
1845 
1846  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
1847 }
1848 
1849 /// @}
1850 
1851 /// Returns how profitable it is to fold a comparison's operand's shift and/or
1852 /// extension operations.
1854  auto isSupportedExtend = [&](SDValue V) {
1855  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
1856  return true;
1857 
1858  if (V.getOpcode() == ISD::AND)
1859  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
1860  uint64_t Mask = MaskCst->getZExtValue();
1861  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
1862  }
1863 
1864  return false;
1865  };
1866 
1867  if (!Op.hasOneUse())
1868  return 0;
1869 
1870  if (isSupportedExtend(Op))
1871  return 1;
1872 
1873  unsigned Opc = Op.getOpcode();
1874  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
1875  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
1876  uint64_t Shift = ShiftCst->getZExtValue();
1877  if (isSupportedExtend(Op.getOperand(0)))
1878  return (Shift <= 4) ? 2 : 1;
1879  EVT VT = Op.getValueType();
1880  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
1881  return 1;
1882  }
1883 
1884  return 0;
1885 }
1886 
1888  SDValue &AArch64cc, SelectionDAG &DAG,
1889  const SDLoc &dl) {
1890  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1891  EVT VT = RHS.getValueType();
1892  uint64_t C = RHSC->getZExtValue();
1893  if (!isLegalArithImmed(C)) {
1894  // Constant does not fit, try adjusting it by one?
1895  switch (CC) {
1896  default:
1897  break;
1898  case ISD::SETLT:
1899  case ISD::SETGE:
1900  if ((VT == MVT::i32 && C != 0x80000000 &&
1901  isLegalArithImmed((uint32_t)(C - 1))) ||
1902  (VT == MVT::i64 && C != 0x80000000ULL &&
1903  isLegalArithImmed(C - 1ULL))) {
1904  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1905  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1906  RHS = DAG.getConstant(C, dl, VT);
1907  }
1908  break;
1909  case ISD::SETULT:
1910  case ISD::SETUGE:
1911  if ((VT == MVT::i32 && C != 0 &&
1912  isLegalArithImmed((uint32_t)(C - 1))) ||
1913  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1914  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1915  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1916  RHS = DAG.getConstant(C, dl, VT);
1917  }
1918  break;
1919  case ISD::SETLE:
1920  case ISD::SETGT:
1921  if ((VT == MVT::i32 && C != INT32_MAX &&
1922  isLegalArithImmed((uint32_t)(C + 1))) ||
1923  (VT == MVT::i64 && C != INT64_MAX &&
1924  isLegalArithImmed(C + 1ULL))) {
1925  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1926  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1927  RHS = DAG.getConstant(C, dl, VT);
1928  }
1929  break;
1930  case ISD::SETULE:
1931  case ISD::SETUGT:
1932  if ((VT == MVT::i32 && C != UINT32_MAX &&
1933  isLegalArithImmed((uint32_t)(C + 1))) ||
1934  (VT == MVT::i64 && C != UINT64_MAX &&
1935  isLegalArithImmed(C + 1ULL))) {
1936  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1937  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1938  RHS = DAG.getConstant(C, dl, VT);
1939  }
1940  break;
1941  }
1942  }
1943  }
1944 
1945  // Comparisons are canonicalized so that the RHS operand is simpler than the
1946  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
1947  // can fold some shift+extend operations on the RHS operand, so swap the
1948  // operands if that can be done.
1949  //
1950  // For example:
1951  // lsl w13, w11, #1
1952  // cmp w13, w12
1953  // can be turned into:
1954  // cmp w12, w11, lsl #1
1955  if (!isa<ConstantSDNode>(RHS) ||
1956  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
1957  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
1958 
1960  std::swap(LHS, RHS);
1962  }
1963  }
1964 
1965  SDValue Cmp;
1966  AArch64CC::CondCode AArch64CC;
1967  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
1968  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
1969 
1970  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
1971  // For the i8 operand, the largest immediate is 255, so this can be easily
1972  // encoded in the compare instruction. For the i16 operand, however, the
1973  // largest immediate cannot be encoded in the compare.
1974  // Therefore, use a sign extending load and cmn to avoid materializing the
1975  // -1 constant. For example,
1976  // movz w1, #65535
1977  // ldrh w0, [x0, #0]
1978  // cmp w0, w1
1979  // >
1980  // ldrsh w0, [x0, #0]
1981  // cmn w0, #1
1982  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
1983  // if and only if (sext LHS) == (sext RHS). The checks are in place to
1984  // ensure both the LHS and RHS are truly zero extended and to make sure the
1985  // transformation is profitable.
1986  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
1987  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
1988  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
1989  LHS.getNode()->hasNUsesOfValue(1, 0)) {
1990  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
1991  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
1992  SDValue SExt =
1993  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
1994  DAG.getValueType(MVT::i16));
1995  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
1996  RHS.getValueType()),
1997  CC, dl, DAG);
1998  AArch64CC = changeIntCCToAArch64CC(CC);
1999  }
2000  }
2001 
2002  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2003  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2004  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2005  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2006  }
2007  }
2008  }
2009 
2010  if (!Cmp) {
2011  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2012  AArch64CC = changeIntCCToAArch64CC(CC);
2013  }
2014  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2015  return Cmp;
2016 }
2017 
2018 static std::pair<SDValue, SDValue>
2020  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2021  "Unsupported value type");
2022  SDValue Value, Overflow;
2023  SDLoc DL(Op);
2024  SDValue LHS = Op.getOperand(0);
2025  SDValue RHS = Op.getOperand(1);
2026  unsigned Opc = 0;
2027  switch (Op.getOpcode()) {
2028  default:
2029  llvm_unreachable("Unknown overflow instruction!");
2030  case ISD::SADDO:
2031  Opc = AArch64ISD::ADDS;
2032  CC = AArch64CC::VS;
2033  break;
2034  case ISD::UADDO:
2035  Opc = AArch64ISD::ADDS;
2036  CC = AArch64CC::HS;
2037  break;
2038  case ISD::SSUBO:
2039  Opc = AArch64ISD::SUBS;
2040  CC = AArch64CC::VS;
2041  break;
2042  case ISD::USUBO:
2043  Opc = AArch64ISD::SUBS;
2044  CC = AArch64CC::LO;
2045  break;
2046  // Multiply needs a little bit extra work.
2047  case ISD::SMULO:
2048  case ISD::UMULO: {
2049  CC = AArch64CC::NE;
2050  bool IsSigned = Op.getOpcode() == ISD::SMULO;
2051  if (Op.getValueType() == MVT::i32) {
2052  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2053  // For a 32 bit multiply with overflow check we want the instruction
2054  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2055  // need to generate the following pattern:
2056  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2057  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2058  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2059  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2060  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
2061  DAG.getConstant(0, DL, MVT::i64));
2062  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2063  // operation. We need to clear out the upper 32 bits, because we used a
2064  // widening multiply that wrote all 64 bits. In the end this should be a
2065  // noop.
2066  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
2067  if (IsSigned) {
2068  // The signed overflow check requires more than just a simple check for
2069  // any bit set in the upper 32 bits of the result. These bits could be
2070  // just the sign bits of a negative number. To perform the overflow
2071  // check we have to arithmetic shift right the 32nd bit of the result by
2072  // 31 bits. Then we compare the result to the upper 32 bits.
2073  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
2074  DAG.getConstant(32, DL, MVT::i64));
2075  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
2076  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
2077  DAG.getConstant(31, DL, MVT::i64));
2078  // It is important that LowerBits is last, otherwise the arithmetic
2079  // shift will not be folded into the compare (SUBS).
2080  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2081  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2082  .getValue(1);
2083  } else {
2084  // The overflow check for unsigned multiply is easy. We only need to
2085  // check if any of the upper 32 bits are set. This can be done with a
2086  // CMP (shifted register). For that we need to generate the following
2087  // pattern:
2088  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2089  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2090  DAG.getConstant(32, DL, MVT::i64));
2091  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2092  Overflow =
2093  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2094  DAG.getConstant(0, DL, MVT::i64),
2095  UpperBits).getValue(1);
2096  }
2097  break;
2098  }
2099  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2100  // For the 64 bit multiply
2101  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2102  if (IsSigned) {
2103  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2104  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
2105  DAG.getConstant(63, DL, MVT::i64));
2106  // It is important that LowerBits is last, otherwise the arithmetic
2107  // shift will not be folded into the compare (SUBS).
2108  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2109  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2110  .getValue(1);
2111  } else {
2112  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2113  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2114  Overflow =
2115  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2116  DAG.getConstant(0, DL, MVT::i64),
2117  UpperBits).getValue(1);
2118  }
2119  break;
2120  }
2121  } // switch (...)
2122 
2123  if (Opc) {
2124  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2125 
2126  // Emit the AArch64 operation with overflow check.
2127  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2128  Overflow = Value.getValue(1);
2129  }
2130  return std::make_pair(Value, Overflow);
2131 }
2132 
2133 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
2134  RTLIB::Libcall Call) const {
2135  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2136  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
2137 }
2138 
2139 // Returns true if the given Op is the overflow flag result of an overflow
2140 // intrinsic operation.
2141 static bool isOverflowIntrOpRes(SDValue Op) {
2142  unsigned Opc = Op.getOpcode();
2143  return (Op.getResNo() == 1 &&
2144  (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2145  Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
2146 }
2147 
2149  SDValue Sel = Op.getOperand(0);
2150  SDValue Other = Op.getOperand(1);
2151  SDLoc dl(Sel);
2152 
2153  // If the operand is an overflow checking operation, invert the condition
2154  // code and kill the Not operation. I.e., transform:
2155  // (xor (overflow_op_bool, 1))
2156  // -->
2157  // (csel 1, 0, invert(cc), overflow_op_bool)
2158  // ... which later gets transformed to just a cset instruction with an
2159  // inverted condition code, rather than a cset + eor sequence.
2160  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
2161  // Only lower legal XALUO ops.
2162  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2163  return SDValue();
2164 
2165  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2166  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2168  SDValue Value, Overflow;
2169  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2170  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2171  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2172  CCVal, Overflow);
2173  }
2174  // If neither operand is a SELECT_CC, give up.
2175  if (Sel.getOpcode() != ISD::SELECT_CC)
2176  std::swap(Sel, Other);
2177  if (Sel.getOpcode() != ISD::SELECT_CC)
2178  return Op;
2179 
2180  // The folding we want to perform is:
2181  // (xor x, (select_cc a, b, cc, 0, -1) )
2182  // -->
2183  // (csel x, (xor x, -1), cc ...)
2184  //
2185  // The latter will get matched to a CSINV instruction.
2186 
2187  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2188  SDValue LHS = Sel.getOperand(0);
2189  SDValue RHS = Sel.getOperand(1);
2190  SDValue TVal = Sel.getOperand(2);
2191  SDValue FVal = Sel.getOperand(3);
2192 
2193  // FIXME: This could be generalized to non-integer comparisons.
2194  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2195  return Op;
2196 
2197  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2198  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2199 
2200  // The values aren't constants, this isn't the pattern we're looking for.
2201  if (!CFVal || !CTVal)
2202  return Op;
2203 
2204  // We can commute the SELECT_CC by inverting the condition. This
2205  // might be needed to make this fit into a CSINV pattern.
2206  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2207  std::swap(TVal, FVal);
2208  std::swap(CTVal, CFVal);
2209  CC = ISD::getSetCCInverse(CC, true);
2210  }
2211 
2212  // If the constants line up, perform the transform!
2213  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2214  SDValue CCVal;
2215  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2216 
2217  FVal = Other;
2218  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2219  DAG.getConstant(-1ULL, dl, Other.getValueType()));
2220 
2221  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2222  CCVal, Cmp);
2223  }
2224 
2225  return Op;
2226 }
2227 
2229  EVT VT = Op.getValueType();
2230 
2231  // Let legalize expand this if it isn't a legal type yet.
2232  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2233  return SDValue();
2234 
2235  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2236 
2237  unsigned Opc;
2238  bool ExtraOp = false;
2239  switch (Op.getOpcode()) {
2240  default:
2241  llvm_unreachable("Invalid code");
2242  case ISD::ADDC:
2243  Opc = AArch64ISD::ADDS;
2244  break;
2245  case ISD::SUBC:
2246  Opc = AArch64ISD::SUBS;
2247  break;
2248  case ISD::ADDE:
2249  Opc = AArch64ISD::ADCS;
2250  ExtraOp = true;
2251  break;
2252  case ISD::SUBE:
2253  Opc = AArch64ISD::SBCS;
2254  ExtraOp = true;
2255  break;
2256  }
2257 
2258  if (!ExtraOp)
2259  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2260  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2261  Op.getOperand(2));
2262 }
2263 
2265  // Let legalize expand this if it isn't a legal type yet.
2267  return SDValue();
2268 
2269  SDLoc dl(Op);
2271  // The actual operation that sets the overflow or carry flag.
2272  SDValue Value, Overflow;
2273  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2274 
2275  // We use 0 and 1 as false and true values.
2276  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2277  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2278 
2279  // We use an inverted condition, because the conditional select is inverted
2280  // too. This will allow it to be selected to a single instruction:
2281  // CSINC Wd, WZR, WZR, invert(cond).
2282  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2283  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2284  CCVal, Overflow);
2285 
2286  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2287  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2288 }
2289 
2290 // Prefetch operands are:
2291 // 1: Address to prefetch
2292 // 2: bool isWrite
2293 // 3: int locality (0 = no locality ... 3 = extreme locality)
2294 // 4: bool isDataCache
2296  SDLoc DL(Op);
2297  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2298  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2299  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2300 
2301  bool IsStream = !Locality;
2302  // When the locality number is set
2303  if (Locality) {
2304  // The front-end should have filtered out the out-of-range values
2305  assert(Locality <= 3 && "Prefetch locality out-of-range");
2306  // The locality degree is the opposite of the cache speed.
2307  // Put the number the other way around.
2308  // The encoding starts at 0 for level 1
2309  Locality = 3 - Locality;
2310  }
2311 
2312  // built the mask value encoding the expected behavior.
2313  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2314  (!IsData << 3) | // IsDataCache bit
2315  (Locality << 1) | // Cache level bits
2316  (unsigned)IsStream; // Stream bit
2317  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2318  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2319 }
2320 
2321 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2322  SelectionDAG &DAG) const {
2323  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2324 
2325  RTLIB::Libcall LC;
2327 
2328  return LowerF128Call(Op, DAG, LC);
2329 }
2330 
2331 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2332  SelectionDAG &DAG) const {
2333  if (Op.getOperand(0).getValueType() != MVT::f128) {
2334  // It's legal except when f128 is involved
2335  return Op;
2336  }
2337 
2338  RTLIB::Libcall LC;
2340 
2341  // FP_ROUND node has a second operand indicating whether it is known to be
2342  // precise. That doesn't take part in the LibCall so we can't directly use
2343  // LowerF128Call.
2344  SDValue SrcVal = Op.getOperand(0);
2345  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
2346  SDLoc(Op)).first;
2347 }
2348 
2349 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
2350  SelectionDAG &DAG) const {
2351  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2352  // Any additional optimization in this function should be recorded
2353  // in the cost tables.
2354  EVT InVT = Op.getOperand(0).getValueType();
2355  EVT VT = Op.getValueType();
2356  unsigned NumElts = InVT.getVectorNumElements();
2357 
2358  // f16 conversions are promoted to f32 when full fp16 is not supported.
2359  if (InVT.getVectorElementType() == MVT::f16 &&
2360  !Subtarget->hasFullFP16()) {
2361  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2362  SDLoc dl(Op);
2363  return DAG.getNode(
2364  Op.getOpcode(), dl, Op.getValueType(),
2365  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2366  }
2367 
2368  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2369  SDLoc dl(Op);
2370  SDValue Cv =
2371  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2372  Op.getOperand(0));
2373  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2374  }
2375 
2376  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2377  SDLoc dl(Op);
2378  MVT ExtVT =
2380  VT.getVectorNumElements());
2381  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2382  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2383  }
2384 
2385  // Type changing conversions are illegal.
2386  return Op;
2387 }
2388 
2389 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2390  SelectionDAG &DAG) const {
2391  if (Op.getOperand(0).getValueType().isVector())
2392  return LowerVectorFP_TO_INT(Op, DAG);
2393 
2394  // f16 conversions are promoted to f32 when full fp16 is not supported.
2395  if (Op.getOperand(0).getValueType() == MVT::f16 &&
2396  !Subtarget->hasFullFP16()) {
2397  SDLoc dl(Op);
2398  return DAG.getNode(
2399  Op.getOpcode(), dl, Op.getValueType(),
2400  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2401  }
2402 
2403  if (Op.getOperand(0).getValueType() != MVT::f128) {
2404  // It's legal except when f128 is involved
2405  return Op;
2406  }
2407 
2408  RTLIB::Libcall LC;
2409  if (Op.getOpcode() == ISD::FP_TO_SINT)
2411  else
2413 
2414  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2415  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
2416 }
2417 
2419  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2420  // Any additional optimization in this function should be recorded
2421  // in the cost tables.
2422  EVT VT = Op.getValueType();
2423  SDLoc dl(Op);
2424  SDValue In = Op.getOperand(0);
2425  EVT InVT = In.getValueType();
2426 
2427  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2428  MVT CastVT =
2430  InVT.getVectorNumElements());
2431  In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2432  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2433  }
2434 
2435  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2436  unsigned CastOpc =
2438  EVT CastVT = VT.changeVectorElementTypeToInteger();
2439  In = DAG.getNode(CastOpc, dl, CastVT, In);
2440  return DAG.getNode(Op.getOpcode(), dl, VT, In);
2441  }
2442 
2443  return Op;
2444 }
2445 
2446 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2447  SelectionDAG &DAG) const {
2448  if (Op.getValueType().isVector())
2449  return LowerVectorINT_TO_FP(Op, DAG);
2450 
2451  // f16 conversions are promoted to f32 when full fp16 is not supported.
2452  if (Op.getValueType() == MVT::f16 &&
2453  !Subtarget->hasFullFP16()) {
2454  SDLoc dl(Op);
2455  return DAG.getNode(
2456  ISD::FP_ROUND, dl, MVT::f16,
2457  DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2458  DAG.getIntPtrConstant(0, dl));
2459  }
2460 
2461  // i128 conversions are libcalls.
2462  if (Op.getOperand(0).getValueType() == MVT::i128)
2463  return SDValue();
2464 
2465  // Other conversions are legal, unless it's to the completely software-based
2466  // fp128.
2467  if (Op.getValueType() != MVT::f128)
2468  return Op;
2469 
2470  RTLIB::Libcall LC;
2471  if (Op.getOpcode() == ISD::SINT_TO_FP)
2473  else
2475 
2476  return LowerF128Call(Op, DAG, LC);
2477 }
2478 
2479 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2480  SelectionDAG &DAG) const {
2481  // For iOS, we want to call an alternative entry point: __sincos_stret,
2482  // which returns the values in two S / D registers.
2483  SDLoc dl(Op);
2484  SDValue Arg = Op.getOperand(0);
2485  EVT ArgVT = Arg.getValueType();
2486  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2487 
2488  ArgListTy Args;
2489  ArgListEntry Entry;
2490 
2491  Entry.Node = Arg;
2492  Entry.Ty = ArgTy;
2493  Entry.IsSExt = false;
2494  Entry.IsZExt = false;
2495  Args.push_back(Entry);
2496 
2497  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2498  : RTLIB::SINCOS_STRET_F32;
2499  const char *LibcallName = getLibcallName(LC);
2500  SDValue Callee =
2501  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2502 
2503  StructType *RetTy = StructType::get(ArgTy, ArgTy);
2505  CLI.setDebugLoc(dl)
2506  .setChain(DAG.getEntryNode())
2507  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2508 
2509  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2510  return CallResult.first;
2511 }
2512 
2514  if (Op.getValueType() != MVT::f16)
2515  return SDValue();
2516 
2517  assert(Op.getOperand(0).getValueType() == MVT::i16);
2518  SDLoc DL(Op);
2519 
2520  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2521  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2522  return SDValue(
2523  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2524  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2525  0);
2526 }
2527 
2528 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2529  if (OrigVT.getSizeInBits() >= 64)
2530  return OrigVT;
2531 
2532  assert(OrigVT.isSimple() && "Expecting a simple value type");
2533 
2534  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2535  switch (OrigSimpleTy) {
2536  default: llvm_unreachable("Unexpected Vector Type");
2537  case MVT::v2i8:
2538  case MVT::v2i16:
2539  return MVT::v2i32;
2540  case MVT::v4i8:
2541  return MVT::v4i16;
2542  }
2543 }
2544 
2546  const EVT &OrigTy,
2547  const EVT &ExtTy,
2548  unsigned ExtOpcode) {
2549  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2550  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2551  // 64-bits we need to insert a new extension so that it will be 64-bits.
2552  assert(ExtTy.is128BitVector() && "Unexpected extension size");
2553  if (OrigTy.getSizeInBits() >= 64)
2554  return N;
2555 
2556  // Must extend size to at least 64 bits to be used as an operand for VMULL.
2557  EVT NewVT = getExtensionTo64Bits(OrigTy);
2558 
2559  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2560 }
2561 
2563  bool isSigned) {
2564  EVT VT = N->getValueType(0);
2565 
2566  if (N->getOpcode() != ISD::BUILD_VECTOR)
2567  return false;
2568 
2569  for (const SDValue &Elt : N->op_values()) {
2570  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2571  unsigned EltSize = VT.getScalarSizeInBits();
2572  unsigned HalfSize = EltSize / 2;
2573  if (isSigned) {
2574  if (!isIntN(HalfSize, C->getSExtValue()))
2575  return false;
2576  } else {
2577  if (!isUIntN(HalfSize, C->getZExtValue()))
2578  return false;
2579  }
2580  continue;
2581  }
2582  return false;
2583  }
2584 
2585  return true;
2586 }
2587 
2589  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2591  N->getOperand(0)->getValueType(0),
2592  N->getValueType(0),
2593  N->getOpcode());
2594 
2595  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2596  EVT VT = N->getValueType(0);
2597  SDLoc dl(N);
2598  unsigned EltSize = VT.getScalarSizeInBits() / 2;
2599  unsigned NumElts = VT.getVectorNumElements();
2600  MVT TruncVT = MVT::getIntegerVT(EltSize);
2602  for (unsigned i = 0; i != NumElts; ++i) {
2603  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2604  const APInt &CInt = C->getAPIntValue();
2605  // Element types smaller than 32 bits are not legal, so use i32 elements.
2606  // The values are implicitly truncated so sext vs. zext doesn't matter.
2607  Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2608  }
2609  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2610 }
2611 
2612 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2613  return N->getOpcode() == ISD::SIGN_EXTEND ||
2614  isExtendedBUILD_VECTOR(N, DAG, true);
2615 }
2616 
2617 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2618  return N->getOpcode() == ISD::ZERO_EXTEND ||
2619  isExtendedBUILD_VECTOR(N, DAG, false);
2620 }
2621 
2622 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2623  unsigned Opcode = N->getOpcode();
2624  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2625  SDNode *N0 = N->getOperand(0).getNode();
2626  SDNode *N1 = N->getOperand(1).getNode();
2627  return N0->hasOneUse() && N1->hasOneUse() &&
2628  isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2629  }
2630  return false;
2631 }
2632 
2633 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2634  unsigned Opcode = N->getOpcode();
2635  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2636  SDNode *N0 = N->getOperand(0).getNode();
2637  SDNode *N1 = N->getOperand(1).getNode();
2638  return N0->hasOneUse() && N1->hasOneUse() &&
2639  isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2640  }
2641  return false;
2642 }
2643 
2644 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
2645  SelectionDAG &DAG) const {
2646  // The rounding mode is in bits 23:22 of the FPSCR.
2647  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
2648  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
2649  // so that the shift + and get folded into a bitfield extract.
2650  SDLoc dl(Op);
2651 
2652  SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
2653  DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
2654  MVT::i64));
2655  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
2656  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
2657  DAG.getConstant(1U << 22, dl, MVT::i32));
2658  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
2659  DAG.getConstant(22, dl, MVT::i32));
2660  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
2661  DAG.getConstant(3, dl, MVT::i32));
2662 }
2663 
2665  // Multiplications are only custom-lowered for 128-bit vectors so that
2666  // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2667  EVT VT = Op.getValueType();
2668  assert(VT.is128BitVector() && VT.isInteger() &&
2669  "unexpected type for custom-lowering ISD::MUL");
2670  SDNode *N0 = Op.getOperand(0).getNode();
2671  SDNode *N1 = Op.getOperand(1).getNode();
2672  unsigned NewOpc = 0;
2673  bool isMLA = false;
2674  bool isN0SExt = isSignExtended(N0, DAG);
2675  bool isN1SExt = isSignExtended(N1, DAG);
2676  if (isN0SExt && isN1SExt)
2677  NewOpc = AArch64ISD::SMULL;
2678  else {
2679  bool isN0ZExt = isZeroExtended(N0, DAG);
2680  bool isN1ZExt = isZeroExtended(N1, DAG);
2681  if (isN0ZExt && isN1ZExt)
2682  NewOpc = AArch64ISD::UMULL;
2683  else if (isN1SExt || isN1ZExt) {
2684  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2685  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2686  if (isN1SExt && isAddSubSExt(N0, DAG)) {
2687  NewOpc = AArch64ISD::SMULL;
2688  isMLA = true;
2689  } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2690  NewOpc = AArch64ISD::UMULL;
2691  isMLA = true;
2692  } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2693  std::swap(N0, N1);
2694  NewOpc = AArch64ISD::UMULL;
2695  isMLA = true;
2696  }
2697  }
2698 
2699  if (!NewOpc) {
2700  if (VT == MVT::v2i64)
2701  // Fall through to expand this. It is not legal.
2702  return SDValue();
2703  else
2704  // Other vector multiplications are legal.
2705  return Op;
2706  }
2707  }
2708 
2709  // Legalize to a S/UMULL instruction
2710  SDLoc DL(Op);
2711  SDValue Op0;
2712  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2713  if (!isMLA) {
2714  Op0 = skipExtensionForVectorMULL(N0, DAG);
2715  assert(Op0.getValueType().is64BitVector() &&
2716  Op1.getValueType().is64BitVector() &&
2717  "unexpected types for extended operands to VMULL");
2718  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2719  }
2720  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2721  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2722  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2723  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2724  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2725  EVT Op1VT = Op1.getValueType();
2726  return DAG.getNode(N0->getOpcode(), DL, VT,
2727  DAG.getNode(NewOpc, DL, VT,
2728  DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2729  DAG.getNode(NewOpc, DL, VT,
2730  DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2731 }
2732 
2733 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2734  SelectionDAG &DAG) const {
2735  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2736  SDLoc dl(Op);
2737  switch (IntNo) {
2738  default: return SDValue(); // Don't custom lower most intrinsics.
2739  case Intrinsic::thread_pointer: {
2740  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2741  return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2742  }
2743  case Intrinsic::aarch64_neon_abs: {
2744  EVT Ty = Op.getValueType();
2745  if (Ty == MVT::i64) {
2746  SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
2747  Op.getOperand(1));
2748  Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
2749  return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
2750  } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
2751  return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
2752  } else {
2753  report_fatal_error("Unexpected type for AArch64 NEON intrinic");
2754  }
2755  }
2756  case Intrinsic::aarch64_neon_smax:
2757  return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2758  Op.getOperand(1), Op.getOperand(2));
2759  case Intrinsic::aarch64_neon_umax:
2760  return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2761  Op.getOperand(1), Op.getOperand(2));
2762  case Intrinsic::aarch64_neon_smin:
2763  return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2764  Op.getOperand(1), Op.getOperand(2));
2765  case Intrinsic::aarch64_neon_umin:
2766  return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2767  Op.getOperand(1), Op.getOperand(2));
2768 
2769  case Intrinsic::localaddress: {
2770  const auto &MF = DAG.getMachineFunction();
2771  const auto *RegInfo = Subtarget->getRegisterInfo();
2772  unsigned Reg = RegInfo->getLocalAddressRegister(MF);
2773  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
2774  Op.getSimpleValueType());
2775  }
2776 
2777  case Intrinsic::eh_recoverfp: {
2778  // FIXME: This needs to be implemented to correctly handle highly aligned
2779  // stack objects. For now we simply return the incoming FP. Refer D53541
2780  // for more details.
2781  SDValue FnOp = Op.getOperand(1);
2782  SDValue IncomingFPOp = Op.getOperand(2);
2784  auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
2785  if (!Fn)
2787  "llvm.eh.recoverfp must take a function as the first argument");
2788  return IncomingFPOp;
2789  }
2790  }
2791 }
2792 
2793 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
2795  EVT VT, EVT MemVT,
2796  SelectionDAG &DAG) {
2797  assert(VT.isVector() && "VT should be a vector type");
2798  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
2799 
2800  SDValue Value = ST->getValue();
2801 
2802  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
2803  // the word lane which represent the v4i8 subvector. It optimizes the store
2804  // to:
2805  //
2806  // xtn v0.8b, v0.8h
2807  // str s0, [x0]
2808 
2809  SDValue Undef = DAG.getUNDEF(MVT::i16);
2810  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
2811  {Undef, Undef, Undef, Undef});
2812 
2813  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
2814  Value, UndefVec);
2815  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
2816 
2817  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
2818  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
2819  Trunc, DAG.getConstant(0, DL, MVT::i64));
2820 
2821  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
2822  ST->getBasePtr(), ST->getMemOperand());
2823 }
2824 
2825 // Custom lowering for any store, vector or scalar and/or default or with
2826 // a truncate operations. Currently only custom lower truncate operation
2827 // from vector v4i16 to v4i8.
2828 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
2829  SelectionDAG &DAG) const {
2830  SDLoc Dl(Op);
2831  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
2832  assert (StoreNode && "Can only custom lower store nodes");
2833 
2834  SDValue Value = StoreNode->getValue();
2835 
2836  EVT VT = Value.getValueType();
2837  EVT MemVT = StoreNode->getMemoryVT();
2838 
2839  assert (VT.isVector() && "Can only custom lower vector store types");
2840 
2841  unsigned AS = StoreNode->getAddressSpace();
2842  unsigned Align = StoreNode->getAlignment();
2843  if (Align < MemVT.getStoreSize() &&
2844  !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
2845  return scalarizeVectorStore(StoreNode, DAG);
2846  }
2847 
2848  if (StoreNode->isTruncatingStore()) {
2849  return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
2850  }
2851 
2852  return SDValue();
2853 }
2854 
2856  SelectionDAG &DAG) const {
2857  LLVM_DEBUG(dbgs() << "Custom lowering: ");
2858  LLVM_DEBUG(Op.dump());
2859 
2860  switch (Op.getOpcode()) {
2861  default:
2862  llvm_unreachable("unimplemented operand");
2863  return SDValue();
2864  case ISD::BITCAST:
2865  return LowerBITCAST(Op, DAG);
2866  case ISD::GlobalAddress:
2867  return LowerGlobalAddress(Op, DAG);
2868  case ISD::GlobalTLSAddress:
2869  return LowerGlobalTLSAddress(Op, DAG);
2870  case ISD::SETCC:
2871  return LowerSETCC(Op, DAG);
2872  case ISD::BR_CC:
2873  return LowerBR_CC(Op, DAG);
2874  case ISD::SELECT:
2875  return LowerSELECT(Op, DAG);
2876  case ISD::SELECT_CC:
2877  return LowerSELECT_CC(Op, DAG);
2878  case ISD::JumpTable:
2879  return LowerJumpTable(Op, DAG);
2880  case ISD::BR_JT:
2881  return LowerBR_JT(Op, DAG);
2882  case ISD::ConstantPool:
2883  return LowerConstantPool(Op, DAG);
2884  case ISD::BlockAddress:
2885  return LowerBlockAddress(Op, DAG);
2886  case ISD::VASTART:
2887  return LowerVASTART(Op, DAG);
2888  case ISD::VACOPY:
2889  return LowerVACOPY(Op, DAG);
2890  case ISD::VAARG:
2891  return LowerVAARG(Op, DAG);
2892  case ISD::ADDC:
2893  case ISD::ADDE:
2894  case ISD::SUBC:
2895  case ISD::SUBE:
2896  return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2897  case ISD::SADDO:
2898  case ISD::UADDO:
2899  case ISD::SSUBO:
2900  case ISD::USUBO:
2901  case ISD::SMULO:
2902  case ISD::UMULO:
2903  return LowerXALUO(Op, DAG);
2904  case ISD::FADD:
2905  return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
2906  case ISD::FSUB:
2907  return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
2908  case ISD::FMUL:
2909  return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
2910  case ISD::FDIV:
2911  return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
2912  case ISD::FP_ROUND:
2913  return LowerFP_ROUND(Op, DAG);
2914  case ISD::FP_EXTEND:
2915  return LowerFP_EXTEND(Op, DAG);
2916  case ISD::FRAMEADDR:
2917  return LowerFRAMEADDR(Op, DAG);
2918  case ISD::SPONENTRY:
2919  return LowerSPONENTRY(Op, DAG);
2920  case ISD::RETURNADDR:
2921  return LowerRETURNADDR(Op, DAG);
2922  case ISD::ADDROFRETURNADDR:
2923  return LowerADDROFRETURNADDR(Op, DAG);
2925  return LowerINSERT_VECTOR_ELT(Op, DAG);
2927  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2928  case ISD::BUILD_VECTOR:
2929  return LowerBUILD_VECTOR(Op, DAG);
2930  case ISD::VECTOR_SHUFFLE:
2931  return LowerVECTOR_SHUFFLE(Op, DAG);
2933  return LowerEXTRACT_SUBVECTOR(Op, DAG);
2934  case ISD::SRA:
2935  case ISD::SRL:
2936  case ISD::SHL:
2937  return LowerVectorSRA_SRL_SHL(Op, DAG);
2938  case ISD::SHL_PARTS:
2939  return LowerShiftLeftParts(Op, DAG);
2940  case ISD::SRL_PARTS:
2941  case ISD::SRA_PARTS:
2942  return LowerShiftRightParts(Op, DAG);
2943  case ISD::CTPOP:
2944  return LowerCTPOP(Op, DAG);
2945  case ISD::FCOPYSIGN:
2946  return LowerFCOPYSIGN(Op, DAG);
2947  case ISD::OR:
2948  return LowerVectorOR(Op, DAG);
2949  case ISD::XOR:
2950  return LowerXOR(Op, DAG);
2951  case ISD::PREFETCH:
2952  return LowerPREFETCH(Op, DAG);
2953  case ISD::SINT_TO_FP:
2954  case ISD::UINT_TO_FP:
2955  return LowerINT_TO_FP(Op, DAG);
2956  case ISD::FP_TO_SINT:
2957  case ISD::FP_TO_UINT:
2958  return LowerFP_TO_INT(Op, DAG);
2959  case ISD::FSINCOS:
2960  return LowerFSINCOS(Op, DAG);
2961  case ISD::FLT_ROUNDS_:
2962  return LowerFLT_ROUNDS_(Op, DAG);
2963  case ISD::MUL:
2964  return LowerMUL(Op, DAG);
2966  return LowerINTRINSIC_WO_CHAIN(Op, DAG);
2967  case ISD::STORE:
2968  return LowerSTORE(Op, DAG);
2969  case ISD::VECREDUCE_ADD:
2970  case ISD::VECREDUCE_SMAX:
2971  case ISD::VECREDUCE_SMIN:
2972  case ISD::VECREDUCE_UMAX:
2973  case ISD::VECREDUCE_UMIN:
2974  case ISD::VECREDUCE_FMAX:
2975  case ISD::VECREDUCE_FMIN:
2976  return LowerVECREDUCE(Op, DAG);
2977  case ISD::ATOMIC_LOAD_SUB:
2978  return LowerATOMIC_LOAD_SUB(Op, DAG);
2979  case ISD::ATOMIC_LOAD_AND:
2980  return LowerATOMIC_LOAD_AND(Op, DAG);
2982  return LowerDYNAMIC_STACKALLOC(Op, DAG);
2983  }
2984 }
2985 
2986 //===----------------------------------------------------------------------===//
2987 // Calling Convention Implementation
2988 //===----------------------------------------------------------------------===//
2989 
2990 /// Selects the correct CCAssignFn for a given CallingConvention value.
2992  bool IsVarArg) const {
2993  switch (CC) {
2994  default:
2995  report_fatal_error("Unsupported calling convention.");
2997  return CC_AArch64_WebKit_JS;
2998  case CallingConv::GHC:
2999  return CC_AArch64_GHC;
3000  case CallingConv::C:
3001  case CallingConv::Fast:
3004  case CallingConv::Swift:
3005  if (Subtarget->isTargetWindows() && IsVarArg)
3006  return CC_AArch64_Win64_VarArg;
3007  if (!Subtarget->isTargetDarwin())
3008  return CC_AArch64_AAPCS;
3010  case CallingConv::Win64:
3011  return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
3013  return CC_AArch64_AAPCS;
3014  }
3015 }
3016 
3017 CCAssignFn *
3021 }
3022 
3023 SDValue AArch64TargetLowering::LowerFormalArguments(
3024  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3025  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3026  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3027  MachineFunction &MF = DAG.getMachineFunction();
3028  MachineFrameInfo &MFI = MF.getFrameInfo();
3029  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3030 
3031  // Assign locations to all of the incoming arguments.
3033  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3034  *DAG.getContext());
3035 
3036  // At this point, Ins[].VT may already be promoted to i32. To correctly
3037  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3038  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3039  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
3040  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
3041  // LocVT.
3042  unsigned NumArgs = Ins.size();
3044  unsigned CurArgIdx = 0;
3045  for (unsigned i = 0; i != NumArgs; ++i) {
3046  MVT ValVT = Ins[i].VT;
3047  if (Ins[i].isOrigArg()) {
3048  std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
3049  CurArgIdx = Ins[i].getOrigArgIndex();
3050 
3051  // Get type of the original argument.
3052  EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
3053  /*AllowUnknown*/ true);
3054  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
3055  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3056  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3057  ValVT = MVT::i8;
3058  else if (ActualMVT == MVT::i16)
3059  ValVT = MVT::i16;
3060  }
3061  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3062  bool Res =
3063  AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
3064  assert(!Res && "Call operand has unhandled type");
3065  (void)Res;
3066  }
3067  assert(ArgLocs.size() == Ins.size());
3068  SmallVector<SDValue, 16> ArgValues;
3069  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3070  CCValAssign &VA = ArgLocs[i];
3071 
3072  if (Ins[i].Flags.isByVal()) {
3073  // Byval is used for HFAs in the PCS, but the system should work in a
3074  // non-compliant manner for larger structs.
3075  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3076  int Size = Ins[i].Flags.getByValSize();
3077  unsigned NumRegs = (Size + 7) / 8;
3078 
3079  // FIXME: This works on big-endian for composite byvals, which are the common
3080  // case. It should also work for fundamental types too.
3081  unsigned FrameIdx =
3082  MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
3083  SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
3084  InVals.push_back(FrameIdxN);
3085 
3086  continue;
3087  }
3088 
3089  if (VA.isRegLoc()) {
3090  // Arguments stored in registers.
3091  EVT RegVT = VA.getLocVT();
3092 
3093  SDValue ArgValue;
3094  const TargetRegisterClass *RC;
3095 
3096  if (RegVT == MVT::i32)
3097  RC = &AArch64::GPR32RegClass;
3098  else if (RegVT == MVT::i64)
3099  RC = &AArch64::GPR64RegClass;
3100  else if (RegVT == MVT::f16)
3101  RC = &AArch64::FPR16RegClass;
3102  else if (RegVT == MVT::f32)
3103  RC = &AArch64::FPR32RegClass;
3104  else if (RegVT == MVT::f64 || RegVT.is64BitVector())
3105  RC = &AArch64::FPR64RegClass;
3106  else if (RegVT == MVT::f128 || RegVT.is128BitVector())
3107  RC = &AArch64::FPR128RegClass;
3108  else
3109  llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3110 
3111  // Transform the arguments in physical registers into virtual ones.
3112  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3113  ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
3114 
3115  // If this is an 8, 16 or 32-bit value, it is really passed promoted
3116  // to 64 bits. Insert an assert[sz]ext to capture this, then
3117  // truncate to the right size.
3118  switch (VA.getLocInfo()) {
3119  default:
3120  llvm_unreachable("Unknown loc info!");
3121  case CCValAssign::Full:
3122  break;
3123  case CCValAssign::BCvt:
3124  ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
3125  break;
3126  case CCValAssign::AExt:
3127  case CCValAssign::SExt:
3128  case CCValAssign::ZExt:
3129  // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
3130  // nodes after our lowering.
3131  assert(RegVT == Ins[i].VT && "incorrect register location selected");
3132  break;
3133  }
3134 
3135  InVals.push_back(ArgValue);
3136 
3137  } else { // VA.isRegLoc()
3138  assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
3139  unsigned ArgOffset = VA.getLocMemOffset();
3140  unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
3141 
3142  uint32_t BEAlign = 0;
3143  if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
3144  !Ins[i].Flags.isInConsecutiveRegs())
3145  BEAlign = 8 - ArgSize;
3146 
3147  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
3148 
3149  // Create load nodes to retrieve arguments from the stack.
3150  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3151  SDValue ArgValue;
3152 
3153  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
3155  MVT MemVT = VA.getValVT();
3156 
3157  switch (VA.getLocInfo()) {
3158  default:
3159  break;
3160  case CCValAssign::BCvt:
3161  MemVT = VA.getLocVT();
3162  break;
3163  case CCValAssign::SExt:
3164  ExtType = ISD::SEXTLOAD;
3165  break;
3166  case CCValAssign::ZExt:
3167  ExtType = ISD::ZEXTLOAD;
3168  break;
3169  case CCValAssign::AExt:
3170  ExtType = ISD::EXTLOAD;
3171  break;
3172  }
3173 
3174  ArgValue = DAG.getExtLoad(
3175  ExtType, DL, VA.getLocVT(), Chain, FIN,
3177  MemVT);
3178 
3179  InVals.push_back(ArgValue);
3180  }
3181  }
3182 
3183  // varargs
3185  if (isVarArg) {
3186  if (!Subtarget->isTargetDarwin() || IsWin64) {
3187  // The AAPCS variadic function ABI is identical to the non-variadic
3188  // one. As a result there may be more arguments in registers and we should
3189  // save them for future reference.
3190  // Win64 variadic functions also pass arguments in registers, but all float
3191  // arguments are passed in integer registers.
3192  saveVarArgRegisters(CCInfo, DAG, DL, Chain);
3193  }
3194 
3195  // This will point to the next argument passed via stack.
3196  unsigned StackOffset = CCInfo.getNextStackOffset();
3197  // We currently pass all varargs at 8-byte alignment.
3198  StackOffset = ((StackOffset + 7) & ~7);
3199  FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
3200 
3201  if (MFI.hasMustTailInVarArgFunc()) {
3202  SmallVector<MVT, 2> RegParmTypes;
3203  RegParmTypes.push_back(MVT::i64);
3204  RegParmTypes.push_back(MVT::f128);
3205  // Compute the set of forwarded registers. The rest are scratch.
3207  FuncInfo->getForwardedMustTailRegParms();
3208  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
3210  }
3211  }
3212 
3213  // On Windows, InReg pointers must be returned, so record the pointer in a
3214  // virtual register at the start of the function so it can be returned in the
3215  // epilogue.
3216  if (IsWin64) {
3217  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3218  if (Ins[I].Flags.isInReg()) {
3219  assert(!FuncInfo->getSRetReturnReg());
3220 
3221  MVT PtrTy = getPointerTy(DAG.getDataLayout());
3222  unsigned Reg =
3224  FuncInfo->setSRetReturnReg(Reg);
3225 
3226  SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
3227  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
3228  break;
3229  }
3230  }
3231  }
3232 
3233  unsigned StackArgSize = CCInfo.getNextStackOffset();
3234  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3235  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
3236  // This is a non-standard ABI so by fiat I say we're allowed to make full
3237  // use of the stack area to be popped, which must be aligned to 16 bytes in
3238  // any case:
3239  StackArgSize = alignTo(StackArgSize, 16);
3240 
3241  // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
3242  // a multiple of 16.
3243  FuncInfo->setArgumentStackToRestore(StackArgSize);
3244 
3245  // This realignment carries over to the available bytes below. Our own
3246  // callers will guarantee the space is free by giving an aligned value to
3247  // CALLSEQ_START.
3248  }
3249  // Even if we're not expected to free up the space, it's useful to know how
3250  // much is there while considering tail calls (because we can reuse it).
3251  FuncInfo->setBytesInStackArgArea(StackArgSize);
3252 
3253  if (Subtarget->hasCustomCallingConv())
3254  Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
3255 
3256  return Chain;
3257 }
3258 
3259 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
3260  SelectionDAG &DAG,
3261  const SDLoc &DL,
3262  SDValue &Chain) const {
3263  MachineFunction &MF = DAG.getMachineFunction();
3264  MachineFrameInfo &MFI = MF.getFrameInfo();
3266  auto PtrVT = getPointerTy(DAG.getDataLayout());
3267  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3268 
3269  SmallVector<SDValue, 8> MemOps;
3270 
3271  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
3272  AArch64::X3, AArch64::X4, AArch64::X5,
3273  AArch64::X6, AArch64::X7 };
3274  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
3275  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
3276 
3277  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
3278  int GPRIdx = 0;
3279  if (GPRSaveSize != 0) {
3280  if (IsWin64) {
3281  GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
3282  if (GPRSaveSize & 15)
3283  // The extra size here, if triggered, will always be 8.
3284  MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
3285  } else
3286  GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
3287 
3288  SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
3289 
3290  for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
3291  unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
3292  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
3293  SDValue Store = DAG.getStore(
3294  Val.getValue(1), DL, Val, FIN,
3295  IsWin64
3297  GPRIdx,
3298  (i - FirstVariadicGPR) * 8)
3300  MemOps.push_back(Store);
3301  FIN =
3302  DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
3303  }
3304  }
3305  FuncInfo->setVarArgsGPRIndex(GPRIdx);
3306  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
3307 
3308  if (Subtarget->hasFPARMv8() && !IsWin64) {
3309  static const MCPhysReg FPRArgRegs[] = {
3310  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
3311  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
3312  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
3313  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
3314 
3315  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
3316  int FPRIdx = 0;
3317  if (FPRSaveSize != 0) {
3318  FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
3319 
3320  SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3321 
3322  for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3323  unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3324  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3325 
3326  SDValue Store = DAG.getStore(
3327  Val.getValue(1), DL, Val, FIN,
3329  MemOps.push_back(Store);
3330  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3331  DAG.getConstant(16, DL, PtrVT));
3332  }
3333  }
3334  FuncInfo->setVarArgsFPRIndex(FPRIdx);
3335  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3336  }
3337 
3338  if (!MemOps.empty()) {
3339  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3340  }
3341 }
3342 
3343 /// LowerCallResult - Lower the result values of a call into the
3344 /// appropriate copies out of appropriate physical registers.
3345 SDValue AArch64TargetLowering::LowerCallResult(
3346  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3347  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3348  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3349  SDValue ThisVal) const {
3350  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3353  // Assign locations to each value returned by this call.
3355  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3356  *DAG.getContext());
3357  CCInfo.AnalyzeCallResult(Ins, RetCC);
3358 
3359  // Copy all of the result registers out of their specified physreg.
3360  for (unsigned i = 0; i != RVLocs.size(); ++i) {
3361  CCValAssign VA = RVLocs[i];
3362 
3363  // Pass 'this' value directly from the argument to return value, to avoid
3364  // reg unit interference
3365  if (i == 0 && isThisReturn) {
3366  assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3367  "unexpected return calling convention register assignment");
3368  InVals.push_back(ThisVal);
3369  continue;
3370  }
3371 
3372  SDValue Val =
3373  DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3374  Chain = Val.getValue(1);
3375  InFlag = Val.getValue(2);
3376 
3377  switch (VA.getLocInfo()) {
3378  default:
3379  llvm_unreachable("Unknown loc info!");
3380  case CCValAssign::Full:
3381  break;
3382  case CCValAssign::BCvt:
3383  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3384  break;
3385  }
3386 
3387  InVals.push_back(Val);
3388  }
3389 
3390  return Chain;
3391 }
3392 
3393 /// Return true if the calling convention is one that we can guarantee TCO for.
3395  return CC == CallingConv::Fast;
3396 }
3397 
3398 /// Return true if we might ever do TCO for calls with this calling convention.
3400  switch (CC) {
3401  case CallingConv::C:
3403  case CallingConv::Swift:
3404  return true;
3405  default:
3406  return canGuaranteeTCO(CC);
3407  }
3408 }
3409 
3410 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3411  SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3412  const SmallVectorImpl<ISD::OutputArg> &Outs,
3413  const SmallVectorImpl<SDValue> &OutVals,
3414  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3415  if (!mayTailCallThisCC(CalleeCC))
3416  return false;
3417 
3418  MachineFunction &MF = DAG.getMachineFunction();
3419  const Function &CallerF = MF.getFunction();
3420  CallingConv::ID CallerCC = CallerF.getCallingConv();
3421  bool CCMatch = CallerCC == CalleeCC;
3422 
3423  // Byval parameters hand the function a pointer directly into the stack area
3424  // we want to reuse during a tail call. Working around this *is* possible (see
3425  // X86) but less efficient and uglier in LowerCall.
3426  for (Function::const_arg_iterator i = CallerF.arg_begin(),
3427  e = CallerF.arg_end();
3428  i != e; ++i) {
3429  if (i->hasByValAttr())
3430  return false;
3431 
3432  // On Windows, "inreg" attributes signify non-aggregate indirect returns.
3433  // In this case, it is necessary to save/restore X0 in the callee. Tail
3434  // call opt interferes with this. So we disable tail call opt when the
3435  // caller has an argument with "inreg" attribute.
3436 
3437  // FIXME: Check whether the callee also has an "inreg" argument.
3438  if (i->hasInRegAttr())
3439  return false;
3440  }
3441 
3443  return canGuaranteeTCO(CalleeCC) && CCMatch;
3444 
3445  // Externally-defined functions with weak linkage should not be
3446  // tail-called on AArch64 when the OS does not support dynamic
3447  // pre-emption of symbols, as the AAELF spec requires normal calls
3448  // to undefined weak functions to be replaced with a NOP or jump to the
3449  // next instruction. The behaviour of branch instructions in this
3450  // situation (as used for tail calls) is implementation-defined, so we
3451  // cannot rely on the linker replacing the tail call with a return.
3452  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3453  const GlobalValue *GV = G->getGlobal();
3454  const Triple &TT = getTargetMachine().getTargetTriple();
3455  if (GV->hasExternalWeakLinkage() &&
3456  (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3457  return false;
3458  }
3459 
3460  // Now we search for cases where we can use a tail call without changing the
3461  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3462  // concept.
3463 
3464  // I want anyone implementing a new calling convention to think long and hard
3465  // about this assert.
3466  assert((!isVarArg || CalleeCC == CallingConv::C) &&
3467  "Unexpected variadic calling convention");
3468 
3469  LLVMContext &C = *DAG.getContext();
3470  if (isVarArg && !Outs.empty()) {
3471  // At least two cases here: if caller is fastcc then we can't have any
3472  // memory arguments (we'd be expected to clean up the stack afterwards). If
3473  // caller is C then we could potentially use its argument area.
3474 
3475  // FIXME: for now we take the most conservative of these in both cases:
3476  // disallow all variadic memory operands.
3478  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3479 
3480  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3481  for (const CCValAssign &ArgLoc : ArgLocs)
3482  if (!ArgLoc.isRegLoc())
3483  return false;
3484  }
3485 
3486  // Check that the call results are passed in the same way.
3487  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3488  CCAssignFnForCall(CalleeCC, isVarArg),
3489  CCAssignFnForCall(CallerCC, isVarArg)))
3490  return false;
3491  // The callee has to preserve all registers the caller needs to preserve.
3492  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3493  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3494  if (!CCMatch) {
3495  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3496  if (Subtarget->hasCustomCallingConv()) {
3497  TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
3498  TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
3499  }
3500  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3501  return false;
3502  }
3503 
3504  // Nothing more to check if the callee is taking no arguments
3505  if (Outs.empty())
3506  return true;
3507 
3509  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3510 
3511  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3512 
3513  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3514 
3515  // If the stack arguments for this call do not fit into our own save area then
3516  // the call cannot be made tail.
3517  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3518  return false;
3519 
3520  const MachineRegisterInfo &MRI = MF.getRegInfo();
3521  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3522  return false;
3523 
3524  return true;
3525 }
3526 
3527 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3528  SelectionDAG &DAG,
3529  MachineFrameInfo &MFI,
3530  int ClobberedFI) const {
3531  SmallVector<SDValue, 8> ArgChains;
3532  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3533  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3534 
3535  // Include the original chain at the beginning of the list. When this is
3536  // used by target LowerCall hooks, this helps legalize find the
3537  // CALLSEQ_BEGIN node.
3538  ArgChains.push_back(Chain);
3539 
3540  // Add a chain value for each stack argument corresponding
3541  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3542  UE = DAG.getEntryNode().getNode()->use_end();
3543  U != UE; ++U)
3544  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3545  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3546  if (FI->getIndex() < 0) {
3547  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3548  int64_t InLastByte = InFirstByte;
3549  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3550 
3551  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3552  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3553  ArgChains.push_back(SDValue(L, 1));
3554  }
3555 
3556  // Build a tokenfactor for all the chains.
3557  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3558 }
3559 
3560 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3561  bool TailCallOpt) const {
3562  return CallCC == CallingConv::Fast && TailCallOpt;
3563 }
3564 
3565 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3566 /// and add input and output parameter nodes.
3567 SDValue
3568 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3569  SmallVectorImpl<SDValue> &InVals) const {
3570  SelectionDAG &DAG = CLI.DAG;
3571  SDLoc &DL = CLI.DL;
3572  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3573  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3574  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3575  SDValue Chain = CLI.Chain;
3576  SDValue Callee = CLI.Callee;
3577  bool &IsTailCall = CLI.IsTailCall;
3578  CallingConv::ID CallConv = CLI.CallConv;
3579  bool IsVarArg = CLI.IsVarArg;
3580 
3581  MachineFunction &MF = DAG.getMachineFunction();
3582  bool IsThisReturn = false;
3583 
3585  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3586  bool IsSibCall = false;
3587 
3588  if (IsTailCall) {
3589  // Check if it's really possible to do a tail call.
3590  IsTailCall = isEligibleForTailCallOptimization(
3591  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3592  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3593  report_fatal_error("failed to perform tail call elimination on a call "
3594  "site marked musttail");
3595 
3596  // A sibling call is one where we're under the usual C ABI and not planning
3597  // to change that but can still do a tail call:
3598  if (!TailCallOpt && IsTailCall)
3599  IsSibCall = true;
3600 
3601  if (IsTailCall)
3602  ++NumTailCalls;
3603  }
3604 
3605  // Analyze operands of the call, assigning locations to each operand.
3607  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3608  *DAG.getContext());
3609 
3610  if (IsVarArg) {
3611  // Handle fixed and variable vector arguments differently.
3612  // Variable vector arguments always go into memory.
3613  unsigned NumArgs = Outs.size();
3614 
3615  for (unsigned i = 0; i != NumArgs; ++i) {
3616  MVT ArgVT = Outs[i].VT;
3617  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3618  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3619  /*IsVarArg=*/ !Outs[i].IsFixed);
3620  bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3621  assert(!Res && "Call operand has unhandled type");
3622  (void)Res;
3623  }
3624  } else {
3625  // At this point, Outs[].VT may already be promoted to i32. To correctly
3626  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3627  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3628  // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3629  // we use a special version of AnalyzeCallOperands to pass in ValVT and
3630  // LocVT.
3631  unsigned NumArgs = Outs.size();
3632  for (unsigned i = 0; i != NumArgs; ++i) {
3633  MVT ValVT = Outs[i].VT;
3634  // Get type of the original argument.
3635  EVT ActualVT = getValueType(DAG.getDataLayout(),
3636  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3637  /*AllowUnknown*/ true);
3638  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3639  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3640  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3641  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3642  ValVT = MVT::i8;
3643  else if (ActualMVT == MVT::i16)
3644  ValVT = MVT::i16;
3645 
3646  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3647  bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3648  assert(!Res && "Call operand has unhandled type");
3649  (void)Res;
3650  }
3651  }
3652 
3653  // Get a count of how many bytes are to be pushed on the stack.
3654  unsigned NumBytes = CCInfo.getNextStackOffset();
3655 
3656  if (IsSibCall) {
3657  // Since we're not changing the ABI to make this a tail call, the memory
3658  // operands are already available in the caller's incoming argument space.
3659  NumBytes = 0;
3660  }
3661 
3662  // FPDiff is the byte offset of the call's argument area from the callee's.
3663  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3664  // by this amount for a tail call. In a sibling call it must be 0 because the
3665  // caller will deallocate the entire stack and the callee still expects its
3666  // arguments to begin at SP+0. Completely unused for non-tail calls.
3667  int FPDiff = 0;
3668 
3669  if (IsTailCall && !IsSibCall) {
3670  unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3671 
3672  // Since callee will pop argument stack as a tail call, we must keep the
3673  // popped size 16-byte aligned.
3674  NumBytes = alignTo(NumBytes, 16);
3675 
3676  // FPDiff will be negative if this tail call requires more space than we
3677  // would automatically have in our incoming argument space. Positive if we
3678  // can actually shrink the stack.
3679  FPDiff = NumReusableBytes - NumBytes;
3680 
3681  // The stack pointer must be 16-byte aligned at all times it's used for a
3682  // memory operation, which in practice means at *all* times and in
3683  // particular across call boundaries. Therefore our own arguments started at
3684  // a 16-byte aligned SP and the delta applied for the tail call should
3685  // satisfy the same constraint.
3686  assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3687  }
3688 
3689  // Adjust the stack pointer for the new arguments...
3690  // These operations are automatically eliminated by the prolog/epilog pass
3691  if (!IsSibCall)
3692  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3693 
3694  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3695  getPointerTy(DAG.getDataLayout()));
3696 
3698  SmallVector<SDValue, 8> MemOpChains;
3699  auto PtrVT = getPointerTy(DAG.getDataLayout());
3700 
3701  if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
3702  const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
3703  for (const auto &F : Forwards) {
3704  SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
3705  RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3706  }
3707  }
3708 
3709  // Walk the register/memloc assignments, inserting copies/loads.
3710  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3711  ++i, ++realArgIdx) {
3712  CCValAssign &VA = ArgLocs[i];
3713  SDValue Arg = OutVals[realArgIdx];
3714  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3715 
3716  // Promote the value if needed.
3717  switch (VA.getLocInfo()) {
3718  default:
3719  llvm_unreachable("Unknown loc info!");
3720  case CCValAssign::Full:
3721  break;
3722  case CCValAssign::SExt:
3723  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3724  break;
3725  case CCValAssign::ZExt:
3726  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3727  break;
3728  case CCValAssign::AExt:
3729  if (Outs[realArgIdx].ArgVT == MVT::i1) {
3730  // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3731  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3732  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3733  }
3734  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3735  break;
3736  case CCValAssign::BCvt:
3737  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3738  break;
3739  case CCValAssign::FPExt:
3740  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3741  break;
3742  }
3743 
3744  if (VA.isRegLoc()) {
3745  if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3746  Outs[0].VT == MVT::i64) {
3747  assert(VA.getLocVT() == MVT::i64 &&
3748  "unexpected calling convention register assignment");
3749  assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3750  "unexpected use of 'returned'");
3751  IsThisReturn = true;
3752  }
3753  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3754  } else {
3755  assert(VA.isMemLoc());
3756 
3757  SDValue DstAddr;
3758  MachinePointerInfo DstInfo;
3759 
3760  // FIXME: This works on big-endian for composite byvals, which are the
3761  // common case. It should also work for fundamental types too.
3762  uint32_t BEAlign = 0;
3763  unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3764  : VA.getValVT().getSizeInBits();
3765  OpSize = (OpSize + 7) / 8;
3766  if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3767  !Flags.isInConsecutiveRegs()) {
3768  if (OpSize < 8)
3769  BEAlign = 8 - OpSize;
3770  }
3771  unsigned LocMemOffset = VA.getLocMemOffset();
3772  int32_t Offset = LocMemOffset + BEAlign;
3773  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3774  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3775 
3776  if (IsTailCall) {
3777  Offset = Offset + FPDiff;
3778  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3779 
3780  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3781  DstInfo =
3783 
3784  // Make sure any stack arguments overlapping with where we're storing
3785  // are loaded before this eventual operation. Otherwise they'll be
3786  // clobbered.
3787  Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3788  } else {
3789  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3790 
3791  DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3793  LocMemOffset);
3794  }
3795 
3796  if (Outs[i].Flags.isByVal()) {
3797  SDValue SizeNode =
3798  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3799  SDValue Cpy = DAG.getMemcpy(
3800  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3801  /*isVol = */ false, /*AlwaysInline = */ false,
3802  /*isTailCall = */ false,
3803  DstInfo, MachinePointerInfo());
3804 
3805  MemOpChains.push_back(Cpy);
3806  } else {
3807  // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3808  // promoted to a legal register type i32, we should truncate Arg back to
3809  // i1/i8/i16.
3810  if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3811  VA.getValVT() == MVT::i16)
3812  Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3813 
3814  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3815  MemOpChains.push_back(Store);
3816  }
3817  }
3818  }
3819 
3820  if (!MemOpChains.empty())
3821  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3822 
3823  // Build a sequence of copy-to-reg nodes chained together with token chain
3824  // and flag operands which copy the outgoing args into the appropriate regs.
3825  SDValue InFlag;
3826  for (auto &RegToPass : RegsToPass) {
3827  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3828  RegToPass.second, InFlag);
3829  InFlag = Chain.getValue(1);
3830  }
3831 
3832  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3833  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3834  // node so that legalize doesn't hack it.
3835  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3836  auto GV = G->getGlobal();
3837  if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
3839  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
3840  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3841  } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
3842  assert(Subtarget->isTargetWindows() &&
3843  "Windows is the only supported COFF target");
3844  Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
3845  } else {
3846  const GlobalValue *GV = G->getGlobal();
3847  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
3848  }
3849  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3850  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3851  Subtarget->isTargetMachO()) {
3852  const char *Sym = S->getSymbol();
3853  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
3854  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3855  } else {
3856  const char *Sym = S->getSymbol();
3857  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
3858  }
3859  }
3860 
3861  // We don't usually want to end the call-sequence here because we would tidy
3862  // the frame up *after* the call, however in the ABI-changing tail-call case
3863  // we've carefully laid out the parameters so that when sp is reset they'll be
3864  // in the correct location.
3865  if (IsTailCall && !IsSibCall) {
3866  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3867  DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
3868  InFlag = Chain.getValue(1);
3869  }
3870 
3871  std::vector<SDValue> Ops;
3872  Ops.push_back(Chain);
3873  Ops.push_back(Callee);
3874 
3875  if (IsTailCall) {
3876  // Each tail call may have to adjust the stack by a different amount, so
3877  // this information must travel along with the operation for eventual
3878  // consumption by emitEpilogue.
3879  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3880  }
3881 
3882  // Add argument registers to the end of the list so that they are known live
3883  // into the call.
3884  for (auto &RegToPass : RegsToPass)
3885  Ops.push_back(DAG.getRegister(RegToPass.first,
3886  RegToPass.second.getValueType()));
3887 
3888  // Add a register mask operand representing the call-preserved registers.
3889  const uint32_t *Mask;
3890  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3891  if (IsThisReturn) {
3892  // For 'this' returns, use the X0-preserving mask if applicable
3893  Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
3894  if (!Mask) {
3895  IsThisReturn = false;
3896  Mask = TRI->getCallPreservedMask(MF, CallConv);
3897  }
3898  } else
3899  Mask = TRI->getCallPreservedMask(MF, CallConv);
3900 
3901  if (Subtarget->hasCustomCallingConv())
3902  TRI->UpdateCustomCallPreservedMask(MF, &Mask);
3903 
3904  if (TRI->isAnyArgRegReserved(MF))
3905  TRI->emitReservedArgRegCallError(MF);
3906 
3907  assert(Mask && "Missing call preserved mask for calling convention");
3908  Ops.push_back(DAG.getRegisterMask(Mask));
3909 
3910  if (InFlag.getNode())
3911  Ops.push_back(InFlag);
3912 
3913  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3914 
3915  // If we're doing a tall call, use a TC_RETURN here rather than an
3916  // actual call instruction.
3917  if (IsTailCall) {
3919  return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
3920  }
3921 
3922  // Returns a chain and a flag for retval copy to use.
3923  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
3924  InFlag = Chain.getValue(1);
3925 
3926  uint64_t CalleePopBytes =
3927  DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
3928 
3929  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3930  DAG.getIntPtrConstant(CalleePopBytes, DL, true),
3931  InFlag, DL);
3932  if (!Ins.empty())
3933  InFlag = Chain.getValue(1);
3934 
3935  // Handle result values, copying them out of physregs into vregs that we
3936  // return.
3937  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3938  InVals, IsThisReturn,
3939  IsThisReturn ? OutVals[0] : SDValue());
3940 }
3941 
3942 bool AArch64TargetLowering::CanLowerReturn(
3943  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3944  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3945  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3949  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3950  return CCInfo.CheckReturn(Outs, RetCC);
3951 }
3952 
3953 SDValue
3954 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3955  bool isVarArg,
3956  const SmallVectorImpl<ISD::OutputArg> &Outs,
3957  const SmallVectorImpl<SDValue> &OutVals,
3958  const SDLoc &DL, SelectionDAG &DAG) const {
3959  auto &MF = DAG.getMachineFunction();
3960  auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3961 
3962  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3966  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3967  *DAG.getContext());
3968  CCInfo.AnalyzeReturn(Outs, RetCC);
3969 
3970  // Copy the result values into the output registers.
3971  SDValue Flag;
3972  SmallVector<SDValue, 4> RetOps(1, Chain);
3973  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
3974  ++i, ++realRVLocIdx) {
3975  CCValAssign &VA = RVLocs[i];
3976  assert(VA.isRegLoc() && "Can only return in registers!");
3977  SDValue Arg = OutVals[realRVLocIdx];
3978 
3979  switch (VA.getLocInfo()) {
3980  default:
3981  llvm_unreachable("Unknown loc info!");
3982  case CCValAssign::Full:
3983  if (Outs[i].ArgVT == MVT::i1) {
3984  // AAPCS requires i1 to be zero-extended to i8 by the producer of the
3985  // value. This is strictly redundant on Darwin (which uses "zeroext
3986  // i1"), but will be optimised out before ISel.
3987  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3988  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3989  }
3990  break;
3991  case CCValAssign::BCvt:
3992  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3993  break;
3994  }
3995 
3996  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
3997  Flag = Chain.getValue(1);
3998  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3999  }
4000 
4001  // Windows AArch64 ABIs require that for returning structs by value we copy
4002  // the sret argument into X0 for the return.
4003  // We saved the argument into a virtual register in the entry block,
4004  // so now we copy the value out and into X0.
4005  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
4006  SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
4007  getPointerTy(MF.getDataLayout()));
4008 
4009  unsigned RetValReg = AArch64::X0;
4010  Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
4011  Flag = Chain.getValue(1);
4012 
4013  RetOps.push_back(
4014  DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
4015  }
4016 
4017  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
4018  const MCPhysReg *I =
4020  if (I) {
4021  for (; *I; ++I) {
4022  if (AArch64::GPR64RegClass.contains(*I))
4023  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
4024  else if (AArch64::FPR64RegClass.contains(*I))
4025  RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
4026  else
4027  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
4028  }
4029  }
4030 
4031  RetOps[0] = Chain; // Update chain.
4032 
4033  // Add the flag if we have it.
4034  if (Flag.getNode())
4035  RetOps.push_back(Flag);
4036 
4037  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
4038 }
4039 
4040 //===----------------------------------------------------------------------===//
4041 // Other Lowering Code
4042 //===----------------------------------------------------------------------===//
4043 
4044 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
4045  SelectionDAG &DAG,
4046  unsigned Flag) const {
4047  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
4048  N->getOffset(), Flag);
4049 }
4050 
4051 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
4052  SelectionDAG &DAG,
4053  unsigned Flag) const {
4054  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
4055 }
4056 
4057 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
4058  SelectionDAG &DAG,
4059  unsigned Flag) const {
4060  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
4061  N->getOffset(), Flag);
4062 }
4063 
4064 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
4065  SelectionDAG &DAG,
4066  unsigned Flag) const {
4067  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
4068 }
4069 
4070 // (loadGOT sym)
4071 template <class NodeTy>
4072 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
4073  unsigned Flags) const {
4074  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
4075  SDLoc DL(N);
4076  EVT Ty = getPointerTy(DAG.getDataLayout());
4077  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
4078  // FIXME: Once remat is capable of dealing with instructions with register
4079  // operands, expand this into two nodes instead of using a wrapper node.
4080  return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
4081 }
4082 
4083 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
4084 template <class NodeTy>
4085 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
4086  unsigned Flags) const {
4087  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
4088  SDLoc DL(N);
4089  EVT Ty = getPointerTy(DAG.getDataLayout());
4090  const unsigned char MO_NC = AArch64II::MO_NC;
4091  return DAG.