LLVM  9.0.0svn
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ExpandImm.h"
14 #include "AArch64ISelLowering.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
47 #include "llvm/IR/Attributes.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DataLayout.h"
50 #include "llvm/IR/DebugLoc.h"
51 #include "llvm/IR/DerivedTypes.h"
52 #include "llvm/IR/Function.h"
54 #include "llvm/IR/GlobalValue.h"
55 #include "llvm/IR/IRBuilder.h"
56 #include "llvm/IR/Instruction.h"
57 #include "llvm/IR/Instructions.h"
58 #include "llvm/IR/IntrinsicInst.h"
59 #include "llvm/IR/Intrinsics.h"
60 #include "llvm/IR/Module.h"
61 #include "llvm/IR/OperandTraits.h"
62 #include "llvm/IR/PatternMatch.h"
63 #include "llvm/IR/Type.h"
64 #include "llvm/IR/Use.h"
65 #include "llvm/IR/Value.h"
66 #include "llvm/MC/MCRegisterInfo.h"
67 #include "llvm/Support/Casting.h"
68 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/Compiler.h"
71 #include "llvm/Support/Debug.h"
73 #include "llvm/Support/KnownBits.h"
79 #include <algorithm>
80 #include <bitset>
81 #include <cassert>
82 #include <cctype>
83 #include <cstdint>
84 #include <cstdlib>
85 #include <iterator>
86 #include <limits>
87 #include <tuple>
88 #include <utility>
89 #include <vector>
90 
91 using namespace llvm;
92 using namespace llvm::PatternMatch;
93 
94 #define DEBUG_TYPE "aarch64-lower"
95 
96 STATISTIC(NumTailCalls, "Number of tail calls");
97 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
98 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
99 
100 static cl::opt<bool>
101 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
102  cl::desc("Allow AArch64 SLI/SRI formation"),
103  cl::init(false));
104 
105 // FIXME: The necessary dtprel relocations don't seem to be supported
106 // well in the GNU bfd and gold linkers at the moment. Therefore, by
107 // default, for now, fall back to GeneralDynamic code generation.
109  "aarch64-elf-ldtls-generation", cl::Hidden,
110  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
111  cl::init(false));
112 
113 static cl::opt<bool>
114 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
115  cl::desc("Enable AArch64 logical imm instruction "
116  "optimization"),
117  cl::init(true));
118 
119 /// Value type used for condition codes.
120 static const MVT MVT_CC = MVT::i32;
121 
123  const AArch64Subtarget &STI)
124  : TargetLowering(TM), Subtarget(&STI) {
125  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
126  // we have to make something up. Arbitrarily, choose ZeroOrOne.
128  // When comparing vectors the result sets the different elements in the
129  // vector to all-one or all-zero.
131 
132  // Set up the register classes.
133  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
134  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
135 
136  if (Subtarget->hasFPARMv8()) {
137  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
138  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
139  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
140  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
141  }
142 
143  if (Subtarget->hasNEON()) {
144  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
145  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
146  // Someone set us up the NEON.
147  addDRTypeForNEON(MVT::v2f32);
148  addDRTypeForNEON(MVT::v8i8);
149  addDRTypeForNEON(MVT::v4i16);
150  addDRTypeForNEON(MVT::v2i32);
151  addDRTypeForNEON(MVT::v1i64);
152  addDRTypeForNEON(MVT::v1f64);
153  addDRTypeForNEON(MVT::v4f16);
154 
155  addQRTypeForNEON(MVT::v4f32);
156  addQRTypeForNEON(MVT::v2f64);
157  addQRTypeForNEON(MVT::v16i8);
158  addQRTypeForNEON(MVT::v8i16);
159  addQRTypeForNEON(MVT::v4i32);
160  addQRTypeForNEON(MVT::v2i64);
161  addQRTypeForNEON(MVT::v8f16);
162  }
163 
164  // Compute derived properties from the register classes
166 
167  // Provide all sorts of operation actions
195 
199 
203 
205 
206  // Custom lowering hooks are needed for XOR
207  // to fold it into CSINC/CSINV.
210 
211  // Virtually no operation on f128 is legal, but LLVM can't expand them when
212  // there's a valid register class, so we need custom operations in most cases.
234 
235  // Lowering for many of the conversions is actually specified by the non-f128
236  // type. The LowerXXX function will be trivial when f128 isn't involved.
251 
252  // Variable arguments.
257 
258  // Variable-sized objects.
261 
262  if (Subtarget->isTargetWindows())
264  else
266 
267  // Constant pool entries
269 
270  // BlockAddress
272 
273  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
282 
283  // AArch64 lacks both left-rotate and popcount instructions.
286  for (MVT VT : MVT::vector_valuetypes()) {
289  }
290 
291  // AArch64 doesn't have {U|S}MUL_LOHI.
294 
297 
300  for (MVT VT : MVT::vector_valuetypes()) {
303  }
310 
311  // Custom lower Add/Sub/Mul with overflow.
324 
333  if (Subtarget->hasFullFP16())
335  else
337 
371 
372  if (!Subtarget->hasFullFP16()) {
395 
396  // promote v4f16 to v4f32 when that is known to be safe.
409 
425 
446  }
447 
448  // AArch64 has implementations of a lot of rounding-like FP operations.
449  for (MVT Ty : {MVT::f32, MVT::f64}) {
464  }
465 
466  if (Subtarget->hasFullFP16()) {
477  }
478 
480 
482 
488 
489  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
490  // This requires the Performance Monitors extension.
491  if (Subtarget->hasPerfMon())
493 
494  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
495  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
496  // Issue __sincos_stret if available.
499  } else {
502  }
503 
504  // Make floating-point constants legal for the large code model, so they don't
505  // become loads from the constant pool.
506  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
509  }
510 
511  // AArch64 does not have floating-point extending loads, i1 sign-extending
512  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
513  for (MVT VT : MVT::fp_valuetypes()) {
518  }
519  for (MVT VT : MVT::integer_valuetypes())
521 
529 
532 
533  // Indexed loads and stores are supported.
534  for (unsigned im = (unsigned)ISD::PRE_INC;
550  }
551 
552  // Trap.
554  if (Subtarget->isTargetWindows())
556 
557  // We combine OR nodes for bitfield operations.
559  // Try to create BICs for vector ANDs.
561 
562  // Vector add and sub nodes may conceal a high-half opportunity.
563  // Also, try to fold ADD into CSINC/CSINV..
570 
574 
576 
583  if (Subtarget->supportsAddressTopByteIgnored())
585 
587 
590 
594 
596 
597  // In case of strict alignment, avoid an excessive number of byte wide stores.
601 
606 
608 
610 
612 
613  EnableExtLdPromotion = true;
614 
615  // Set required alignment.
617  // Set preferred alignments.
620 
621  // Only change the limit for entries in a jump table if specified by
622  // the sub target, but not at the command line.
623  unsigned MaxJT = STI.getMaximumJumpTableSize();
624  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
626 
627  setHasExtractBitsInsn(true);
628 
630 
631  if (Subtarget->hasNEON()) {
632  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
633  // silliness like this:
659 
665 
667 
668  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
669  // elements smaller than i32, so promote the input to i32 first.
672  // i8 vector elements also need promotion to i32 for v8i8
675  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
680  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
681  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
684 
685  if (Subtarget->hasFullFP16()) {
690  } else {
691  // when AArch64 doesn't have fullfp16 support, promote the input
692  // to i32 first.
697  }
698 
701 
702  // AArch64 doesn't have MUL.2d:
704  // Custom handling for some quad-vector types to detect MULL.
708 
709  // Vector reductions
710  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
717  }
718  for (MVT VT : { MVT::v4f16, MVT::v2f32,
722  }
723 
726  // Likewise, narrowing and extending vector loads/stores aren't handled
727  // directly.
728  for (MVT VT : MVT::vector_valuetypes()) {
730 
731  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
734  } else {
737  }
740 
743 
744  for (MVT InnerVT : MVT::vector_valuetypes()) {
745  setTruncStoreAction(VT, InnerVT, Expand);
746  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
747  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
748  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
749  }
750  }
751 
752  // AArch64 has implementations of a lot of rounding-like FP operations.
753  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
760  }
761 
762  if (Subtarget->hasFullFP16()) {
763  for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
770  }
771  }
772 
774  }
775 
777 }
778 
779 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
780  assert(VT.isVector() && "VT should be a vector type");
781 
782  if (VT.isFloatingPoint()) {
784  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
785  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
786  }
787 
788  // Mark vector float intrinsics as expand.
789  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
798 
799  // But we do support custom-lowering for FCOPYSIGN.
801  }
802 
814 
818  for (MVT InnerVT : MVT::all_valuetypes())
819  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
820 
821  // CNT supports only B element sizes, then use UADDLP to widen.
822  if (VT != MVT::v8i8 && VT != MVT::v16i8)
824 
830 
833 
834  if (!VT.isFloatingPoint())
836 
837  // [SU][MIN|MAX] are available for all NEON types apart from i64.
838  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
839  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
840  setOperationAction(Opcode, VT, Legal);
841 
842  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
843  if (VT.isFloatingPoint() &&
844  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
845  for (unsigned Opcode :
847  setOperationAction(Opcode, VT, Legal);
848 
849  if (Subtarget->isLittleEndian()) {
850  for (unsigned im = (unsigned)ISD::PRE_INC;
854  }
855  }
856 }
857 
858 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
859  addRegisterClass(VT, &AArch64::FPR64RegClass);
860  addTypeForNEON(VT, MVT::v2i32);
861 }
862 
863 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
864  addRegisterClass(VT, &AArch64::FPR128RegClass);
865  addTypeForNEON(VT, MVT::v4i32);
866 }
867 
869  EVT VT) const {
870  if (!VT.isVector())
871  return MVT::i32;
873 }
874 
875 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
876  const APInt &Demanded,
878  unsigned NewOpc) {
879  uint64_t OldImm = Imm, NewImm, Enc;
880  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
881 
882  // Return if the immediate is already all zeros, all ones, a bimm32 or a
883  // bimm64.
884  if (Imm == 0 || Imm == Mask ||
885  AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
886  return false;
887 
888  unsigned EltSize = Size;
889  uint64_t DemandedBits = Demanded.getZExtValue();
890 
891  // Clear bits that are not demanded.
892  Imm &= DemandedBits;
893 
894  while (true) {
895  // The goal here is to set the non-demanded bits in a way that minimizes
896  // the number of switching between 0 and 1. In order to achieve this goal,
897  // we set the non-demanded bits to the value of the preceding demanded bits.
898  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
899  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
900  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
901  // The final result is 0b11000011.
902  uint64_t NonDemandedBits = ~DemandedBits;
903  uint64_t InvertedImm = ~Imm & DemandedBits;
904  uint64_t RotatedImm =
905  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
906  NonDemandedBits;
907  uint64_t Sum = RotatedImm + NonDemandedBits;
908  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
909  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
910  NewImm = (Imm | Ones) & Mask;
911 
912  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
913  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
914  // we halve the element size and continue the search.
915  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
916  break;
917 
918  // We cannot shrink the element size any further if it is 2-bits.
919  if (EltSize == 2)
920  return false;
921 
922  EltSize /= 2;
923  Mask >>= EltSize;
924  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
925 
926  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
927  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
928  return false;
929 
930  // Merge the upper and lower halves of Imm and DemandedBits.
931  Imm |= Hi;
932  DemandedBits |= DemandedBitsHi;
933  }
934 
935  ++NumOptimizedImms;
936 
937  // Replicate the element across the register width.
938  while (EltSize < Size) {
939  NewImm |= NewImm << EltSize;
940  EltSize *= 2;
941  }
942 
943  (void)OldImm;
944  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
945  "demanded bits should never be altered");
946  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
947 
948  // Create the new constant immediate node.
949  EVT VT = Op.getValueType();
950  SDLoc DL(Op);
951  SDValue New;
952 
953  // If the new constant immediate is all-zeros or all-ones, let the target
954  // independent DAG combine optimize this node.
955  if (NewImm == 0 || NewImm == OrigMask) {
956  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
957  TLO.DAG.getConstant(NewImm, DL, VT));
958  // Otherwise, create a machine node so that target independent DAG combine
959  // doesn't undo this optimization.
960  } else {
961  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
962  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
963  New = SDValue(
964  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
965  }
966 
967  return TLO.CombineTo(Op, New);
968 }
969 
971  SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
972  // Delay this optimization to as late as possible.
973  if (!TLO.LegalOps)
974  return false;
975 
977  return false;
978 
979  EVT VT = Op.getValueType();
980  if (VT.isVector())
981  return false;
982 
983  unsigned Size = VT.getSizeInBits();
984  assert((Size == 32 || Size == 64) &&
985  "i32 or i64 is expected after legalization.");
986 
987  // Exit early if we demand all bits.
988  if (Demanded.countPopulation() == Size)
989  return false;
990 
991  unsigned NewOpc;
992  switch (Op.getOpcode()) {
993  default:
994  return false;
995  case ISD::AND:
996  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
997  break;
998  case ISD::OR:
999  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1000  break;
1001  case ISD::XOR:
1002  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1003  break;
1004  }
1006  if (!C)
1007  return false;
1008  uint64_t Imm = C->getZExtValue();
1009  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
1010 }
1011 
1012 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1013 /// Mask are known to be either zero or one and return them Known.
1015  const SDValue Op, KnownBits &Known,
1016  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1017  switch (Op.getOpcode()) {
1018  default:
1019  break;
1020  case AArch64ISD::CSEL: {
1021  KnownBits Known2;
1022  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1023  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1024  Known.Zero &= Known2.Zero;
1025  Known.One &= Known2.One;
1026  break;
1027  }
1028  case ISD::INTRINSIC_W_CHAIN: {
1029  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1030  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1031  switch (IntID) {
1032  default: return;
1033  case Intrinsic::aarch64_ldaxr:
1034  case Intrinsic::aarch64_ldxr: {
1035  unsigned BitWidth = Known.getBitWidth();
1036  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1037  unsigned MemBits = VT.getScalarSizeInBits();
1038  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1039  return;
1040  }
1041  }
1042  break;
1043  }
1045  case ISD::INTRINSIC_VOID: {
1046  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1047  switch (IntNo) {
1048  default:
1049  break;
1050  case Intrinsic::aarch64_neon_umaxv:
1051  case Intrinsic::aarch64_neon_uminv: {
1052  // Figure out the datatype of the vector operand. The UMINV instruction
1053  // will zero extend the result, so we can mark as known zero all the
1054  // bits larger than the element datatype. 32-bit or larget doesn't need
1055  // this as those are legal types and will be handled by isel directly.
1056  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1057  unsigned BitWidth = Known.getBitWidth();
1058  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1059  assert(BitWidth >= 8 && "Unexpected width!");
1060  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1061  Known.Zero |= Mask;
1062  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1063  assert(BitWidth >= 16 && "Unexpected width!");
1064  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1065  Known.Zero |= Mask;
1066  }
1067  break;
1068  } break;
1069  }
1070  }
1071  }
1072 }
1073 
1075  EVT) const {
1076  return MVT::i64;
1077 }
1078 
1080  EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1081  bool *Fast) const {
1082  if (Subtarget->requiresStrictAlign())
1083  return false;
1084 
1085  if (Fast) {
1086  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1087  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1088  // See comments in performSTORECombine() for more details about
1089  // these conditions.
1090 
1091  // Code that uses clang vector extensions can mark that it
1092  // wants unaligned accesses to be treated as fast by
1093  // underspecifying alignment to be 1 or 2.
1094  Align <= 2 ||
1095 
1096  // Disregard v2i64. Memcpy lowering produces those and splitting
1097  // them regresses performance on micro-benchmarks and olden/bh.
1098  VT == MVT::v2i64;
1099  }
1100  return true;
1101 }
1102 
1103 FastISel *
1105  const TargetLibraryInfo *libInfo) const {
1106  return AArch64::createFastISel(funcInfo, libInfo);
1107 }
1108 
1109 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1110  switch ((AArch64ISD::NodeType)Opcode) {
1111  case AArch64ISD::FIRST_NUMBER: break;
1112  case AArch64ISD::CALL: return "AArch64ISD::CALL";
1113  case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
1114  case AArch64ISD::ADR: return "AArch64ISD::ADR";
1115  case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
1116  case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
1117  case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
1118  case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
1119  case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
1120  case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
1121  case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
1122  case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
1123  case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
1124  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1125  case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
1126  case AArch64ISD::ADC: return "AArch64ISD::ADC";
1127  case AArch64ISD::SBC: return "AArch64ISD::SBC";
1128  case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
1129  case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
1130  case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
1131  case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
1132  case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
1133  case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
1134  case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
1135  case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
1136  case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
1137  case AArch64ISD::DUP: return "AArch64ISD::DUP";
1138  case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
1139  case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
1140  case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
1141  case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
1142  case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
1143  case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
1144  case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
1145  case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
1146  case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
1147  case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
1148  case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
1149  case AArch64ISD::BICi: return "AArch64ISD::BICi";
1150  case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
1151  case AArch64ISD::BSL: return "AArch64ISD::BSL";
1152  case AArch64ISD::NEG: return "AArch64ISD::NEG";
1153  case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
1154  case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
1155  case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
1156  case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
1157  case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
1158  case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
1159  case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
1160  case AArch64ISD::REV16: return "AArch64ISD::REV16";
1161  case AArch64ISD::REV32: return "AArch64ISD::REV32";
1162  case AArch64ISD::REV64: return "AArch64ISD::REV64";
1163  case AArch64ISD::EXT: return "AArch64ISD::EXT";
1164  case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
1165  case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
1166  case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
1167  case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
1168  case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
1169  case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
1170  case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
1171  case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
1172  case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
1173  case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
1174  case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
1175  case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
1176  case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
1177  case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
1178  case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
1179  case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
1180  case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
1181  case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
1182  case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
1183  case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
1184  case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
1185  case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
1186  case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
1187  case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
1188  case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
1189  case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
1190  case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
1191  case AArch64ISD::NOT: return "AArch64ISD::NOT";
1192  case AArch64ISD::BIT: return "AArch64ISD::BIT";
1193  case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
1194  case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
1195  case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
1196  case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
1197  case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
1198  case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
1199  case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
1200  case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
1201  case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
1202  case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
1203  case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
1204  case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
1205  case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
1206  case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
1207  case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
1208  case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
1209  case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
1210  case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
1211  case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
1212  case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
1213  case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
1214  case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
1215  case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
1216  case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
1217  case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
1218  case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
1219  case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
1220  case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
1221  case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
1222  case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
1223  case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
1224  case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
1225  case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
1226  case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
1227  case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
1228  case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
1229  case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
1230  case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
1231  case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
1232  case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
1233  case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
1234  case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
1235  case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
1236  case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
1237  case AArch64ISD::STG: return "AArch64ISD::STG";
1238  case AArch64ISD::STZG: return "AArch64ISD::STZG";
1239  case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
1240  case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
1241  }
1242  return nullptr;
1243 }
1244 
1247  MachineBasicBlock *MBB) const {
1248  // We materialise the F128CSEL pseudo-instruction as some control flow and a
1249  // phi node:
1250 
1251  // OrigBB:
1252  // [... previous instrs leading to comparison ...]
1253  // b.ne TrueBB
1254  // b EndBB
1255  // TrueBB:
1256  // ; Fallthrough
1257  // EndBB:
1258  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1259 
1260  MachineFunction *MF = MBB->getParent();
1261  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1262  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1263  DebugLoc DL = MI.getDebugLoc();
1264  MachineFunction::iterator It = ++MBB->getIterator();
1265 
1266  unsigned DestReg = MI.getOperand(0).getReg();
1267  unsigned IfTrueReg = MI.getOperand(1).getReg();
1268  unsigned IfFalseReg = MI.getOperand(2).getReg();
1269  unsigned CondCode = MI.getOperand(3).getImm();
1270  bool NZCVKilled = MI.getOperand(4).isKill();
1271 
1272  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1273  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1274  MF->insert(It, TrueBB);
1275  MF->insert(It, EndBB);
1276 
1277  // Transfer rest of current basic-block to EndBB
1278  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1279  MBB->end());
1280  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1281 
1282  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1283  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1284  MBB->addSuccessor(TrueBB);
1285  MBB->addSuccessor(EndBB);
1286 
1287  // TrueBB falls through to the end.
1288  TrueBB->addSuccessor(EndBB);
1289 
1290  if (!NZCVKilled) {
1291  TrueBB->addLiveIn(AArch64::NZCV);
1292  EndBB->addLiveIn(AArch64::NZCV);
1293  }
1294 
1295  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1296  .addReg(IfTrueReg)
1297  .addMBB(TrueBB)
1298  .addReg(IfFalseReg)
1299  .addMBB(MBB);
1300 
1301  MI.eraseFromParent();
1302  return EndBB;
1303 }
1304 
1306  MachineInstr &MI, MachineBasicBlock *BB) const {
1308  BB->getParent()->getFunction().getPersonalityFn())) &&
1309  "SEH does not use catchret!");
1310  return BB;
1311 }
1312 
1314  MachineInstr &MI, MachineBasicBlock *BB) const {
1315  MI.eraseFromParent();
1316  return BB;
1317 }
1318 
1320  MachineInstr &MI, MachineBasicBlock *BB) const {
1321  switch (MI.getOpcode()) {
1322  default:
1323 #ifndef NDEBUG
1324  MI.dump();
1325 #endif
1326  llvm_unreachable("Unexpected instruction for custom inserter!");
1327 
1328  case AArch64::F128CSEL:
1329  return EmitF128CSEL(MI, BB);
1330 
1331  case TargetOpcode::STACKMAP:
1332  case TargetOpcode::PATCHPOINT:
1333  return emitPatchPoint(MI, BB);
1334 
1335  case AArch64::CATCHRET:
1336  return EmitLoweredCatchRet(MI, BB);
1337  case AArch64::CATCHPAD:
1338  return EmitLoweredCatchPad(MI, BB);
1339  }
1340 }
1341 
1342 //===----------------------------------------------------------------------===//
1343 // AArch64 Lowering private implementation.
1344 //===----------------------------------------------------------------------===//
1345 
1346 //===----------------------------------------------------------------------===//
1347 // Lowering Code
1348 //===----------------------------------------------------------------------===//
1349 
1350 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1351 /// CC
1353  switch (CC) {
1354  default:
1355  llvm_unreachable("Unknown condition code!");
1356  case ISD::SETNE:
1357  return AArch64CC::NE;
1358  case ISD::SETEQ:
1359  return AArch64CC::EQ;
1360  case ISD::SETGT:
1361  return AArch64CC::GT;
1362  case ISD::SETGE:
1363  return AArch64CC::GE;
1364  case ISD::SETLT:
1365  return AArch64CC::LT;
1366  case ISD::SETLE:
1367  return AArch64CC::LE;
1368  case ISD::SETUGT:
1369  return AArch64CC::HI;
1370  case ISD::SETUGE:
1371  return AArch64CC::HS;
1372  case ISD::SETULT:
1373  return AArch64CC::LO;
1374  case ISD::SETULE:
1375  return AArch64CC::LS;
1376  }
1377 }
1378 
1379 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1382  AArch64CC::CondCode &CondCode2) {
1383  CondCode2 = AArch64CC::AL;
1384  switch (CC) {
1385  default:
1386  llvm_unreachable("Unknown FP condition!");
1387  case ISD::SETEQ:
1388  case ISD::SETOEQ:
1389  CondCode = AArch64CC::EQ;
1390  break;
1391  case ISD::SETGT:
1392  case ISD::SETOGT:
1393  CondCode = AArch64CC::GT;
1394  break;
1395  case ISD::SETGE:
1396  case ISD::SETOGE:
1397  CondCode = AArch64CC::GE;
1398  break;
1399  case ISD::SETOLT:
1400  CondCode = AArch64CC::MI;
1401  break;
1402  case ISD::SETOLE:
1403  CondCode = AArch64CC::LS;
1404  break;
1405  case ISD::SETONE:
1406  CondCode = AArch64CC::MI;
1407  CondCode2 = AArch64CC::GT;
1408  break;
1409  case ISD::SETO:
1410  CondCode = AArch64CC::VC;
1411  break;
1412  case ISD::SETUO:
1413  CondCode = AArch64CC::VS;
1414  break;
1415  case ISD::SETUEQ:
1416  CondCode = AArch64CC::EQ;
1417  CondCode2 = AArch64CC::VS;
1418  break;
1419  case ISD::SETUGT:
1420  CondCode = AArch64CC::HI;
1421  break;
1422  case ISD::SETUGE:
1423  CondCode = AArch64CC::PL;
1424  break;
1425  case ISD::SETLT:
1426  case ISD::SETULT:
1427  CondCode = AArch64CC::LT;
1428  break;
1429  case ISD::SETLE:
1430  case ISD::SETULE:
1431  CondCode = AArch64CC::LE;
1432  break;
1433  case ISD::SETNE:
1434  case ISD::SETUNE:
1435  CondCode = AArch64CC::NE;
1436  break;
1437  }
1438 }
1439 
1440 /// Convert a DAG fp condition code to an AArch64 CC.
1441 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1442 /// should be AND'ed instead of OR'ed.
1445  AArch64CC::CondCode &CondCode2) {
1446  CondCode2 = AArch64CC::AL;
1447  switch (CC) {
1448  default:
1449  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1450  assert(CondCode2 == AArch64CC::AL);
1451  break;
1452  case ISD::SETONE:
1453  // (a one b)
1454  // == ((a olt b) || (a ogt b))
1455  // == ((a ord b) && (a une b))
1456  CondCode = AArch64CC::VC;
1457  CondCode2 = AArch64CC::NE;
1458  break;
1459  case ISD::SETUEQ:
1460  // (a ueq b)
1461  // == ((a uno b) || (a oeq b))
1462  // == ((a ule b) && (a uge b))
1463  CondCode = AArch64CC::PL;
1464  CondCode2 = AArch64CC::LE;
1465  break;
1466  }
1467 }
1468 
1469 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1470 /// CC usable with the vector instructions. Fewer operations are available
1471 /// without a real NZCV register, so we have to use less efficient combinations
1472 /// to get the same effect.
1475  AArch64CC::CondCode &CondCode2,
1476  bool &Invert) {
1477  Invert = false;
1478  switch (CC) {
1479  default:
1480  // Mostly the scalar mappings work fine.
1481  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1482  break;
1483  case ISD::SETUO:
1484  Invert = true;
1486  case ISD::SETO:
1487  CondCode = AArch64CC::MI;
1488  CondCode2 = AArch64CC::GE;
1489  break;
1490  case ISD::SETUEQ:
1491  case ISD::SETULT:
1492  case ISD::SETULE:
1493  case ISD::SETUGT:
1494  case ISD::SETUGE:
1495  // All of the compare-mask comparisons are ordered, but we can switch
1496  // between the two by a double inversion. E.g. ULE == !OGT.
1497  Invert = true;
1498  changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1499  break;
1500  }
1501 }
1502 
1503 static bool isLegalArithImmed(uint64_t C) {
1504  // Matches AArch64DAGToDAGISel::SelectArithImmed().
1505  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1506  LLVM_DEBUG(dbgs() << "Is imm " << C
1507  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1508  return IsLegal;
1509 }
1510 
1511 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
1512 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
1513 // can be set differently by this operation. It comes down to whether
1514 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1515 // everything is fine. If not then the optimization is wrong. Thus general
1516 // comparisons are only valid if op2 != 0.
1517 //
1518 // So, finally, the only LLVM-native comparisons that don't mention C and V
1519 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1520 // the absence of information about op2.
1521 static bool isCMN(SDValue Op, ISD::CondCode CC) {
1522  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
1523  (CC == ISD::SETEQ || CC == ISD::SETNE);
1524 }
1525 
1527  const SDLoc &dl, SelectionDAG &DAG) {
1528  EVT VT = LHS.getValueType();
1529  const bool FullFP16 =
1530  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1531 
1532  if (VT.isFloatingPoint()) {
1533  assert(VT != MVT::f128);
1534  if (VT == MVT::f16 && !FullFP16) {
1535  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1536  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1537  VT = MVT::f32;
1538  }
1539  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1540  }
1541 
1542  // The CMP instruction is just an alias for SUBS, and representing it as
1543  // SUBS means that it's possible to get CSE with subtract operations.
1544  // A later phase can perform the optimization of setting the destination
1545  // register to WZR/XZR if it ends up being unused.
1546  unsigned Opcode = AArch64ISD::SUBS;
1547 
1548  if (isCMN(RHS, CC)) {
1549  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
1550  Opcode = AArch64ISD::ADDS;
1551  RHS = RHS.getOperand(1);
1552  } else if (isCMN(LHS, CC)) {
1553  // As we are looking for EQ/NE compares, the operands can be commuted ; can
1554  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
1555  Opcode = AArch64ISD::ADDS;
1556  LHS = LHS.getOperand(1);
1557  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1558  !isUnsignedIntSetCC(CC)) {
1559  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1560  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1561  // of the signed comparisons.
1562  Opcode = AArch64ISD::ANDS;
1563  RHS = LHS.getOperand(1);
1564  LHS = LHS.getOperand(0);
1565  }
1566 
1567  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1568  .getValue(1);
1569 }
1570 
1571 /// \defgroup AArch64CCMP CMP;CCMP matching
1572 ///
1573 /// These functions deal with the formation of CMP;CCMP;... sequences.
1574 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1575 /// a comparison. They set the NZCV flags to a predefined value if their
1576 /// predicate is false. This allows to express arbitrary conjunctions, for
1577 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
1578 /// expressed as:
1579 /// cmp A
1580 /// ccmp B, inv(CB), CA
1581 /// check for CB flags
1582 ///
1583 /// This naturally lets us implement chains of AND operations with SETCC
1584 /// operands. And we can even implement some other situations by transforming
1585 /// them:
1586 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
1587 /// negating the flags used in a CCMP/FCCMP operations.
1588 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
1589 /// by negating the flags we test for afterwards. i.e.
1590 /// NEG (CMP CCMP CCCMP ...) can be implemented.
1591 /// - Note that we can only ever negate all previously processed results.
1592 /// What we can not implement by flipping the flags to test is a negation
1593 /// of two sub-trees (because the negation affects all sub-trees emitted so
1594 /// far, so the 2nd sub-tree we emit would also affect the first).
1595 /// With those tools we can implement some OR operations:
1596 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
1597 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
1598 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
1599 /// elimination rules from earlier to implement the whole thing as a
1600 /// CCMP/FCCMP chain.
1601 ///
1602 /// As complete example:
1603 /// or (or (setCA (cmp A)) (setCB (cmp B)))
1604 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1605 /// can be reassociated to:
1606 /// or (and (setCC (cmp C)) setCD (cmp D))
1607 // (or (setCA (cmp A)) (setCB (cmp B)))
1608 /// can be transformed to:
1609 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
1610 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1611 /// which can be implemented as:
1612 /// cmp C
1613 /// ccmp D, inv(CD), CC
1614 /// ccmp A, CA, inv(CD)
1615 /// ccmp B, CB, inv(CA)
1616 /// check for CB flags
1617 ///
1618 /// A counterexample is "or (and A B) (and C D)" which translates to
1619 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
1620 /// can only implement 1 of the inner (not) operations, but not both!
1621 /// @{
1622 
1623 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1625  ISD::CondCode CC, SDValue CCOp,
1627  AArch64CC::CondCode OutCC,
1628  const SDLoc &DL, SelectionDAG &DAG) {
1629  unsigned Opcode = 0;
1630  const bool FullFP16 =
1631  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1632 
1633  if (LHS.getValueType().isFloatingPoint()) {
1634  assert(LHS.getValueType() != MVT::f128);
1635  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1636  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1637  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1638  }
1639  Opcode = AArch64ISD::FCCMP;
1640  } else if (RHS.getOpcode() == ISD::SUB) {
1641  SDValue SubOp0 = RHS.getOperand(0);
1642  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1643  // See emitComparison() on why we can only do this for SETEQ and SETNE.
1644  Opcode = AArch64ISD::CCMN;
1645  RHS = RHS.getOperand(1);
1646  }
1647  }
1648  if (Opcode == 0)
1649  Opcode = AArch64ISD::CCMP;
1650 
1651  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1653  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1654  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1655  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1656 }
1657 
1658 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
1659 /// expressed as a conjunction. See \ref AArch64CCMP.
1660 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
1661 /// changing the conditions on the SETCC tests.
1662 /// (this means we can call emitConjunctionRec() with
1663 /// Negate==true on this sub-tree)
1664 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
1665 /// cannot do the negation naturally. We are required to
1666 /// emit the subtree first in this case.
1667 /// \param WillNegate Is true if are called when the result of this
1668 /// subexpression must be negated. This happens when the
1669 /// outer expression is an OR. We can use this fact to know
1670 /// that we have a double negation (or (or ...) ...) that
1671 /// can be implemented for free.
1672 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
1673  bool &MustBeFirst, bool WillNegate,
1674  unsigned Depth = 0) {
1675  if (!Val.hasOneUse())
1676  return false;
1677  unsigned Opcode = Val->getOpcode();
1678  if (Opcode == ISD::SETCC) {
1679  if (Val->getOperand(0).getValueType() == MVT::f128)
1680  return false;
1681  CanNegate = true;
1682  MustBeFirst = false;
1683  return true;
1684  }
1685  // Protect against exponential runtime and stack overflow.
1686  if (Depth > 6)
1687  return false;
1688  if (Opcode == ISD::AND || Opcode == ISD::OR) {
1689  bool IsOR = Opcode == ISD::OR;
1690  SDValue O0 = Val->getOperand(0);
1691  SDValue O1 = Val->getOperand(1);
1692  bool CanNegateL;
1693  bool MustBeFirstL;
1694  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
1695  return false;
1696  bool CanNegateR;
1697  bool MustBeFirstR;
1698  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
1699  return false;
1700 
1701  if (MustBeFirstL && MustBeFirstR)
1702  return false;
1703 
1704  if (IsOR) {
1705  // For an OR expression we need to be able to naturally negate at least
1706  // one side or we cannot do the transformation at all.
1707  if (!CanNegateL && !CanNegateR)
1708  return false;
1709  // If we the result of the OR will be negated and we can naturally negate
1710  // the leafs, then this sub-tree as a whole negates naturally.
1711  CanNegate = WillNegate && CanNegateL && CanNegateR;
1712  // If we cannot naturally negate the whole sub-tree, then this must be
1713  // emitted first.
1714  MustBeFirst = !CanNegate;
1715  } else {
1716  assert(Opcode == ISD::AND && "Must be OR or AND");
1717  // We cannot naturally negate an AND operation.
1718  CanNegate = false;
1719  MustBeFirst = MustBeFirstL || MustBeFirstR;
1720  }
1721  return true;
1722  }
1723  return false;
1724 }
1725 
1726 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1727 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1728 /// Tries to transform the given i1 producing node @p Val to a series compare
1729 /// and conditional compare operations. @returns an NZCV flags producing node
1730 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1731 /// transformation was not possible.
1732 /// \p Negate is true if we want this sub-tree being negated just by changing
1733 /// SETCC conditions.
1735  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1737  // We're at a tree leaf, produce a conditional comparison operation.
1738  unsigned Opcode = Val->getOpcode();
1739  if (Opcode == ISD::SETCC) {
1740  SDValue LHS = Val->getOperand(0);
1741  SDValue RHS = Val->getOperand(1);
1742  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1743  bool isInteger = LHS.getValueType().isInteger();
1744  if (Negate)
1745  CC = getSetCCInverse(CC, isInteger);
1746  SDLoc DL(Val);
1747  // Determine OutCC and handle FP special case.
1748  if (isInteger) {
1749  OutCC = changeIntCCToAArch64CC(CC);
1750  } else {
1752  AArch64CC::CondCode ExtraCC;
1753  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1754  // Some floating point conditions can't be tested with a single condition
1755  // code. Construct an additional comparison in this case.
1756  if (ExtraCC != AArch64CC::AL) {
1757  SDValue ExtraCmp;
1758  if (!CCOp.getNode())
1759  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1760  else
1761  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1762  ExtraCC, DL, DAG);
1763  CCOp = ExtraCmp;
1764  Predicate = ExtraCC;
1765  }
1766  }
1767 
1768  // Produce a normal comparison if we are first in the chain
1769  if (!CCOp)
1770  return emitComparison(LHS, RHS, CC, DL, DAG);
1771  // Otherwise produce a ccmp.
1772  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1773  DAG);
1774  }
1775  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
1776 
1777  bool IsOR = Opcode == ISD::OR;
1778 
1779  SDValue LHS = Val->getOperand(0);
1780  bool CanNegateL;
1781  bool MustBeFirstL;
1782  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
1783  assert(ValidL && "Valid conjunction/disjunction tree");
1784  (void)ValidL;
1785 
1786  SDValue RHS = Val->getOperand(1);
1787  bool CanNegateR;
1788  bool MustBeFirstR;
1789  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
1790  assert(ValidR && "Valid conjunction/disjunction tree");
1791  (void)ValidR;
1792 
1793  // Swap sub-tree that must come first to the right side.
1794  if (MustBeFirstL) {
1795  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
1796  std::swap(LHS, RHS);
1797  std::swap(CanNegateL, CanNegateR);
1798  std::swap(MustBeFirstL, MustBeFirstR);
1799  }
1800 
1801  bool NegateR;
1802  bool NegateAfterR;
1803  bool NegateL;
1804  bool NegateAfterAll;
1805  if (Opcode == ISD::OR) {
1806  // Swap the sub-tree that we can negate naturally to the left.
1807  if (!CanNegateL) {
1808  assert(CanNegateR && "at least one side must be negatable");
1809  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
1810  assert(!Negate);
1811  std::swap(LHS, RHS);
1812  NegateR = false;
1813  NegateAfterR = true;
1814  } else {
1815  // Negate the left sub-tree if possible, otherwise negate the result.
1816  NegateR = CanNegateR;
1817  NegateAfterR = !CanNegateR;
1818  }
1819  NegateL = true;
1820  NegateAfterAll = !Negate;
1821  } else {
1822  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
1823  assert(!Negate && "Valid conjunction/disjunction tree");
1824 
1825  NegateL = false;
1826  NegateR = false;
1827  NegateAfterR = false;
1828  NegateAfterAll = false;
1829  }
1830 
1831  // Emit sub-trees.
1832  AArch64CC::CondCode RHSCC;
1833  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
1834  if (NegateAfterR)
1835  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1836  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
1837  if (NegateAfterAll)
1838  OutCC = AArch64CC::getInvertedCondCode(OutCC);
1839  return CmpL;
1840 }
1841 
1842 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
1843 /// In some cases this is even possible with OR operations in the expression.
1844 /// See \ref AArch64CCMP.
1845 /// \see emitConjunctionRec().
1847  AArch64CC::CondCode &OutCC) {
1848  bool DummyCanNegate;
1849  bool DummyMustBeFirst;
1850  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
1851  return SDValue();
1852 
1853  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
1854 }
1855 
1856 /// @}
1857 
1858 /// Returns how profitable it is to fold a comparison's operand's shift and/or
1859 /// extension operations.
1861  auto isSupportedExtend = [&](SDValue V) {
1862  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
1863  return true;
1864 
1865  if (V.getOpcode() == ISD::AND)
1866  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
1867  uint64_t Mask = MaskCst->getZExtValue();
1868  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
1869  }
1870 
1871  return false;
1872  };
1873 
1874  if (!Op.hasOneUse())
1875  return 0;
1876 
1877  if (isSupportedExtend(Op))
1878  return 1;
1879 
1880  unsigned Opc = Op.getOpcode();
1881  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
1882  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
1883  uint64_t Shift = ShiftCst->getZExtValue();
1884  if (isSupportedExtend(Op.getOperand(0)))
1885  return (Shift <= 4) ? 2 : 1;
1886  EVT VT = Op.getValueType();
1887  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
1888  return 1;
1889  }
1890 
1891  return 0;
1892 }
1893 
1895  SDValue &AArch64cc, SelectionDAG &DAG,
1896  const SDLoc &dl) {
1897  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1898  EVT VT = RHS.getValueType();
1899  uint64_t C = RHSC->getZExtValue();
1900  if (!isLegalArithImmed(C)) {
1901  // Constant does not fit, try adjusting it by one?
1902  switch (CC) {
1903  default:
1904  break;
1905  case ISD::SETLT:
1906  case ISD::SETGE:
1907  if ((VT == MVT::i32 && C != 0x80000000 &&
1908  isLegalArithImmed((uint32_t)(C - 1))) ||
1909  (VT == MVT::i64 && C != 0x80000000ULL &&
1910  isLegalArithImmed(C - 1ULL))) {
1911  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1912  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1913  RHS = DAG.getConstant(C, dl, VT);
1914  }
1915  break;
1916  case ISD::SETULT:
1917  case ISD::SETUGE:
1918  if ((VT == MVT::i32 && C != 0 &&
1919  isLegalArithImmed((uint32_t)(C - 1))) ||
1920  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1921  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1922  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1923  RHS = DAG.getConstant(C, dl, VT);
1924  }
1925  break;
1926  case ISD::SETLE:
1927  case ISD::SETGT:
1928  if ((VT == MVT::i32 && C != INT32_MAX &&
1929  isLegalArithImmed((uint32_t)(C + 1))) ||
1930  (VT == MVT::i64 && C != INT64_MAX &&
1931  isLegalArithImmed(C + 1ULL))) {
1932  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1933  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1934  RHS = DAG.getConstant(C, dl, VT);
1935  }
1936  break;
1937  case ISD::SETULE:
1938  case ISD::SETUGT:
1939  if ((VT == MVT::i32 && C != UINT32_MAX &&
1940  isLegalArithImmed((uint32_t)(C + 1))) ||
1941  (VT == MVT::i64 && C != UINT64_MAX &&
1942  isLegalArithImmed(C + 1ULL))) {
1943  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1944  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1945  RHS = DAG.getConstant(C, dl, VT);
1946  }
1947  break;
1948  }
1949  }
1950  }
1951 
1952  // Comparisons are canonicalized so that the RHS operand is simpler than the
1953  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
1954  // can fold some shift+extend operations on the RHS operand, so swap the
1955  // operands if that can be done.
1956  //
1957  // For example:
1958  // lsl w13, w11, #1
1959  // cmp w13, w12
1960  // can be turned into:
1961  // cmp w12, w11, lsl #1
1962  if (!isa<ConstantSDNode>(RHS) ||
1963  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
1964  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
1965 
1967  std::swap(LHS, RHS);
1969  }
1970  }
1971 
1972  SDValue Cmp;
1973  AArch64CC::CondCode AArch64CC;
1974  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
1975  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
1976 
1977  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
1978  // For the i8 operand, the largest immediate is 255, so this can be easily
1979  // encoded in the compare instruction. For the i16 operand, however, the
1980  // largest immediate cannot be encoded in the compare.
1981  // Therefore, use a sign extending load and cmn to avoid materializing the
1982  // -1 constant. For example,
1983  // movz w1, #65535
1984  // ldrh w0, [x0, #0]
1985  // cmp w0, w1
1986  // >
1987  // ldrsh w0, [x0, #0]
1988  // cmn w0, #1
1989  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
1990  // if and only if (sext LHS) == (sext RHS). The checks are in place to
1991  // ensure both the LHS and RHS are truly zero extended and to make sure the
1992  // transformation is profitable.
1993  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
1994  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
1995  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
1996  LHS.getNode()->hasNUsesOfValue(1, 0)) {
1997  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
1998  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
1999  SDValue SExt =
2000  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2001  DAG.getValueType(MVT::i16));
2002  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2003  RHS.getValueType()),
2004  CC, dl, DAG);
2005  AArch64CC = changeIntCCToAArch64CC(CC);
2006  }
2007  }
2008 
2009  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2010  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2011  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2012  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2013  }
2014  }
2015  }
2016 
2017  if (!Cmp) {
2018  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2019  AArch64CC = changeIntCCToAArch64CC(CC);
2020  }
2021  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2022  return Cmp;
2023 }
2024 
2025 static std::pair<SDValue, SDValue>
2027  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2028  "Unsupported value type");
2029  SDValue Value, Overflow;
2030  SDLoc DL(Op);
2031  SDValue LHS = Op.getOperand(0);
2032  SDValue RHS = Op.getOperand(1);
2033  unsigned Opc = 0;
2034  switch (Op.getOpcode()) {
2035  default:
2036  llvm_unreachable("Unknown overflow instruction!");
2037  case ISD::SADDO:
2038  Opc = AArch64ISD::ADDS;
2039  CC = AArch64CC::VS;
2040  break;
2041  case ISD::UADDO:
2042  Opc = AArch64ISD::ADDS;
2043  CC = AArch64CC::HS;
2044  break;
2045  case ISD::SSUBO:
2046  Opc = AArch64ISD::SUBS;
2047  CC = AArch64CC::VS;
2048  break;
2049  case ISD::USUBO:
2050  Opc = AArch64ISD::SUBS;
2051  CC = AArch64CC::LO;
2052  break;
2053  // Multiply needs a little bit extra work.
2054  case ISD::SMULO:
2055  case ISD::UMULO: {
2056  CC = AArch64CC::NE;
2057  bool IsSigned = Op.getOpcode() == ISD::SMULO;
2058  if (Op.getValueType() == MVT::i32) {
2059  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2060  // For a 32 bit multiply with overflow check we want the instruction
2061  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2062  // need to generate the following pattern:
2063  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2064  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2065  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2066  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2067  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
2068  DAG.getConstant(0, DL, MVT::i64));
2069  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2070  // operation. We need to clear out the upper 32 bits, because we used a
2071  // widening multiply that wrote all 64 bits. In the end this should be a
2072  // noop.
2073  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
2074  if (IsSigned) {
2075  // The signed overflow check requires more than just a simple check for
2076  // any bit set in the upper 32 bits of the result. These bits could be
2077  // just the sign bits of a negative number. To perform the overflow
2078  // check we have to arithmetic shift right the 32nd bit of the result by
2079  // 31 bits. Then we compare the result to the upper 32 bits.
2080  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
2081  DAG.getConstant(32, DL, MVT::i64));
2082  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
2083  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
2084  DAG.getConstant(31, DL, MVT::i64));
2085  // It is important that LowerBits is last, otherwise the arithmetic
2086  // shift will not be folded into the compare (SUBS).
2087  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2088  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2089  .getValue(1);
2090  } else {
2091  // The overflow check for unsigned multiply is easy. We only need to
2092  // check if any of the upper 32 bits are set. This can be done with a
2093  // CMP (shifted register). For that we need to generate the following
2094  // pattern:
2095  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2096  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2097  DAG.getConstant(32, DL, MVT::i64));
2098  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2099  Overflow =
2100  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2101  DAG.getConstant(0, DL, MVT::i64),
2102  UpperBits).getValue(1);
2103  }
2104  break;
2105  }
2106  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2107  // For the 64 bit multiply
2108  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2109  if (IsSigned) {
2110  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2111  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
2112  DAG.getConstant(63, DL, MVT::i64));
2113  // It is important that LowerBits is last, otherwise the arithmetic
2114  // shift will not be folded into the compare (SUBS).
2115  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2116  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2117  .getValue(1);
2118  } else {
2119  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2120  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2121  Overflow =
2122  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2123  DAG.getConstant(0, DL, MVT::i64),
2124  UpperBits).getValue(1);
2125  }
2126  break;
2127  }
2128  } // switch (...)
2129 
2130  if (Opc) {
2131  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2132 
2133  // Emit the AArch64 operation with overflow check.
2134  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2135  Overflow = Value.getValue(1);
2136  }
2137  return std::make_pair(Value, Overflow);
2138 }
2139 
2140 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
2141  RTLIB::Libcall Call) const {
2142  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2143  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
2144 }
2145 
2146 // Returns true if the given Op is the overflow flag result of an overflow
2147 // intrinsic operation.
2148 static bool isOverflowIntrOpRes(SDValue Op) {
2149  unsigned Opc = Op.getOpcode();
2150  return (Op.getResNo() == 1 &&
2151  (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2152  Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
2153 }
2154 
2156  SDValue Sel = Op.getOperand(0);
2157  SDValue Other = Op.getOperand(1);
2158  SDLoc dl(Sel);
2159 
2160  // If the operand is an overflow checking operation, invert the condition
2161  // code and kill the Not operation. I.e., transform:
2162  // (xor (overflow_op_bool, 1))
2163  // -->
2164  // (csel 1, 0, invert(cc), overflow_op_bool)
2165  // ... which later gets transformed to just a cset instruction with an
2166  // inverted condition code, rather than a cset + eor sequence.
2167  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
2168  // Only lower legal XALUO ops.
2169  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2170  return SDValue();
2171 
2172  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2173  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2175  SDValue Value, Overflow;
2176  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2177  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2178  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2179  CCVal, Overflow);
2180  }
2181  // If neither operand is a SELECT_CC, give up.
2182  if (Sel.getOpcode() != ISD::SELECT_CC)
2183  std::swap(Sel, Other);
2184  if (Sel.getOpcode() != ISD::SELECT_CC)
2185  return Op;
2186 
2187  // The folding we want to perform is:
2188  // (xor x, (select_cc a, b, cc, 0, -1) )
2189  // -->
2190  // (csel x, (xor x, -1), cc ...)
2191  //
2192  // The latter will get matched to a CSINV instruction.
2193 
2194  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2195  SDValue LHS = Sel.getOperand(0);
2196  SDValue RHS = Sel.getOperand(1);
2197  SDValue TVal = Sel.getOperand(2);
2198  SDValue FVal = Sel.getOperand(3);
2199 
2200  // FIXME: This could be generalized to non-integer comparisons.
2201  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2202  return Op;
2203 
2204  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2205  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2206 
2207  // The values aren't constants, this isn't the pattern we're looking for.
2208  if (!CFVal || !CTVal)
2209  return Op;
2210 
2211  // We can commute the SELECT_CC by inverting the condition. This
2212  // might be needed to make this fit into a CSINV pattern.
2213  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2214  std::swap(TVal, FVal);
2215  std::swap(CTVal, CFVal);
2216  CC = ISD::getSetCCInverse(CC, true);
2217  }
2218 
2219  // If the constants line up, perform the transform!
2220  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2221  SDValue CCVal;
2222  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2223 
2224  FVal = Other;
2225  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2226  DAG.getConstant(-1ULL, dl, Other.getValueType()));
2227 
2228  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2229  CCVal, Cmp);
2230  }
2231 
2232  return Op;
2233 }
2234 
2236  EVT VT = Op.getValueType();
2237 
2238  // Let legalize expand this if it isn't a legal type yet.
2239  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2240  return SDValue();
2241 
2242  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2243 
2244  unsigned Opc;
2245  bool ExtraOp = false;
2246  switch (Op.getOpcode()) {
2247  default:
2248  llvm_unreachable("Invalid code");
2249  case ISD::ADDC:
2250  Opc = AArch64ISD::ADDS;
2251  break;
2252  case ISD::SUBC:
2253  Opc = AArch64ISD::SUBS;
2254  break;
2255  case ISD::ADDE:
2256  Opc = AArch64ISD::ADCS;
2257  ExtraOp = true;
2258  break;
2259  case ISD::SUBE:
2260  Opc = AArch64ISD::SBCS;
2261  ExtraOp = true;
2262  break;
2263  }
2264 
2265  if (!ExtraOp)
2266  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2267  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2268  Op.getOperand(2));
2269 }
2270 
2272  // Let legalize expand this if it isn't a legal type yet.
2274  return SDValue();
2275 
2276  SDLoc dl(Op);
2278  // The actual operation that sets the overflow or carry flag.
2279  SDValue Value, Overflow;
2280  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2281 
2282  // We use 0 and 1 as false and true values.
2283  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2284  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2285 
2286  // We use an inverted condition, because the conditional select is inverted
2287  // too. This will allow it to be selected to a single instruction:
2288  // CSINC Wd, WZR, WZR, invert(cond).
2289  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2290  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2291  CCVal, Overflow);
2292 
2293  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2294  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2295 }
2296 
2297 // Prefetch operands are:
2298 // 1: Address to prefetch
2299 // 2: bool isWrite
2300 // 3: int locality (0 = no locality ... 3 = extreme locality)
2301 // 4: bool isDataCache
2303  SDLoc DL(Op);
2304  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2305  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2306  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2307 
2308  bool IsStream = !Locality;
2309  // When the locality number is set
2310  if (Locality) {
2311  // The front-end should have filtered out the out-of-range values
2312  assert(Locality <= 3 && "Prefetch locality out-of-range");
2313  // The locality degree is the opposite of the cache speed.
2314  // Put the number the other way around.
2315  // The encoding starts at 0 for level 1
2316  Locality = 3 - Locality;
2317  }
2318 
2319  // built the mask value encoding the expected behavior.
2320  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2321  (!IsData << 3) | // IsDataCache bit
2322  (Locality << 1) | // Cache level bits
2323  (unsigned)IsStream; // Stream bit
2324  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2325  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2326 }
2327 
2328 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2329  SelectionDAG &DAG) const {
2330  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2331 
2332  RTLIB::Libcall LC;
2334 
2335  return LowerF128Call(Op, DAG, LC);
2336 }
2337 
2338 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2339  SelectionDAG &DAG) const {
2340  if (Op.getOperand(0).getValueType() != MVT::f128) {
2341  // It's legal except when f128 is involved
2342  return Op;
2343  }
2344 
2345  RTLIB::Libcall LC;
2347 
2348  // FP_ROUND node has a second operand indicating whether it is known to be
2349  // precise. That doesn't take part in the LibCall so we can't directly use
2350  // LowerF128Call.
2351  SDValue SrcVal = Op.getOperand(0);
2352  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
2353  SDLoc(Op)).first;
2354 }
2355 
2356 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
2357  SelectionDAG &DAG) const {
2358  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2359  // Any additional optimization in this function should be recorded
2360  // in the cost tables.
2361  EVT InVT = Op.getOperand(0).getValueType();
2362  EVT VT = Op.getValueType();
2363  unsigned NumElts = InVT.getVectorNumElements();
2364 
2365  // f16 conversions are promoted to f32 when full fp16 is not supported.
2366  if (InVT.getVectorElementType() == MVT::f16 &&
2367  !Subtarget->hasFullFP16()) {
2368  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2369  SDLoc dl(Op);
2370  return DAG.getNode(
2371  Op.getOpcode(), dl, Op.getValueType(),
2372  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2373  }
2374 
2375  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2376  SDLoc dl(Op);
2377  SDValue Cv =
2378  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2379  Op.getOperand(0));
2380  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2381  }
2382 
2383  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2384  SDLoc dl(Op);
2385  MVT ExtVT =
2387  VT.getVectorNumElements());
2388  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2389  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2390  }
2391 
2392  // Type changing conversions are illegal.
2393  return Op;
2394 }
2395 
2396 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2397  SelectionDAG &DAG) const {
2398  if (Op.getOperand(0).getValueType().isVector())
2399  return LowerVectorFP_TO_INT(Op, DAG);
2400 
2401  // f16 conversions are promoted to f32 when full fp16 is not supported.
2402  if (Op.getOperand(0).getValueType() == MVT::f16 &&
2403  !Subtarget->hasFullFP16()) {
2404  SDLoc dl(Op);
2405  return DAG.getNode(
2406  Op.getOpcode(), dl, Op.getValueType(),
2407  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2408  }
2409 
2410  if (Op.getOperand(0).getValueType() != MVT::f128) {
2411  // It's legal except when f128 is involved
2412  return Op;
2413  }
2414 
2415  RTLIB::Libcall LC;
2416  if (Op.getOpcode() == ISD::FP_TO_SINT)
2418  else
2420 
2421  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2422  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
2423 }
2424 
2426  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2427  // Any additional optimization in this function should be recorded
2428  // in the cost tables.
2429  EVT VT = Op.getValueType();
2430  SDLoc dl(Op);
2431  SDValue In = Op.getOperand(0);
2432  EVT InVT = In.getValueType();
2433 
2434  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2435  MVT CastVT =
2437  InVT.getVectorNumElements());
2438  In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2439  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2440  }
2441 
2442  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2443  unsigned CastOpc =
2445  EVT CastVT = VT.changeVectorElementTypeToInteger();
2446  In = DAG.getNode(CastOpc, dl, CastVT, In);
2447  return DAG.getNode(Op.getOpcode(), dl, VT, In);
2448  }
2449 
2450  return Op;
2451 }
2452 
2453 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2454  SelectionDAG &DAG) const {
2455  if (Op.getValueType().isVector())
2456  return LowerVectorINT_TO_FP(Op, DAG);
2457 
2458  // f16 conversions are promoted to f32 when full fp16 is not supported.
2459  if (Op.getValueType() == MVT::f16 &&
2460  !Subtarget->hasFullFP16()) {
2461  SDLoc dl(Op);
2462  return DAG.getNode(
2463  ISD::FP_ROUND, dl, MVT::f16,
2464  DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2465  DAG.getIntPtrConstant(0, dl));
2466  }
2467 
2468  // i128 conversions are libcalls.
2469  if (Op.getOperand(0).getValueType() == MVT::i128)
2470  return SDValue();
2471 
2472  // Other conversions are legal, unless it's to the completely software-based
2473  // fp128.
2474  if (Op.getValueType() != MVT::f128)
2475  return Op;
2476 
2477  RTLIB::Libcall LC;
2478  if (Op.getOpcode() == ISD::SINT_TO_FP)
2480  else
2482 
2483  return LowerF128Call(Op, DAG, LC);
2484 }
2485 
2486 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2487  SelectionDAG &DAG) const {
2488  // For iOS, we want to call an alternative entry point: __sincos_stret,
2489  // which returns the values in two S / D registers.
2490  SDLoc dl(Op);
2491  SDValue Arg = Op.getOperand(0);
2492  EVT ArgVT = Arg.getValueType();
2493  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2494 
2495  ArgListTy Args;
2496  ArgListEntry Entry;
2497 
2498  Entry.Node = Arg;
2499  Entry.Ty = ArgTy;
2500  Entry.IsSExt = false;
2501  Entry.IsZExt = false;
2502  Args.push_back(Entry);
2503 
2504  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2505  : RTLIB::SINCOS_STRET_F32;
2506  const char *LibcallName = getLibcallName(LC);
2507  SDValue Callee =
2508  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2509 
2510  StructType *RetTy = StructType::get(ArgTy, ArgTy);
2512  CLI.setDebugLoc(dl)
2513  .setChain(DAG.getEntryNode())
2514  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2515 
2516  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2517  return CallResult.first;
2518 }
2519 
2521  if (Op.getValueType() != MVT::f16)
2522  return SDValue();
2523 
2524  assert(Op.getOperand(0).getValueType() == MVT::i16);
2525  SDLoc DL(Op);
2526 
2527  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2528  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2529  return SDValue(
2530  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2531  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2532  0);
2533 }
2534 
2535 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2536  if (OrigVT.getSizeInBits() >= 64)
2537  return OrigVT;
2538 
2539  assert(OrigVT.isSimple() && "Expecting a simple value type");
2540 
2541  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2542  switch (OrigSimpleTy) {
2543  default: llvm_unreachable("Unexpected Vector Type");
2544  case MVT::v2i8:
2545  case MVT::v2i16:
2546  return MVT::v2i32;
2547  case MVT::v4i8:
2548  return MVT::v4i16;
2549  }
2550 }
2551 
2553  const EVT &OrigTy,
2554  const EVT &ExtTy,
2555  unsigned ExtOpcode) {
2556  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2557  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2558  // 64-bits we need to insert a new extension so that it will be 64-bits.
2559  assert(ExtTy.is128BitVector() && "Unexpected extension size");
2560  if (OrigTy.getSizeInBits() >= 64)
2561  return N;
2562 
2563  // Must extend size to at least 64 bits to be used as an operand for VMULL.
2564  EVT NewVT = getExtensionTo64Bits(OrigTy);
2565 
2566  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2567 }
2568 
2570  bool isSigned) {
2571  EVT VT = N->getValueType(0);
2572 
2573  if (N->getOpcode() != ISD::BUILD_VECTOR)
2574  return false;
2575 
2576  for (const SDValue &Elt : N->op_values()) {
2577  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2578  unsigned EltSize = VT.getScalarSizeInBits();
2579  unsigned HalfSize = EltSize / 2;
2580  if (isSigned) {
2581  if (!isIntN(HalfSize, C->getSExtValue()))
2582  return false;
2583  } else {
2584  if (!isUIntN(HalfSize, C->getZExtValue()))
2585  return false;
2586  }
2587  continue;
2588  }
2589  return false;
2590  }
2591 
2592  return true;
2593 }
2594 
2596  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2598  N->getOperand(0)->getValueType(0),
2599  N->getValueType(0),
2600  N->getOpcode());
2601 
2602  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2603  EVT VT = N->getValueType(0);
2604  SDLoc dl(N);
2605  unsigned EltSize = VT.getScalarSizeInBits() / 2;
2606  unsigned NumElts = VT.getVectorNumElements();
2607  MVT TruncVT = MVT::getIntegerVT(EltSize);
2609  for (unsigned i = 0; i != NumElts; ++i) {
2610  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2611  const APInt &CInt = C->getAPIntValue();
2612  // Element types smaller than 32 bits are not legal, so use i32 elements.
2613  // The values are implicitly truncated so sext vs. zext doesn't matter.
2614  Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2615  }
2616  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2617 }
2618 
2619 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2620  return N->getOpcode() == ISD::SIGN_EXTEND ||
2621  isExtendedBUILD_VECTOR(N, DAG, true);
2622 }
2623 
2624 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2625  return N->getOpcode() == ISD::ZERO_EXTEND ||
2626  isExtendedBUILD_VECTOR(N, DAG, false);
2627 }
2628 
2629 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2630  unsigned Opcode = N->getOpcode();
2631  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2632  SDNode *N0 = N->getOperand(0).getNode();
2633  SDNode *N1 = N->getOperand(1).getNode();
2634  return N0->hasOneUse() && N1->hasOneUse() &&
2635  isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2636  }
2637  return false;
2638 }
2639 
2640 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2641  unsigned Opcode = N->getOpcode();
2642  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2643  SDNode *N0 = N->getOperand(0).getNode();
2644  SDNode *N1 = N->getOperand(1).getNode();
2645  return N0->hasOneUse() && N1->hasOneUse() &&
2646  isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2647  }
2648  return false;
2649 }
2650 
2651 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
2652  SelectionDAG &DAG) const {
2653  // The rounding mode is in bits 23:22 of the FPSCR.
2654  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
2655  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
2656  // so that the shift + and get folded into a bitfield extract.
2657  SDLoc dl(Op);
2658 
2659  SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
2660  DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
2661  MVT::i64));
2662  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
2663  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
2664  DAG.getConstant(1U << 22, dl, MVT::i32));
2665  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
2666  DAG.getConstant(22, dl, MVT::i32));
2667  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
2668  DAG.getConstant(3, dl, MVT::i32));
2669 }
2670 
2672  // Multiplications are only custom-lowered for 128-bit vectors so that
2673  // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2674  EVT VT = Op.getValueType();
2675  assert(VT.is128BitVector() && VT.isInteger() &&
2676  "unexpected type for custom-lowering ISD::MUL");
2677  SDNode *N0 = Op.getOperand(0).getNode();
2678  SDNode *N1 = Op.getOperand(1).getNode();
2679  unsigned NewOpc = 0;
2680  bool isMLA = false;
2681  bool isN0SExt = isSignExtended(N0, DAG);
2682  bool isN1SExt = isSignExtended(N1, DAG);
2683  if (isN0SExt && isN1SExt)
2684  NewOpc = AArch64ISD::SMULL;
2685  else {
2686  bool isN0ZExt = isZeroExtended(N0, DAG);
2687  bool isN1ZExt = isZeroExtended(N1, DAG);
2688  if (isN0ZExt && isN1ZExt)
2689  NewOpc = AArch64ISD::UMULL;
2690  else if (isN1SExt || isN1ZExt) {
2691  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2692  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2693  if (isN1SExt && isAddSubSExt(N0, DAG)) {
2694  NewOpc = AArch64ISD::SMULL;
2695  isMLA = true;
2696  } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2697  NewOpc = AArch64ISD::UMULL;
2698  isMLA = true;
2699  } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2700  std::swap(N0, N1);
2701  NewOpc = AArch64ISD::UMULL;
2702  isMLA = true;
2703  }
2704  }
2705 
2706  if (!NewOpc) {
2707  if (VT == MVT::v2i64)
2708  // Fall through to expand this. It is not legal.
2709  return SDValue();
2710  else
2711  // Other vector multiplications are legal.
2712  return Op;
2713  }
2714  }
2715 
2716  // Legalize to a S/UMULL instruction
2717  SDLoc DL(Op);
2718  SDValue Op0;
2719  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2720  if (!isMLA) {
2721  Op0 = skipExtensionForVectorMULL(N0, DAG);
2722  assert(Op0.getValueType().is64BitVector() &&
2723  Op1.getValueType().is64BitVector() &&
2724  "unexpected types for extended operands to VMULL");
2725  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2726  }
2727  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2728  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2729  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2730  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2731  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2732  EVT Op1VT = Op1.getValueType();
2733  return DAG.getNode(N0->getOpcode(), DL, VT,
2734  DAG.getNode(NewOpc, DL, VT,
2735  DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2736  DAG.getNode(NewOpc, DL, VT,
2737  DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2738 }
2739 
2740 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2741  SelectionDAG &DAG) const {
2742  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2743  SDLoc dl(Op);
2744  switch (IntNo) {
2745  default: return SDValue(); // Don't custom lower most intrinsics.
2746  case Intrinsic::thread_pointer: {
2747  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2748  return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2749  }
2750  case Intrinsic::aarch64_neon_abs: {
2751  EVT Ty = Op.getValueType();
2752  if (Ty == MVT::i64) {
2753  SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
2754  Op.getOperand(1));
2755  Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
2756  return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
2757  } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
2758  return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
2759  } else {
2760  report_fatal_error("Unexpected type for AArch64 NEON intrinic");
2761  }
2762  }
2763  case Intrinsic::aarch64_neon_smax:
2764  return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2765  Op.getOperand(1), Op.getOperand(2));
2766  case Intrinsic::aarch64_neon_umax:
2767  return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2768  Op.getOperand(1), Op.getOperand(2));
2769  case Intrinsic::aarch64_neon_smin:
2770  return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2771  Op.getOperand(1), Op.getOperand(2));
2772  case Intrinsic::aarch64_neon_umin:
2773  return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2774  Op.getOperand(1), Op.getOperand(2));
2775 
2776  case Intrinsic::localaddress: {
2777  const auto &MF = DAG.getMachineFunction();
2778  const auto *RegInfo = Subtarget->getRegisterInfo();
2779  unsigned Reg = RegInfo->getLocalAddressRegister(MF);
2780  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
2781  Op.getSimpleValueType());
2782  }
2783 
2784  case Intrinsic::eh_recoverfp: {
2785  // FIXME: This needs to be implemented to correctly handle highly aligned
2786  // stack objects. For now we simply return the incoming FP. Refer D53541
2787  // for more details.
2788  SDValue FnOp = Op.getOperand(1);
2789  SDValue IncomingFPOp = Op.getOperand(2);
2791  auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
2792  if (!Fn)
2794  "llvm.eh.recoverfp must take a function as the first argument");
2795  return IncomingFPOp;
2796  }
2797  }
2798 }
2799 
2800 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
2802  EVT VT, EVT MemVT,
2803  SelectionDAG &DAG) {
2804  assert(VT.isVector() && "VT should be a vector type");
2805  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
2806 
2807  SDValue Value = ST->getValue();
2808 
2809  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
2810  // the word lane which represent the v4i8 subvector. It optimizes the store
2811  // to:
2812  //
2813  // xtn v0.8b, v0.8h
2814  // str s0, [x0]
2815 
2816  SDValue Undef = DAG.getUNDEF(MVT::i16);
2817  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
2818  {Undef, Undef, Undef, Undef});
2819 
2820  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
2821  Value, UndefVec);
2822  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
2823 
2824  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
2825  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
2826  Trunc, DAG.getConstant(0, DL, MVT::i64));
2827 
2828  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
2829  ST->getBasePtr(), ST->getMemOperand());
2830 }
2831 
2832 // Custom lowering for any store, vector or scalar and/or default or with
2833 // a truncate operations. Currently only custom lower truncate operation
2834 // from vector v4i16 to v4i8.
2835 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
2836  SelectionDAG &DAG) const {
2837  SDLoc Dl(Op);
2838  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
2839  assert (StoreNode && "Can only custom lower store nodes");
2840 
2841  SDValue Value = StoreNode->getValue();
2842 
2843  EVT VT = Value.getValueType();
2844  EVT MemVT = StoreNode->getMemoryVT();
2845 
2846  assert (VT.isVector() && "Can only custom lower vector store types");
2847 
2848  unsigned AS = StoreNode->getAddressSpace();
2849  unsigned Align = StoreNode->getAlignment();
2850  if (Align < MemVT.getStoreSize() &&
2852  MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
2853  return scalarizeVectorStore(StoreNode, DAG);
2854  }
2855 
2856  if (StoreNode->isTruncatingStore()) {
2857  return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
2858  }
2859 
2860  return SDValue();
2861 }
2862 
2864  SelectionDAG &DAG) const {
2865  LLVM_DEBUG(dbgs() << "Custom lowering: ");
2866  LLVM_DEBUG(Op.dump());
2867 
2868  switch (Op.getOpcode()) {
2869  default:
2870  llvm_unreachable("unimplemented operand");
2871  return SDValue();
2872  case ISD::BITCAST:
2873  return LowerBITCAST(Op, DAG);
2874  case ISD::GlobalAddress:
2875  return LowerGlobalAddress(Op, DAG);
2876  case ISD::GlobalTLSAddress:
2877  return LowerGlobalTLSAddress(Op, DAG);
2878  case ISD::SETCC:
2879  return LowerSETCC(Op, DAG);
2880  case ISD::BR_CC:
2881  return LowerBR_CC(Op, DAG);
2882  case ISD::SELECT:
2883  return LowerSELECT(Op, DAG);
2884  case ISD::SELECT_CC:
2885  return LowerSELECT_CC(Op, DAG);
2886  case ISD::JumpTable:
2887  return LowerJumpTable(Op, DAG);
2888  case ISD::BR_JT:
2889  return LowerBR_JT(Op, DAG);
2890  case ISD::ConstantPool:
2891  return LowerConstantPool(Op, DAG);
2892  case ISD::BlockAddress:
2893  return LowerBlockAddress(Op, DAG);
2894  case ISD::VASTART:
2895  return LowerVASTART(Op, DAG);
2896  case ISD::VACOPY:
2897  return LowerVACOPY(Op, DAG);
2898  case ISD::VAARG:
2899  return LowerVAARG(Op, DAG);
2900  case ISD::ADDC:
2901  case ISD::ADDE:
2902  case ISD::SUBC:
2903  case ISD::SUBE:
2904  return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2905  case ISD::SADDO:
2906  case ISD::UADDO:
2907  case ISD::SSUBO:
2908  case ISD::USUBO:
2909  case ISD::SMULO:
2910  case ISD::UMULO:
2911  return LowerXALUO(Op, DAG);
2912  case ISD::FADD:
2913  return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
2914  case ISD::FSUB:
2915  return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
2916  case ISD::FMUL:
2917  return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
2918  case ISD::FDIV:
2919  return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
2920  case ISD::FP_ROUND:
2921  return LowerFP_ROUND(Op, DAG);
2922  case ISD::FP_EXTEND:
2923  return LowerFP_EXTEND(Op, DAG);
2924  case ISD::FRAMEADDR:
2925  return LowerFRAMEADDR(Op, DAG);
2926  case ISD::SPONENTRY:
2927  return LowerSPONENTRY(Op, DAG);
2928  case ISD::RETURNADDR:
2929  return LowerRETURNADDR(Op, DAG);
2930  case ISD::ADDROFRETURNADDR:
2931  return LowerADDROFRETURNADDR(Op, DAG);
2933  return LowerINSERT_VECTOR_ELT(Op, DAG);
2935  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2936  case ISD::BUILD_VECTOR:
2937  return LowerBUILD_VECTOR(Op, DAG);
2938  case ISD::VECTOR_SHUFFLE:
2939  return LowerVECTOR_SHUFFLE(Op, DAG);
2941  return LowerEXTRACT_SUBVECTOR(Op, DAG);
2942  case ISD::SRA:
2943  case ISD::SRL:
2944  case ISD::SHL:
2945  return LowerVectorSRA_SRL_SHL(Op, DAG);
2946  case ISD::SHL_PARTS:
2947  return LowerShiftLeftParts(Op, DAG);
2948  case ISD::SRL_PARTS:
2949  case ISD::SRA_PARTS:
2950  return LowerShiftRightParts(Op, DAG);
2951  case ISD::CTPOP:
2952  return LowerCTPOP(Op, DAG);
2953  case ISD::FCOPYSIGN:
2954  return LowerFCOPYSIGN(Op, DAG);
2955  case ISD::OR:
2956  return LowerVectorOR(Op, DAG);
2957  case ISD::XOR:
2958  return LowerXOR(Op, DAG);
2959  case ISD::PREFETCH:
2960  return LowerPREFETCH(Op, DAG);
2961  case ISD::SINT_TO_FP:
2962  case ISD::UINT_TO_FP:
2963  return LowerINT_TO_FP(Op, DAG);
2964  case ISD::FP_TO_SINT:
2965  case ISD::FP_TO_UINT:
2966  return LowerFP_TO_INT(Op, DAG);
2967  case ISD::FSINCOS:
2968  return LowerFSINCOS(Op, DAG);
2969  case ISD::FLT_ROUNDS_:
2970  return LowerFLT_ROUNDS_(Op, DAG);
2971  case ISD::MUL:
2972  return LowerMUL(Op, DAG);
2974  return LowerINTRINSIC_WO_CHAIN(Op, DAG);
2975  case ISD::STORE:
2976  return LowerSTORE(Op, DAG);
2977  case ISD::VECREDUCE_ADD:
2978  case ISD::VECREDUCE_SMAX:
2979  case ISD::VECREDUCE_SMIN:
2980  case ISD::VECREDUCE_UMAX:
2981  case ISD::VECREDUCE_UMIN:
2982  case ISD::VECREDUCE_FMAX:
2983  case ISD::VECREDUCE_FMIN:
2984  return LowerVECREDUCE(Op, DAG);
2985  case ISD::ATOMIC_LOAD_SUB:
2986  return LowerATOMIC_LOAD_SUB(Op, DAG);
2987  case ISD::ATOMIC_LOAD_AND:
2988  return LowerATOMIC_LOAD_AND(Op, DAG);
2990  return LowerDYNAMIC_STACKALLOC(Op, DAG);
2991  }
2992 }
2993 
2994 //===----------------------------------------------------------------------===//
2995 // Calling Convention Implementation
2996 //===----------------------------------------------------------------------===//
2997 
2998 /// Selects the correct CCAssignFn for a given CallingConvention value.
3000  bool IsVarArg) const {
3001  switch (CC) {
3002  default:
3003  report_fatal_error("Unsupported calling convention.");
3005  return CC_AArch64_WebKit_JS;
3006  case CallingConv::GHC:
3007  return CC_AArch64_GHC;
3008  case CallingConv::C:
3009  case CallingConv::Fast:
3012  case CallingConv::Swift:
3013  if (Subtarget->isTargetWindows() && IsVarArg)
3014  return CC_AArch64_Win64_VarArg;
3015  if (!Subtarget->isTargetDarwin())
3016  return CC_AArch64_AAPCS;
3018  case CallingConv::Win64:
3019  return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
3021  return CC_AArch64_AAPCS;
3022  }
3023 }
3024 
3025 CCAssignFn *
3029 }
3030 
3031 SDValue AArch64TargetLowering::LowerFormalArguments(
3032  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3033  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3034  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3035  MachineFunction &MF = DAG.getMachineFunction();
3036  MachineFrameInfo &MFI = MF.getFrameInfo();
3037  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3038 
3039  // Assign locations to all of the incoming arguments.
3041  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3042  *DAG.getContext());
3043 
3044  // At this point, Ins[].VT may already be promoted to i32. To correctly
3045  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3046  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3047  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
3048  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
3049  // LocVT.
3050  unsigned NumArgs = Ins.size();
3052  unsigned CurArgIdx = 0;
3053  for (unsigned i = 0; i != NumArgs; ++i) {
3054  MVT ValVT = Ins[i].VT;
3055  if (Ins[i].isOrigArg()) {
3056  std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
3057  CurArgIdx = Ins[i].getOrigArgIndex();
3058 
3059  // Get type of the original argument.
3060  EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
3061  /*AllowUnknown*/ true);
3062  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
3063  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3064  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3065  ValVT = MVT::i8;
3066  else if (ActualMVT == MVT::i16)
3067  ValVT = MVT::i16;
3068  }
3069  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3070  bool Res =
3071  AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
3072  assert(!Res && "Call operand has unhandled type");
3073  (void)Res;
3074  }
3075  assert(ArgLocs.size() == Ins.size());
3076  SmallVector<SDValue, 16> ArgValues;
3077  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3078  CCValAssign &VA = ArgLocs[i];
3079 
3080  if (Ins[i].Flags.isByVal()) {
3081  // Byval is used for HFAs in the PCS, but the system should work in a
3082  // non-compliant manner for larger structs.
3083  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3084  int Size = Ins[i].Flags.getByValSize();
3085  unsigned NumRegs = (Size + 7) / 8;
3086 
3087  // FIXME: This works on big-endian for composite byvals, which are the common
3088  // case. It should also work for fundamental types too.
3089  unsigned FrameIdx =
3090  MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
3091  SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
3092  InVals.push_back(FrameIdxN);
3093 
3094  continue;
3095  }
3096 
3097  if (VA.isRegLoc()) {
3098  // Arguments stored in registers.
3099  EVT RegVT = VA.getLocVT();
3100 
3101  SDValue ArgValue;
3102  const TargetRegisterClass *RC;
3103 
3104  if (RegVT == MVT::i32)
3105  RC = &AArch64::GPR32RegClass;
3106  else if (RegVT == MVT::i64)
3107  RC = &AArch64::GPR64RegClass;
3108  else if (RegVT == MVT::f16)
3109  RC = &AArch64::FPR16RegClass;
3110  else if (RegVT == MVT::f32)
3111  RC = &AArch64::FPR32RegClass;
3112  else if (RegVT == MVT::f64 || RegVT.is64BitVector())
3113  RC = &AArch64::FPR64RegClass;
3114  else if (RegVT == MVT::f128 || RegVT.is128BitVector())
3115  RC = &AArch64::FPR128RegClass;
3116  else
3117  llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3118 
3119  // Transform the arguments in physical registers into virtual ones.
3120  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3121  ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
3122 
3123  // If this is an 8, 16 or 32-bit value, it is really passed promoted
3124  // to 64 bits. Insert an assert[sz]ext to capture this, then
3125  // truncate to the right size.
3126  switch (VA.getLocInfo()) {
3127  default:
3128  llvm_unreachable("Unknown loc info!");
3129  case CCValAssign::Full:
3130  break;
3131  case CCValAssign::BCvt:
3132  ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
3133  break;
3134  case CCValAssign::AExt:
3135  case CCValAssign::SExt:
3136  case CCValAssign::ZExt:
3137  // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
3138  // nodes after our lowering.
3139  assert(RegVT == Ins[i].VT && "incorrect register location selected");
3140  break;
3141  }
3142 
3143  InVals.push_back(ArgValue);
3144 
3145  } else { // VA.isRegLoc()
3146  assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
3147  unsigned ArgOffset = VA.getLocMemOffset();
3148  unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
3149 
3150  uint32_t BEAlign = 0;
3151  if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
3152  !Ins[i].Flags.isInConsecutiveRegs())
3153  BEAlign = 8 - ArgSize;
3154 
3155  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
3156 
3157  // Create load nodes to retrieve arguments from the stack.
3158  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3159  SDValue ArgValue;
3160 
3161  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
3163  MVT MemVT = VA.getValVT();
3164 
3165  switch (VA.getLocInfo()) {
3166  default:
3167  break;
3168  case CCValAssign::BCvt:
3169  MemVT = VA.getLocVT();
3170  break;
3171  case CCValAssign::SExt:
3172  ExtType = ISD::SEXTLOAD;
3173  break;
3174  case CCValAssign::ZExt:
3175  ExtType = ISD::ZEXTLOAD;
3176  break;
3177  case CCValAssign::AExt:
3178  ExtType = ISD::EXTLOAD;
3179  break;
3180  }
3181 
3182  ArgValue = DAG.getExtLoad(
3183  ExtType, DL, VA.getLocVT(), Chain, FIN,
3185  MemVT);
3186 
3187  InVals.push_back(ArgValue);
3188  }
3189  }
3190 
3191  // varargs
3193  if (isVarArg) {
3194  if (!Subtarget->isTargetDarwin() || IsWin64) {
3195  // The AAPCS variadic function ABI is identical to the non-variadic
3196  // one. As a result there may be more arguments in registers and we should
3197  // save them for future reference.
3198  // Win64 variadic functions also pass arguments in registers, but all float
3199  // arguments are passed in integer registers.
3200  saveVarArgRegisters(CCInfo, DAG, DL, Chain);
3201  }
3202 
3203  // This will point to the next argument passed via stack.
3204  unsigned StackOffset = CCInfo.getNextStackOffset();
3205  // We currently pass all varargs at 8-byte alignment.
3206  StackOffset = ((StackOffset + 7) & ~7);
3207  FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
3208 
3209  if (MFI.hasMustTailInVarArgFunc()) {
3210  SmallVector<MVT, 2> RegParmTypes;
3211  RegParmTypes.push_back(MVT::i64);
3212  RegParmTypes.push_back(MVT::f128);
3213  // Compute the set of forwarded registers. The rest are scratch.
3215  FuncInfo->getForwardedMustTailRegParms();
3216  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
3218 
3219  // Conservatively forward X8, since it might be used for aggregate return.
3220  if (!CCInfo.isAllocated(AArch64::X8)) {
3221  unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
3222  Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
3223  }
3224  }
3225  }
3226 
3227  // On Windows, InReg pointers must be returned, so record the pointer in a
3228  // virtual register at the start of the function so it can be returned in the
3229  // epilogue.
3230  if (IsWin64) {
3231  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3232  if (Ins[I].Flags.isInReg()) {
3233  assert(!FuncInfo->getSRetReturnReg());
3234 
3235  MVT PtrTy = getPointerTy(DAG.getDataLayout());
3236  unsigned Reg =
3238  FuncInfo->setSRetReturnReg(Reg);
3239 
3240  SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
3241  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
3242  break;
3243  }
3244  }
3245  }
3246 
3247  unsigned StackArgSize = CCInfo.getNextStackOffset();
3248  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3249  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
3250  // This is a non-standard ABI so by fiat I say we're allowed to make full
3251  // use of the stack area to be popped, which must be aligned to 16 bytes in
3252  // any case:
3253  StackArgSize = alignTo(StackArgSize, 16);
3254 
3255  // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
3256  // a multiple of 16.
3257  FuncInfo->setArgumentStackToRestore(StackArgSize);
3258 
3259  // This realignment carries over to the available bytes below. Our own
3260  // callers will guarantee the space is free by giving an aligned value to
3261  // CALLSEQ_START.
3262  }
3263  // Even if we're not expected to free up the space, it's useful to know how
3264  // much is there while considering tail calls (because we can reuse it).
3265  FuncInfo->setBytesInStackArgArea(StackArgSize);
3266 
3267  if (Subtarget->hasCustomCallingConv())
3268  Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
3269 
3270  return Chain;
3271 }
3272 
3273 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
3274  SelectionDAG &DAG,
3275  const SDLoc &DL,
3276  SDValue &Chain) const {
3277  MachineFunction &MF = DAG.getMachineFunction();
3278  MachineFrameInfo &MFI = MF.getFrameInfo();
3280  auto PtrVT = getPointerTy(DAG.getDataLayout());
3281  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3282 
3283  SmallVector<SDValue, 8> MemOps;
3284 
3285  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
3286  AArch64::X3, AArch64::X4, AArch64::X5,
3287  AArch64::X6, AArch64::X7 };
3288  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
3289  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
3290 
3291  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
3292  int GPRIdx = 0;
3293  if (GPRSaveSize != 0) {
3294  if (IsWin64) {
3295  GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
3296  if (GPRSaveSize & 15)
3297  // The extra size here, if triggered, will always be 8.
3298  MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
3299  } else
3300  GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
3301 
3302  SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
3303 
3304  for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
3305  unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
3306  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
3307  SDValue Store = DAG.getStore(
3308  Val.getValue(1), DL, Val, FIN,
3309  IsWin64
3311  GPRIdx,
3312  (i - FirstVariadicGPR) * 8)
3314  MemOps.push_back(Store);
3315  FIN =
3316  DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
3317  }
3318  }
3319  FuncInfo->setVarArgsGPRIndex(GPRIdx);
3320  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
3321 
3322  if (Subtarget->hasFPARMv8() && !IsWin64) {
3323  static const MCPhysReg FPRArgRegs[] = {
3324  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
3325  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
3326  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
3327  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
3328 
3329  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
3330  int FPRIdx = 0;
3331  if (FPRSaveSize != 0) {
3332  FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
3333 
3334  SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3335 
3336  for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3337  unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3338  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3339 
3340  SDValue Store = DAG.getStore(
3341  Val.getValue(1), DL, Val, FIN,
3343  MemOps.push_back(Store);
3344  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3345  DAG.getConstant(16, DL, PtrVT));
3346  }
3347  }
3348  FuncInfo->setVarArgsFPRIndex(FPRIdx);
3349  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3350  }
3351 
3352  if (!MemOps.empty()) {
3353  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3354  }
3355 }
3356 
3357 /// LowerCallResult - Lower the result values of a call into the
3358 /// appropriate copies out of appropriate physical registers.
3359 SDValue AArch64TargetLowering::LowerCallResult(
3360  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3361  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3362  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3363  SDValue ThisVal) const {
3364  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3367  // Assign locations to each value returned by this call.
3369  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3370  *DAG.getContext());
3371  CCInfo.AnalyzeCallResult(Ins, RetCC);
3372 
3373  // Copy all of the result registers out of their specified physreg.
3374  for (unsigned i = 0; i != RVLocs.size(); ++i) {
3375  CCValAssign VA = RVLocs[i];
3376 
3377  // Pass 'this' value directly from the argument to return value, to avoid
3378  // reg unit interference
3379  if (i == 0 && isThisReturn) {
3380  assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3381  "unexpected return calling convention register assignment");
3382  InVals.push_back(ThisVal);
3383  continue;
3384  }
3385 
3386  SDValue Val =
3387  DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3388  Chain = Val.getValue(1);
3389  InFlag = Val.getValue(2);
3390 
3391  switch (VA.getLocInfo()) {
3392  default:
3393  llvm_unreachable("Unknown loc info!");
3394  case CCValAssign::Full:
3395  break;
3396  case CCValAssign::BCvt:
3397  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3398  break;
3399  }
3400 
3401  InVals.push_back(Val);
3402  }
3403 
3404  return Chain;
3405 }
3406 
3407 /// Return true if the calling convention is one that we can guarantee TCO for.
3409  return CC == CallingConv::Fast;
3410 }
3411 
3412 /// Return true if we might ever do TCO for calls with this calling convention.
3414  switch (CC) {
3415  case CallingConv::C:
3417  case CallingConv::Swift:
3418  return true;
3419  default:
3420  return canGuaranteeTCO(CC);
3421  }
3422 }
3423 
3424 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3425  SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3426  const SmallVectorImpl<ISD::OutputArg> &Outs,
3427  const SmallVectorImpl<SDValue> &OutVals,
3428  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3429  if (!mayTailCallThisCC(CalleeCC))
3430  return false;
3431 
3432  MachineFunction &MF = DAG.getMachineFunction();
3433  const Function &CallerF = MF.getFunction();
3434  CallingConv::ID CallerCC = CallerF.getCallingConv();
3435  bool CCMatch = CallerCC == CalleeCC;
3436 
3437  // Byval parameters hand the function a pointer directly into the stack area
3438  // we want to reuse during a tail call. Working around this *is* possible (see
3439  // X86) but less efficient and uglier in LowerCall.
3440  for (Function::const_arg_iterator i = CallerF.arg_begin(),
3441  e = CallerF.arg_end();
3442  i != e; ++i) {
3443  if (i->hasByValAttr())
3444  return false;
3445 
3446  // On Windows, "inreg" attributes signify non-aggregate indirect returns.
3447  // In this case, it is necessary to save/restore X0 in the callee. Tail
3448  // call opt interferes with this. So we disable tail call opt when the
3449  // caller has an argument with "inreg" attribute.
3450 
3451  // FIXME: Check whether the callee also has an "inreg" argument.
3452  if (i->hasInRegAttr())
3453  return false;
3454  }
3455 
3457  return canGuaranteeTCO(CalleeCC) && CCMatch;
3458 
3459  // Externally-defined functions with weak linkage should not be
3460  // tail-called on AArch64 when the OS does not support dynamic
3461  // pre-emption of symbols, as the AAELF spec requires normal calls
3462  // to undefined weak functions to be replaced with a NOP or jump to the
3463  // next instruction. The behaviour of branch instructions in this
3464  // situation (as used for tail calls) is implementation-defined, so we
3465  // cannot rely on the linker replacing the tail call with a return.
3466  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3467  const GlobalValue *GV = G->getGlobal();
3469  if (GV->hasExternalWeakLinkage() &&
3470  (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3471  return false;
3472  }
3473 
3474  // Now we search for cases where we can use a tail call without changing the
3475  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3476  // concept.
3477 
3478  // I want anyone implementing a new calling convention to think long and hard
3479  // about this assert.
3480  assert((!isVarArg || CalleeCC == CallingConv::C) &&
3481  "Unexpected variadic calling convention");
3482 
3483  LLVMContext &C = *DAG.getContext();
3484  if (isVarArg && !Outs.empty()) {
3485  // At least two cases here: if caller is fastcc then we can't have any
3486  // memory arguments (we'd be expected to clean up the stack afterwards). If
3487  // caller is C then we could potentially use its argument area.
3488 
3489  // FIXME: for now we take the most conservative of these in both cases:
3490  // disallow all variadic memory operands.
3492  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3493 
3494  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3495  for (const CCValAssign &ArgLoc : ArgLocs)
3496  if (!ArgLoc.isRegLoc())
3497  return false;
3498  }
3499 
3500  // Check that the call results are passed in the same way.
3501  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3502  CCAssignFnForCall(CalleeCC, isVarArg),
3503  CCAssignFnForCall(CallerCC, isVarArg)))
3504  return false;
3505  // The callee has to preserve all registers the caller needs to preserve.
3506  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3507  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3508  if (!CCMatch) {
3509  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3510  if (Subtarget->hasCustomCallingConv()) {
3511  TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
3512  TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
3513  }
3514  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3515  return false;
3516  }
3517 
3518  // Nothing more to check if the callee is taking no arguments
3519  if (Outs.empty())
3520  return true;
3521 
3523  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3524 
3525  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3526 
3527  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3528 
3529  // If the stack arguments for this call do not fit into our own save area then
3530  // the call cannot be made tail.
3531  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3532  return false;
3533 
3534  const MachineRegisterInfo &MRI = MF.getRegInfo();
3535  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3536  return false;
3537 
3538  return true;
3539 }
3540 
3541 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3542  SelectionDAG &DAG,
3543  MachineFrameInfo &MFI,
3544  int ClobberedFI) const {
3545  SmallVector<SDValue, 8> ArgChains;
3546  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3547  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3548 
3549  // Include the original chain at the beginning of the list. When this is
3550  // used by target LowerCall hooks, this helps legalize find the
3551  // CALLSEQ_BEGIN node.
3552  ArgChains.push_back(Chain);
3553 
3554  // Add a chain value for each stack argument corresponding
3555  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3556  UE = DAG.getEntryNode().getNode()->use_end();
3557  U != UE; ++U)
3558  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3559  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3560  if (FI->getIndex() < 0) {
3561  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3562  int64_t InLastByte = InFirstByte;
3563  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3564 
3565  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3566  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3567  ArgChains.push_back(SDValue(L, 1));
3568  }
3569 
3570  // Build a tokenfactor for all the chains.
3571  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3572 }
3573 
3574 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3575  bool TailCallOpt) const {
3576  return CallCC == CallingConv::Fast && TailCallOpt;
3577 }
3578 
3579 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3580 /// and add input and output parameter nodes.
3581 SDValue
3582 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3583  SmallVectorImpl<SDValue> &InVals) const {
3584  SelectionDAG &DAG = CLI.DAG;
3585  SDLoc &DL = CLI.DL;
3586  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3587  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3588  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3589  SDValue Chain = CLI.Chain;
3590  SDValue Callee = CLI.Callee;
3591  bool &IsTailCall = CLI.IsTailCall;
3592  CallingConv::ID CallConv = CLI.CallConv;
3593  bool IsVarArg = CLI.IsVarArg;
3594 
3595  MachineFunction &MF = DAG.getMachineFunction();
3596  bool IsThisReturn = false;
3597 
3599  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3600  bool IsSibCall = false;
3601 
3602  if (IsTailCall) {
3603  // Check if it's really possible to do a tail call.
3604  IsTailCall = isEligibleForTailCallOptimization(
3605  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3606  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3607  report_fatal_error("failed to perform tail call elimination on a call "
3608  "site marked musttail");
3609 
3610  // A sibling call is one where we're under the usual C ABI and not planning
3611  // to change that but can still do a tail call:
3612  if (!TailCallOpt && IsTailCall)
3613  IsSibCall = true;
3614 
3615  if (IsTailCall)
3616  ++NumTailCalls;
3617  }
3618 
3619  // Analyze operands of the call, assigning locations to each operand.
3621  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3622  *DAG.getContext());
3623 
3624  if (IsVarArg) {
3625  // Handle fixed and variable vector arguments differently.
3626  // Variable vector arguments always go into memory.
3627  unsigned NumArgs = Outs.size();
3628 
3629  for (unsigned i = 0; i != NumArgs; ++i) {
3630  MVT ArgVT = Outs[i].VT;
3631  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3632  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3633  /*IsVarArg=*/ !Outs[i].IsFixed);
3634  bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3635  assert(!Res && "Call operand has unhandled type");
3636  (void)Res;
3637  }
3638  } else {
3639  // At this point, Outs[].VT may already be promoted to i32. To correctly
3640  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3641  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3642  // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3643  // we use a special version of AnalyzeCallOperands to pass in ValVT and
3644  // LocVT.
3645  unsigned NumArgs = Outs.size();
3646  for (unsigned i = 0; i != NumArgs; ++i) {
3647  MVT ValVT = Outs[i].VT;
3648  // Get type of the original argument.
3649  EVT ActualVT = getValueType(DAG.getDataLayout(),
3650  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3651  /*AllowUnknown*/ true);
3652  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3653  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3654  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3655  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3656  ValVT = MVT::i8;
3657  else if (ActualMVT == MVT::i16)
3658  ValVT = MVT::i16;
3659 
3660  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3661  bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3662  assert(!Res && "Call operand has unhandled type");
3663  (void)Res;
3664  }
3665  }
3666 
3667  // Get a count of how many bytes are to be pushed on the stack.
3668  unsigned NumBytes = CCInfo.getNextStackOffset();
3669 
3670  if (IsSibCall) {
3671  // Since we're not changing the ABI to make this a tail call, the memory
3672  // operands are already available in the caller's incoming argument space.
3673  NumBytes = 0;
3674  }
3675 
3676  // FPDiff is the byte offset of the call's argument area from the callee's.
3677  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3678  // by this amount for a tail call. In a sibling call it must be 0 because the
3679  // caller will deallocate the entire stack and the callee still expects its
3680  // arguments to begin at SP+0. Completely unused for non-tail calls.
3681  int FPDiff = 0;
3682 
3683  if (IsTailCall && !IsSibCall) {
3684  unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3685 
3686  // Since callee will pop argument stack as a tail call, we must keep the
3687  // popped size 16-byte aligned.
3688  NumBytes = alignTo(NumBytes, 16);
3689 
3690  // FPDiff will be negative if this tail call requires more space than we
3691  // would automatically have in our incoming argument space. Positive if we
3692  // can actually shrink the stack.
3693  FPDiff = NumReusableBytes - NumBytes;
3694 
3695  // The stack pointer must be 16-byte aligned at all times it's used for a
3696  // memory operation, which in practice means at *all* times and in
3697  // particular across call boundaries. Therefore our own arguments started at
3698  // a 16-byte aligned SP and the delta applied for the tail call should
3699  // satisfy the same constraint.
3700  assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3701  }
3702 
3703  // Adjust the stack pointer for the new arguments...
3704  // These operations are automatically eliminated by the prolog/epilog pass
3705  if (!IsSibCall)
3706  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3707 
3708  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3709  getPointerTy(DAG.getDataLayout()));
3710 
3712  SmallVector<SDValue, 8> MemOpChains;
3713  auto PtrVT = getPointerTy(DAG.getDataLayout());
3714 
3715  if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
3716  const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
3717  for (const auto &F : Forwards) {
3718  SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
3719  RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3720  }
3721  }
3722 
3723  // Walk the register/memloc assignments, inserting copies/loads.
3724  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3725  ++i, ++realArgIdx) {
3726  CCValAssign &VA = ArgLocs[i];
3727  SDValue Arg = OutVals[realArgIdx];
3728  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3729 
3730  // Promote the value if needed.
3731  switch (VA.getLocInfo()) {
3732  default:
3733  llvm_unreachable("Unknown loc info!");
3734  case CCValAssign::Full:
3735  break;
3736  case CCValAssign::SExt:
3737  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3738  break;
3739  case CCValAssign::ZExt:
3740  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3741  break;
3742  case CCValAssign::AExt:
3743  if (Outs[realArgIdx].ArgVT == MVT::i1) {
3744  // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3745  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3746  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3747  }
3748  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3749  break;
3750  case CCValAssign::BCvt:
3751  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3752  break;
3753  case CCValAssign::FPExt:
3754  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3755  break;
3756  }
3757 
3758  if (VA.isRegLoc()) {
3759  if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3760  Outs[0].VT == MVT::i64) {
3761  assert(VA.getLocVT() == MVT::i64 &&
3762  "unexpected calling convention register assignment");
3763  assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3764  "unexpected use of 'returned'");
3765  IsThisReturn = true;
3766  }
3767  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3768  } else {
3769  assert(VA.isMemLoc());
3770 
3771  SDValue DstAddr;
3772  MachinePointerInfo DstInfo;
3773 
3774  // FIXME: This works on big-endian for composite byvals, which are the
3775  // common case. It should also work for fundamental types too.
3776  uint32_t BEAlign = 0;
3777  unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3778  : VA.getValVT().getSizeInBits();
3779  OpSize = (OpSize + 7) / 8;
3780  if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3781  !Flags.isInConsecutiveRegs()) {
3782  if (OpSize < 8)
3783  BEAlign = 8 - OpSize;
3784  }
3785  unsigned LocMemOffset = VA.getLocMemOffset();
3786  int32_t Offset = LocMemOffset + BEAlign;
3787  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3788  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3789 
3790  if (IsTailCall) {
3791  Offset = Offset + FPDiff;
3792  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3793 
3794  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3795  DstInfo =
3797 
3798  // Make sure any stack arguments overlapping with where we're storing
3799  // are loaded before this eventual operation. Otherwise they'll be
3800  // clobbered.
3801  Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3802  } else {
3803  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3804 
3805  DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3807  LocMemOffset);
3808  }
3809 
3810  if (Outs[i].Flags.isByVal()) {
3811  SDValue SizeNode =
3812  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3813  SDValue Cpy = DAG.getMemcpy(
3814  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3815  /*isVol = */ false, /*AlwaysInline = */ false,
3816  /*isTailCall = */ false,
3817  DstInfo, MachinePointerInfo());
3818 
3819  MemOpChains.push_back(Cpy);
3820  } else {
3821  // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3822  // promoted to a legal register type i32, we should truncate Arg back to
3823  // i1/i8/i16.
3824  if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3825  VA.getValVT() == MVT::i16)
3826  Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3827 
3828  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3829  MemOpChains.push_back(Store);
3830  }
3831  }
3832  }
3833 
3834  if (!MemOpChains.empty())
3835  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3836 
3837  // Build a sequence of copy-to-reg nodes chained together with token chain
3838  // and flag operands which copy the outgoing args into the appropriate regs.
3839  SDValue InFlag;
3840  for (auto &RegToPass : RegsToPass) {
3841  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3842  RegToPass.second, InFlag);
3843  InFlag = Chain.getValue(1);
3844  }
3845 
3846  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3847  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3848  // node so that legalize doesn't hack it.
3849  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3850  auto GV = G->getGlobal();
3851  if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
3853  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
3854  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3855  } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
3856  assert(Subtarget->isTargetWindows() &&
3857  "Windows is the only supported COFF target");
3858  Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
3859  } else {
3860  const GlobalValue *GV = G->getGlobal();
3861  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
3862  }
3863  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3864  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3865  Subtarget->isTargetMachO()) {
3866  const char *Sym = S->getSymbol();
3867  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
3868  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3869  } else {
3870  const char *Sym = S->getSymbol();
3871  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
3872  }
3873  }
3874 
3875  // We don't usually want to end the call-sequence here because we would tidy
3876  // the frame up *after* the call, however in the ABI-changing tail-call case
3877  // we've carefully laid out the parameters so that when sp is reset they'll be
3878  // in the correct location.
3879  if (IsTailCall && !IsSibCall) {
3880  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3881  DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
3882  InFlag = Chain.getValue(1);
3883  }
3884 
3885  std::vector<SDValue> Ops;
3886  Ops.push_back(Chain);
3887  Ops.push_back(Callee);
3888 
3889  if (IsTailCall) {
3890  // Each tail call may have to adjust the stack by a different amount, so
3891  // this information must travel along with the operation for eventual
3892  // consumption by emitEpilogue.
3893  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3894  }
3895 
3896  // Add argument registers to the end of the list so that they are known live
3897  // into the call.
3898  for (auto &RegToPass : RegsToPass)
3899  Ops.push_back(DAG.getRegister(RegToPass.first,
3900  RegToPass.second.getValueType()));
3901 
3902  // Add a register mask operand representing the call-preserved registers.
3903  const uint32_t *Mask;
3904  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3905  if (IsThisReturn) {
3906  // For 'this' returns, use the X0-preserving mask if applicable
3907  Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
3908  if (!Mask) {
3909  IsThisReturn = false;
3910  Mask = TRI->getCallPreservedMask(MF, CallConv);
3911  }
3912  } else
3913  Mask = TRI->getCallPreservedMask(MF, CallConv);
3914 
3915  if (Subtarget->hasCustomCallingConv())
3916  TRI->UpdateCustomCallPreservedMask(MF, &Mask);
3917 
3918  if (TRI->isAnyArgRegReserved(MF))
3919  TRI->emitReservedArgRegCallError(MF);
3920 
3921  assert(Mask && "Missing call preserved mask for calling convention");
3922  Ops.push_back(DAG.getRegisterMask(Mask));
3923 
3924  if (InFlag.getNode())
3925  Ops.push_back(InFlag);
3926 
3927  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3928 
3929  // If we're doing a tall call, use a TC_RETURN here rather than an
3930  // actual call instruction.
3931  if (IsTailCall) {
3933  return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
3934  }
3935 
3936  // Returns a chain and a flag for retval copy to use.
3937  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
3938  InFlag = Chain.getValue(1);
3939 
3940  uint64_t CalleePopBytes =
3941  DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
3942 
3943  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3944  DAG.getIntPtrConstant(CalleePopBytes, DL, true),
3945  InFlag, DL);
3946  if (!Ins.empty())
3947  InFlag = Chain.getValue(1);
3948 
3949  // Handle result values, copying them out of physregs into vregs that we
3950  // return.
3951  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3952  InVals, IsThisReturn,
3953  IsThisReturn ? OutVals[0] : SDValue());
3954 }
3955 
3956 bool AArch64TargetLowering::CanLowerReturn(
3957  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3958  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3959  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3963  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3964  return CCInfo.CheckReturn(Outs, RetCC);
3965 }
3966 
3967 SDValue
3968 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3969  bool isVarArg,
3970  const SmallVectorImpl<ISD::OutputArg> &Outs,
3971  const SmallVectorImpl<SDValue> &OutVals,
3972  const SDLoc &DL, SelectionDAG &DAG) const {
3973  auto &MF = DAG.getMachineFunction();
3974  auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3975 
3976  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3980  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3981  *DAG.getContext());
3982  CCInfo.AnalyzeReturn(Outs, RetCC);
3983 
3984  // Copy the result values into the output registers.
3985  SDValue Flag;
3986  SmallVector<SDValue, 4> RetOps(1, Chain);
3987  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
3988  ++i, ++realRVLocIdx) {
3989  CCValAssign &VA = RVLocs[i];
3990  assert(VA.isRegLoc() && "Can only return in registers!");
3991  SDValue Arg = OutVals[realRVLocIdx];
3992 
3993  switch (VA.getLocInfo()) {
3994  default:
3995  llvm_unreachable("Unknown loc info!");
3996  case CCValAssign::Full:
3997  if (Outs[i].ArgVT == MVT::i1) {
3998  // AAPCS requires i1 to be zero-extended to i8 by the producer of the
3999  // value. This is strictly redundant on Darwin (which uses "zeroext
4000  // i1"), but will be optimised out before ISel.
4001  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
4002  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4003  }
4004  break;
4005  case CCValAssign::BCvt:
4006  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4007  break;
4008  }
4009 
4010  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
4011  Flag = Chain.getValue(1);
4012  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
4013  }
4014 
4015  // Windows AArch64 ABIs require that for returning structs by value we copy
4016  // the sret argument into X0 for the return.
4017  // We saved the argument into a virtual register in the entry block,
4018  // so now we copy the value out and into X0.
4019  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
4020  SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
4021  getPointerTy(MF.getDataLayout()));
4022 
4023  unsigned RetValReg = AArch64::X0;
4024  Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
4025  Flag = Chain.getValue(1);
4026 
4027  RetOps.push_back(
4028  DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
4029  }
4030 
4031  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
4032  const MCPhysReg *I =
4034  if (I) {
4035  for (; *I; ++I) {
4036  if (AArch64::GPR64RegClass.contains(*I))
4037  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
4038  else if (AArch64::FPR64RegClass.contains(*I))
4039  RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
4040  else
4041  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
4042  }
4043  }
4044 
4045  RetOps[0] = Chain; // Update chain.
4046 
4047  // Add the flag if we have it.
4048  if (Flag.getNode())
4049  RetOps.push_back(Flag);
4050 
4051  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
4052 }
4053 
4054 //===----------------------------------------------------------------------===//
4055 // Other Lowering Code
4056 //===----------------------------------------------------------------------===//
4057 
4058 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
4059  SelectionDAG &DAG,
4060  unsigned Flag) const {
4061  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
4062  N->getOffset(), Flag);
4063 }
4064 
4065 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
4066  SelectionDAG &DAG,
4067  unsigned Flag) const {
4068  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
4069 }
4070 
4071 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
4072  SelectionDAG &DAG,
4073  unsigned Flag) const {
4074  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
4075  N->getOffset(), Flag);
4076 }
4077 
4078 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
4079  SelectionDAG &DAG,
4080  unsigned Flag) const {
4081  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
4082 }
4083 
4084 // (loadGOT sym)
4085