LLVM  9.0.0svn
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ExpandImm.h"
14 #include "AArch64ISelLowering.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
47 #include "llvm/IR/Attributes.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DataLayout.h"
50 #include "llvm/IR/DebugLoc.h"
51 #include "llvm/IR/DerivedTypes.h"
52 #include "llvm/IR/Function.h"
54 #include "llvm/IR/GlobalValue.h"
55 #include "llvm/IR/IRBuilder.h"
56 #include "llvm/IR/Instruction.h"
57 #include "llvm/IR/Instructions.h"
58 #include "llvm/IR/IntrinsicInst.h"
59 #include "llvm/IR/Intrinsics.h"
60 #include "llvm/IR/Module.h"
61 #include "llvm/IR/OperandTraits.h"
62 #include "llvm/IR/PatternMatch.h"
63 #include "llvm/IR/Type.h"
64 #include "llvm/IR/Use.h"
65 #include "llvm/IR/Value.h"
66 #include "llvm/MC/MCRegisterInfo.h"
67 #include "llvm/Support/Casting.h"
68 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/Compiler.h"
71 #include "llvm/Support/Debug.h"
73 #include "llvm/Support/KnownBits.h"
79 #include <algorithm>
80 #include <bitset>
81 #include <cassert>
82 #include <cctype>
83 #include <cstdint>
84 #include <cstdlib>
85 #include <iterator>
86 #include <limits>
87 #include <tuple>
88 #include <utility>
89 #include <vector>
90 
91 using namespace llvm;
92 using namespace llvm::PatternMatch;
93 
94 #define DEBUG_TYPE "aarch64-lower"
95 
96 STATISTIC(NumTailCalls, "Number of tail calls");
97 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
98 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
99 
100 static cl::opt<bool>
101 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
102  cl::desc("Allow AArch64 SLI/SRI formation"),
103  cl::init(false));
104 
105 // FIXME: The necessary dtprel relocations don't seem to be supported
106 // well in the GNU bfd and gold linkers at the moment. Therefore, by
107 // default, for now, fall back to GeneralDynamic code generation.
109  "aarch64-elf-ldtls-generation", cl::Hidden,
110  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
111  cl::init(false));
112 
113 static cl::opt<bool>
114 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
115  cl::desc("Enable AArch64 logical imm instruction "
116  "optimization"),
117  cl::init(true));
118 
119 /// Value type used for condition codes.
120 static const MVT MVT_CC = MVT::i32;
121 
123  const AArch64Subtarget &STI)
124  : TargetLowering(TM), Subtarget(&STI) {
125  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
126  // we have to make something up. Arbitrarily, choose ZeroOrOne.
128  // When comparing vectors the result sets the different elements in the
129  // vector to all-one or all-zero.
131 
132  // Set up the register classes.
133  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
134  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
135 
136  if (Subtarget->hasFPARMv8()) {
137  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
138  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
139  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
140  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
141  }
142 
143  if (Subtarget->hasNEON()) {
144  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
145  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
146  // Someone set us up the NEON.
147  addDRTypeForNEON(MVT::v2f32);
148  addDRTypeForNEON(MVT::v8i8);
149  addDRTypeForNEON(MVT::v4i16);
150  addDRTypeForNEON(MVT::v2i32);
151  addDRTypeForNEON(MVT::v1i64);
152  addDRTypeForNEON(MVT::v1f64);
153  addDRTypeForNEON(MVT::v4f16);
154 
155  addQRTypeForNEON(MVT::v4f32);
156  addQRTypeForNEON(MVT::v2f64);
157  addQRTypeForNEON(MVT::v16i8);
158  addQRTypeForNEON(MVT::v8i16);
159  addQRTypeForNEON(MVT::v4i32);
160  addQRTypeForNEON(MVT::v2i64);
161  addQRTypeForNEON(MVT::v8f16);
162  }
163 
164  // Compute derived properties from the register classes
166 
167  // Provide all sorts of operation actions
195 
199 
203 
205 
206  // Custom lowering hooks are needed for XOR
207  // to fold it into CSINC/CSINV.
210 
211  // Virtually no operation on f128 is legal, but LLVM can't expand them when
212  // there's a valid register class, so we need custom operations in most cases.
234 
235  // Lowering for many of the conversions is actually specified by the non-f128
236  // type. The LowerXXX function will be trivial when f128 isn't involved.
251 
252  // Variable arguments.
257 
258  // Variable-sized objects.
261 
262  if (Subtarget->isTargetWindows())
264  else
266 
267  // Constant pool entries
269 
270  // BlockAddress
272 
273  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
282 
283  // AArch64 lacks both left-rotate and popcount instructions.
286  for (MVT VT : MVT::vector_valuetypes()) {
289  }
290 
291  // AArch64 doesn't have {U|S}MUL_LOHI.
294 
297 
300  for (MVT VT : MVT::vector_valuetypes()) {
303  }
310 
311  // Custom lower Add/Sub/Mul with overflow.
324 
333  if (Subtarget->hasFullFP16())
335  else
337 
371 
372  if (!Subtarget->hasFullFP16()) {
395 
396  // promote v4f16 to v4f32 when that is known to be safe.
409 
425 
446  }
447 
448  // AArch64 has implementations of a lot of rounding-like FP operations.
449  for (MVT Ty : {MVT::f32, MVT::f64}) {
464  }
465 
466  if (Subtarget->hasFullFP16()) {
477  }
478 
480 
482 
488 
489  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
490  // This requires the Performance Monitors extension.
491  if (Subtarget->hasPerfMon())
493 
494  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
495  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
496  // Issue __sincos_stret if available.
499  } else {
502  }
503 
504  // Make floating-point constants legal for the large code model, so they don't
505  // become loads from the constant pool.
506  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
509  }
510 
511  // AArch64 does not have floating-point extending loads, i1 sign-extending
512  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
513  for (MVT VT : MVT::fp_valuetypes()) {
518  }
519  for (MVT VT : MVT::integer_valuetypes())
521 
529 
532 
533  // Indexed loads and stores are supported.
534  for (unsigned im = (unsigned)ISD::PRE_INC;
550  }
551 
552  // Trap.
554 
555  // We combine OR nodes for bitfield operations.
557  // Try to create BICs for vector ANDs.
559 
560  // Vector add and sub nodes may conceal a high-half opportunity.
561  // Also, try to fold ADD into CSINC/CSINV..
568 
572 
574 
581  if (Subtarget->supportsAddressTopByteIgnored())
583 
585 
588 
592 
594 
595  // In case of strict alignment, avoid an excessive number of byte wide stores.
599 
604 
606 
608 
610 
611  EnableExtLdPromotion = true;
612 
613  // Set required alignment.
615  // Set preferred alignments.
618 
619  // Only change the limit for entries in a jump table if specified by
620  // the sub target, but not at the command line.
621  unsigned MaxJT = STI.getMaximumJumpTableSize();
622  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
624 
625  setHasExtractBitsInsn(true);
626 
628 
629  if (Subtarget->hasNEON()) {
630  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
631  // silliness like this:
657 
663 
665 
666  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
667  // elements smaller than i32, so promote the input to i32 first.
670  // i8 vector elements also need promotion to i32 for v8i8
673  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
678  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
679  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
682 
683  if (Subtarget->hasFullFP16()) {
688  } else {
689  // when AArch64 doesn't have fullfp16 support, promote the input
690  // to i32 first.
695  }
696 
699 
700  // AArch64 doesn't have MUL.2d:
702  // Custom handling for some quad-vector types to detect MULL.
706 
707  // Vector reductions
708  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
715  }
716  for (MVT VT : { MVT::v4f16, MVT::v2f32,
720  }
721 
724  // Likewise, narrowing and extending vector loads/stores aren't handled
725  // directly.
726  for (MVT VT : MVT::vector_valuetypes()) {
728 
729  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
732  } else {
735  }
738 
741 
742  for (MVT InnerVT : MVT::vector_valuetypes()) {
743  setTruncStoreAction(VT, InnerVT, Expand);
744  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
745  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
746  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
747  }
748  }
749 
750  // AArch64 has implementations of a lot of rounding-like FP operations.
751  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
758  }
759 
760  if (Subtarget->hasFullFP16()) {
761  for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
768  }
769  }
770 
772  }
773 
775 }
776 
777 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
778  assert(VT.isVector() && "VT should be a vector type");
779 
780  if (VT.isFloatingPoint()) {
782  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
783  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
784  }
785 
786  // Mark vector float intrinsics as expand.
787  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
796 
797  // But we do support custom-lowering for FCOPYSIGN.
799  }
800 
812 
816  for (MVT InnerVT : MVT::all_valuetypes())
817  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
818 
819  // CNT supports only B element sizes, then use UADDLP to widen.
820  if (VT != MVT::v8i8 && VT != MVT::v16i8)
822 
828 
831 
832  if (!VT.isFloatingPoint())
834 
835  // [SU][MIN|MAX] are available for all NEON types apart from i64.
836  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
837  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
838  setOperationAction(Opcode, VT, Legal);
839 
840  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
841  if (VT.isFloatingPoint() &&
842  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
843  for (unsigned Opcode :
845  setOperationAction(Opcode, VT, Legal);
846 
847  if (Subtarget->isLittleEndian()) {
848  for (unsigned im = (unsigned)ISD::PRE_INC;
852  }
853  }
854 }
855 
856 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
857  addRegisterClass(VT, &AArch64::FPR64RegClass);
858  addTypeForNEON(VT, MVT::v2i32);
859 }
860 
861 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
862  addRegisterClass(VT, &AArch64::FPR128RegClass);
863  addTypeForNEON(VT, MVT::v4i32);
864 }
865 
867  EVT VT) const {
868  if (!VT.isVector())
869  return MVT::i32;
871 }
872 
873 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
874  const APInt &Demanded,
876  unsigned NewOpc) {
877  uint64_t OldImm = Imm, NewImm, Enc;
878  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
879 
880  // Return if the immediate is already all zeros, all ones, a bimm32 or a
881  // bimm64.
882  if (Imm == 0 || Imm == Mask ||
883  AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
884  return false;
885 
886  unsigned EltSize = Size;
887  uint64_t DemandedBits = Demanded.getZExtValue();
888 
889  // Clear bits that are not demanded.
890  Imm &= DemandedBits;
891 
892  while (true) {
893  // The goal here is to set the non-demanded bits in a way that minimizes
894  // the number of switching between 0 and 1. In order to achieve this goal,
895  // we set the non-demanded bits to the value of the preceding demanded bits.
896  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
897  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
898  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
899  // The final result is 0b11000011.
900  uint64_t NonDemandedBits = ~DemandedBits;
901  uint64_t InvertedImm = ~Imm & DemandedBits;
902  uint64_t RotatedImm =
903  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
904  NonDemandedBits;
905  uint64_t Sum = RotatedImm + NonDemandedBits;
906  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
907  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
908  NewImm = (Imm | Ones) & Mask;
909 
910  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
911  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
912  // we halve the element size and continue the search.
913  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
914  break;
915 
916  // We cannot shrink the element size any further if it is 2-bits.
917  if (EltSize == 2)
918  return false;
919 
920  EltSize /= 2;
921  Mask >>= EltSize;
922  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
923 
924  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
925  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
926  return false;
927 
928  // Merge the upper and lower halves of Imm and DemandedBits.
929  Imm |= Hi;
930  DemandedBits |= DemandedBitsHi;
931  }
932 
933  ++NumOptimizedImms;
934 
935  // Replicate the element across the register width.
936  while (EltSize < Size) {
937  NewImm |= NewImm << EltSize;
938  EltSize *= 2;
939  }
940 
941  (void)OldImm;
942  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
943  "demanded bits should never be altered");
944  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
945 
946  // Create the new constant immediate node.
947  EVT VT = Op.getValueType();
948  SDLoc DL(Op);
949  SDValue New;
950 
951  // If the new constant immediate is all-zeros or all-ones, let the target
952  // independent DAG combine optimize this node.
953  if (NewImm == 0 || NewImm == OrigMask) {
954  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
955  TLO.DAG.getConstant(NewImm, DL, VT));
956  // Otherwise, create a machine node so that target independent DAG combine
957  // doesn't undo this optimization.
958  } else {
959  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
960  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
961  New = SDValue(
962  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
963  }
964 
965  return TLO.CombineTo(Op, New);
966 }
967 
969  SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
970  // Delay this optimization to as late as possible.
971  if (!TLO.LegalOps)
972  return false;
973 
975  return false;
976 
977  EVT VT = Op.getValueType();
978  if (VT.isVector())
979  return false;
980 
981  unsigned Size = VT.getSizeInBits();
982  assert((Size == 32 || Size == 64) &&
983  "i32 or i64 is expected after legalization.");
984 
985  // Exit early if we demand all bits.
986  if (Demanded.countPopulation() == Size)
987  return false;
988 
989  unsigned NewOpc;
990  switch (Op.getOpcode()) {
991  default:
992  return false;
993  case ISD::AND:
994  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
995  break;
996  case ISD::OR:
997  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
998  break;
999  case ISD::XOR:
1000  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1001  break;
1002  }
1004  if (!C)
1005  return false;
1006  uint64_t Imm = C->getZExtValue();
1007  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
1008 }
1009 
1010 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1011 /// Mask are known to be either zero or one and return them Known.
1013  const SDValue Op, KnownBits &Known,
1014  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1015  switch (Op.getOpcode()) {
1016  default:
1017  break;
1018  case AArch64ISD::CSEL: {
1019  KnownBits Known2;
1020  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1021  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1022  Known.Zero &= Known2.Zero;
1023  Known.One &= Known2.One;
1024  break;
1025  }
1026  case ISD::INTRINSIC_W_CHAIN: {
1027  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1028  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1029  switch (IntID) {
1030  default: return;
1031  case Intrinsic::aarch64_ldaxr:
1032  case Intrinsic::aarch64_ldxr: {
1033  unsigned BitWidth = Known.getBitWidth();
1034  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1035  unsigned MemBits = VT.getScalarSizeInBits();
1036  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1037  return;
1038  }
1039  }
1040  break;
1041  }
1043  case ISD::INTRINSIC_VOID: {
1044  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1045  switch (IntNo) {
1046  default:
1047  break;
1048  case Intrinsic::aarch64_neon_umaxv:
1049  case Intrinsic::aarch64_neon_uminv: {
1050  // Figure out the datatype of the vector operand. The UMINV instruction
1051  // will zero extend the result, so we can mark as known zero all the
1052  // bits larger than the element datatype. 32-bit or larget doesn't need
1053  // this as those are legal types and will be handled by isel directly.
1054  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1055  unsigned BitWidth = Known.getBitWidth();
1056  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1057  assert(BitWidth >= 8 && "Unexpected width!");
1058  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1059  Known.Zero |= Mask;
1060  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1061  assert(BitWidth >= 16 && "Unexpected width!");
1062  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1063  Known.Zero |= Mask;
1064  }
1065  break;
1066  } break;
1067  }
1068  }
1069  }
1070 }
1071 
1073  EVT) const {
1074  return MVT::i64;
1075 }
1076 
1078  EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1079  bool *Fast) const {
1080  if (Subtarget->requiresStrictAlign())
1081  return false;
1082 
1083  if (Fast) {
1084  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1085  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1086  // See comments in performSTORECombine() for more details about
1087  // these conditions.
1088 
1089  // Code that uses clang vector extensions can mark that it
1090  // wants unaligned accesses to be treated as fast by
1091  // underspecifying alignment to be 1 or 2.
1092  Align <= 2 ||
1093 
1094  // Disregard v2i64. Memcpy lowering produces those and splitting
1095  // them regresses performance on micro-benchmarks and olden/bh.
1096  VT == MVT::v2i64;
1097  }
1098  return true;
1099 }
1100 
1101 FastISel *
1103  const TargetLibraryInfo *libInfo) const {
1104  return AArch64::createFastISel(funcInfo, libInfo);
1105 }
1106 
1107 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1108  switch ((AArch64ISD::NodeType)Opcode) {
1109  case AArch64ISD::FIRST_NUMBER: break;
1110  case AArch64ISD::CALL: return "AArch64ISD::CALL";
1111  case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
1112  case AArch64ISD::ADR: return "AArch64ISD::ADR";
1113  case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
1114  case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
1115  case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
1116  case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
1117  case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
1118  case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
1119  case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
1120  case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
1121  case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
1122  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1123  case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
1124  case AArch64ISD::ADC: return "AArch64ISD::ADC";
1125  case AArch64ISD::SBC: return "AArch64ISD::SBC";
1126  case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
1127  case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
1128  case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
1129  case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
1130  case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
1131  case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
1132  case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
1133  case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
1134  case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
1135  case AArch64ISD::DUP: return "AArch64ISD::DUP";
1136  case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
1137  case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
1138  case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
1139  case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
1140  case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
1141  case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
1142  case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
1143  case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
1144  case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
1145  case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
1146  case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
1147  case AArch64ISD::BICi: return "AArch64ISD::BICi";
1148  case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
1149  case AArch64ISD::BSL: return "AArch64ISD::BSL";
1150  case AArch64ISD::NEG: return "AArch64ISD::NEG";
1151  case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
1152  case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
1153  case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
1154  case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
1155  case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
1156  case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
1157  case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
1158  case AArch64ISD::REV16: return "AArch64ISD::REV16";
1159  case AArch64ISD::REV32: return "AArch64ISD::REV32";
1160  case AArch64ISD::REV64: return "AArch64ISD::REV64";
1161  case AArch64ISD::EXT: return "AArch64ISD::EXT";
1162  case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
1163  case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
1164  case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
1165  case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
1166  case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
1167  case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
1168  case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
1169  case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
1170  case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
1171  case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
1172  case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
1173  case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
1174  case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
1175  case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
1176  case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
1177  case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
1178  case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
1179  case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
1180  case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
1181  case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
1182  case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
1183  case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
1184  case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
1185  case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
1186  case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
1187  case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
1188  case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
1189  case AArch64ISD::NOT: return "AArch64ISD::NOT";
1190  case AArch64ISD::BIT: return "AArch64ISD::BIT";
1191  case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
1192  case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
1193  case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
1194  case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
1195  case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
1196  case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
1197  case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
1198  case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
1199  case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
1200  case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
1201  case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
1202  case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
1203  case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
1204  case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
1205  case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
1206  case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
1207  case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
1208  case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
1209  case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
1210  case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
1211  case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
1212  case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
1213  case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
1214  case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
1215  case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
1216  case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
1217  case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
1218  case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
1219  case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
1220  case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
1221  case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
1222  case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
1223  case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
1224  case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
1225  case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
1226  case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
1227  case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
1228  case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
1229  case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
1230  case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
1231  case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
1232  case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
1233  case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
1234  case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
1235  }
1236  return nullptr;
1237 }
1238 
1241  MachineBasicBlock *MBB) const {
1242  // We materialise the F128CSEL pseudo-instruction as some control flow and a
1243  // phi node:
1244 
1245  // OrigBB:
1246  // [... previous instrs leading to comparison ...]
1247  // b.ne TrueBB
1248  // b EndBB
1249  // TrueBB:
1250  // ; Fallthrough
1251  // EndBB:
1252  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1253 
1254  MachineFunction *MF = MBB->getParent();
1255  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1256  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1257  DebugLoc DL = MI.getDebugLoc();
1258  MachineFunction::iterator It = ++MBB->getIterator();
1259 
1260  unsigned DestReg = MI.getOperand(0).getReg();
1261  unsigned IfTrueReg = MI.getOperand(1).getReg();
1262  unsigned IfFalseReg = MI.getOperand(2).getReg();
1263  unsigned CondCode = MI.getOperand(3).getImm();
1264  bool NZCVKilled = MI.getOperand(4).isKill();
1265 
1266  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1267  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1268  MF->insert(It, TrueBB);
1269  MF->insert(It, EndBB);
1270 
1271  // Transfer rest of current basic-block to EndBB
1272  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1273  MBB->end());
1274  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1275 
1276  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1277  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1278  MBB->addSuccessor(TrueBB);
1279  MBB->addSuccessor(EndBB);
1280 
1281  // TrueBB falls through to the end.
1282  TrueBB->addSuccessor(EndBB);
1283 
1284  if (!NZCVKilled) {
1285  TrueBB->addLiveIn(AArch64::NZCV);
1286  EndBB->addLiveIn(AArch64::NZCV);
1287  }
1288 
1289  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1290  .addReg(IfTrueReg)
1291  .addMBB(TrueBB)
1292  .addReg(IfFalseReg)
1293  .addMBB(MBB);
1294 
1295  MI.eraseFromParent();
1296  return EndBB;
1297 }
1298 
1300  MachineInstr &MI, MachineBasicBlock *BB) const {
1302  BB->getParent()->getFunction().getPersonalityFn())) &&
1303  "SEH does not use catchret!");
1304  return BB;
1305 }
1306 
1308  MachineInstr &MI, MachineBasicBlock *BB) const {
1309  MI.eraseFromParent();
1310  return BB;
1311 }
1312 
1314  MachineInstr &MI, MachineBasicBlock *BB) const {
1315  switch (MI.getOpcode()) {
1316  default:
1317 #ifndef NDEBUG
1318  MI.dump();
1319 #endif
1320  llvm_unreachable("Unexpected instruction for custom inserter!");
1321 
1322  case AArch64::F128CSEL:
1323  return EmitF128CSEL(MI, BB);
1324 
1325  case TargetOpcode::STACKMAP:
1326  case TargetOpcode::PATCHPOINT:
1327  return emitPatchPoint(MI, BB);
1328 
1329  case AArch64::CATCHRET:
1330  return EmitLoweredCatchRet(MI, BB);
1331  case AArch64::CATCHPAD:
1332  return EmitLoweredCatchPad(MI, BB);
1333  }
1334 }
1335 
1336 //===----------------------------------------------------------------------===//
1337 // AArch64 Lowering private implementation.
1338 //===----------------------------------------------------------------------===//
1339 
1340 //===----------------------------------------------------------------------===//
1341 // Lowering Code
1342 //===----------------------------------------------------------------------===//
1343 
1344 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1345 /// CC
1347  switch (CC) {
1348  default:
1349  llvm_unreachable("Unknown condition code!");
1350  case ISD::SETNE:
1351  return AArch64CC::NE;
1352  case ISD::SETEQ:
1353  return AArch64CC::EQ;
1354  case ISD::SETGT:
1355  return AArch64CC::GT;
1356  case ISD::SETGE:
1357  return AArch64CC::GE;
1358  case ISD::SETLT:
1359  return AArch64CC::LT;
1360  case ISD::SETLE:
1361  return AArch64CC::LE;
1362  case ISD::SETUGT:
1363  return AArch64CC::HI;
1364  case ISD::SETUGE:
1365  return AArch64CC::HS;
1366  case ISD::SETULT:
1367  return AArch64CC::LO;
1368  case ISD::SETULE:
1369  return AArch64CC::LS;
1370  }
1371 }
1372 
1373 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1376  AArch64CC::CondCode &CondCode2) {
1377  CondCode2 = AArch64CC::AL;
1378  switch (CC) {
1379  default:
1380  llvm_unreachable("Unknown FP condition!");
1381  case ISD::SETEQ:
1382  case ISD::SETOEQ:
1383  CondCode = AArch64CC::EQ;
1384  break;
1385  case ISD::SETGT:
1386  case ISD::SETOGT:
1387  CondCode = AArch64CC::GT;
1388  break;
1389  case ISD::SETGE:
1390  case ISD::SETOGE:
1391  CondCode = AArch64CC::GE;
1392  break;
1393  case ISD::SETOLT:
1394  CondCode = AArch64CC::MI;
1395  break;
1396  case ISD::SETOLE:
1397  CondCode = AArch64CC::LS;
1398  break;
1399  case ISD::SETONE:
1400  CondCode = AArch64CC::MI;
1401  CondCode2 = AArch64CC::GT;
1402  break;
1403  case ISD::SETO:
1404  CondCode = AArch64CC::VC;
1405  break;
1406  case ISD::SETUO:
1407  CondCode = AArch64CC::VS;
1408  break;
1409  case ISD::SETUEQ:
1410  CondCode = AArch64CC::EQ;
1411  CondCode2 = AArch64CC::VS;
1412  break;
1413  case ISD::SETUGT:
1414  CondCode = AArch64CC::HI;
1415  break;
1416  case ISD::SETUGE:
1417  CondCode = AArch64CC::PL;
1418  break;
1419  case ISD::SETLT:
1420  case ISD::SETULT:
1421  CondCode = AArch64CC::LT;
1422  break;
1423  case ISD::SETLE:
1424  case ISD::SETULE:
1425  CondCode = AArch64CC::LE;
1426  break;
1427  case ISD::SETNE:
1428  case ISD::SETUNE:
1429  CondCode = AArch64CC::NE;
1430  break;
1431  }
1432 }
1433 
1434 /// Convert a DAG fp condition code to an AArch64 CC.
1435 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1436 /// should be AND'ed instead of OR'ed.
1439  AArch64CC::CondCode &CondCode2) {
1440  CondCode2 = AArch64CC::AL;
1441  switch (CC) {
1442  default:
1443  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1444  assert(CondCode2 == AArch64CC::AL);
1445  break;
1446  case ISD::SETONE:
1447  // (a one b)
1448  // == ((a olt b) || (a ogt b))
1449  // == ((a ord b) && (a une b))
1450  CondCode = AArch64CC::VC;
1451  CondCode2 = AArch64CC::NE;
1452  break;
1453  case ISD::SETUEQ:
1454  // (a ueq b)
1455  // == ((a uno b) || (a oeq b))
1456  // == ((a ule b) && (a uge b))
1457  CondCode = AArch64CC::PL;
1458  CondCode2 = AArch64CC::LE;
1459  break;
1460  }
1461 }
1462 
1463 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1464 /// CC usable with the vector instructions. Fewer operations are available
1465 /// without a real NZCV register, so we have to use less efficient combinations
1466 /// to get the same effect.
1469  AArch64CC::CondCode &CondCode2,
1470  bool &Invert) {
1471  Invert = false;
1472  switch (CC) {
1473  default:
1474  // Mostly the scalar mappings work fine.
1475  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1476  break;
1477  case ISD::SETUO:
1478  Invert = true;
1480  case ISD::SETO:
1481  CondCode = AArch64CC::MI;
1482  CondCode2 = AArch64CC::GE;
1483  break;
1484  case ISD::SETUEQ:
1485  case ISD::SETULT:
1486  case ISD::SETULE:
1487  case ISD::SETUGT:
1488  case ISD::SETUGE:
1489  // All of the compare-mask comparisons are ordered, but we can switch
1490  // between the two by a double inversion. E.g. ULE == !OGT.
1491  Invert = true;
1492  changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1493  break;
1494  }
1495 }
1496 
1497 static bool isLegalArithImmed(uint64_t C) {
1498  // Matches AArch64DAGToDAGISel::SelectArithImmed().
1499  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1500  LLVM_DEBUG(dbgs() << "Is imm " << C
1501  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1502  return IsLegal;
1503 }
1504 
1505 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
1506 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
1507 // can be set differently by this operation. It comes down to whether
1508 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1509 // everything is fine. If not then the optimization is wrong. Thus general
1510 // comparisons are only valid if op2 != 0.
1511 //
1512 // So, finally, the only LLVM-native comparisons that don't mention C and V
1513 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1514 // the absence of information about op2.
1515 static bool isCMN(SDValue Op, ISD::CondCode CC) {
1516  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
1517  (CC == ISD::SETEQ || CC == ISD::SETNE);
1518 }
1519 
1521  const SDLoc &dl, SelectionDAG &DAG) {
1522  EVT VT = LHS.getValueType();
1523  const bool FullFP16 =
1524  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1525 
1526  if (VT.isFloatingPoint()) {
1527  assert(VT != MVT::f128);
1528  if (VT == MVT::f16 && !FullFP16) {
1529  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1530  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1531  VT = MVT::f32;
1532  }
1533  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1534  }
1535 
1536  // The CMP instruction is just an alias for SUBS, and representing it as
1537  // SUBS means that it's possible to get CSE with subtract operations.
1538  // A later phase can perform the optimization of setting the destination
1539  // register to WZR/XZR if it ends up being unused.
1540  unsigned Opcode = AArch64ISD::SUBS;
1541 
1542  if (isCMN(RHS, CC)) {
1543  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
1544  Opcode = AArch64ISD::ADDS;
1545  RHS = RHS.getOperand(1);
1546  } else if (isCMN(LHS, CC)) {
1547  // As we are looking for EQ/NE compares, the operands can be commuted ; can
1548  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
1549  Opcode = AArch64ISD::ADDS;
1550  LHS = LHS.getOperand(1);
1551  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1552  !isUnsignedIntSetCC(CC)) {
1553  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1554  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1555  // of the signed comparisons.
1556  Opcode = AArch64ISD::ANDS;
1557  RHS = LHS.getOperand(1);
1558  LHS = LHS.getOperand(0);
1559  }
1560 
1561  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1562  .getValue(1);
1563 }
1564 
1565 /// \defgroup AArch64CCMP CMP;CCMP matching
1566 ///
1567 /// These functions deal with the formation of CMP;CCMP;... sequences.
1568 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1569 /// a comparison. They set the NZCV flags to a predefined value if their
1570 /// predicate is false. This allows to express arbitrary conjunctions, for
1571 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
1572 /// expressed as:
1573 /// cmp A
1574 /// ccmp B, inv(CB), CA
1575 /// check for CB flags
1576 ///
1577 /// This naturally lets us implement chains of AND operations with SETCC
1578 /// operands. And we can even implement some other situations by transforming
1579 /// them:
1580 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
1581 /// negating the flags used in a CCMP/FCCMP operations.
1582 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
1583 /// by negating the flags we test for afterwards. i.e.
1584 /// NEG (CMP CCMP CCCMP ...) can be implemented.
1585 /// - Note that we can only ever negate all previously processed results.
1586 /// What we can not implement by flipping the flags to test is a negation
1587 /// of two sub-trees (because the negation affects all sub-trees emitted so
1588 /// far, so the 2nd sub-tree we emit would also affect the first).
1589 /// With those tools we can implement some OR operations:
1590 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
1591 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
1592 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
1593 /// elimination rules from earlier to implement the whole thing as a
1594 /// CCMP/FCCMP chain.
1595 ///
1596 /// As complete example:
1597 /// or (or (setCA (cmp A)) (setCB (cmp B)))
1598 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1599 /// can be reassociated to:
1600 /// or (and (setCC (cmp C)) setCD (cmp D))
1601 // (or (setCA (cmp A)) (setCB (cmp B)))
1602 /// can be transformed to:
1603 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
1604 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1605 /// which can be implemented as:
1606 /// cmp C
1607 /// ccmp D, inv(CD), CC
1608 /// ccmp A, CA, inv(CD)
1609 /// ccmp B, CB, inv(CA)
1610 /// check for CB flags
1611 ///
1612 /// A counterexample is "or (and A B) (and C D)" which translates to
1613 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
1614 /// can only implement 1 of the inner (not) operations, but not both!
1615 /// @{
1616 
1617 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1619  ISD::CondCode CC, SDValue CCOp,
1621  AArch64CC::CondCode OutCC,
1622  const SDLoc &DL, SelectionDAG &DAG) {
1623  unsigned Opcode = 0;
1624  const bool FullFP16 =
1625  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1626 
1627  if (LHS.getValueType().isFloatingPoint()) {
1628  assert(LHS.getValueType() != MVT::f128);
1629  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1630  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1631  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1632  }
1633  Opcode = AArch64ISD::FCCMP;
1634  } else if (RHS.getOpcode() == ISD::SUB) {
1635  SDValue SubOp0 = RHS.getOperand(0);
1636  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1637  // See emitComparison() on why we can only do this for SETEQ and SETNE.
1638  Opcode = AArch64ISD::CCMN;
1639  RHS = RHS.getOperand(1);
1640  }
1641  }
1642  if (Opcode == 0)
1643  Opcode = AArch64ISD::CCMP;
1644 
1645  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1647  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1648  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1649  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1650 }
1651 
1652 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
1653 /// expressed as a conjunction. See \ref AArch64CCMP.
1654 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
1655 /// changing the conditions on the SETCC tests.
1656 /// (this means we can call emitConjunctionRec() with
1657 /// Negate==true on this sub-tree)
1658 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
1659 /// cannot do the negation naturally. We are required to
1660 /// emit the subtree first in this case.
1661 /// \param WillNegate Is true if are called when the result of this
1662 /// subexpression must be negated. This happens when the
1663 /// outer expression is an OR. We can use this fact to know
1664 /// that we have a double negation (or (or ...) ...) that
1665 /// can be implemented for free.
1666 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
1667  bool &MustBeFirst, bool WillNegate,
1668  unsigned Depth = 0) {
1669  if (!Val.hasOneUse())
1670  return false;
1671  unsigned Opcode = Val->getOpcode();
1672  if (Opcode == ISD::SETCC) {
1673  if (Val->getOperand(0).getValueType() == MVT::f128)
1674  return false;
1675  CanNegate = true;
1676  MustBeFirst = false;
1677  return true;
1678  }
1679  // Protect against exponential runtime and stack overflow.
1680  if (Depth > 6)
1681  return false;
1682  if (Opcode == ISD::AND || Opcode == ISD::OR) {
1683  bool IsOR = Opcode == ISD::OR;
1684  SDValue O0 = Val->getOperand(0);
1685  SDValue O1 = Val->getOperand(1);
1686  bool CanNegateL;
1687  bool MustBeFirstL;
1688  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
1689  return false;
1690  bool CanNegateR;
1691  bool MustBeFirstR;
1692  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
1693  return false;
1694 
1695  if (MustBeFirstL && MustBeFirstR)
1696  return false;
1697 
1698  if (IsOR) {
1699  // For an OR expression we need to be able to naturally negate at least
1700  // one side or we cannot do the transformation at all.
1701  if (!CanNegateL && !CanNegateR)
1702  return false;
1703  // If we the result of the OR will be negated and we can naturally negate
1704  // the leafs, then this sub-tree as a whole negates naturally.
1705  CanNegate = WillNegate && CanNegateL && CanNegateR;
1706  // If we cannot naturally negate the whole sub-tree, then this must be
1707  // emitted first.
1708  MustBeFirst = !CanNegate;
1709  } else {
1710  assert(Opcode == ISD::AND && "Must be OR or AND");
1711  // We cannot naturally negate an AND operation.
1712  CanNegate = false;
1713  MustBeFirst = MustBeFirstL || MustBeFirstR;
1714  }
1715  return true;
1716  }
1717  return false;
1718 }
1719 
1720 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1721 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1722 /// Tries to transform the given i1 producing node @p Val to a series compare
1723 /// and conditional compare operations. @returns an NZCV flags producing node
1724 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1725 /// transformation was not possible.
1726 /// \p Negate is true if we want this sub-tree being negated just by changing
1727 /// SETCC conditions.
1729  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1731  // We're at a tree leaf, produce a conditional comparison operation.
1732  unsigned Opcode = Val->getOpcode();
1733  if (Opcode == ISD::SETCC) {
1734  SDValue LHS = Val->getOperand(0);
1735  SDValue RHS = Val->getOperand(1);
1736  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1737  bool isInteger = LHS.getValueType().isInteger();
1738  if (Negate)
1739  CC = getSetCCInverse(CC, isInteger);
1740  SDLoc DL(Val);
1741  // Determine OutCC and handle FP special case.
1742  if (isInteger) {
1743  OutCC = changeIntCCToAArch64CC(CC);
1744  } else {
1746  AArch64CC::CondCode ExtraCC;
1747  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1748  // Some floating point conditions can't be tested with a single condition
1749  // code. Construct an additional comparison in this case.
1750  if (ExtraCC != AArch64CC::AL) {
1751  SDValue ExtraCmp;
1752  if (!CCOp.getNode())
1753  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1754  else
1755  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1756  ExtraCC, DL, DAG);
1757  CCOp = ExtraCmp;
1758  Predicate = ExtraCC;
1759  }
1760  }
1761 
1762  // Produce a normal comparison if we are first in the chain
1763  if (!CCOp)
1764  return emitComparison(LHS, RHS, CC, DL, DAG);
1765  // Otherwise produce a ccmp.
1766  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1767  DAG);
1768  }
1769  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
1770 
1771  bool IsOR = Opcode == ISD::OR;
1772 
1773  SDValue LHS = Val->getOperand(0);
1774  bool CanNegateL;
1775  bool MustBeFirstL;
1776  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
1777  assert(ValidL && "Valid conjunction/disjunction tree");
1778  (void)ValidL;
1779 
1780  SDValue RHS = Val->getOperand(1);
1781  bool CanNegateR;
1782  bool MustBeFirstR;
1783  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
1784  assert(ValidR && "Valid conjunction/disjunction tree");
1785  (void)ValidR;
1786 
1787  // Swap sub-tree that must come first to the right side.
1788  if (MustBeFirstL) {
1789  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
1790  std::swap(LHS, RHS);
1791  std::swap(CanNegateL, CanNegateR);
1792  std::swap(MustBeFirstL, MustBeFirstR);
1793  }
1794 
1795  bool NegateR;
1796  bool NegateAfterR;
1797  bool NegateL;
1798  bool NegateAfterAll;
1799  if (Opcode == ISD::OR) {
1800  // Swap the sub-tree that we can negate naturally to the left.
1801  if (!CanNegateL) {
1802  assert(CanNegateR && "at least one side must be negatable");
1803  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
1804  assert(!Negate);
1805  std::swap(LHS, RHS);
1806  NegateR = false;
1807  NegateAfterR = true;
1808  } else {
1809  // Negate the left sub-tree if possible, otherwise negate the result.
1810  NegateR = CanNegateR;
1811  NegateAfterR = !CanNegateR;
1812  }
1813  NegateL = true;
1814  NegateAfterAll = !Negate;
1815  } else {
1816  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
1817  assert(!Negate && "Valid conjunction/disjunction tree");
1818 
1819  NegateL = false;
1820  NegateR = false;
1821  NegateAfterR = false;
1822  NegateAfterAll = false;
1823  }
1824 
1825  // Emit sub-trees.
1826  AArch64CC::CondCode RHSCC;
1827  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
1828  if (NegateAfterR)
1829  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1830  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
1831  if (NegateAfterAll)
1832  OutCC = AArch64CC::getInvertedCondCode(OutCC);
1833  return CmpL;
1834 }
1835 
1836 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
1837 /// In some cases this is even possible with OR operations in the expression.
1838 /// See \ref AArch64CCMP.
1839 /// \see emitConjunctionRec().
1841  AArch64CC::CondCode &OutCC) {
1842  bool DummyCanNegate;
1843  bool DummyMustBeFirst;
1844  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
1845  return SDValue();
1846 
1847  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
1848 }
1849 
1850 /// @}
1851 
1852 /// Returns how profitable it is to fold a comparison's operand's shift and/or
1853 /// extension operations.
1855  auto isSupportedExtend = [&](SDValue V) {
1856  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
1857  return true;
1858 
1859  if (V.getOpcode() == ISD::AND)
1860  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
1861  uint64_t Mask = MaskCst->getZExtValue();
1862  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
1863  }
1864 
1865  return false;
1866  };
1867 
1868  if (!Op.hasOneUse())
1869  return 0;
1870 
1871  if (isSupportedExtend(Op))
1872  return 1;
1873 
1874  unsigned Opc = Op.getOpcode();
1875  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
1876  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
1877  uint64_t Shift = ShiftCst->getZExtValue();
1878  if (isSupportedExtend(Op.getOperand(0)))
1879  return (Shift <= 4) ? 2 : 1;
1880  EVT VT = Op.getValueType();
1881  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
1882  return 1;
1883  }
1884 
1885  return 0;
1886 }
1887 
1889  SDValue &AArch64cc, SelectionDAG &DAG,
1890  const SDLoc &dl) {
1891  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1892  EVT VT = RHS.getValueType();
1893  uint64_t C = RHSC->getZExtValue();
1894  if (!isLegalArithImmed(C)) {
1895  // Constant does not fit, try adjusting it by one?
1896  switch (CC) {
1897  default:
1898  break;
1899  case ISD::SETLT:
1900  case ISD::SETGE:
1901  if ((VT == MVT::i32 && C != 0x80000000 &&
1902  isLegalArithImmed((uint32_t)(C - 1))) ||
1903  (VT == MVT::i64 && C != 0x80000000ULL &&
1904  isLegalArithImmed(C - 1ULL))) {
1905  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1906  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1907  RHS = DAG.getConstant(C, dl, VT);
1908  }
1909  break;
1910  case ISD::SETULT:
1911  case ISD::SETUGE:
1912  if ((VT == MVT::i32 && C != 0 &&
1913  isLegalArithImmed((uint32_t)(C - 1))) ||
1914  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1915  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1916  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1917  RHS = DAG.getConstant(C, dl, VT);
1918  }
1919  break;
1920  case ISD::SETLE:
1921  case ISD::SETGT:
1922  if ((VT == MVT::i32 && C != INT32_MAX &&
1923  isLegalArithImmed((uint32_t)(C + 1))) ||
1924  (VT == MVT::i64 && C != INT64_MAX &&
1925  isLegalArithImmed(C + 1ULL))) {
1926  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1927  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1928  RHS = DAG.getConstant(C, dl, VT);
1929  }
1930  break;
1931  case ISD::SETULE:
1932  case ISD::SETUGT:
1933  if ((VT == MVT::i32 && C != UINT32_MAX &&
1934  isLegalArithImmed((uint32_t)(C + 1))) ||
1935  (VT == MVT::i64 && C != UINT64_MAX &&
1936  isLegalArithImmed(C + 1ULL))) {
1937  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1938  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1939  RHS = DAG.getConstant(C, dl, VT);
1940  }
1941  break;
1942  }
1943  }
1944  }
1945 
1946  // Comparisons are canonicalized so that the RHS operand is simpler than the
1947  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
1948  // can fold some shift+extend operations on the RHS operand, so swap the
1949  // operands if that can be done.
1950  //
1951  // For example:
1952  // lsl w13, w11, #1
1953  // cmp w13, w12
1954  // can be turned into:
1955  // cmp w12, w11, lsl #1
1956  if (!isa<ConstantSDNode>(RHS) ||
1957  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
1958  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
1959 
1961  std::swap(LHS, RHS);
1963  }
1964  }
1965 
1966  SDValue Cmp;
1967  AArch64CC::CondCode AArch64CC;
1968  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
1969  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
1970 
1971  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
1972  // For the i8 operand, the largest immediate is 255, so this can be easily
1973  // encoded in the compare instruction. For the i16 operand, however, the
1974  // largest immediate cannot be encoded in the compare.
1975  // Therefore, use a sign extending load and cmn to avoid materializing the
1976  // -1 constant. For example,
1977  // movz w1, #65535
1978  // ldrh w0, [x0, #0]
1979  // cmp w0, w1
1980  // >
1981  // ldrsh w0, [x0, #0]
1982  // cmn w0, #1
1983  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
1984  // if and only if (sext LHS) == (sext RHS). The checks are in place to
1985  // ensure both the LHS and RHS are truly zero extended and to make sure the
1986  // transformation is profitable.
1987  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
1988  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
1989  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
1990  LHS.getNode()->hasNUsesOfValue(1, 0)) {
1991  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
1992  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
1993  SDValue SExt =
1994  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
1995  DAG.getValueType(MVT::i16));
1996  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
1997  RHS.getValueType()),
1998  CC, dl, DAG);
1999  AArch64CC = changeIntCCToAArch64CC(CC);
2000  }
2001  }
2002 
2003  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2004  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2005  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2006  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2007  }
2008  }
2009  }
2010 
2011  if (!Cmp) {
2012  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2013  AArch64CC = changeIntCCToAArch64CC(CC);
2014  }
2015  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2016  return Cmp;
2017 }
2018 
2019 static std::pair<SDValue, SDValue>
2021  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2022  "Unsupported value type");
2023  SDValue Value, Overflow;
2024  SDLoc DL(Op);
2025  SDValue LHS = Op.getOperand(0);
2026  SDValue RHS = Op.getOperand(1);
2027  unsigned Opc = 0;
2028  switch (Op.getOpcode()) {
2029  default:
2030  llvm_unreachable("Unknown overflow instruction!");
2031  case ISD::SADDO:
2032  Opc = AArch64ISD::ADDS;
2033  CC = AArch64CC::VS;
2034  break;
2035  case ISD::UADDO:
2036  Opc = AArch64ISD::ADDS;
2037  CC = AArch64CC::HS;
2038  break;
2039  case ISD::SSUBO:
2040  Opc = AArch64ISD::SUBS;
2041  CC = AArch64CC::VS;
2042  break;
2043  case ISD::USUBO:
2044  Opc = AArch64ISD::SUBS;
2045  CC = AArch64CC::LO;
2046  break;
2047  // Multiply needs a little bit extra work.
2048  case ISD::SMULO:
2049  case ISD::UMULO: {
2050  CC = AArch64CC::NE;
2051  bool IsSigned = Op.getOpcode() == ISD::SMULO;
2052  if (Op.getValueType() == MVT::i32) {
2053  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2054  // For a 32 bit multiply with overflow check we want the instruction
2055  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2056  // need to generate the following pattern:
2057  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2058  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2059  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2060  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2061  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
2062  DAG.getConstant(0, DL, MVT::i64));
2063  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2064  // operation. We need to clear out the upper 32 bits, because we used a
2065  // widening multiply that wrote all 64 bits. In the end this should be a
2066  // noop.
2067  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
2068  if (IsSigned) {
2069  // The signed overflow check requires more than just a simple check for
2070  // any bit set in the upper 32 bits of the result. These bits could be
2071  // just the sign bits of a negative number. To perform the overflow
2072  // check we have to arithmetic shift right the 32nd bit of the result by
2073  // 31 bits. Then we compare the result to the upper 32 bits.
2074  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
2075  DAG.getConstant(32, DL, MVT::i64));
2076  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
2077  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
2078  DAG.getConstant(31, DL, MVT::i64));
2079  // It is important that LowerBits is last, otherwise the arithmetic
2080  // shift will not be folded into the compare (SUBS).
2081  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2082  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2083  .getValue(1);
2084  } else {
2085  // The overflow check for unsigned multiply is easy. We only need to
2086  // check if any of the upper 32 bits are set. This can be done with a
2087  // CMP (shifted register). For that we need to generate the following
2088  // pattern:
2089  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2090  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2091  DAG.getConstant(32, DL, MVT::i64));
2092  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2093  Overflow =
2094  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2095  DAG.getConstant(0, DL, MVT::i64),
2096  UpperBits).getValue(1);
2097  }
2098  break;
2099  }
2100  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2101  // For the 64 bit multiply
2102  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2103  if (IsSigned) {
2104  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2105  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
2106  DAG.getConstant(63, DL, MVT::i64));
2107  // It is important that LowerBits is last, otherwise the arithmetic
2108  // shift will not be folded into the compare (SUBS).
2109  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2110  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2111  .getValue(1);
2112  } else {
2113  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2114  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2115  Overflow =
2116  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2117  DAG.getConstant(0, DL, MVT::i64),
2118  UpperBits).getValue(1);
2119  }
2120  break;
2121  }
2122  } // switch (...)
2123 
2124  if (Opc) {
2125  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2126 
2127  // Emit the AArch64 operation with overflow check.
2128  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2129  Overflow = Value.getValue(1);
2130  }
2131  return std::make_pair(Value, Overflow);
2132 }
2133 
2134 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
2135  RTLIB::Libcall Call) const {
2136  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2137  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
2138 }
2139 
2140 // Returns true if the given Op is the overflow flag result of an overflow
2141 // intrinsic operation.
2142 static bool isOverflowIntrOpRes(SDValue Op) {
2143  unsigned Opc = Op.getOpcode();
2144  return (Op.getResNo() == 1 &&
2145  (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2146  Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
2147 }
2148 
2150  SDValue Sel = Op.getOperand(0);
2151  SDValue Other = Op.getOperand(1);
2152  SDLoc dl(Sel);
2153 
2154  // If the operand is an overflow checking operation, invert the condition
2155  // code and kill the Not operation. I.e., transform:
2156  // (xor (overflow_op_bool, 1))
2157  // -->
2158  // (csel 1, 0, invert(cc), overflow_op_bool)
2159  // ... which later gets transformed to just a cset instruction with an
2160  // inverted condition code, rather than a cset + eor sequence.
2161  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
2162  // Only lower legal XALUO ops.
2163  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2164  return SDValue();
2165 
2166  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2167  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2169  SDValue Value, Overflow;
2170  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2171  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2172  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2173  CCVal, Overflow);
2174  }
2175  // If neither operand is a SELECT_CC, give up.
2176  if (Sel.getOpcode() != ISD::SELECT_CC)
2177  std::swap(Sel, Other);
2178  if (Sel.getOpcode() != ISD::SELECT_CC)
2179  return Op;
2180 
2181  // The folding we want to perform is:
2182  // (xor x, (select_cc a, b, cc, 0, -1) )
2183  // -->
2184  // (csel x, (xor x, -1), cc ...)
2185  //
2186  // The latter will get matched to a CSINV instruction.
2187 
2188  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2189  SDValue LHS = Sel.getOperand(0);
2190  SDValue RHS = Sel.getOperand(1);
2191  SDValue TVal = Sel.getOperand(2);
2192  SDValue FVal = Sel.getOperand(3);
2193 
2194  // FIXME: This could be generalized to non-integer comparisons.
2195  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2196  return Op;
2197 
2198  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2199  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2200 
2201  // The values aren't constants, this isn't the pattern we're looking for.
2202  if (!CFVal || !CTVal)
2203  return Op;
2204 
2205  // We can commute the SELECT_CC by inverting the condition. This
2206  // might be needed to make this fit into a CSINV pattern.
2207  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2208  std::swap(TVal, FVal);
2209  std::swap(CTVal, CFVal);
2210  CC = ISD::getSetCCInverse(CC, true);
2211  }
2212 
2213  // If the constants line up, perform the transform!
2214  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2215  SDValue CCVal;
2216  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2217 
2218  FVal = Other;
2219  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2220  DAG.getConstant(-1ULL, dl, Other.getValueType()));
2221 
2222  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2223  CCVal, Cmp);
2224  }
2225 
2226  return Op;
2227 }
2228 
2230  EVT VT = Op.getValueType();
2231 
2232  // Let legalize expand this if it isn't a legal type yet.
2233  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2234  return SDValue();
2235 
2236  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2237 
2238  unsigned Opc;
2239  bool ExtraOp = false;
2240  switch (Op.getOpcode()) {
2241  default:
2242  llvm_unreachable("Invalid code");
2243  case ISD::ADDC:
2244  Opc = AArch64ISD::ADDS;
2245  break;
2246  case ISD::SUBC:
2247  Opc = AArch64ISD::SUBS;
2248  break;
2249  case ISD::ADDE:
2250  Opc = AArch64ISD::ADCS;
2251  ExtraOp = true;
2252  break;
2253  case ISD::SUBE:
2254  Opc = AArch64ISD::SBCS;
2255  ExtraOp = true;
2256  break;
2257  }
2258 
2259  if (!ExtraOp)
2260  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2261  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2262  Op.getOperand(2));
2263 }
2264 
2266  // Let legalize expand this if it isn't a legal type yet.
2268  return SDValue();
2269 
2270  SDLoc dl(Op);
2272  // The actual operation that sets the overflow or carry flag.
2273  SDValue Value, Overflow;
2274  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2275 
2276  // We use 0 and 1 as false and true values.
2277  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2278  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2279 
2280  // We use an inverted condition, because the conditional select is inverted
2281  // too. This will allow it to be selected to a single instruction:
2282  // CSINC Wd, WZR, WZR, invert(cond).
2283  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2284  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2285  CCVal, Overflow);
2286 
2287  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2288  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2289 }
2290 
2291 // Prefetch operands are:
2292 // 1: Address to prefetch
2293 // 2: bool isWrite
2294 // 3: int locality (0 = no locality ... 3 = extreme locality)
2295 // 4: bool isDataCache
2297  SDLoc DL(Op);
2298  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2299  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2300  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2301 
2302  bool IsStream = !Locality;
2303  // When the locality number is set
2304  if (Locality) {
2305  // The front-end should have filtered out the out-of-range values
2306  assert(Locality <= 3 && "Prefetch locality out-of-range");
2307  // The locality degree is the opposite of the cache speed.
2308  // Put the number the other way around.
2309  // The encoding starts at 0 for level 1
2310  Locality = 3 - Locality;
2311  }
2312 
2313  // built the mask value encoding the expected behavior.
2314  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2315  (!IsData << 3) | // IsDataCache bit
2316  (Locality << 1) | // Cache level bits
2317  (unsigned)IsStream; // Stream bit
2318  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2319  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2320 }
2321 
2322 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2323  SelectionDAG &DAG) const {
2324  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2325 
2326  RTLIB::Libcall LC;
2328 
2329  return LowerF128Call(Op, DAG, LC);
2330 }
2331 
2332 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2333  SelectionDAG &DAG) const {
2334  if (Op.getOperand(0).getValueType() != MVT::f128) {
2335  // It's legal except when f128 is involved
2336  return Op;
2337  }
2338 
2339  RTLIB::Libcall LC;
2341 
2342  // FP_ROUND node has a second operand indicating whether it is known to be
2343  // precise. That doesn't take part in the LibCall so we can't directly use
2344  // LowerF128Call.
2345  SDValue SrcVal = Op.getOperand(0);
2346  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
2347  SDLoc(Op)).first;
2348 }
2349 
2350 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
2351  SelectionDAG &DAG) const {
2352  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2353  // Any additional optimization in this function should be recorded
2354  // in the cost tables.
2355  EVT InVT = Op.getOperand(0).getValueType();
2356  EVT VT = Op.getValueType();
2357  unsigned NumElts = InVT.getVectorNumElements();
2358 
2359  // f16 conversions are promoted to f32 when full fp16 is not supported.
2360  if (InVT.getVectorElementType() == MVT::f16 &&
2361  !Subtarget->hasFullFP16()) {
2362  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2363  SDLoc dl(Op);
2364  return DAG.getNode(
2365  Op.getOpcode(), dl, Op.getValueType(),
2366  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2367  }
2368 
2369  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2370  SDLoc dl(Op);
2371  SDValue Cv =
2372  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2373  Op.getOperand(0));
2374  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2375  }
2376 
2377  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2378  SDLoc dl(Op);
2379  MVT ExtVT =
2381  VT.getVectorNumElements());
2382  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2383  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2384  }
2385 
2386  // Type changing conversions are illegal.
2387  return Op;
2388 }
2389 
2390 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2391  SelectionDAG &DAG) const {
2392  if (Op.getOperand(0).getValueType().isVector())
2393  return LowerVectorFP_TO_INT(Op, DAG);
2394 
2395  // f16 conversions are promoted to f32 when full fp16 is not supported.
2396  if (Op.getOperand(0).getValueType() == MVT::f16 &&
2397  !Subtarget->hasFullFP16()) {
2398  SDLoc dl(Op);
2399  return DAG.getNode(
2400  Op.getOpcode(), dl, Op.getValueType(),
2401  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2402  }
2403 
2404  if (Op.getOperand(0).getValueType() != MVT::f128) {
2405  // It's legal except when f128 is involved
2406  return Op;
2407  }
2408 
2409  RTLIB::Libcall LC;
2410  if (Op.getOpcode() == ISD::FP_TO_SINT)
2412  else
2414 
2415  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2416  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
2417 }
2418 
2420  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2421  // Any additional optimization in this function should be recorded
2422  // in the cost tables.
2423  EVT VT = Op.getValueType();
2424  SDLoc dl(Op);
2425  SDValue In = Op.getOperand(0);
2426  EVT InVT = In.getValueType();
2427 
2428  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2429  MVT CastVT =
2431  InVT.getVectorNumElements());
2432  In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2433  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2434  }
2435 
2436  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2437  unsigned CastOpc =
2439  EVT CastVT = VT.changeVectorElementTypeToInteger();
2440  In = DAG.getNode(CastOpc, dl, CastVT, In);
2441  return DAG.getNode(Op.getOpcode(), dl, VT, In);
2442  }
2443 
2444  return Op;
2445 }
2446 
2447 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2448  SelectionDAG &DAG) const {
2449  if (Op.getValueType().isVector())
2450  return LowerVectorINT_TO_FP(Op, DAG);
2451 
2452  // f16 conversions are promoted to f32 when full fp16 is not supported.
2453  if (Op.getValueType() == MVT::f16 &&
2454  !Subtarget->hasFullFP16()) {
2455  SDLoc dl(Op);
2456  return DAG.getNode(
2457  ISD::FP_ROUND, dl, MVT::f16,
2458  DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2459  DAG.getIntPtrConstant(0, dl));
2460  }
2461 
2462  // i128 conversions are libcalls.
2463  if (Op.getOperand(0).getValueType() == MVT::i128)
2464  return SDValue();
2465 
2466  // Other conversions are legal, unless it's to the completely software-based
2467  // fp128.
2468  if (Op.getValueType() != MVT::f128)
2469  return Op;
2470 
2471  RTLIB::Libcall LC;
2472  if (Op.getOpcode() == ISD::SINT_TO_FP)
2474  else
2476 
2477  return LowerF128Call(Op, DAG, LC);
2478 }
2479 
2480 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2481  SelectionDAG &DAG) const {
2482  // For iOS, we want to call an alternative entry point: __sincos_stret,
2483  // which returns the values in two S / D registers.
2484  SDLoc dl(Op);
2485  SDValue Arg = Op.getOperand(0);
2486  EVT ArgVT = Arg.getValueType();
2487  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2488 
2489  ArgListTy Args;
2490  ArgListEntry Entry;
2491 
2492  Entry.Node = Arg;
2493  Entry.Ty = ArgTy;
2494  Entry.IsSExt = false;
2495  Entry.IsZExt = false;
2496  Args.push_back(Entry);
2497 
2498  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2499  : RTLIB::SINCOS_STRET_F32;
2500  const char *LibcallName = getLibcallName(LC);
2501  SDValue Callee =
2502  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2503 
2504  StructType *RetTy = StructType::get(ArgTy, ArgTy);
2506  CLI.setDebugLoc(dl)
2507  .setChain(DAG.getEntryNode())
2508  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2509 
2510  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2511  return CallResult.first;
2512 }
2513 
2515  if (Op.getValueType() != MVT::f16)
2516  return SDValue();
2517 
2518  assert(Op.getOperand(0).getValueType() == MVT::i16);
2519  SDLoc DL(Op);
2520 
2521  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2522  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2523  return SDValue(
2524  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2525  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2526  0);
2527 }
2528 
2529 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2530  if (OrigVT.getSizeInBits() >= 64)
2531  return OrigVT;
2532 
2533  assert(OrigVT.isSimple() && "Expecting a simple value type");
2534 
2535  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2536  switch (OrigSimpleTy) {
2537  default: llvm_unreachable("Unexpected Vector Type");
2538  case MVT::v2i8:
2539  case MVT::v2i16:
2540  return MVT::v2i32;
2541  case MVT::v4i8:
2542  return MVT::v4i16;
2543  }
2544 }
2545 
2547  const EVT &OrigTy,
2548  const EVT &ExtTy,
2549  unsigned ExtOpcode) {
2550  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2551  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2552  // 64-bits we need to insert a new extension so that it will be 64-bits.
2553  assert(ExtTy.is128BitVector() && "Unexpected extension size");
2554  if (OrigTy.getSizeInBits() >= 64)
2555  return N;
2556 
2557  // Must extend size to at least 64 bits to be used as an operand for VMULL.
2558  EVT NewVT = getExtensionTo64Bits(OrigTy);
2559 
2560  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2561 }
2562 
2564  bool isSigned) {
2565  EVT VT = N->getValueType(0);
2566 
2567  if (N->getOpcode() != ISD::BUILD_VECTOR)
2568  return false;
2569 
2570  for (const SDValue &Elt : N->op_values()) {
2571  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2572  unsigned EltSize = VT.getScalarSizeInBits();
2573  unsigned HalfSize = EltSize / 2;
2574  if (isSigned) {
2575  if (!isIntN(HalfSize, C->getSExtValue()))
2576  return false;
2577  } else {
2578  if (!isUIntN(HalfSize, C->getZExtValue()))
2579  return false;
2580  }
2581  continue;
2582  }
2583  return false;
2584  }
2585 
2586  return true;
2587 }
2588 
2590  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2592  N->getOperand(0)->getValueType(0),
2593  N->getValueType(0),
2594  N->getOpcode());
2595 
2596  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2597  EVT VT = N->getValueType(0);
2598  SDLoc dl(N);
2599  unsigned EltSize = VT.getScalarSizeInBits() / 2;
2600  unsigned NumElts = VT.getVectorNumElements();
2601  MVT TruncVT = MVT::getIntegerVT(EltSize);
2603  for (unsigned i = 0; i != NumElts; ++i) {
2604  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2605  const APInt &CInt = C->getAPIntValue();
2606  // Element types smaller than 32 bits are not legal, so use i32 elements.
2607  // The values are implicitly truncated so sext vs. zext doesn't matter.
2608  Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2609  }
2610  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2611 }
2612 
2613 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2614  return N->getOpcode() == ISD::SIGN_EXTEND ||
2615  isExtendedBUILD_VECTOR(N, DAG, true);
2616 }
2617 
2618 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2619  return N->getOpcode() == ISD::ZERO_EXTEND ||
2620  isExtendedBUILD_VECTOR(N, DAG, false);
2621 }
2622 
2623 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2624  unsigned Opcode = N->getOpcode();
2625  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2626  SDNode *N0 = N->getOperand(0).getNode();
2627  SDNode *N1 = N->getOperand(1).getNode();
2628  return N0->hasOneUse() && N1->hasOneUse() &&
2629  isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2630  }
2631  return false;
2632 }
2633 
2634 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2635  unsigned Opcode = N->getOpcode();
2636  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2637  SDNode *N0 = N->getOperand(0).getNode();
2638  SDNode *N1 = N->getOperand(1).getNode();
2639  return N0->hasOneUse() && N1->hasOneUse() &&
2640  isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2641  }
2642  return false;
2643 }
2644 
2645 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
2646  SelectionDAG &DAG) const {
2647  // The rounding mode is in bits 23:22 of the FPSCR.
2648  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
2649  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
2650  // so that the shift + and get folded into a bitfield extract.
2651  SDLoc dl(Op);
2652 
2653  SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
2654  DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
2655  MVT::i64));
2656  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
2657  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
2658  DAG.getConstant(1U << 22, dl, MVT::i32));
2659  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
2660  DAG.getConstant(22, dl, MVT::i32));
2661  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
2662  DAG.getConstant(3, dl, MVT::i32));
2663 }
2664 
2666  // Multiplications are only custom-lowered for 128-bit vectors so that
2667  // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2668  EVT VT = Op.getValueType();
2669  assert(VT.is128BitVector() && VT.isInteger() &&
2670  "unexpected type for custom-lowering ISD::MUL");
2671  SDNode *N0 = Op.getOperand(0).getNode();
2672  SDNode *N1 = Op.getOperand(1).getNode();
2673  unsigned NewOpc = 0;
2674  bool isMLA = false;
2675  bool isN0SExt = isSignExtended(N0, DAG);
2676  bool isN1SExt = isSignExtended(N1, DAG);
2677  if (isN0SExt && isN1SExt)
2678  NewOpc = AArch64ISD::SMULL;
2679  else {
2680  bool isN0ZExt = isZeroExtended(N0, DAG);
2681  bool isN1ZExt = isZeroExtended(N1, DAG);
2682  if (isN0ZExt && isN1ZExt)
2683  NewOpc = AArch64ISD::UMULL;
2684  else if (isN1SExt || isN1ZExt) {
2685  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2686  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2687  if (isN1SExt && isAddSubSExt(N0, DAG)) {
2688  NewOpc = AArch64ISD::SMULL;
2689  isMLA = true;
2690  } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2691  NewOpc = AArch64ISD::UMULL;
2692  isMLA = true;
2693  } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2694  std::swap(N0, N1);
2695  NewOpc = AArch64ISD::UMULL;
2696  isMLA = true;
2697  }
2698  }
2699 
2700  if (!NewOpc) {
2701  if (VT == MVT::v2i64)
2702  // Fall through to expand this. It is not legal.
2703  return SDValue();
2704  else
2705  // Other vector multiplications are legal.
2706  return Op;
2707  }
2708  }
2709 
2710  // Legalize to a S/UMULL instruction
2711  SDLoc DL(Op);
2712  SDValue Op0;
2713  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2714  if (!isMLA) {
2715  Op0 = skipExtensionForVectorMULL(N0, DAG);
2716  assert(Op0.getValueType().is64BitVector() &&
2717  Op1.getValueType().is64BitVector() &&
2718  "unexpected types for extended operands to VMULL");
2719  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2720  }
2721  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2722  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2723  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2724  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2725  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2726  EVT Op1VT = Op1.getValueType();
2727  return DAG.getNode(N0->getOpcode(), DL, VT,
2728  DAG.getNode(NewOpc, DL, VT,
2729  DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2730  DAG.getNode(NewOpc, DL, VT,
2731  DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2732 }
2733 
2734 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2735  SelectionDAG &DAG) const {
2736  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2737  SDLoc dl(Op);
2738  switch (IntNo) {
2739  default: return SDValue(); // Don't custom lower most intrinsics.
2740  case Intrinsic::thread_pointer: {
2741  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2742  return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2743  }
2744  case Intrinsic::aarch64_neon_abs: {
2745  EVT Ty = Op.getValueType();
2746  if (Ty == MVT::i64) {
2747  SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
2748  Op.getOperand(1));
2749  Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
2750  return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
2751  } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
2752  return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
2753  } else {
2754  report_fatal_error("Unexpected type for AArch64 NEON intrinic");
2755  }
2756  }
2757  case Intrinsic::aarch64_neon_smax:
2758  return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2759  Op.getOperand(1), Op.getOperand(2));
2760  case Intrinsic::aarch64_neon_umax:
2761  return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2762  Op.getOperand(1), Op.getOperand(2));
2763  case Intrinsic::aarch64_neon_smin:
2764  return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2765  Op.getOperand(1), Op.getOperand(2));
2766  case Intrinsic::aarch64_neon_umin:
2767  return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2768  Op.getOperand(1), Op.getOperand(2));
2769 
2770  case Intrinsic::localaddress: {
2771  const auto &MF = DAG.getMachineFunction();
2772  const auto *RegInfo = Subtarget->getRegisterInfo();
2773  unsigned Reg = RegInfo->getLocalAddressRegister(MF);
2774  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
2775  Op.getSimpleValueType());
2776  }
2777 
2778  case Intrinsic::eh_recoverfp: {
2779  // FIXME: This needs to be implemented to correctly handle highly aligned
2780  // stack objects. For now we simply return the incoming FP. Refer D53541
2781  // for more details.
2782  SDValue FnOp = Op.getOperand(1);
2783  SDValue IncomingFPOp = Op.getOperand(2);
2785  auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
2786  if (!Fn)
2788  "llvm.eh.recoverfp must take a function as the first argument");
2789  return IncomingFPOp;
2790  }
2791  }
2792 }
2793 
2794 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
2796  EVT VT, EVT MemVT,
2797  SelectionDAG &DAG) {
2798  assert(VT.isVector() && "VT should be a vector type");
2799  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
2800 
2801  SDValue Value = ST->getValue();
2802 
2803  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
2804  // the word lane which represent the v4i8 subvector. It optimizes the store
2805  // to:
2806  //
2807  // xtn v0.8b, v0.8h
2808  // str s0, [x0]
2809 
2810  SDValue Undef = DAG.getUNDEF(MVT::i16);
2811  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
2812  {Undef, Undef, Undef, Undef});
2813 
2814  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
2815  Value, UndefVec);
2816  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
2817 
2818  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
2819  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
2820  Trunc, DAG.getConstant(0, DL, MVT::i64));
2821 
2822  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
2823  ST->getBasePtr(), ST->getMemOperand());
2824 }
2825 
2826 // Custom lowering for any store, vector or scalar and/or default or with
2827 // a truncate operations. Currently only custom lower truncate operation
2828 // from vector v4i16 to v4i8.
2829 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
2830  SelectionDAG &DAG) const {
2831  SDLoc Dl(Op);
2832  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
2833  assert (StoreNode && "Can only custom lower store nodes");
2834 
2835  SDValue Value = StoreNode->getValue();
2836 
2837  EVT VT = Value.getValueType();
2838  EVT MemVT = StoreNode->getMemoryVT();
2839 
2840  assert (VT.isVector() && "Can only custom lower vector store types");
2841 
2842  unsigned AS = StoreNode->getAddressSpace();
2843  unsigned Align = StoreNode->getAlignment();
2844  if (Align < MemVT.getStoreSize() &&
2846  MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
2847  return scalarizeVectorStore(StoreNode, DAG);
2848  }
2849 
2850  if (StoreNode->isTruncatingStore()) {
2851  return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
2852  }
2853 
2854  return SDValue();
2855 }
2856 
2858  SelectionDAG &DAG) const {
2859  LLVM_DEBUG(dbgs() << "Custom lowering: ");
2860  LLVM_DEBUG(Op.dump());
2861 
2862  switch (Op.getOpcode()) {
2863  default:
2864  llvm_unreachable("unimplemented operand");
2865  return SDValue();
2866  case ISD::BITCAST:
2867  return LowerBITCAST(Op, DAG);
2868  case ISD::GlobalAddress:
2869  return LowerGlobalAddress(Op, DAG);
2870  case ISD::GlobalTLSAddress:
2871  return LowerGlobalTLSAddress(Op, DAG);
2872  case ISD::SETCC:
2873  return LowerSETCC(Op, DAG);
2874  case ISD::BR_CC:
2875  return LowerBR_CC(Op, DAG);
2876  case ISD::SELECT:
2877  return LowerSELECT(Op, DAG);
2878  case ISD::SELECT_CC:
2879  return LowerSELECT_CC(Op, DAG);
2880  case ISD::JumpTable:
2881  return LowerJumpTable(Op, DAG);
2882  case ISD::BR_JT:
2883  return LowerBR_JT(Op, DAG);
2884  case ISD::ConstantPool:
2885  return LowerConstantPool(Op, DAG);
2886  case ISD::BlockAddress:
2887  return LowerBlockAddress(Op, DAG);
2888  case ISD::VASTART:
2889  return LowerVASTART(Op, DAG);
2890  case ISD::VACOPY:
2891  return LowerVACOPY(Op, DAG);
2892  case ISD::VAARG:
2893  return LowerVAARG(Op, DAG);
2894  case ISD::ADDC:
2895  case ISD::ADDE:
2896  case ISD::SUBC:
2897  case ISD::SUBE:
2898  return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2899  case ISD::SADDO:
2900  case ISD::UADDO:
2901  case ISD::SSUBO:
2902  case ISD::USUBO:
2903  case ISD::SMULO:
2904  case ISD::UMULO:
2905  return LowerXALUO(Op, DAG);
2906  case ISD::FADD:
2907  return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
2908  case ISD::FSUB:
2909  return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
2910  case ISD::FMUL:
2911  return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
2912  case ISD::FDIV:
2913  return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
2914  case ISD::FP_ROUND:
2915  return LowerFP_ROUND(Op, DAG);
2916  case ISD::FP_EXTEND:
2917  return LowerFP_EXTEND(Op, DAG);
2918  case ISD::FRAMEADDR:
2919  return LowerFRAMEADDR(Op, DAG);
2920  case ISD::SPONENTRY:
2921  return LowerSPONENTRY(Op, DAG);
2922  case ISD::RETURNADDR:
2923  return LowerRETURNADDR(Op, DAG);
2924  case ISD::ADDROFRETURNADDR:
2925  return LowerADDROFRETURNADDR(Op, DAG);
2927  return LowerINSERT_VECTOR_ELT(Op, DAG);
2929  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2930  case ISD::BUILD_VECTOR:
2931  return LowerBUILD_VECTOR(Op, DAG);
2932  case ISD::VECTOR_SHUFFLE:
2933  return LowerVECTOR_SHUFFLE(Op, DAG);
2935  return LowerEXTRACT_SUBVECTOR(Op, DAG);
2936  case ISD::SRA:
2937  case ISD::SRL:
2938  case ISD::SHL:
2939  return LowerVectorSRA_SRL_SHL(Op, DAG);
2940  case ISD::SHL_PARTS:
2941  return LowerShiftLeftParts(Op, DAG);
2942  case ISD::SRL_PARTS:
2943  case ISD::SRA_PARTS:
2944  return LowerShiftRightParts(Op, DAG);
2945  case ISD::CTPOP:
2946  return LowerCTPOP(Op, DAG);
2947  case ISD::FCOPYSIGN:
2948  return LowerFCOPYSIGN(Op, DAG);
2949  case ISD::OR:
2950  return LowerVectorOR(Op, DAG);
2951  case ISD::XOR:
2952  return LowerXOR(Op, DAG);
2953  case ISD::PREFETCH:
2954  return LowerPREFETCH(Op, DAG);
2955  case ISD::SINT_TO_FP:
2956  case ISD::UINT_TO_FP:
2957  return LowerINT_TO_FP(Op, DAG);
2958  case ISD::FP_TO_SINT:
2959  case ISD::FP_TO_UINT:
2960  return LowerFP_TO_INT(Op, DAG);
2961  case ISD::FSINCOS:
2962  return LowerFSINCOS(Op, DAG);
2963  case ISD::FLT_ROUNDS_:
2964  return LowerFLT_ROUNDS_(Op, DAG);
2965  case ISD::MUL:
2966  return LowerMUL(Op, DAG);
2968  return LowerINTRINSIC_WO_CHAIN(Op, DAG);
2969  case ISD::STORE:
2970  return LowerSTORE(Op, DAG);
2971  case ISD::VECREDUCE_ADD:
2972  case ISD::VECREDUCE_SMAX:
2973  case ISD::VECREDUCE_SMIN:
2974  case ISD::VECREDUCE_UMAX:
2975  case ISD::VECREDUCE_UMIN:
2976  case ISD::VECREDUCE_FMAX:
2977  case ISD::VECREDUCE_FMIN:
2978  return LowerVECREDUCE(Op, DAG);
2979  case ISD::ATOMIC_LOAD_SUB:
2980  return LowerATOMIC_LOAD_SUB(Op, DAG);
2981  case ISD::ATOMIC_LOAD_AND:
2982  return LowerATOMIC_LOAD_AND(Op, DAG);
2984  return LowerDYNAMIC_STACKALLOC(Op, DAG);
2985  }
2986 }
2987 
2988 //===----------------------------------------------------------------------===//
2989 // Calling Convention Implementation
2990 //===----------------------------------------------------------------------===//
2991 
2992 /// Selects the correct CCAssignFn for a given CallingConvention value.
2994  bool IsVarArg) const {
2995  switch (CC) {
2996  default:
2997  report_fatal_error("Unsupported calling convention.");
2999  return CC_AArch64_WebKit_JS;
3000  case CallingConv::GHC:
3001  return CC_AArch64_GHC;
3002  case CallingConv::C:
3003  case CallingConv::Fast:
3006  case CallingConv::Swift:
3007  if (Subtarget->isTargetWindows() && IsVarArg)
3008  return CC_AArch64_Win64_VarArg;
3009  if (!Subtarget->isTargetDarwin())
3010  return CC_AArch64_AAPCS;
3012  case CallingConv::Win64:
3013  return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
3015  return CC_AArch64_AAPCS;
3016  }
3017 }
3018 
3019 CCAssignFn *
3023 }
3024 
3025 SDValue AArch64TargetLowering::LowerFormalArguments(
3026  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3027  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3028  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3029  MachineFunction &MF = DAG.getMachineFunction();
3030  MachineFrameInfo &MFI = MF.getFrameInfo();
3031  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3032 
3033  // Assign locations to all of the incoming arguments.
3035  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3036  *DAG.getContext());
3037 
3038  // At this point, Ins[].VT may already be promoted to i32. To correctly
3039  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3040  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3041  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
3042  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
3043  // LocVT.
3044  unsigned NumArgs = Ins.size();
3046  unsigned CurArgIdx = 0;
3047  for (unsigned i = 0; i != NumArgs; ++i) {
3048  MVT ValVT = Ins[i].VT;
3049  if (Ins[i].isOrigArg()) {
3050  std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
3051  CurArgIdx = Ins[i].getOrigArgIndex();
3052 
3053  // Get type of the original argument.
3054  EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
3055  /*AllowUnknown*/ true);
3056  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
3057  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3058  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3059  ValVT = MVT::i8;
3060  else if (ActualMVT == MVT::i16)
3061  ValVT = MVT::i16;
3062  }
3063  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3064  bool Res =
3065  AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
3066  assert(!Res && "Call operand has unhandled type");
3067  (void)Res;
3068  }
3069  assert(ArgLocs.size() == Ins.size());
3070  SmallVector<SDValue, 16> ArgValues;
3071  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3072  CCValAssign &VA = ArgLocs[i];
3073 
3074  if (Ins[i].Flags.isByVal()) {
3075  // Byval is used for HFAs in the PCS, but the system should work in a
3076  // non-compliant manner for larger structs.
3077  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3078  int Size = Ins[i].Flags.getByValSize();
3079  unsigned NumRegs = (Size + 7) / 8;
3080 
3081  // FIXME: This works on big-endian for composite byvals, which are the common
3082  // case. It should also work for fundamental types too.
3083  unsigned FrameIdx =
3084  MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
3085  SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
3086  InVals.push_back(FrameIdxN);
3087 
3088  continue;
3089  }
3090 
3091  if (VA.isRegLoc()) {
3092  // Arguments stored in registers.
3093  EVT RegVT = VA.getLocVT();
3094 
3095  SDValue ArgValue;
3096  const TargetRegisterClass *RC;
3097 
3098  if (RegVT == MVT::i32)
3099  RC = &AArch64::GPR32RegClass;
3100  else if (RegVT == MVT::i64)
3101  RC = &AArch64::GPR64RegClass;
3102  else if (RegVT == MVT::f16)
3103  RC = &AArch64::FPR16RegClass;
3104  else if (RegVT == MVT::f32)
3105  RC = &AArch64::FPR32RegClass;
3106  else if (RegVT == MVT::f64 || RegVT.is64BitVector())
3107  RC = &AArch64::FPR64RegClass;
3108  else if (RegVT == MVT::f128 || RegVT.is128BitVector())
3109  RC = &AArch64::FPR128RegClass;
3110  else
3111  llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3112 
3113  // Transform the arguments in physical registers into virtual ones.
3114  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3115  ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
3116 
3117  // If this is an 8, 16 or 32-bit value, it is really passed promoted
3118  // to 64 bits. Insert an assert[sz]ext to capture this, then
3119  // truncate to the right size.
3120  switch (VA.getLocInfo()) {
3121  default:
3122  llvm_unreachable("Unknown loc info!");
3123  case CCValAssign::Full:
3124  break;
3125  case CCValAssign::BCvt:
3126  ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
3127  break;
3128  case CCValAssign::AExt:
3129  case CCValAssign::SExt:
3130  case CCValAssign::ZExt:
3131  // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
3132  // nodes after our lowering.
3133  assert(RegVT == Ins[i].VT && "incorrect register location selected");
3134  break;
3135  }
3136 
3137  InVals.push_back(ArgValue);
3138 
3139  } else { // VA.isRegLoc()
3140  assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
3141  unsigned ArgOffset = VA.getLocMemOffset();
3142  unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
3143 
3144  uint32_t BEAlign = 0;
3145  if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
3146  !Ins[i].Flags.isInConsecutiveRegs())
3147  BEAlign = 8 - ArgSize;
3148 
3149  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
3150 
3151  // Create load nodes to retrieve arguments from the stack.
3152  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3153  SDValue ArgValue;
3154 
3155  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
3157  MVT MemVT = VA.getValVT();
3158 
3159  switch (VA.getLocInfo()) {
3160  default:
3161  break;
3162  case CCValAssign::BCvt:
3163  MemVT = VA.getLocVT();
3164  break;
3165  case CCValAssign::SExt:
3166  ExtType = ISD::SEXTLOAD;
3167  break;
3168  case CCValAssign::ZExt:
3169  ExtType = ISD::ZEXTLOAD;
3170  break;
3171  case CCValAssign::AExt:
3172  ExtType = ISD::EXTLOAD;
3173  break;
3174  }
3175 
3176  ArgValue = DAG.getExtLoad(
3177  ExtType, DL, VA.getLocVT(), Chain, FIN,
3179  MemVT);
3180 
3181  InVals.push_back(ArgValue);
3182  }
3183  }
3184 
3185  // varargs
3187  if (isVarArg) {
3188  if (!Subtarget->isTargetDarwin() || IsWin64) {
3189  // The AAPCS variadic function ABI is identical to the non-variadic
3190  // one. As a result there may be more arguments in registers and we should
3191  // save them for future reference.
3192  // Win64 variadic functions also pass arguments in registers, but all float
3193  // arguments are passed in integer registers.
3194  saveVarArgRegisters(CCInfo, DAG, DL, Chain);
3195  }
3196 
3197  // This will point to the next argument passed via stack.
3198  unsigned StackOffset = CCInfo.getNextStackOffset();
3199  // We currently pass all varargs at 8-byte alignment.
3200  StackOffset = ((StackOffset + 7) & ~7);
3201  FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
3202 
3203  if (MFI.hasMustTailInVarArgFunc()) {
3204  SmallVector<MVT, 2> RegParmTypes;
3205  RegParmTypes.push_back(MVT::i64);
3206  RegParmTypes.push_back(MVT::f128);
3207  // Compute the set of forwarded registers. The rest are scratch.
3209  FuncInfo->getForwardedMustTailRegParms();
3210  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
3212 
3213  // Conservatively forward X8, since it might be used for aggregate return.
3214  if (!CCInfo.isAllocated(AArch64::X8)) {
3215  unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
3216  Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
3217  }
3218  }
3219  }
3220 
3221  // On Windows, InReg pointers must be returned, so record the pointer in a
3222  // virtual register at the start of the function so it can be returned in the
3223  // epilogue.
3224  if (IsWin64) {
3225  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3226  if (Ins[I].Flags.isInReg()) {
3227  assert(!FuncInfo->getSRetReturnReg());
3228 
3229  MVT PtrTy = getPointerTy(DAG.getDataLayout());
3230  unsigned Reg =
3232  FuncInfo->setSRetReturnReg(Reg);
3233 
3234  SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
3235  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
3236  break;
3237  }
3238  }
3239  }
3240 
3241  unsigned StackArgSize = CCInfo.getNextStackOffset();
3242  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3243  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
3244  // This is a non-standard ABI so by fiat I say we're allowed to make full
3245  // use of the stack area to be popped, which must be aligned to 16 bytes in
3246  // any case:
3247  StackArgSize = alignTo(StackArgSize, 16);
3248 
3249  // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
3250  // a multiple of 16.
3251  FuncInfo->setArgumentStackToRestore(StackArgSize);
3252 
3253  // This realignment carries over to the available bytes below. Our own
3254  // callers will guarantee the space is free by giving an aligned value to
3255  // CALLSEQ_START.
3256  }
3257  // Even if we're not expected to free up the space, it's useful to know how
3258  // much is there while considering tail calls (because we can reuse it).
3259  FuncInfo->setBytesInStackArgArea(StackArgSize);
3260 
3261  if (Subtarget->hasCustomCallingConv())
3262  Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
3263 
3264  return Chain;
3265 }
3266 
3267 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
3268  SelectionDAG &DAG,
3269  const SDLoc &DL,
3270  SDValue &Chain) const {
3271  MachineFunction &MF = DAG.getMachineFunction();
3272  MachineFrameInfo &MFI = MF.getFrameInfo();
3274  auto PtrVT = getPointerTy(DAG.getDataLayout());
3275  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3276 
3277  SmallVector<SDValue, 8> MemOps;
3278 
3279  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
3280  AArch64::X3, AArch64::X4, AArch64::X5,
3281  AArch64::X6, AArch64::X7 };
3282  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
3283  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
3284 
3285  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
3286  int GPRIdx = 0;
3287  if (GPRSaveSize != 0) {
3288  if (IsWin64) {
3289  GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
3290  if (GPRSaveSize & 15)
3291  // The extra size here, if triggered, will always be 8.
3292  MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
3293  } else
3294  GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
3295 
3296  SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
3297 
3298  for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
3299  unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
3300  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
3301  SDValue Store = DAG.getStore(
3302  Val.getValue(1), DL, Val, FIN,
3303  IsWin64
3305  GPRIdx,
3306  (i - FirstVariadicGPR) * 8)
3308  MemOps.push_back(Store);
3309  FIN =
3310  DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
3311  }
3312  }
3313  FuncInfo->setVarArgsGPRIndex(GPRIdx);
3314  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
3315 
3316  if (Subtarget->hasFPARMv8() && !IsWin64) {
3317  static const MCPhysReg FPRArgRegs[] = {
3318  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
3319  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
3320  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
3321  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
3322 
3323  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
3324  int FPRIdx = 0;
3325  if (FPRSaveSize != 0) {
3326  FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
3327 
3328  SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3329 
3330  for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3331  unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3332  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3333 
3334  SDValue Store = DAG.getStore(
3335  Val.getValue(1), DL, Val, FIN,
3337  MemOps.push_back(Store);
3338  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3339  DAG.getConstant(16, DL, PtrVT));
3340  }
3341  }
3342  FuncInfo->setVarArgsFPRIndex(FPRIdx);
3343  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3344  }
3345 
3346  if (!MemOps.empty()) {
3347  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3348  }
3349 }
3350 
3351 /// LowerCallResult - Lower the result values of a call into the
3352 /// appropriate copies out of appropriate physical registers.
3353 SDValue AArch64TargetLowering::LowerCallResult(
3354  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3355  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3356  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3357  SDValue ThisVal) const {
3358  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3361  // Assign locations to each value returned by this call.
3363  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3364  *DAG.getContext());
3365  CCInfo.AnalyzeCallResult(Ins, RetCC);
3366 
3367  // Copy all of the result registers out of their specified physreg.
3368  for (unsigned i = 0; i != RVLocs.size(); ++i) {
3369  CCValAssign VA = RVLocs[i];
3370 
3371  // Pass 'this' value directly from the argument to return value, to avoid
3372  // reg unit interference
3373  if (i == 0 && isThisReturn) {
3374  assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3375  "unexpected return calling convention register assignment");
3376  InVals.push_back(ThisVal);
3377  continue;
3378  }
3379 
3380  SDValue Val =
3381  DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3382  Chain = Val.getValue(1);
3383  InFlag = Val.getValue(2);
3384 
3385  switch (VA.getLocInfo()) {
3386  default:
3387  llvm_unreachable("Unknown loc info!");
3388  case CCValAssign::Full:
3389  break;
3390  case CCValAssign::BCvt:
3391  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3392  break;
3393  }
3394 
3395  InVals.push_back(Val);
3396  }
3397 
3398  return Chain;
3399 }
3400 
3401 /// Return true if the calling convention is one that we can guarantee TCO for.
3403  return CC == CallingConv::Fast;
3404 }
3405 
3406 /// Return true if we might ever do TCO for calls with this calling convention.
3408  switch (CC) {
3409  case CallingConv::C:
3411  case CallingConv::Swift:
3412  return true;
3413  default:
3414  return canGuaranteeTCO(CC);
3415  }
3416 }
3417 
3418 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3419  SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3420  const SmallVectorImpl<ISD::OutputArg> &Outs,
3421  const SmallVectorImpl<SDValue> &OutVals,
3422  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3423  if (!mayTailCallThisCC(CalleeCC))
3424  return false;
3425 
3426  MachineFunction &MF = DAG.getMachineFunction();
3427  const Function &CallerF = MF.getFunction();
3428  CallingConv::ID CallerCC = CallerF.getCallingConv();
3429  bool CCMatch = CallerCC == CalleeCC;
3430 
3431  // Byval parameters hand the function a pointer directly into the stack area
3432  // we want to reuse during a tail call. Working around this *is* possible (see
3433  // X86) but less efficient and uglier in LowerCall.
3434  for (Function::const_arg_iterator i = CallerF.arg_begin(),
3435  e = CallerF.arg_end();
3436  i != e; ++i) {
3437  if (i->hasByValAttr())
3438  return false;
3439 
3440  // On Windows, "inreg" attributes signify non-aggregate indirect returns.
3441  // In this case, it is necessary to save/restore X0 in the callee. Tail
3442  // call opt interferes with this. So we disable tail call opt when the
3443  // caller has an argument with "inreg" attribute.
3444 
3445  // FIXME: Check whether the callee also has an "inreg" argument.
3446  if (i->hasInRegAttr())
3447  return false;
3448  }
3449 
3451  return canGuaranteeTCO(CalleeCC) && CCMatch;
3452 
3453  // Externally-defined functions with weak linkage should not be
3454  // tail-called on AArch64 when the OS does not support dynamic
3455  // pre-emption of symbols, as the AAELF spec requires normal calls
3456  // to undefined weak functions to be replaced with a NOP or jump to the
3457  // next instruction. The behaviour of branch instructions in this
3458  // situation (as used for tail calls) is implementation-defined, so we
3459  // cannot rely on the linker replacing the tail call with a return.
3460  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3461  const GlobalValue *GV = G->getGlobal();
3463  if (GV->hasExternalWeakLinkage() &&
3464  (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3465  return false;
3466  }
3467 
3468  // Now we search for cases where we can use a tail call without changing the
3469  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3470  // concept.
3471 
3472  // I want anyone implementing a new calling convention to think long and hard
3473  // about this assert.
3474  assert((!isVarArg || CalleeCC == CallingConv::C) &&
3475  "Unexpected variadic calling convention");
3476 
3477  LLVMContext &C = *DAG.getContext();
3478  if (isVarArg && !Outs.empty()) {
3479  // At least two cases here: if caller is fastcc then we can't have any
3480  // memory arguments (we'd be expected to clean up the stack afterwards). If
3481  // caller is C then we could potentially use its argument area.
3482 
3483  // FIXME: for now we take the most conservative of these in both cases:
3484  // disallow all variadic memory operands.
3486  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3487 
3488  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3489  for (const CCValAssign &ArgLoc : ArgLocs)
3490  if (!ArgLoc.isRegLoc())
3491  return false;
3492  }
3493 
3494  // Check that the call results are passed in the same way.
3495  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3496  CCAssignFnForCall(CalleeCC, isVarArg),
3497  CCAssignFnForCall(CallerCC, isVarArg)))
3498  return false;
3499  // The callee has to preserve all registers the caller needs to preserve.
3500  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3501  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3502  if (!CCMatch) {
3503  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3504  if (Subtarget->hasCustomCallingConv()) {
3505  TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
3506  TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
3507  }
3508  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3509  return false;
3510  }
3511 
3512  // Nothing more to check if the callee is taking no arguments
3513  if (Outs.empty())
3514  return true;
3515 
3517  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3518 
3519  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3520 
3521  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3522 
3523  // If the stack arguments for this call do not fit into our own save area then
3524  // the call cannot be made tail.
3525  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3526  return false;
3527 
3528  const MachineRegisterInfo &MRI = MF.getRegInfo();
3529  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3530  return false;
3531 
3532  return true;
3533 }
3534 
3535 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3536  SelectionDAG &DAG,
3537  MachineFrameInfo &MFI,
3538  int ClobberedFI) const {
3539  SmallVector<SDValue, 8> ArgChains;
3540  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3541  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3542 
3543  // Include the original chain at the beginning of the list. When this is
3544  // used by target LowerCall hooks, this helps legalize find the
3545  // CALLSEQ_BEGIN node.
3546  ArgChains.push_back(Chain);
3547 
3548  // Add a chain value for each stack argument corresponding
3549  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3550  UE = DAG.getEntryNode().getNode()->use_end();
3551  U != UE; ++U)
3552  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3553  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3554  if (FI->getIndex() < 0) {
3555  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3556  int64_t InLastByte = InFirstByte;
3557  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3558 
3559  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3560  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3561  ArgChains.push_back(SDValue(L, 1));
3562  }
3563 
3564  // Build a tokenfactor for all the chains.
3565  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3566 }
3567 
3568 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3569  bool TailCallOpt) const {
3570  return CallCC == CallingConv::Fast && TailCallOpt;
3571 }
3572 
3573 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3574 /// and add input and output parameter nodes.
3575 SDValue
3576 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3577  SmallVectorImpl<SDValue> &InVals) const {
3578  SelectionDAG &DAG = CLI.DAG;
3579  SDLoc &DL = CLI.DL;
3580  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3581  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3582  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3583  SDValue Chain = CLI.Chain;
3584  SDValue Callee = CLI.Callee;
3585  bool &IsTailCall = CLI.IsTailCall;
3586  CallingConv::ID CallConv = CLI.CallConv;
3587  bool IsVarArg = CLI.IsVarArg;
3588 
3589  MachineFunction &MF = DAG.getMachineFunction();
3590  bool IsThisReturn = false;
3591 
3593  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3594  bool IsSibCall = false;
3595 
3596  if (IsTailCall) {
3597  // Check if it's really possible to do a tail call.
3598  IsTailCall = isEligibleForTailCallOptimization(
3599  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3600  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3601  report_fatal_error("failed to perform tail call elimination on a call "
3602  "site marked musttail");
3603 
3604  // A sibling call is one where we're under the usual C ABI and not planning
3605  // to change that but can still do a tail call:
3606  if (!TailCallOpt && IsTailCall)
3607  IsSibCall = true;
3608 
3609  if (IsTailCall)
3610  ++NumTailCalls;
3611  }
3612 
3613  // Analyze operands of the call, assigning locations to each operand.
3615  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3616  *DAG.getContext());
3617 
3618  if (IsVarArg) {
3619  // Handle fixed and variable vector arguments differently.
3620  // Variable vector arguments always go into memory.
3621  unsigned NumArgs = Outs.size();
3622 
3623  for (unsigned i = 0; i != NumArgs; ++i) {
3624  MVT ArgVT = Outs[i].VT;
3625  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3626  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3627  /*IsVarArg=*/ !Outs[i].IsFixed);
3628  bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3629  assert(!Res && "Call operand has unhandled type");
3630  (void)Res;
3631  }
3632  } else {
3633  // At this point, Outs[].VT may already be promoted to i32. To correctly
3634  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3635  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3636  // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3637  // we use a special version of AnalyzeCallOperands to pass in ValVT and
3638  // LocVT.
3639  unsigned NumArgs = Outs.size();
3640  for (unsigned i = 0; i != NumArgs; ++i) {
3641  MVT ValVT = Outs[i].VT;
3642  // Get type of the original argument.
3643  EVT ActualVT = getValueType(DAG.getDataLayout(),
3644  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3645  /*AllowUnknown*/ true);
3646  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3647  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3648  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3649  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3650  ValVT = MVT::i8;
3651  else if (ActualMVT == MVT::i16)
3652  ValVT = MVT::i16;
3653 
3654  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3655  bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3656  assert(!Res && "Call operand has unhandled type");
3657  (void)Res;
3658  }
3659  }
3660 
3661  // Get a count of how many bytes are to be pushed on the stack.
3662  unsigned NumBytes = CCInfo.getNextStackOffset();
3663 
3664  if (IsSibCall) {
3665  // Since we're not changing the ABI to make this a tail call, the memory
3666  // operands are already available in the caller's incoming argument space.
3667  NumBytes = 0;
3668  }
3669 
3670  // FPDiff is the byte offset of the call's argument area from the callee's.
3671  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3672  // by this amount for a tail call. In a sibling call it must be 0 because the
3673  // caller will deallocate the entire stack and the callee still expects its
3674  // arguments to begin at SP+0. Completely unused for non-tail calls.
3675  int FPDiff = 0;
3676 
3677  if (IsTailCall && !IsSibCall) {
3678  unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3679 
3680  // Since callee will pop argument stack as a tail call, we must keep the
3681  // popped size 16-byte aligned.
3682  NumBytes = alignTo(NumBytes, 16);
3683 
3684  // FPDiff will be negative if this tail call requires more space than we
3685  // would automatically have in our incoming argument space. Positive if we
3686  // can actually shrink the stack.
3687  FPDiff = NumReusableBytes - NumBytes;
3688 
3689  // The stack pointer must be 16-byte aligned at all times it's used for a
3690  // memory operation, which in practice means at *all* times and in
3691  // particular across call boundaries. Therefore our own arguments started at
3692  // a 16-byte aligned SP and the delta applied for the tail call should
3693  // satisfy the same constraint.
3694  assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3695  }
3696 
3697  // Adjust the stack pointer for the new arguments...
3698  // These operations are automatically eliminated by the prolog/epilog pass
3699  if (!IsSibCall)
3700  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3701 
3702  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3703  getPointerTy(DAG.getDataLayout()));
3704 
3706  SmallVector<SDValue, 8> MemOpChains;
3707  auto PtrVT = getPointerTy(DAG.getDataLayout());
3708 
3709  if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
3710  const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
3711  for (const auto &F : Forwards) {
3712  SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
3713  RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3714  }
3715  }
3716 
3717  // Walk the register/memloc assignments, inserting copies/loads.
3718  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3719  ++i, ++realArgIdx) {
3720  CCValAssign &VA = ArgLocs[i];
3721  SDValue Arg = OutVals[realArgIdx];
3722  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3723 
3724  // Promote the value if needed.
3725  switch (VA.getLocInfo()) {
3726  default:
3727  llvm_unreachable("Unknown loc info!");
3728  case CCValAssign::Full:
3729  break;
3730  case CCValAssign::SExt:
3731  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3732  break;
3733  case CCValAssign::ZExt:
3734  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3735  break;
3736  case CCValAssign::AExt:
3737  if (Outs[realArgIdx].ArgVT == MVT::i1) {
3738  // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3739  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3740  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3741  }
3742  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3743  break;
3744  case CCValAssign::BCvt:
3745  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3746  break;
3747  case CCValAssign::FPExt:
3748  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3749  break;
3750  }
3751 
3752  if (VA.isRegLoc()) {
3753  if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3754  Outs[0].VT == MVT::i64) {
3755  assert(VA.getLocVT() == MVT::i64 &&
3756  "unexpected calling convention register assignment");
3757  assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3758  "unexpected use of 'returned'");
3759  IsThisReturn = true;
3760  }
3761  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3762  } else {
3763  assert(VA.isMemLoc());
3764 
3765  SDValue DstAddr;
3766  MachinePointerInfo DstInfo;
3767 
3768  // FIXME: This works on big-endian for composite byvals, which are the
3769  // common case. It should also work for fundamental types too.
3770  uint32_t BEAlign = 0;
3771  unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3772  : VA.getValVT().getSizeInBits();
3773  OpSize = (OpSize + 7) / 8;
3774  if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3775  !Flags.isInConsecutiveRegs()) {
3776  if (OpSize < 8)
3777  BEAlign = 8 - OpSize;
3778  }
3779  unsigned LocMemOffset = VA.getLocMemOffset();
3780  int32_t Offset = LocMemOffset + BEAlign;
3781  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3782  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3783 
3784  if (IsTailCall) {
3785  Offset = Offset + FPDiff;
3786  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3787 
3788  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3789  DstInfo =
3791 
3792  // Make sure any stack arguments overlapping with where we're storing
3793  // are loaded before this eventual operation. Otherwise they'll be
3794  // clobbered.
3795  Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3796  } else {
3797  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3798 
3799  DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3801  LocMemOffset);
3802  }
3803 
3804  if (Outs[i].Flags.isByVal()) {
3805  SDValue SizeNode =
3806  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3807  SDValue Cpy = DAG.getMemcpy(
3808  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3809  /*isVol = */ false, /*AlwaysInline = */ false,
3810  /*isTailCall = */ false,
3811  DstInfo, MachinePointerInfo());
3812 
3813  MemOpChains.push_back(Cpy);
3814  } else {
3815  // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3816  // promoted to a legal register type i32, we should truncate Arg back to
3817  // i1/i8/i16.
3818  if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3819  VA.getValVT() == MVT::i16)
3820  Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3821 
3822  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3823  MemOpChains.push_back(Store);
3824  }
3825  }
3826  }
3827 
3828  if (!MemOpChains.empty())
3829  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3830 
3831  // Build a sequence of copy-to-reg nodes chained together with token chain
3832  // and flag operands which copy the outgoing args into the appropriate regs.
3833  SDValue InFlag;
3834  for (auto &RegToPass : RegsToPass) {
3835  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3836  RegToPass.second, InFlag);
3837  InFlag = Chain.getValue(1);
3838  }
3839 
3840  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3841  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3842  // node so that legalize doesn't hack it.
3843  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3844  auto GV = G->getGlobal();
3845  if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
3847  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
3848  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3849  } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
3850  assert(Subtarget->isTargetWindows() &&
3851  "Windows is the only supported COFF target");
3852  Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
3853  } else {
3854  const GlobalValue *GV = G->getGlobal();
3855  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
3856  }
3857  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3858  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3859  Subtarget->isTargetMachO()) {
3860  const char *Sym = S->getSymbol();
3861  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
3862  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3863  } else {
3864  const char *Sym = S->getSymbol();
3865  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
3866  }
3867  }
3868 
3869  // We don't usually want to end the call-sequence here because we would tidy
3870  // the frame up *after* the call, however in the ABI-changing tail-call case
3871  // we've carefully laid out the parameters so that when sp is reset they'll be
3872  // in the correct location.
3873  if (IsTailCall && !IsSibCall) {
3874  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3875  DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
3876  InFlag = Chain.getValue(1);
3877  }
3878 
3879  std::vector<SDValue> Ops;
3880  Ops.push_back(Chain);
3881  Ops.push_back(Callee);
3882 
3883  if (IsTailCall) {
3884  // Each tail call may have to adjust the stack by a different amount, so
3885  // this information must travel along with the operation for eventual
3886  // consumption by emitEpilogue.
3887  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3888  }
3889 
3890  // Add argument registers to the end of the list so that they are known live
3891  // into the call.
3892  for (auto &RegToPass : RegsToPass)
3893  Ops.push_back(DAG.getRegister(RegToPass.first,
3894  RegToPass.second.getValueType()));
3895 
3896  // Add a register mask operand representing the call-preserved registers.
3897  const uint32_t *Mask;
3898  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3899  if (IsThisReturn) {
3900  // For 'this' returns, use the X0-preserving mask if applicable
3901  Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
3902  if (!Mask) {
3903  IsThisReturn = false;
3904  Mask = TRI->getCallPreservedMask(MF, CallConv);
3905  }
3906  } else
3907  Mask = TRI->getCallPreservedMask(MF, CallConv);
3908 
3909  if (Subtarget->hasCustomCallingConv())
3910  TRI->UpdateCustomCallPreservedMask(MF, &Mask);
3911 
3912  if (TRI->isAnyArgRegReserved(MF))
3913  TRI->emitReservedArgRegCallError(MF);
3914 
3915  assert(Mask && "Missing call preserved mask for calling convention");
3916  Ops.push_back(DAG.getRegisterMask(Mask));
3917 
3918  if (InFlag.getNode())
3919  Ops.push_back(InFlag);
3920 
3921  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3922 
3923  // If we're doing a tall call, use a TC_RETURN here rather than an
3924  // actual call instruction.
3925  if (IsTailCall) {
3927  return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
3928  }
3929 
3930  // Returns a chain and a flag for retval copy to use.
3931  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
3932  InFlag = Chain.getValue(1);
3933 
3934  uint64_t CalleePopBytes =
3935  DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
3936 
3937  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3938  DAG.getIntPtrConstant(CalleePopBytes, DL, true),
3939  InFlag, DL);
3940  if (!Ins.empty())
3941  InFlag = Chain.getValue(1);
3942 
3943  // Handle result values, copying them out of physregs into vregs that we
3944  // return.
3945  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3946  InVals, IsThisReturn,
3947  IsThisReturn ? OutVals[0] : SDValue());
3948 }
3949 
3950 bool AArch64TargetLowering::CanLowerReturn(
3951  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3952  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3953  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3957  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3958  return CCInfo.CheckReturn(Outs, RetCC);
3959 }
3960 
3961 SDValue
3962 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3963  bool isVarArg,
3964  const SmallVectorImpl<ISD::OutputArg> &Outs,
3965  const SmallVectorImpl<SDValue> &OutVals,
3966  const SDLoc &DL, SelectionDAG &DAG) const {
3967  auto &MF = DAG.getMachineFunction();
3968  auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3969 
3970  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3974  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3975  *DAG.getContext());
3976  CCInfo.AnalyzeReturn(Outs, RetCC);
3977 
3978  // Copy the result values into the output registers.
3979  SDValue Flag;
3980  SmallVector<SDValue, 4> RetOps(1, Chain);
3981  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
3982  ++i, ++realRVLocIdx) {
3983  CCValAssign &VA = RVLocs[i];
3984  assert(VA.isRegLoc() && "Can only return in registers!");
3985  SDValue Arg = OutVals[realRVLocIdx];
3986 
3987  switch (VA.getLocInfo()) {
3988  default:
3989  llvm_unreachable("Unknown loc info!");
3990  case CCValAssign::Full:
3991  if (Outs[i].ArgVT == MVT::i1) {
3992  // AAPCS requires i1 to be zero-extended to i8 by the producer of the
3993  // value. This is strictly redundant on Darwin (which uses "zeroext
3994  // i1"), but will be optimised out before ISel.
3995  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3996  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3997  }
3998  break;
3999  case CCValAssign::BCvt:
4000  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4001  break;
4002  }
4003 
4004  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
4005  Flag = Chain.getValue(1);
4006  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
4007  }
4008 
4009  // Windows AArch64 ABIs require that for returning structs by value we copy
4010  // the sret argument into X0 for the return.
4011  // We saved the argument into a virtual register in the entry block,
4012  // so now we copy the value out and into X0.
4013  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
4014  SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
4015  getPointerTy(MF.getDataLayout()));
4016 
4017  unsigned RetValReg = AArch64::X0;
4018  Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
4019  Flag = Chain.getValue(1);
4020 
4021  RetOps.push_back(
4022  DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
4023  }
4024 
4025  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
4026  const MCPhysReg *I =
4028  if (I) {
4029  for (; *I; ++I) {
4030  if (AArch64::GPR64RegClass.contains(*I))
4031  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
4032  else if (AArch64::FPR64RegClass.contains(*I))
4033  RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
4034  else
4035  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
4036  }
4037  }
4038 
4039  RetOps[0] = Chain; // Update chain.
4040 
4041  // Add the flag if we have it.
4042  if (Flag.getNode())
4043  RetOps.push_back(Flag);
4044 
4045  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
4046 }
4047 
4048 //===----------------------------------------------------------------------===//
4049 // Other Lowering Code
4050 //===----------------------------------------------------------------------===//
4051 
4052 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
4053  SelectionDAG &DAG,
4054  unsigned Flag) const {
4055  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
4056  N->getOffset(), Flag);
4057 }
4058 
4059 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
4060  SelectionDAG &DAG,
4061  unsigned Flag) const {
4062  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
4063 }
4064 
4065 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
4066  SelectionDAG &DAG,
4067  unsigned Flag) const {
4068  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
4069  N->getOffset(), Flag);
4070 }
4071 
4072 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
4073  SelectionDAG &DAG,
4074  unsigned Flag) const {
4075  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
4076 }
4077 
4078 // (loadGOT sym)
4079 template <class NodeTy>
4080 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
4081  unsigned Flags) const {
4082  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
4083  SDLoc DL(N);
4084  EVT Ty = getPointerTy(DAG.getDataLayout());
4085  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
4086  // FIXME: Once remat is capable of dealing with instructions with register
4087  // operands, expand this into two nodes instead of using a wrapper node.
4088  return DAG.getNode(