LLVM  9.0.0svn
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ISelLowering.h"
16 #include "AArch64PerfectShuffle.h"
17 #include "AArch64RegisterInfo.h"
18 #include "AArch64Subtarget.h"
20 #include "Utils/AArch64BaseInfo.h"
21 #include "llvm/ADT/APFloat.h"
22 #include "llvm/ADT/APInt.h"
23 #include "llvm/ADT/ArrayRef.h"
24 #include "llvm/ADT/STLExtras.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/ADT/StringSwitch.h"
29 #include "llvm/ADT/Triple.h"
30 #include "llvm/ADT/Twine.h"
46 #include "llvm/IR/Attributes.h"
47 #include "llvm/IR/Constants.h"
48 #include "llvm/IR/DataLayout.h"
49 #include "llvm/IR/DebugLoc.h"
50 #include "llvm/IR/DerivedTypes.h"
51 #include "llvm/IR/Function.h"
53 #include "llvm/IR/GlobalValue.h"
54 #include "llvm/IR/IRBuilder.h"
55 #include "llvm/IR/Instruction.h"
56 #include "llvm/IR/Instructions.h"
57 #include "llvm/IR/IntrinsicInst.h"
58 #include "llvm/IR/Intrinsics.h"
59 #include "llvm/IR/Module.h"
60 #include "llvm/IR/OperandTraits.h"
61 #include "llvm/IR/PatternMatch.h"
62 #include "llvm/IR/Type.h"
63 #include "llvm/IR/Use.h"
64 #include "llvm/IR/Value.h"
65 #include "llvm/MC/MCRegisterInfo.h"
66 #include "llvm/Support/Casting.h"
67 #include "llvm/Support/CodeGen.h"
69 #include "llvm/Support/Compiler.h"
70 #include "llvm/Support/Debug.h"
72 #include "llvm/Support/KnownBits.h"
78 #include <algorithm>
79 #include <bitset>
80 #include <cassert>
81 #include <cctype>
82 #include <cstdint>
83 #include <cstdlib>
84 #include <iterator>
85 #include <limits>
86 #include <tuple>
87 #include <utility>
88 #include <vector>
89 
90 using namespace llvm;
91 using namespace llvm::PatternMatch;
92 
93 #define DEBUG_TYPE "aarch64-lower"
94 
95 STATISTIC(NumTailCalls, "Number of tail calls");
96 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
97 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
98 
99 static cl::opt<bool>
100 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
101  cl::desc("Allow AArch64 SLI/SRI formation"),
102  cl::init(false));
103 
104 // FIXME: The necessary dtprel relocations don't seem to be supported
105 // well in the GNU bfd and gold linkers at the moment. Therefore, by
106 // default, for now, fall back to GeneralDynamic code generation.
108  "aarch64-elf-ldtls-generation", cl::Hidden,
109  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
110  cl::init(false));
111 
112 static cl::opt<bool>
113 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
114  cl::desc("Enable AArch64 logical imm instruction "
115  "optimization"),
116  cl::init(true));
117 
118 /// Value type used for condition codes.
119 static const MVT MVT_CC = MVT::i32;
120 
122  const AArch64Subtarget &STI)
123  : TargetLowering(TM), Subtarget(&STI) {
124  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
125  // we have to make something up. Arbitrarily, choose ZeroOrOne.
127  // When comparing vectors the result sets the different elements in the
128  // vector to all-one or all-zero.
130 
131  // Set up the register classes.
132  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
133  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
134 
135  if (Subtarget->hasFPARMv8()) {
136  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
137  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
138  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
139  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
140  }
141 
142  if (Subtarget->hasNEON()) {
143  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
144  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
145  // Someone set us up the NEON.
146  addDRTypeForNEON(MVT::v2f32);
147  addDRTypeForNEON(MVT::v8i8);
148  addDRTypeForNEON(MVT::v4i16);
149  addDRTypeForNEON(MVT::v2i32);
150  addDRTypeForNEON(MVT::v1i64);
151  addDRTypeForNEON(MVT::v1f64);
152  addDRTypeForNEON(MVT::v4f16);
153 
154  addQRTypeForNEON(MVT::v4f32);
155  addQRTypeForNEON(MVT::v2f64);
156  addQRTypeForNEON(MVT::v16i8);
157  addQRTypeForNEON(MVT::v8i16);
158  addQRTypeForNEON(MVT::v4i32);
159  addQRTypeForNEON(MVT::v2i64);
160  addQRTypeForNEON(MVT::v8f16);
161  }
162 
163  // Compute derived properties from the register classes
165 
166  // Provide all sorts of operation actions
194 
198 
202 
204 
205  // Custom lowering hooks are needed for XOR
206  // to fold it into CSINC/CSINV.
209 
210  // Virtually no operation on f128 is legal, but LLVM can't expand them when
211  // there's a valid register class, so we need custom operations in most cases.
233 
234  // Lowering for many of the conversions is actually specified by the non-f128
235  // type. The LowerXXX function will be trivial when f128 isn't involved.
250 
251  // Variable arguments.
256 
257  // Variable-sized objects.
260 
261  if (Subtarget->isTargetWindows())
263  else
265 
266  // Constant pool entries
268 
269  // BlockAddress
271 
272  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
281 
282  // AArch64 lacks both left-rotate and popcount instructions.
285  for (MVT VT : MVT::vector_valuetypes()) {
288  }
289 
290  // AArch64 doesn't have {U|S}MUL_LOHI.
293 
296 
299  for (MVT VT : MVT::vector_valuetypes()) {
302  }
309 
310  // Custom lower Add/Sub/Mul with overflow.
323 
332  if (Subtarget->hasFullFP16())
334  else
336 
370 
371  if (!Subtarget->hasFullFP16()) {
394 
395  // promote v4f16 to v4f32 when that is known to be safe.
408 
424 
445  }
446 
447  // AArch64 has implementations of a lot of rounding-like FP operations.
448  for (MVT Ty : {MVT::f32, MVT::f64}) {
459  }
460 
461  if (Subtarget->hasFullFP16()) {
472  }
473 
475 
477 
483 
484  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
485  // This requires the Performance Monitors extension.
486  if (Subtarget->hasPerfMon())
488 
489  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
490  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
491  // Issue __sincos_stret if available.
494  } else {
497  }
498 
499  // Make floating-point constants legal for the large code model, so they don't
500  // become loads from the constant pool.
501  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
504  }
505 
506  // AArch64 does not have floating-point extending loads, i1 sign-extending
507  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
508  for (MVT VT : MVT::fp_valuetypes()) {
513  }
514  for (MVT VT : MVT::integer_valuetypes())
516 
524 
527 
528  // Indexed loads and stores are supported.
529  for (unsigned im = (unsigned)ISD::PRE_INC;
545  }
546 
547  // Trap.
549 
550  // We combine OR nodes for bitfield operations.
552 
553  // Vector add and sub nodes may conceal a high-half opportunity.
554  // Also, try to fold ADD into CSINC/CSINV..
561 
565 
567 
574  if (Subtarget->supportsAddressTopByteIgnored())
576 
578 
581 
585 
587 
588  // In case of strict alignment, avoid an excessive number of byte wide stores.
592 
597 
599 
601 
603 
604  EnableExtLdPromotion = true;
605 
606  // Set required alignment.
608  // Set preferred alignments.
611 
612  // Only change the limit for entries in a jump table if specified by
613  // the subtarget, but not at the command line.
614  unsigned MaxJT = STI.getMaximumJumpTableSize();
615  if (MaxJT && getMaximumJumpTableSize() == 0)
617 
618  setHasExtractBitsInsn(true);
619 
621 
622  if (Subtarget->hasNEON()) {
623  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
624  // silliness like this:
650 
656 
658 
659  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
660  // elements smaller than i32, so promote the input to i32 first.
665  // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
666  // -> v8f16 conversions.
671  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
676  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
677  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
680 
683 
684  // AArch64 doesn't have MUL.2d:
686  // Custom handling for some quad-vector types to detect MULL.
690 
691  // Vector reductions
692  for (MVT VT : MVT::integer_valuetypes()) {
698  }
699  for (MVT VT : MVT::fp_valuetypes()) {
702  }
703 
706  // Likewise, narrowing and extending vector loads/stores aren't handled
707  // directly.
708  for (MVT VT : MVT::vector_valuetypes()) {
710 
711  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
714  } else {
717  }
720 
723 
724  for (MVT InnerVT : MVT::vector_valuetypes()) {
725  setTruncStoreAction(VT, InnerVT, Expand);
726  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
727  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
728  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
729  }
730  }
731 
732  // AArch64 has implementations of a lot of rounding-like FP operations.
733  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
740  }
741 
743  }
744 
746 }
747 
748 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
749  assert(VT.isVector() && "VT should be a vector type");
750 
751  if (VT.isFloatingPoint()) {
753  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
754  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
755  }
756 
757  // Mark vector float intrinsics as expand.
758  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
767 
768  // But we do support custom-lowering for FCOPYSIGN.
770  }
771 
784 
788  for (MVT InnerVT : MVT::all_valuetypes())
789  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
790 
791  // CNT supports only B element sizes, then use UADDLP to widen.
792  if (VT != MVT::v8i8 && VT != MVT::v16i8)
794 
800 
803 
804  if (!VT.isFloatingPoint())
806 
807  // [SU][MIN|MAX] are available for all NEON types apart from i64.
808  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
809  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
810  setOperationAction(Opcode, VT, Legal);
811 
812  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
813  if (VT.isFloatingPoint() &&
814  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
815  for (unsigned Opcode :
817  setOperationAction(Opcode, VT, Legal);
818 
819  if (Subtarget->isLittleEndian()) {
820  for (unsigned im = (unsigned)ISD::PRE_INC;
824  }
825  }
826 }
827 
828 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
829  addRegisterClass(VT, &AArch64::FPR64RegClass);
830  addTypeForNEON(VT, MVT::v2i32);
831 }
832 
833 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
834  addRegisterClass(VT, &AArch64::FPR128RegClass);
835  addTypeForNEON(VT, MVT::v4i32);
836 }
837 
839  EVT VT) const {
840  if (!VT.isVector())
841  return MVT::i32;
843 }
844 
845 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
846  const APInt &Demanded,
848  unsigned NewOpc) {
849  uint64_t OldImm = Imm, NewImm, Enc;
850  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
851 
852  // Return if the immediate is already all zeros, all ones, a bimm32 or a
853  // bimm64.
854  if (Imm == 0 || Imm == Mask ||
855  AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
856  return false;
857 
858  unsigned EltSize = Size;
859  uint64_t DemandedBits = Demanded.getZExtValue();
860 
861  // Clear bits that are not demanded.
862  Imm &= DemandedBits;
863 
864  while (true) {
865  // The goal here is to set the non-demanded bits in a way that minimizes
866  // the number of switching between 0 and 1. In order to achieve this goal,
867  // we set the non-demanded bits to the value of the preceding demanded bits.
868  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
869  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
870  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
871  // The final result is 0b11000011.
872  uint64_t NonDemandedBits = ~DemandedBits;
873  uint64_t InvertedImm = ~Imm & DemandedBits;
874  uint64_t RotatedImm =
875  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
876  NonDemandedBits;
877  uint64_t Sum = RotatedImm + NonDemandedBits;
878  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
879  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
880  NewImm = (Imm | Ones) & Mask;
881 
882  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
883  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
884  // we halve the element size and continue the search.
885  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
886  break;
887 
888  // We cannot shrink the element size any further if it is 2-bits.
889  if (EltSize == 2)
890  return false;
891 
892  EltSize /= 2;
893  Mask >>= EltSize;
894  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
895 
896  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
897  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
898  return false;
899 
900  // Merge the upper and lower halves of Imm and DemandedBits.
901  Imm |= Hi;
902  DemandedBits |= DemandedBitsHi;
903  }
904 
905  ++NumOptimizedImms;
906 
907  // Replicate the element across the register width.
908  while (EltSize < Size) {
909  NewImm |= NewImm << EltSize;
910  EltSize *= 2;
911  }
912 
913  (void)OldImm;
914  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
915  "demanded bits should never be altered");
916  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
917 
918  // Create the new constant immediate node.
919  EVT VT = Op.getValueType();
920  SDLoc DL(Op);
921  SDValue New;
922 
923  // If the new constant immediate is all-zeros or all-ones, let the target
924  // independent DAG combine optimize this node.
925  if (NewImm == 0 || NewImm == OrigMask) {
926  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
927  TLO.DAG.getConstant(NewImm, DL, VT));
928  // Otherwise, create a machine node so that target independent DAG combine
929  // doesn't undo this optimization.
930  } else {
931  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
932  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
933  New = SDValue(
934  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
935  }
936 
937  return TLO.CombineTo(Op, New);
938 }
939 
941  SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
942  // Delay this optimization to as late as possible.
943  if (!TLO.LegalOps)
944  return false;
945 
947  return false;
948 
949  EVT VT = Op.getValueType();
950  if (VT.isVector())
951  return false;
952 
953  unsigned Size = VT.getSizeInBits();
954  assert((Size == 32 || Size == 64) &&
955  "i32 or i64 is expected after legalization.");
956 
957  // Exit early if we demand all bits.
958  if (Demanded.countPopulation() == Size)
959  return false;
960 
961  unsigned NewOpc;
962  switch (Op.getOpcode()) {
963  default:
964  return false;
965  case ISD::AND:
966  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
967  break;
968  case ISD::OR:
969  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
970  break;
971  case ISD::XOR:
972  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
973  break;
974  }
976  if (!C)
977  return false;
978  uint64_t Imm = C->getZExtValue();
979  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
980 }
981 
982 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
983 /// Mask are known to be either zero or one and return them Known.
985  const SDValue Op, KnownBits &Known,
986  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
987  switch (Op.getOpcode()) {
988  default:
989  break;
990  case AArch64ISD::CSEL: {
991  KnownBits Known2;
992  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
993  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
994  Known.Zero &= Known2.Zero;
995  Known.One &= Known2.One;
996  break;
997  }
998  case ISD::INTRINSIC_W_CHAIN: {
999  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1000  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1001  switch (IntID) {
1002  default: return;
1003  case Intrinsic::aarch64_ldaxr:
1004  case Intrinsic::aarch64_ldxr: {
1005  unsigned BitWidth = Known.getBitWidth();
1006  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1007  unsigned MemBits = VT.getScalarSizeInBits();
1008  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1009  return;
1010  }
1011  }
1012  break;
1013  }
1015  case ISD::INTRINSIC_VOID: {
1016  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1017  switch (IntNo) {
1018  default:
1019  break;
1020  case Intrinsic::aarch64_neon_umaxv:
1021  case Intrinsic::aarch64_neon_uminv: {
1022  // Figure out the datatype of the vector operand. The UMINV instruction
1023  // will zero extend the result, so we can mark as known zero all the
1024  // bits larger than the element datatype. 32-bit or larget doesn't need
1025  // this as those are legal types and will be handled by isel directly.
1026  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1027  unsigned BitWidth = Known.getBitWidth();
1028  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1029  assert(BitWidth >= 8 && "Unexpected width!");
1030  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1031  Known.Zero |= Mask;
1032  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1033  assert(BitWidth >= 16 && "Unexpected width!");
1034  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1035  Known.Zero |= Mask;
1036  }
1037  break;
1038  } break;
1039  }
1040  }
1041  }
1042 }
1043 
1045  EVT) const {
1046  return MVT::i64;
1047 }
1048 
1050  unsigned AddrSpace,
1051  unsigned Align,
1052  bool *Fast) const {
1053  if (Subtarget->requiresStrictAlign())
1054  return false;
1055 
1056  if (Fast) {
1057  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1058  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1059  // See comments in performSTORECombine() for more details about
1060  // these conditions.
1061 
1062  // Code that uses clang vector extensions can mark that it
1063  // wants unaligned accesses to be treated as fast by
1064  // underspecifying alignment to be 1 or 2.
1065  Align <= 2 ||
1066 
1067  // Disregard v2i64. Memcpy lowering produces those and splitting
1068  // them regresses performance on micro-benchmarks and olden/bh.
1069  VT == MVT::v2i64;
1070  }
1071  return true;
1072 }
1073 
1074 FastISel *
1076  const TargetLibraryInfo *libInfo) const {
1077  return AArch64::createFastISel(funcInfo, libInfo);
1078 }
1079 
1080 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1081  switch ((AArch64ISD::NodeType)Opcode) {
1082  case AArch64ISD::FIRST_NUMBER: break;
1083  case AArch64ISD::CALL: return "AArch64ISD::CALL";
1084  case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
1085  case AArch64ISD::ADR: return "AArch64ISD::ADR";
1086  case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
1087  case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
1088  case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
1089  case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
1090  case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
1091  case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
1092  case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
1093  case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
1094  case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
1095  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1096  case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
1097  case AArch64ISD::ADC: return "AArch64ISD::ADC";
1098  case AArch64ISD::SBC: return "AArch64ISD::SBC";
1099  case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
1100  case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
1101  case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
1102  case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
1103  case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
1104  case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
1105  case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
1106  case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
1107  case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
1108  case AArch64ISD::DUP: return "AArch64ISD::DUP";
1109  case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
1110  case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
1111  case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
1112  case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
1113  case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
1114  case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
1115  case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
1116  case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
1117  case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
1118  case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
1119  case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
1120  case AArch64ISD::BICi: return "AArch64ISD::BICi";
1121  case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
1122  case AArch64ISD::BSL: return "AArch64ISD::BSL";
1123  case AArch64ISD::NEG: return "AArch64ISD::NEG";
1124  case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
1125  case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
1126  case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
1127  case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
1128  case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
1129  case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
1130  case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
1131  case AArch64ISD::REV16: return "AArch64ISD::REV16";
1132  case AArch64ISD::REV32: return "AArch64ISD::REV32";
1133  case AArch64ISD::REV64: return "AArch64ISD::REV64";
1134  case AArch64ISD::EXT: return "AArch64ISD::EXT";
1135  case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
1136  case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
1137  case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
1138  case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
1139  case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
1140  case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
1141  case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
1142  case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
1143  case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
1144  case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
1145  case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
1146  case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
1147  case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
1148  case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
1149  case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
1150  case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
1151  case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
1152  case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
1153  case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
1154  case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
1155  case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
1156  case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
1157  case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
1158  case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
1159  case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
1160  case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
1161  case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
1162  case AArch64ISD::NOT: return "AArch64ISD::NOT";
1163  case AArch64ISD::BIT: return "AArch64ISD::BIT";
1164  case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
1165  case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
1166  case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
1167  case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
1168  case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
1169  case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
1170  case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
1171  case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
1172  case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
1173  case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
1174  case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
1175  case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
1176  case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
1177  case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
1178  case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
1179  case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
1180  case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
1181  case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
1182  case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
1183  case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
1184  case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
1185  case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
1186  case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
1187  case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
1188  case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
1189  case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
1190  case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
1191  case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
1192  case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
1193  case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
1194  case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
1195  case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
1196  case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
1197  case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
1198  case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
1199  case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
1200  case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
1201  case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
1202  case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
1203  case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
1204  case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
1205  case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
1206  case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
1207  case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
1208  }
1209  return nullptr;
1210 }
1211 
1214  MachineBasicBlock *MBB) const {
1215  // We materialise the F128CSEL pseudo-instruction as some control flow and a
1216  // phi node:
1217 
1218  // OrigBB:
1219  // [... previous instrs leading to comparison ...]
1220  // b.ne TrueBB
1221  // b EndBB
1222  // TrueBB:
1223  // ; Fallthrough
1224  // EndBB:
1225  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1226 
1227  MachineFunction *MF = MBB->getParent();
1228  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1229  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1230  DebugLoc DL = MI.getDebugLoc();
1231  MachineFunction::iterator It = ++MBB->getIterator();
1232 
1233  unsigned DestReg = MI.getOperand(0).getReg();
1234  unsigned IfTrueReg = MI.getOperand(1).getReg();
1235  unsigned IfFalseReg = MI.getOperand(2).getReg();
1236  unsigned CondCode = MI.getOperand(3).getImm();
1237  bool NZCVKilled = MI.getOperand(4).isKill();
1238 
1239  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1240  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1241  MF->insert(It, TrueBB);
1242  MF->insert(It, EndBB);
1243 
1244  // Transfer rest of current basic-block to EndBB
1245  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1246  MBB->end());
1247  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1248 
1249  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1250  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1251  MBB->addSuccessor(TrueBB);
1252  MBB->addSuccessor(EndBB);
1253 
1254  // TrueBB falls through to the end.
1255  TrueBB->addSuccessor(EndBB);
1256 
1257  if (!NZCVKilled) {
1258  TrueBB->addLiveIn(AArch64::NZCV);
1259  EndBB->addLiveIn(AArch64::NZCV);
1260  }
1261 
1262  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1263  .addReg(IfTrueReg)
1264  .addMBB(TrueBB)
1265  .addReg(IfFalseReg)
1266  .addMBB(MBB);
1267 
1268  MI.eraseFromParent();
1269  return EndBB;
1270 }
1271 
1273  MachineInstr &MI, MachineBasicBlock *BB) const {
1275  BB->getParent()->getFunction().getPersonalityFn())) &&
1276  "SEH does not use catchret!");
1277  return BB;
1278 }
1279 
1281  MachineInstr &MI, MachineBasicBlock *BB) const {
1282  MI.eraseFromParent();
1283  return BB;
1284 }
1285 
1287  MachineInstr &MI, MachineBasicBlock *BB) const {
1288  switch (MI.getOpcode()) {
1289  default:
1290 #ifndef NDEBUG
1291  MI.dump();
1292 #endif
1293  llvm_unreachable("Unexpected instruction for custom inserter!");
1294 
1295  case AArch64::F128CSEL:
1296  return EmitF128CSEL(MI, BB);
1297 
1298  case TargetOpcode::STACKMAP:
1299  case TargetOpcode::PATCHPOINT:
1300  return emitPatchPoint(MI, BB);
1301 
1302  case AArch64::CATCHRET:
1303  return EmitLoweredCatchRet(MI, BB);
1304  case AArch64::CATCHPAD:
1305  return EmitLoweredCatchPad(MI, BB);
1306  }
1307 }
1308 
1309 //===----------------------------------------------------------------------===//
1310 // AArch64 Lowering private implementation.
1311 //===----------------------------------------------------------------------===//
1312 
1313 //===----------------------------------------------------------------------===//
1314 // Lowering Code
1315 //===----------------------------------------------------------------------===//
1316 
1317 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1318 /// CC
1320  switch (CC) {
1321  default:
1322  llvm_unreachable("Unknown condition code!");
1323  case ISD::SETNE:
1324  return AArch64CC::NE;
1325  case ISD::SETEQ:
1326  return AArch64CC::EQ;
1327  case ISD::SETGT:
1328  return AArch64CC::GT;
1329  case ISD::SETGE:
1330  return AArch64CC::GE;
1331  case ISD::SETLT:
1332  return AArch64CC::LT;
1333  case ISD::SETLE:
1334  return AArch64CC::LE;
1335  case ISD::SETUGT:
1336  return AArch64CC::HI;
1337  case ISD::SETUGE:
1338  return AArch64CC::HS;
1339  case ISD::SETULT:
1340  return AArch64CC::LO;
1341  case ISD::SETULE:
1342  return AArch64CC::LS;
1343  }
1344 }
1345 
1346 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1349  AArch64CC::CondCode &CondCode2) {
1350  CondCode2 = AArch64CC::AL;
1351  switch (CC) {
1352  default:
1353  llvm_unreachable("Unknown FP condition!");
1354  case ISD::SETEQ:
1355  case ISD::SETOEQ:
1356  CondCode = AArch64CC::EQ;
1357  break;
1358  case ISD::SETGT:
1359  case ISD::SETOGT:
1360  CondCode = AArch64CC::GT;
1361  break;
1362  case ISD::SETGE:
1363  case ISD::SETOGE:
1364  CondCode = AArch64CC::GE;
1365  break;
1366  case ISD::SETOLT:
1367  CondCode = AArch64CC::MI;
1368  break;
1369  case ISD::SETOLE:
1370  CondCode = AArch64CC::LS;
1371  break;
1372  case ISD::SETONE:
1373  CondCode = AArch64CC::MI;
1374  CondCode2 = AArch64CC::GT;
1375  break;
1376  case ISD::SETO:
1377  CondCode = AArch64CC::VC;
1378  break;
1379  case ISD::SETUO:
1380  CondCode = AArch64CC::VS;
1381  break;
1382  case ISD::SETUEQ:
1383  CondCode = AArch64CC::EQ;
1384  CondCode2 = AArch64CC::VS;
1385  break;
1386  case ISD::SETUGT:
1387  CondCode = AArch64CC::HI;
1388  break;
1389  case ISD::SETUGE:
1390  CondCode = AArch64CC::PL;
1391  break;
1392  case ISD::SETLT:
1393  case ISD::SETULT:
1394  CondCode = AArch64CC::LT;
1395  break;
1396  case ISD::SETLE:
1397  case ISD::SETULE:
1398  CondCode = AArch64CC::LE;
1399  break;
1400  case ISD::SETNE:
1401  case ISD::SETUNE:
1402  CondCode = AArch64CC::NE;
1403  break;
1404  }
1405 }
1406 
1407 /// Convert a DAG fp condition code to an AArch64 CC.
1408 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1409 /// should be AND'ed instead of OR'ed.
1412  AArch64CC::CondCode &CondCode2) {
1413  CondCode2 = AArch64CC::AL;
1414  switch (CC) {
1415  default:
1416  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1417  assert(CondCode2 == AArch64CC::AL);
1418  break;
1419  case ISD::SETONE:
1420  // (a one b)
1421  // == ((a olt b) || (a ogt b))
1422  // == ((a ord b) && (a une b))
1423  CondCode = AArch64CC::VC;
1424  CondCode2 = AArch64CC::NE;
1425  break;
1426  case ISD::SETUEQ:
1427  // (a ueq b)
1428  // == ((a uno b) || (a oeq b))
1429  // == ((a ule b) && (a uge b))
1430  CondCode = AArch64CC::PL;
1431  CondCode2 = AArch64CC::LE;
1432  break;
1433  }
1434 }
1435 
1436 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1437 /// CC usable with the vector instructions. Fewer operations are available
1438 /// without a real NZCV register, so we have to use less efficient combinations
1439 /// to get the same effect.
1442  AArch64CC::CondCode &CondCode2,
1443  bool &Invert) {
1444  Invert = false;
1445  switch (CC) {
1446  default:
1447  // Mostly the scalar mappings work fine.
1448  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1449  break;
1450  case ISD::SETUO:
1451  Invert = true;
1453  case ISD::SETO:
1454  CondCode = AArch64CC::MI;
1455  CondCode2 = AArch64CC::GE;
1456  break;
1457  case ISD::SETUEQ:
1458  case ISD::SETULT:
1459  case ISD::SETULE:
1460  case ISD::SETUGT:
1461  case ISD::SETUGE:
1462  // All of the compare-mask comparisons are ordered, but we can switch
1463  // between the two by a double inversion. E.g. ULE == !OGT.
1464  Invert = true;
1465  changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1466  break;
1467  }
1468 }
1469 
1470 static bool isLegalArithImmed(uint64_t C) {
1471  // Matches AArch64DAGToDAGISel::SelectArithImmed().
1472  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1473  LLVM_DEBUG(dbgs() << "Is imm " << C
1474  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1475  return IsLegal;
1476 }
1477 
1478 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
1479 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
1480 // can be set differently by this operation. It comes down to whether
1481 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1482 // everything is fine. If not then the optimization is wrong. Thus general
1483 // comparisons are only valid if op2 != 0.
1484 //
1485 // So, finally, the only LLVM-native comparisons that don't mention C and V
1486 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1487 // the absence of information about op2.
1488 static bool isCMN(SDValue Op, ISD::CondCode CC) {
1489  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
1490  (CC == ISD::SETEQ || CC == ISD::SETNE);
1491 }
1492 
1494  const SDLoc &dl, SelectionDAG &DAG) {
1495  EVT VT = LHS.getValueType();
1496  const bool FullFP16 =
1497  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1498 
1499  if (VT.isFloatingPoint()) {
1500  assert(VT != MVT::f128);
1501  if (VT == MVT::f16 && !FullFP16) {
1502  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1503  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1504  VT = MVT::f32;
1505  }
1506  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1507  }
1508 
1509  // The CMP instruction is just an alias for SUBS, and representing it as
1510  // SUBS means that it's possible to get CSE with subtract operations.
1511  // A later phase can perform the optimization of setting the destination
1512  // register to WZR/XZR if it ends up being unused.
1513  unsigned Opcode = AArch64ISD::SUBS;
1514 
1515  if (isCMN(RHS, CC)) {
1516  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
1517  Opcode = AArch64ISD::ADDS;
1518  RHS = RHS.getOperand(1);
1519  } else if (isCMN(LHS, CC)) {
1520  // As we are looking for EQ/NE compares, the operands can be commuted ; can
1521  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
1522  Opcode = AArch64ISD::ADDS;
1523  LHS = LHS.getOperand(1);
1524  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1525  !isUnsignedIntSetCC(CC)) {
1526  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1527  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1528  // of the signed comparisons.
1529  Opcode = AArch64ISD::ANDS;
1530  RHS = LHS.getOperand(1);
1531  LHS = LHS.getOperand(0);
1532  }
1533 
1534  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1535  .getValue(1);
1536 }
1537 
1538 /// \defgroup AArch64CCMP CMP;CCMP matching
1539 ///
1540 /// These functions deal with the formation of CMP;CCMP;... sequences.
1541 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1542 /// a comparison. They set the NZCV flags to a predefined value if their
1543 /// predicate is false. This allows to express arbitrary conjunctions, for
1544 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
1545 /// expressed as:
1546 /// cmp A
1547 /// ccmp B, inv(CB), CA
1548 /// check for CB flags
1549 ///
1550 /// This naturally lets us implement chains of AND operations with SETCC
1551 /// operands. And we can even implement some other situations by transforming
1552 /// them:
1553 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
1554 /// negating the flags used in a CCMP/FCCMP operations.
1555 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
1556 /// by negating the flags we test for afterwards. i.e.
1557 /// NEG (CMP CCMP CCCMP ...) can be implemented.
1558 /// - Note that we can only ever negate all previously processed results.
1559 /// What we can not implement by flipping the flags to test is a negation
1560 /// of two sub-trees (because the negation affects all sub-trees emitted so
1561 /// far, so the 2nd sub-tree we emit would also affect the first).
1562 /// With those tools we can implement some OR operations:
1563 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
1564 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
1565 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
1566 /// elimination rules from earlier to implement the whole thing as a
1567 /// CCMP/FCCMP chain.
1568 ///
1569 /// As complete example:
1570 /// or (or (setCA (cmp A)) (setCB (cmp B)))
1571 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1572 /// can be reassociated to:
1573 /// or (and (setCC (cmp C)) setCD (cmp D))
1574 // (or (setCA (cmp A)) (setCB (cmp B)))
1575 /// can be transformed to:
1576 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
1577 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1578 /// which can be implemented as:
1579 /// cmp C
1580 /// ccmp D, inv(CD), CC
1581 /// ccmp A, CA, inv(CD)
1582 /// ccmp B, CB, inv(CA)
1583 /// check for CB flags
1584 ///
1585 /// A counterexample is "or (and A B) (and C D)" which translates to
1586 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
1587 /// can only implement 1 of the inner (not) operations, but not both!
1588 /// @{
1589 
1590 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1592  ISD::CondCode CC, SDValue CCOp,
1594  AArch64CC::CondCode OutCC,
1595  const SDLoc &DL, SelectionDAG &DAG) {
1596  unsigned Opcode = 0;
1597  const bool FullFP16 =
1598  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1599 
1600  if (LHS.getValueType().isFloatingPoint()) {
1601  assert(LHS.getValueType() != MVT::f128);
1602  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1603  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1604  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1605  }
1606  Opcode = AArch64ISD::FCCMP;
1607  } else if (RHS.getOpcode() == ISD::SUB) {
1608  SDValue SubOp0 = RHS.getOperand(0);
1609  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1610  // See emitComparison() on why we can only do this for SETEQ and SETNE.
1611  Opcode = AArch64ISD::CCMN;
1612  RHS = RHS.getOperand(1);
1613  }
1614  }
1615  if (Opcode == 0)
1616  Opcode = AArch64ISD::CCMP;
1617 
1618  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1620  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1621  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1622  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1623 }
1624 
1625 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
1626 /// expressed as a conjunction. See \ref AArch64CCMP.
1627 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
1628 /// changing the conditions on the SETCC tests.
1629 /// (this means we can call emitConjunctionRec() with
1630 /// Negate==true on this sub-tree)
1631 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
1632 /// cannot do the negation naturally. We are required to
1633 /// emit the subtree first in this case.
1634 /// \param WillNegate Is true if are called when the result of this
1635 /// subexpression must be negated. This happens when the
1636 /// outer expression is an OR. We can use this fact to know
1637 /// that we have a double negation (or (or ...) ...) that
1638 /// can be implemented for free.
1639 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
1640  bool &MustBeFirst, bool WillNegate,
1641  unsigned Depth = 0) {
1642  if (!Val.hasOneUse())
1643  return false;
1644  unsigned Opcode = Val->getOpcode();
1645  if (Opcode == ISD::SETCC) {
1646  if (Val->getOperand(0).getValueType() == MVT::f128)
1647  return false;
1648  CanNegate = true;
1649  MustBeFirst = false;
1650  return true;
1651  }
1652  // Protect against exponential runtime and stack overflow.
1653  if (Depth > 6)
1654  return false;
1655  if (Opcode == ISD::AND || Opcode == ISD::OR) {
1656  bool IsOR = Opcode == ISD::OR;
1657  SDValue O0 = Val->getOperand(0);
1658  SDValue O1 = Val->getOperand(1);
1659  bool CanNegateL;
1660  bool MustBeFirstL;
1661  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
1662  return false;
1663  bool CanNegateR;
1664  bool MustBeFirstR;
1665  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
1666  return false;
1667 
1668  if (MustBeFirstL && MustBeFirstR)
1669  return false;
1670 
1671  if (IsOR) {
1672  // For an OR expression we need to be able to naturally negate at least
1673  // one side or we cannot do the transformation at all.
1674  if (!CanNegateL && !CanNegateR)
1675  return false;
1676  // If we the result of the OR will be negated and we can naturally negate
1677  // the leafs, then this sub-tree as a whole negates naturally.
1678  CanNegate = WillNegate && CanNegateL && CanNegateR;
1679  // If we cannot naturally negate the whole sub-tree, then this must be
1680  // emitted first.
1681  MustBeFirst = !CanNegate;
1682  } else {
1683  assert(Opcode == ISD::AND && "Must be OR or AND");
1684  // We cannot naturally negate an AND operation.
1685  CanNegate = false;
1686  MustBeFirst = MustBeFirstL || MustBeFirstR;
1687  }
1688  return true;
1689  }
1690  return false;
1691 }
1692 
1693 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1694 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1695 /// Tries to transform the given i1 producing node @p Val to a series compare
1696 /// and conditional compare operations. @returns an NZCV flags producing node
1697 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1698 /// transformation was not possible.
1699 /// \p Negate is true if we want this sub-tree being negated just by changing
1700 /// SETCC conditions.
1702  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1704  // We're at a tree leaf, produce a conditional comparison operation.
1705  unsigned Opcode = Val->getOpcode();
1706  if (Opcode == ISD::SETCC) {
1707  SDValue LHS = Val->getOperand(0);
1708  SDValue RHS = Val->getOperand(1);
1709  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1710  bool isInteger = LHS.getValueType().isInteger();
1711  if (Negate)
1712  CC = getSetCCInverse(CC, isInteger);
1713  SDLoc DL(Val);
1714  // Determine OutCC and handle FP special case.
1715  if (isInteger) {
1716  OutCC = changeIntCCToAArch64CC(CC);
1717  } else {
1719  AArch64CC::CondCode ExtraCC;
1720  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1721  // Some floating point conditions can't be tested with a single condition
1722  // code. Construct an additional comparison in this case.
1723  if (ExtraCC != AArch64CC::AL) {
1724  SDValue ExtraCmp;
1725  if (!CCOp.getNode())
1726  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1727  else
1728  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1729  ExtraCC, DL, DAG);
1730  CCOp = ExtraCmp;
1731  Predicate = ExtraCC;
1732  }
1733  }
1734 
1735  // Produce a normal comparison if we are first in the chain
1736  if (!CCOp)
1737  return emitComparison(LHS, RHS, CC, DL, DAG);
1738  // Otherwise produce a ccmp.
1739  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1740  DAG);
1741  }
1742  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
1743 
1744  bool IsOR = Opcode == ISD::OR;
1745 
1746  SDValue LHS = Val->getOperand(0);
1747  bool CanNegateL;
1748  bool MustBeFirstL;
1749  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
1750  assert(ValidL && "Valid conjunction/disjunction tree");
1751  (void)ValidL;
1752 
1753  SDValue RHS = Val->getOperand(1);
1754  bool CanNegateR;
1755  bool MustBeFirstR;
1756  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
1757  assert(ValidR && "Valid conjunction/disjunction tree");
1758  (void)ValidR;
1759 
1760  // Swap sub-tree that must come first to the right side.
1761  if (MustBeFirstL) {
1762  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
1763  std::swap(LHS, RHS);
1764  std::swap(CanNegateL, CanNegateR);
1765  std::swap(MustBeFirstL, MustBeFirstR);
1766  }
1767 
1768  bool NegateR;
1769  bool NegateAfterR;
1770  bool NegateL;
1771  bool NegateAfterAll;
1772  if (Opcode == ISD::OR) {
1773  // Swap the sub-tree that we can negate naturally to the left.
1774  if (!CanNegateL) {
1775  assert(CanNegateR && "at least one side must be negatable");
1776  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
1777  assert(!Negate);
1778  std::swap(LHS, RHS);
1779  NegateR = false;
1780  NegateAfterR = true;
1781  } else {
1782  // Negate the left sub-tree if possible, otherwise negate the result.
1783  NegateR = CanNegateR;
1784  NegateAfterR = !CanNegateR;
1785  }
1786  NegateL = true;
1787  NegateAfterAll = !Negate;
1788  } else {
1789  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
1790  assert(!Negate && "Valid conjunction/disjunction tree");
1791 
1792  NegateL = false;
1793  NegateR = false;
1794  NegateAfterR = false;
1795  NegateAfterAll = false;
1796  }
1797 
1798  // Emit sub-trees.
1799  AArch64CC::CondCode RHSCC;
1800  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
1801  if (NegateAfterR)
1802  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1803  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
1804  if (NegateAfterAll)
1805  OutCC = AArch64CC::getInvertedCondCode(OutCC);
1806  return CmpL;
1807 }
1808 
1809 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
1810 /// In some cases this is even possible with OR operations in the expression.
1811 /// See \ref AArch64CCMP.
1812 /// \see emitConjunctionRec().
1814  AArch64CC::CondCode &OutCC) {
1815  bool DummyCanNegate;
1816  bool DummyMustBeFirst;
1817  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
1818  return SDValue();
1819 
1820  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
1821 }
1822 
1823 /// @}
1824 
1825 /// Returns how profitable it is to fold a comparison's operand's shift and/or
1826 /// extension operations.
1828  auto isSupportedExtend = [&](SDValue V) {
1829  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
1830  return true;
1831 
1832  if (V.getOpcode() == ISD::AND)
1833  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
1834  uint64_t Mask = MaskCst->getZExtValue();
1835  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
1836  }
1837 
1838  return false;
1839  };
1840 
1841  if (!Op.hasOneUse())
1842  return 0;
1843 
1844  if (isSupportedExtend(Op))
1845  return 1;
1846 
1847  unsigned Opc = Op.getOpcode();
1848  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
1849  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
1850  uint64_t Shift = ShiftCst->getZExtValue();
1851  if (isSupportedExtend(Op.getOperand(0)))
1852  return (Shift <= 4) ? 2 : 1;
1853  EVT VT = Op.getValueType();
1854  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
1855  return 1;
1856  }
1857 
1858  return 0;
1859 }
1860 
1862  SDValue &AArch64cc, SelectionDAG &DAG,
1863  const SDLoc &dl) {
1864  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1865  EVT VT = RHS.getValueType();
1866  uint64_t C = RHSC->getZExtValue();
1867  if (!isLegalArithImmed(C)) {
1868  // Constant does not fit, try adjusting it by one?
1869  switch (CC) {
1870  default:
1871  break;
1872  case ISD::SETLT:
1873  case ISD::SETGE:
1874  if ((VT == MVT::i32 && C != 0x80000000 &&
1875  isLegalArithImmed((uint32_t)(C - 1))) ||
1876  (VT == MVT::i64 && C != 0x80000000ULL &&
1877  isLegalArithImmed(C - 1ULL))) {
1878  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1879  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1880  RHS = DAG.getConstant(C, dl, VT);
1881  }
1882  break;
1883  case ISD::SETULT:
1884  case ISD::SETUGE:
1885  if ((VT == MVT::i32 && C != 0 &&
1886  isLegalArithImmed((uint32_t)(C - 1))) ||
1887  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1888  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1889  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1890  RHS = DAG.getConstant(C, dl, VT);
1891  }
1892  break;
1893  case ISD::SETLE:
1894  case ISD::SETGT:
1895  if ((VT == MVT::i32 && C != INT32_MAX &&
1896  isLegalArithImmed((uint32_t)(C + 1))) ||
1897  (VT == MVT::i64 && C != INT64_MAX &&
1898  isLegalArithImmed(C + 1ULL))) {
1899  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1900  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1901  RHS = DAG.getConstant(C, dl, VT);
1902  }
1903  break;
1904  case ISD::SETULE:
1905  case ISD::SETUGT:
1906  if ((VT == MVT::i32 && C != UINT32_MAX &&
1907  isLegalArithImmed((uint32_t)(C + 1))) ||
1908  (VT == MVT::i64 && C != UINT64_MAX &&
1909  isLegalArithImmed(C + 1ULL))) {
1910  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1911  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1912  RHS = DAG.getConstant(C, dl, VT);
1913  }
1914  break;
1915  }
1916  }
1917  }
1918 
1919  // Comparisons are canonicalized so that the RHS operand is simpler than the
1920  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
1921  // can fold some shift+extend operations on the RHS operand, so swap the
1922  // operands if that can be done.
1923  //
1924  // For example:
1925  // lsl w13, w11, #1
1926  // cmp w13, w12
1927  // can be turned into:
1928  // cmp w12, w11, lsl #1
1929  if (!isa<ConstantSDNode>(RHS) ||
1930  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
1931  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
1932 
1934  std::swap(LHS, RHS);
1936  }
1937  }
1938 
1939  SDValue Cmp;
1940  AArch64CC::CondCode AArch64CC;
1941  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
1942  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
1943 
1944  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
1945  // For the i8 operand, the largest immediate is 255, so this can be easily
1946  // encoded in the compare instruction. For the i16 operand, however, the
1947  // largest immediate cannot be encoded in the compare.
1948  // Therefore, use a sign extending load and cmn to avoid materializing the
1949  // -1 constant. For example,
1950  // movz w1, #65535
1951  // ldrh w0, [x0, #0]
1952  // cmp w0, w1
1953  // >
1954  // ldrsh w0, [x0, #0]
1955  // cmn w0, #1
1956  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
1957  // if and only if (sext LHS) == (sext RHS). The checks are in place to
1958  // ensure both the LHS and RHS are truly zero extended and to make sure the
1959  // transformation is profitable.
1960  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
1961  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
1962  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
1963  LHS.getNode()->hasNUsesOfValue(1, 0)) {
1964  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
1965  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
1966  SDValue SExt =
1967  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
1968  DAG.getValueType(MVT::i16));
1969  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
1970  RHS.getValueType()),
1971  CC, dl, DAG);
1972  AArch64CC = changeIntCCToAArch64CC(CC);
1973  }
1974  }
1975 
1976  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
1977  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
1978  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
1979  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
1980  }
1981  }
1982  }
1983 
1984  if (!Cmp) {
1985  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
1986  AArch64CC = changeIntCCToAArch64CC(CC);
1987  }
1988  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
1989  return Cmp;
1990 }
1991 
1992 static std::pair<SDValue, SDValue>
1994  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
1995  "Unsupported value type");
1996  SDValue Value, Overflow;
1997  SDLoc DL(Op);
1998  SDValue LHS = Op.getOperand(0);
1999  SDValue RHS = Op.getOperand(1);
2000  unsigned Opc = 0;
2001  switch (Op.getOpcode()) {
2002  default:
2003  llvm_unreachable("Unknown overflow instruction!");
2004  case ISD::SADDO:
2005  Opc = AArch64ISD::ADDS;
2006  CC = AArch64CC::VS;
2007  break;
2008  case ISD::UADDO:
2009  Opc = AArch64ISD::ADDS;
2010  CC = AArch64CC::HS;
2011  break;
2012  case ISD::SSUBO:
2013  Opc = AArch64ISD::SUBS;
2014  CC = AArch64CC::VS;
2015  break;
2016  case ISD::USUBO:
2017  Opc = AArch64ISD::SUBS;
2018  CC = AArch64CC::LO;
2019  break;
2020  // Multiply needs a little bit extra work.
2021  case ISD::SMULO:
2022  case ISD::UMULO: {
2023  CC = AArch64CC::NE;
2024  bool IsSigned = Op.getOpcode() == ISD::SMULO;
2025  if (Op.getValueType() == MVT::i32) {
2026  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2027  // For a 32 bit multiply with overflow check we want the instruction
2028  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2029  // need to generate the following pattern:
2030  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2031  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2032  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2033  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2034  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
2035  DAG.getConstant(0, DL, MVT::i64));
2036  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2037  // operation. We need to clear out the upper 32 bits, because we used a
2038  // widening multiply that wrote all 64 bits. In the end this should be a
2039  // noop.
2040  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
2041  if (IsSigned) {
2042  // The signed overflow check requires more than just a simple check for
2043  // any bit set in the upper 32 bits of the result. These bits could be
2044  // just the sign bits of a negative number. To perform the overflow
2045  // check we have to arithmetic shift right the 32nd bit of the result by
2046  // 31 bits. Then we compare the result to the upper 32 bits.
2047  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
2048  DAG.getConstant(32, DL, MVT::i64));
2049  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
2050  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
2051  DAG.getConstant(31, DL, MVT::i64));
2052  // It is important that LowerBits is last, otherwise the arithmetic
2053  // shift will not be folded into the compare (SUBS).
2054  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2055  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2056  .getValue(1);
2057  } else {
2058  // The overflow check for unsigned multiply is easy. We only need to
2059  // check if any of the upper 32 bits are set. This can be done with a
2060  // CMP (shifted register). For that we need to generate the following
2061  // pattern:
2062  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2063  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2064  DAG.getConstant(32, DL, MVT::i64));
2065  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2066  Overflow =
2067  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2068  DAG.getConstant(0, DL, MVT::i64),
2069  UpperBits).getValue(1);
2070  }
2071  break;
2072  }
2073  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2074  // For the 64 bit multiply
2075  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2076  if (IsSigned) {
2077  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2078  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
2079  DAG.getConstant(63, DL, MVT::i64));
2080  // It is important that LowerBits is last, otherwise the arithmetic
2081  // shift will not be folded into the compare (SUBS).
2082  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2083  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2084  .getValue(1);
2085  } else {
2086  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2087  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2088  Overflow =
2089  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2090  DAG.getConstant(0, DL, MVT::i64),
2091  UpperBits).getValue(1);
2092  }
2093  break;
2094  }
2095  } // switch (...)
2096 
2097  if (Opc) {
2098  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2099 
2100  // Emit the AArch64 operation with overflow check.
2101  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2102  Overflow = Value.getValue(1);
2103  }
2104  return std::make_pair(Value, Overflow);
2105 }
2106 
2107 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
2108  RTLIB::Libcall Call) const {
2109  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2110  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
2111 }
2112 
2113 // Returns true if the given Op is the overflow flag result of an overflow
2114 // intrinsic operation.
2115 static bool isOverflowIntrOpRes(SDValue Op) {
2116  unsigned Opc = Op.getOpcode();
2117  return (Op.getResNo() == 1 &&
2118  (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2119  Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
2120 }
2121 
2123  SDValue Sel = Op.getOperand(0);
2124  SDValue Other = Op.getOperand(1);
2125  SDLoc dl(Sel);
2126 
2127  // If the operand is an overflow checking operation, invert the condition
2128  // code and kill the Not operation. I.e., transform:
2129  // (xor (overflow_op_bool, 1))
2130  // -->
2131  // (csel 1, 0, invert(cc), overflow_op_bool)
2132  // ... which later gets transformed to just a cset instruction with an
2133  // inverted condition code, rather than a cset + eor sequence.
2134  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
2135  // Only lower legal XALUO ops.
2136  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2137  return SDValue();
2138 
2139  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2140  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2142  SDValue Value, Overflow;
2143  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2144  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2145  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2146  CCVal, Overflow);
2147  }
2148  // If neither operand is a SELECT_CC, give up.
2149  if (Sel.getOpcode() != ISD::SELECT_CC)
2150  std::swap(Sel, Other);
2151  if (Sel.getOpcode() != ISD::SELECT_CC)
2152  return Op;
2153 
2154  // The folding we want to perform is:
2155  // (xor x, (select_cc a, b, cc, 0, -1) )
2156  // -->
2157  // (csel x, (xor x, -1), cc ...)
2158  //
2159  // The latter will get matched to a CSINV instruction.
2160 
2161  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2162  SDValue LHS = Sel.getOperand(0);
2163  SDValue RHS = Sel.getOperand(1);
2164  SDValue TVal = Sel.getOperand(2);
2165  SDValue FVal = Sel.getOperand(3);
2166 
2167  // FIXME: This could be generalized to non-integer comparisons.
2168  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2169  return Op;
2170 
2171  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2172  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2173 
2174  // The values aren't constants, this isn't the pattern we're looking for.
2175  if (!CFVal || !CTVal)
2176  return Op;
2177 
2178  // We can commute the SELECT_CC by inverting the condition. This
2179  // might be needed to make this fit into a CSINV pattern.
2180  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2181  std::swap(TVal, FVal);
2182  std::swap(CTVal, CFVal);
2183  CC = ISD::getSetCCInverse(CC, true);
2184  }
2185 
2186  // If the constants line up, perform the transform!
2187  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2188  SDValue CCVal;
2189  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2190 
2191  FVal = Other;
2192  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2193  DAG.getConstant(-1ULL, dl, Other.getValueType()));
2194 
2195  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2196  CCVal, Cmp);
2197  }
2198 
2199  return Op;
2200 }
2201 
2203  EVT VT = Op.getValueType();
2204 
2205  // Let legalize expand this if it isn't a legal type yet.
2206  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2207  return SDValue();
2208 
2209  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2210 
2211  unsigned Opc;
2212  bool ExtraOp = false;
2213  switch (Op.getOpcode()) {
2214  default:
2215  llvm_unreachable("Invalid code");
2216  case ISD::ADDC:
2217  Opc = AArch64ISD::ADDS;
2218  break;
2219  case ISD::SUBC:
2220  Opc = AArch64ISD::SUBS;
2221  break;
2222  case ISD::ADDE:
2223  Opc = AArch64ISD::ADCS;
2224  ExtraOp = true;
2225  break;
2226  case ISD::SUBE:
2227  Opc = AArch64ISD::SBCS;
2228  ExtraOp = true;
2229  break;
2230  }
2231 
2232  if (!ExtraOp)
2233  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2234  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2235  Op.getOperand(2));
2236 }
2237 
2239  // Let legalize expand this if it isn't a legal type yet.
2241  return SDValue();
2242 
2243  SDLoc dl(Op);
2245  // The actual operation that sets the overflow or carry flag.
2246  SDValue Value, Overflow;
2247  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2248 
2249  // We use 0 and 1 as false and true values.
2250  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2251  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2252 
2253  // We use an inverted condition, because the conditional select is inverted
2254  // too. This will allow it to be selected to a single instruction:
2255  // CSINC Wd, WZR, WZR, invert(cond).
2256  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2257  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2258  CCVal, Overflow);
2259 
2260  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2261  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2262 }
2263 
2264 // Prefetch operands are:
2265 // 1: Address to prefetch
2266 // 2: bool isWrite
2267 // 3: int locality (0 = no locality ... 3 = extreme locality)
2268 // 4: bool isDataCache
2270  SDLoc DL(Op);
2271  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2272  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2273  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2274 
2275  bool IsStream = !Locality;
2276  // When the locality number is set
2277  if (Locality) {
2278  // The front-end should have filtered out the out-of-range values
2279  assert(Locality <= 3 && "Prefetch locality out-of-range");
2280  // The locality degree is the opposite of the cache speed.
2281  // Put the number the other way around.
2282  // The encoding starts at 0 for level 1
2283  Locality = 3 - Locality;
2284  }
2285 
2286  // built the mask value encoding the expected behavior.
2287  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2288  (!IsData << 3) | // IsDataCache bit
2289  (Locality << 1) | // Cache level bits
2290  (unsigned)IsStream; // Stream bit
2291  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2292  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2293 }
2294 
2295 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2296  SelectionDAG &DAG) const {
2297  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2298 
2299  RTLIB::Libcall LC;
2301 
2302  return LowerF128Call(Op, DAG, LC);
2303 }
2304 
2305 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2306  SelectionDAG &DAG) const {
2307  if (Op.getOperand(0).getValueType() != MVT::f128) {
2308  // It's legal except when f128 is involved
2309  return Op;
2310  }
2311 
2312  RTLIB::Libcall LC;
2314 
2315  // FP_ROUND node has a second operand indicating whether it is known to be
2316  // precise. That doesn't take part in the LibCall so we can't directly use
2317  // LowerF128Call.
2318  SDValue SrcVal = Op.getOperand(0);
2319  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
2320  SDLoc(Op)).first;
2321 }
2322 
2324  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2325  // Any additional optimization in this function should be recorded
2326  // in the cost tables.
2327  EVT InVT = Op.getOperand(0).getValueType();
2328  EVT VT = Op.getValueType();
2329  unsigned NumElts = InVT.getVectorNumElements();
2330 
2331  // f16 vectors are promoted to f32 before a conversion.
2332  if (InVT.getVectorElementType() == MVT::f16) {
2333  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2334  SDLoc dl(Op);
2335  return DAG.getNode(
2336  Op.getOpcode(), dl, Op.getValueType(),
2337  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2338  }
2339 
2340  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2341  SDLoc dl(Op);
2342  SDValue Cv =
2343  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2344  Op.getOperand(0));
2345  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2346  }
2347 
2348  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2349  SDLoc dl(Op);
2350  MVT ExtVT =
2352  VT.getVectorNumElements());
2353  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2354  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2355  }
2356 
2357  // Type changing conversions are illegal.
2358  return Op;
2359 }
2360 
2361 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2362  SelectionDAG &DAG) const {
2363  if (Op.getOperand(0).getValueType().isVector())
2364  return LowerVectorFP_TO_INT(Op, DAG);
2365 
2366  // f16 conversions are promoted to f32 when full fp16 is not supported.
2367  if (Op.getOperand(0).getValueType() == MVT::f16 &&
2368  !Subtarget->hasFullFP16()) {
2369  SDLoc dl(Op);
2370  return DAG.getNode(
2371  Op.getOpcode(), dl, Op.getValueType(),
2372  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2373  }
2374 
2375  if (Op.getOperand(0).getValueType() != MVT::f128) {
2376  // It's legal except when f128 is involved
2377  return Op;
2378  }
2379 
2380  RTLIB::Libcall LC;
2381  if (Op.getOpcode() == ISD::FP_TO_SINT)
2383  else
2385 
2386  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2387  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
2388 }
2389 
2391  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2392  // Any additional optimization in this function should be recorded
2393  // in the cost tables.
2394  EVT VT = Op.getValueType();
2395  SDLoc dl(Op);
2396  SDValue In = Op.getOperand(0);
2397  EVT InVT = In.getValueType();
2398 
2399  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2400  MVT CastVT =
2402  InVT.getVectorNumElements());
2403  In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2404  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2405  }
2406 
2407  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2408  unsigned CastOpc =
2410  EVT CastVT = VT.changeVectorElementTypeToInteger();
2411  In = DAG.getNode(CastOpc, dl, CastVT, In);
2412  return DAG.getNode(Op.getOpcode(), dl, VT, In);
2413  }
2414 
2415  return Op;
2416 }
2417 
2418 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2419  SelectionDAG &DAG) const {
2420  if (Op.getValueType().isVector())
2421  return LowerVectorINT_TO_FP(Op, DAG);
2422 
2423  // f16 conversions are promoted to f32 when full fp16 is not supported.
2424  if (Op.getValueType() == MVT::f16 &&
2425  !Subtarget->hasFullFP16()) {
2426  SDLoc dl(Op);
2427  return DAG.getNode(
2428  ISD::FP_ROUND, dl, MVT::f16,
2429  DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2430  DAG.getIntPtrConstant(0, dl));
2431  }
2432 
2433  // i128 conversions are libcalls.
2434  if (Op.getOperand(0).getValueType() == MVT::i128)
2435  return SDValue();
2436 
2437  // Other conversions are legal, unless it's to the completely software-based
2438  // fp128.
2439  if (Op.getValueType() != MVT::f128)
2440  return Op;
2441 
2442  RTLIB::Libcall LC;
2443  if (Op.getOpcode() == ISD::SINT_TO_FP)
2445  else
2447 
2448  return LowerF128Call(Op, DAG, LC);
2449 }
2450 
2451 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2452  SelectionDAG &DAG) const {
2453  // For iOS, we want to call an alternative entry point: __sincos_stret,
2454  // which returns the values in two S / D registers.
2455  SDLoc dl(Op);
2456  SDValue Arg = Op.getOperand(0);
2457  EVT ArgVT = Arg.getValueType();
2458  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2459 
2460  ArgListTy Args;
2461  ArgListEntry Entry;
2462 
2463  Entry.Node = Arg;
2464  Entry.Ty = ArgTy;
2465  Entry.IsSExt = false;
2466  Entry.IsZExt = false;
2467  Args.push_back(Entry);
2468 
2469  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2470  : RTLIB::SINCOS_STRET_F32;
2471  const char *LibcallName = getLibcallName(LC);
2472  SDValue Callee =
2473  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2474 
2475  StructType *RetTy = StructType::get(ArgTy, ArgTy);
2477  CLI.setDebugLoc(dl)
2478  .setChain(DAG.getEntryNode())
2479  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2480 
2481  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2482  return CallResult.first;
2483 }
2484 
2486  if (Op.getValueType() != MVT::f16)
2487  return SDValue();
2488 
2489  assert(Op.getOperand(0).getValueType() == MVT::i16);
2490  SDLoc DL(Op);
2491 
2492  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2493  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2494  return SDValue(
2495  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2496  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2497  0);
2498 }
2499 
2500 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2501  if (OrigVT.getSizeInBits() >= 64)
2502  return OrigVT;
2503 
2504  assert(OrigVT.isSimple() && "Expecting a simple value type");
2505 
2506  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2507  switch (OrigSimpleTy) {
2508  default: llvm_unreachable("Unexpected Vector Type");
2509  case MVT::v2i8:
2510  case MVT::v2i16:
2511  return MVT::v2i32;
2512  case MVT::v4i8:
2513  return MVT::v4i16;
2514  }
2515 }
2516 
2518  const EVT &OrigTy,
2519  const EVT &ExtTy,
2520  unsigned ExtOpcode) {
2521  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2522  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2523  // 64-bits we need to insert a new extension so that it will be 64-bits.
2524  assert(ExtTy.is128BitVector() && "Unexpected extension size");
2525  if (OrigTy.getSizeInBits() >= 64)
2526  return N;
2527 
2528  // Must extend size to at least 64 bits to be used as an operand for VMULL.
2529  EVT NewVT = getExtensionTo64Bits(OrigTy);
2530 
2531  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2532 }
2533 
2535  bool isSigned) {
2536  EVT VT = N->getValueType(0);
2537 
2538  if (N->getOpcode() != ISD::BUILD_VECTOR)
2539  return false;
2540 
2541  for (const SDValue &Elt : N->op_values()) {
2542  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2543  unsigned EltSize = VT.getScalarSizeInBits();
2544  unsigned HalfSize = EltSize / 2;
2545  if (isSigned) {
2546  if (!isIntN(HalfSize, C->getSExtValue()))
2547  return false;
2548  } else {
2549  if (!isUIntN(HalfSize, C->getZExtValue()))
2550  return false;
2551  }
2552  continue;
2553  }
2554  return false;
2555  }
2556 
2557  return true;
2558 }
2559 
2561  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2563  N->getOperand(0)->getValueType(0),
2564  N->getValueType(0),
2565  N->getOpcode());
2566 
2567  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2568  EVT VT = N->getValueType(0);
2569  SDLoc dl(N);
2570  unsigned EltSize = VT.getScalarSizeInBits() / 2;
2571  unsigned NumElts = VT.getVectorNumElements();
2572  MVT TruncVT = MVT::getIntegerVT(EltSize);
2574  for (unsigned i = 0; i != NumElts; ++i) {
2575  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2576  const APInt &CInt = C->getAPIntValue();
2577  // Element types smaller than 32 bits are not legal, so use i32 elements.
2578  // The values are implicitly truncated so sext vs. zext doesn't matter.
2579  Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2580  }
2581  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2582 }
2583 
2584 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2585  return N->getOpcode() == ISD::SIGN_EXTEND ||
2586  isExtendedBUILD_VECTOR(N, DAG, true);
2587 }
2588 
2589 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2590  return N->getOpcode() == ISD::ZERO_EXTEND ||
2591  isExtendedBUILD_VECTOR(N, DAG, false);
2592 }
2593 
2594 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2595  unsigned Opcode = N->getOpcode();
2596  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2597  SDNode *N0 = N->getOperand(0).getNode();
2598  SDNode *N1 = N->getOperand(1).getNode();
2599  return N0->hasOneUse() && N1->hasOneUse() &&
2600  isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2601  }
2602  return false;
2603 }
2604 
2605 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2606  unsigned Opcode = N->getOpcode();
2607  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2608  SDNode *N0 = N->getOperand(0).getNode();
2609  SDNode *N1 = N->getOperand(1).getNode();
2610  return N0->hasOneUse() && N1->hasOneUse() &&
2611  isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2612  }
2613  return false;
2614 }
2615 
2616 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
2617  SelectionDAG &DAG) const {
2618  // The rounding mode is in bits 23:22 of the FPSCR.
2619  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
2620  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
2621  // so that the shift + and get folded into a bitfield extract.
2622  SDLoc dl(Op);
2623 
2624  SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
2625  DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
2626  MVT::i64));
2627  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
2628  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
2629  DAG.getConstant(1U << 22, dl, MVT::i32));
2630  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
2631  DAG.getConstant(22, dl, MVT::i32));
2632  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
2633  DAG.getConstant(3, dl, MVT::i32));
2634 }
2635 
2637  // Multiplications are only custom-lowered for 128-bit vectors so that
2638  // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2639  EVT VT = Op.getValueType();
2640  assert(VT.is128BitVector() && VT.isInteger() &&
2641  "unexpected type for custom-lowering ISD::MUL");
2642  SDNode *N0 = Op.getOperand(0).getNode();
2643  SDNode *N1 = Op.getOperand(1).getNode();
2644  unsigned NewOpc = 0;
2645  bool isMLA = false;
2646  bool isN0SExt = isSignExtended(N0, DAG);
2647  bool isN1SExt = isSignExtended(N1, DAG);
2648  if (isN0SExt && isN1SExt)
2649  NewOpc = AArch64ISD::SMULL;
2650  else {
2651  bool isN0ZExt = isZeroExtended(N0, DAG);
2652  bool isN1ZExt = isZeroExtended(N1, DAG);
2653  if (isN0ZExt && isN1ZExt)
2654  NewOpc = AArch64ISD::UMULL;
2655  else if (isN1SExt || isN1ZExt) {
2656  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2657  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2658  if (isN1SExt && isAddSubSExt(N0, DAG)) {
2659  NewOpc = AArch64ISD::SMULL;
2660  isMLA = true;
2661  } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2662  NewOpc = AArch64ISD::UMULL;
2663  isMLA = true;
2664  } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2665  std::swap(N0, N1);
2666  NewOpc = AArch64ISD::UMULL;
2667  isMLA = true;
2668  }
2669  }
2670 
2671  if (!NewOpc) {
2672  if (VT == MVT::v2i64)
2673  // Fall through to expand this. It is not legal.
2674  return SDValue();
2675  else
2676  // Other vector multiplications are legal.
2677  return Op;
2678  }
2679  }
2680 
2681  // Legalize to a S/UMULL instruction
2682  SDLoc DL(Op);
2683  SDValue Op0;
2684  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2685  if (!isMLA) {
2686  Op0 = skipExtensionForVectorMULL(N0, DAG);
2687  assert(Op0.getValueType().is64BitVector() &&
2688  Op1.getValueType().is64BitVector() &&
2689  "unexpected types for extended operands to VMULL");
2690  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2691  }
2692  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2693  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2694  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2695  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2696  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2697  EVT Op1VT = Op1.getValueType();
2698  return DAG.getNode(N0->getOpcode(), DL, VT,
2699  DAG.getNode(NewOpc, DL, VT,
2700  DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2701  DAG.getNode(NewOpc, DL, VT,
2702  DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2703 }
2704 
2705 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2706  SelectionDAG &DAG) const {
2707  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2708  SDLoc dl(Op);
2709  switch (IntNo) {
2710  default: return SDValue(); // Don't custom lower most intrinsics.
2711  case Intrinsic::thread_pointer: {
2712  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2713  return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2714  }
2715  case Intrinsic::aarch64_neon_abs: {
2716  EVT Ty = Op.getValueType();
2717  if (Ty == MVT::i64) {
2718  SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
2719  Op.getOperand(1));
2720  Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
2721  return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
2722  } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
2723  return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
2724  } else {
2725  report_fatal_error("Unexpected type for AArch64 NEON intrinic");
2726  }
2727  }
2728  case Intrinsic::aarch64_neon_smax:
2729  return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2730  Op.getOperand(1), Op.getOperand(2));
2731  case Intrinsic::aarch64_neon_umax:
2732  return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2733  Op.getOperand(1), Op.getOperand(2));
2734  case Intrinsic::aarch64_neon_smin:
2735  return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2736  Op.getOperand(1), Op.getOperand(2));
2737  case Intrinsic::aarch64_neon_umin:
2738  return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2739  Op.getOperand(1), Op.getOperand(2));
2740 
2741  case Intrinsic::localaddress: {
2742  const auto &MF = DAG.getMachineFunction();
2743  const auto *RegInfo = Subtarget->getRegisterInfo();
2744  unsigned Reg = RegInfo->getLocalAddressRegister(MF);
2745  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
2746  Op.getSimpleValueType());
2747  }
2748 
2749  case Intrinsic::eh_recoverfp: {
2750  // FIXME: This needs to be implemented to correctly handle highly aligned
2751  // stack objects. For now we simply return the incoming FP. Refer D53541
2752  // for more details.
2753  SDValue FnOp = Op.getOperand(1);
2754  SDValue IncomingFPOp = Op.getOperand(2);
2756  auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
2757  if (!Fn)
2759  "llvm.eh.recoverfp must take a function as the first argument");
2760  return IncomingFPOp;
2761  }
2762  }
2763 }
2764 
2765 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
2767  EVT VT, EVT MemVT,
2768  SelectionDAG &DAG) {
2769  assert(VT.isVector() && "VT should be a vector type");
2770  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
2771 
2772  SDValue Value = ST->getValue();
2773 
2774  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
2775  // the word lane which represent the v4i8 subvector. It optimizes the store
2776  // to:
2777  //
2778  // xtn v0.8b, v0.8h
2779  // str s0, [x0]
2780 
2781  SDValue Undef = DAG.getUNDEF(MVT::i16);
2782  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
2783  {Undef, Undef, Undef, Undef});
2784 
2785  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
2786  Value, UndefVec);
2787  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
2788 
2789  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
2790  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
2791  Trunc, DAG.getConstant(0, DL, MVT::i64));
2792 
2793  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
2794  ST->getBasePtr(), ST->getMemOperand());
2795 }
2796 
2797 // Custom lowering for any store, vector or scalar and/or default or with
2798 // a truncate operations. Currently only custom lower truncate operation
2799 // from vector v4i16 to v4i8.
2800 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
2801  SelectionDAG &DAG) const {
2802  SDLoc Dl(Op);
2803  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
2804  assert (StoreNode && "Can only custom lower store nodes");
2805 
2806  SDValue Value = StoreNode->getValue();
2807 
2808  EVT VT = Value.getValueType();
2809  EVT MemVT = StoreNode->getMemoryVT();
2810 
2811  assert (VT.isVector() && "Can only custom lower vector store types");
2812 
2813  unsigned AS = StoreNode->getAddressSpace();
2814  unsigned Align = StoreNode->getAlignment();
2815  if (Align < MemVT.getStoreSize() &&
2816  !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
2817  return scalarizeVectorStore(StoreNode, DAG);
2818  }
2819 
2820  if (StoreNode->isTruncatingStore()) {
2821  return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
2822  }
2823 
2824  return SDValue();
2825 }
2826 
2828  SelectionDAG &DAG) const {
2829  LLVM_DEBUG(dbgs() << "Custom lowering: ");
2830  LLVM_DEBUG(Op.dump());
2831 
2832  switch (Op.getOpcode()) {
2833  default:
2834  llvm_unreachable("unimplemented operand");
2835  return SDValue();
2836  case ISD::BITCAST:
2837  return LowerBITCAST(Op, DAG);
2838  case ISD::GlobalAddress:
2839  return LowerGlobalAddress(Op, DAG);
2840  case ISD::GlobalTLSAddress:
2841  return LowerGlobalTLSAddress(Op, DAG);
2842  case ISD::SETCC:
2843  return LowerSETCC(Op, DAG);
2844  case ISD::BR_CC:
2845  return LowerBR_CC(Op, DAG);
2846  case ISD::SELECT:
2847  return LowerSELECT(Op, DAG);
2848  case ISD::SELECT_CC:
2849  return LowerSELECT_CC(Op, DAG);
2850  case ISD::JumpTable:
2851  return LowerJumpTable(Op, DAG);
2852  case ISD::BR_JT:
2853  return LowerBR_JT(Op, DAG);
2854  case ISD::ConstantPool:
2855  return LowerConstantPool(Op, DAG);
2856  case ISD::BlockAddress:
2857  return LowerBlockAddress(Op, DAG);
2858  case ISD::VASTART:
2859  return LowerVASTART(Op, DAG);
2860  case ISD::VACOPY:
2861  return LowerVACOPY(Op, DAG);
2862  case ISD::VAARG:
2863  return LowerVAARG(Op, DAG);
2864  case ISD::ADDC:
2865  case ISD::ADDE:
2866  case ISD::SUBC:
2867  case ISD::SUBE:
2868  return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2869  case ISD::SADDO:
2870  case ISD::UADDO:
2871  case ISD::SSUBO:
2872  case ISD::USUBO:
2873  case ISD::SMULO:
2874  case ISD::UMULO:
2875  return LowerXALUO(Op, DAG);
2876  case ISD::FADD:
2877  return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
2878  case ISD::FSUB:
2879  return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
2880  case ISD::FMUL:
2881  return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
2882  case ISD::FDIV:
2883  return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
2884  case ISD::FP_ROUND:
2885  return LowerFP_ROUND(Op, DAG);
2886  case ISD::FP_EXTEND:
2887  return LowerFP_EXTEND(Op, DAG);
2888  case ISD::FRAMEADDR:
2889  return LowerFRAMEADDR(Op, DAG);
2890  case ISD::SPONENTRY:
2891  return LowerSPONENTRY(Op, DAG);
2892  case ISD::RETURNADDR:
2893  return LowerRETURNADDR(Op, DAG);
2894  case ISD::ADDROFRETURNADDR:
2895  return LowerADDROFRETURNADDR(Op, DAG);
2897  return LowerINSERT_VECTOR_ELT(Op, DAG);
2899  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2900  case ISD::BUILD_VECTOR:
2901  return LowerBUILD_VECTOR(Op, DAG);
2902  case ISD::VECTOR_SHUFFLE:
2903  return LowerVECTOR_SHUFFLE(Op, DAG);
2905  return LowerEXTRACT_SUBVECTOR(Op, DAG);
2906  case ISD::SRA:
2907  case ISD::SRL:
2908  case ISD::SHL:
2909  return LowerVectorSRA_SRL_SHL(Op, DAG);
2910  case ISD::SHL_PARTS:
2911  return LowerShiftLeftParts(Op, DAG);
2912  case ISD::SRL_PARTS:
2913  case ISD::SRA_PARTS:
2914  return LowerShiftRightParts(Op, DAG);
2915  case ISD::CTPOP:
2916  return LowerCTPOP(Op, DAG);
2917  case ISD::FCOPYSIGN:
2918  return LowerFCOPYSIGN(Op, DAG);
2919  case ISD::AND:
2920  return LowerVectorAND(Op, DAG);
2921  case ISD::OR:
2922  return LowerVectorOR(Op, DAG);
2923  case ISD::XOR:
2924  return LowerXOR(Op, DAG);
2925  case ISD::PREFETCH:
2926  return LowerPREFETCH(Op, DAG);
2927  case ISD::SINT_TO_FP:
2928  case ISD::UINT_TO_FP:
2929  return LowerINT_TO_FP(Op, DAG);
2930  case ISD::FP_TO_SINT:
2931  case ISD::FP_TO_UINT:
2932  return LowerFP_TO_INT(Op, DAG);
2933  case ISD::FSINCOS:
2934  return LowerFSINCOS(Op, DAG);
2935  case ISD::FLT_ROUNDS_:
2936  return LowerFLT_ROUNDS_(Op, DAG);
2937  case ISD::MUL:
2938  return LowerMUL(Op, DAG);
2940  return LowerINTRINSIC_WO_CHAIN(Op, DAG);
2941  case ISD::STORE:
2942  return LowerSTORE(Op, DAG);
2943  case ISD::VECREDUCE_ADD:
2944  case ISD::VECREDUCE_SMAX:
2945  case ISD::VECREDUCE_SMIN:
2946  case ISD::VECREDUCE_UMAX:
2947  case ISD::VECREDUCE_UMIN:
2948  case ISD::VECREDUCE_FMAX:
2949  case ISD::VECREDUCE_FMIN:
2950  return LowerVECREDUCE(Op, DAG);
2951  case ISD::ATOMIC_LOAD_SUB:
2952  return LowerATOMIC_LOAD_SUB(Op, DAG);
2953  case ISD::ATOMIC_LOAD_AND:
2954  return LowerATOMIC_LOAD_AND(Op, DAG);
2956  return LowerDYNAMIC_STACKALLOC(Op, DAG);
2957  }
2958 }
2959 
2960 //===----------------------------------------------------------------------===//
2961 // Calling Convention Implementation
2962 //===----------------------------------------------------------------------===//
2963 
2964 /// Selects the correct CCAssignFn for a given CallingConvention value.
2966  bool IsVarArg) const {
2967  switch (CC) {
2968  default:
2969  report_fatal_error("Unsupported calling convention.");
2971  return CC_AArch64_WebKit_JS;
2972  case CallingConv::GHC:
2973  return CC_AArch64_GHC;
2974  case CallingConv::C:
2975  case CallingConv::Fast:
2978  case CallingConv::Swift:
2979  if (Subtarget->isTargetWindows() && IsVarArg)
2980  return CC_AArch64_Win64_VarArg;
2981  if (!Subtarget->isTargetDarwin())
2982  return CC_AArch64_AAPCS;
2984  case CallingConv::Win64:
2985  return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
2987  return CC_AArch64_AAPCS;
2988  }
2989 }
2990 
2991 CCAssignFn *
2995 }
2996 
2997 SDValue AArch64TargetLowering::LowerFormalArguments(
2998  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2999  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3000  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3001  MachineFunction &MF = DAG.getMachineFunction();
3002  MachineFrameInfo &MFI = MF.getFrameInfo();
3003  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3004 
3005  // Assign locations to all of the incoming arguments.
3007  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3008  *DAG.getContext());
3009 
3010  // At this point, Ins[].VT may already be promoted to i32. To correctly
3011  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3012  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3013  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
3014  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
3015  // LocVT.
3016  unsigned NumArgs = Ins.size();
3018  unsigned CurArgIdx = 0;
3019  for (unsigned i = 0; i != NumArgs; ++i) {
3020  MVT ValVT = Ins[i].VT;
3021  if (Ins[i].isOrigArg()) {
3022  std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
3023  CurArgIdx = Ins[i].getOrigArgIndex();
3024 
3025  // Get type of the original argument.
3026  EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
3027  /*AllowUnknown*/ true);
3028  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
3029  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3030  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3031  ValVT = MVT::i8;
3032  else if (ActualMVT == MVT::i16)
3033  ValVT = MVT::i16;
3034  }
3035  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3036  bool Res =
3037  AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
3038  assert(!Res && "Call operand has unhandled type");
3039  (void)Res;
3040  }
3041  assert(ArgLocs.size() == Ins.size());
3042  SmallVector<SDValue, 16> ArgValues;
3043  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3044  CCValAssign &VA = ArgLocs[i];
3045 
3046  if (Ins[i].Flags.isByVal()) {
3047  // Byval is used for HFAs in the PCS, but the system should work in a
3048  // non-compliant manner for larger structs.
3049  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3050  int Size = Ins[i].Flags.getByValSize();
3051  unsigned NumRegs = (Size + 7) / 8;
3052 
3053  // FIXME: This works on big-endian for composite byvals, which are the common
3054  // case. It should also work for fundamental types too.
3055  unsigned FrameIdx =
3056  MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
3057  SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
3058  InVals.push_back(FrameIdxN);
3059 
3060  continue;
3061  }
3062 
3063  if (VA.isRegLoc()) {
3064  // Arguments stored in registers.
3065  EVT RegVT = VA.getLocVT();
3066 
3067  SDValue ArgValue;
3068  const TargetRegisterClass *RC;
3069 
3070  if (RegVT == MVT::i32)
3071  RC = &AArch64::GPR32RegClass;
3072  else if (RegVT == MVT::i64)
3073  RC = &AArch64::GPR64RegClass;
3074  else if (RegVT == MVT::f16)
3075  RC = &AArch64::FPR16RegClass;
3076  else if (RegVT == MVT::f32)
3077  RC = &AArch64::FPR32RegClass;
3078  else if (RegVT == MVT::f64 || RegVT.is64BitVector())
3079  RC = &AArch64::FPR64RegClass;
3080  else if (RegVT == MVT::f128 || RegVT.is128BitVector())
3081  RC = &AArch64::FPR128RegClass;
3082  else
3083  llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3084 
3085  // Transform the arguments in physical registers into virtual ones.
3086  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3087  ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
3088 
3089  // If this is an 8, 16 or 32-bit value, it is really passed promoted
3090  // to 64 bits. Insert an assert[sz]ext to capture this, then
3091  // truncate to the right size.
3092  switch (VA.getLocInfo()) {
3093  default:
3094  llvm_unreachable("Unknown loc info!");
3095  case CCValAssign::Full:
3096  break;
3097  case CCValAssign::BCvt:
3098  ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
3099  break;
3100  case CCValAssign::AExt:
3101  case CCValAssign::SExt:
3102  case CCValAssign::ZExt:
3103  // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
3104  // nodes after our lowering.
3105  assert(RegVT == Ins[i].VT && "incorrect register location selected");
3106  break;
3107  }
3108 
3109  InVals.push_back(ArgValue);
3110 
3111  } else { // VA.isRegLoc()
3112  assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
3113  unsigned ArgOffset = VA.getLocMemOffset();
3114  unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
3115 
3116  uint32_t BEAlign = 0;
3117  if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
3118  !Ins[i].Flags.isInConsecutiveRegs())
3119  BEAlign = 8 - ArgSize;
3120 
3121  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
3122 
3123  // Create load nodes to retrieve arguments from the stack.
3124  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3125  SDValue ArgValue;
3126 
3127  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
3129  MVT MemVT = VA.getValVT();
3130 
3131  switch (VA.getLocInfo()) {
3132  default:
3133  break;
3134  case CCValAssign::BCvt:
3135  MemVT = VA.getLocVT();
3136  break;
3137  case CCValAssign::SExt:
3138  ExtType = ISD::SEXTLOAD;
3139  break;
3140  case CCValAssign::ZExt:
3141  ExtType = ISD::ZEXTLOAD;
3142  break;
3143  case CCValAssign::AExt:
3144  ExtType = ISD::EXTLOAD;
3145  break;
3146  }
3147 
3148  ArgValue = DAG.getExtLoad(
3149  ExtType, DL, VA.getLocVT(), Chain, FIN,
3151  MemVT);
3152 
3153  InVals.push_back(ArgValue);
3154  }
3155  }
3156 
3157  // varargs
3159  if (isVarArg) {
3160  if (!Subtarget->isTargetDarwin() || IsWin64) {
3161  // The AAPCS variadic function ABI is identical to the non-variadic
3162  // one. As a result there may be more arguments in registers and we should
3163  // save them for future reference.
3164  // Win64 variadic functions also pass arguments in registers, but all float
3165  // arguments are passed in integer registers.
3166  saveVarArgRegisters(CCInfo, DAG, DL, Chain);
3167  }
3168 
3169  // This will point to the next argument passed via stack.
3170  unsigned StackOffset = CCInfo.getNextStackOffset();
3171  // We currently pass all varargs at 8-byte alignment.
3172  StackOffset = ((StackOffset + 7) & ~7);
3173  FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
3174 
3175  if (MFI.hasMustTailInVarArgFunc()) {
3176  SmallVector<MVT, 2> RegParmTypes;
3177  RegParmTypes.push_back(MVT::i64);
3178  RegParmTypes.push_back(MVT::f128);
3179  // Compute the set of forwarded registers. The rest are scratch.
3181  FuncInfo->getForwardedMustTailRegParms();
3182  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
3184  }
3185  }
3186 
3187  unsigned StackArgSize = CCInfo.getNextStackOffset();
3188  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3189  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
3190  // This is a non-standard ABI so by fiat I say we're allowed to make full
3191  // use of the stack area to be popped, which must be aligned to 16 bytes in
3192  // any case:
3193  StackArgSize = alignTo(StackArgSize, 16);
3194 
3195  // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
3196  // a multiple of 16.
3197  FuncInfo->setArgumentStackToRestore(StackArgSize);
3198 
3199  // This realignment carries over to the available bytes below. Our own
3200  // callers will guarantee the space is free by giving an aligned value to
3201  // CALLSEQ_START.
3202  }
3203  // Even if we're not expected to free up the space, it's useful to know how
3204  // much is there while considering tail calls (because we can reuse it).
3205  FuncInfo->setBytesInStackArgArea(StackArgSize);
3206 
3207  if (Subtarget->hasCustomCallingConv())
3208  Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
3209 
3210  return Chain;
3211 }
3212 
3213 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
3214  SelectionDAG &DAG,
3215  const SDLoc &DL,
3216  SDValue &Chain) const {
3217  MachineFunction &MF = DAG.getMachineFunction();
3218  MachineFrameInfo &MFI = MF.getFrameInfo();
3220  auto PtrVT = getPointerTy(DAG.getDataLayout());
3221  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
3222 
3223  SmallVector<SDValue, 8> MemOps;
3224 
3225  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
3226  AArch64::X3, AArch64::X4, AArch64::X5,
3227  AArch64::X6, AArch64::X7 };
3228  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
3229  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
3230 
3231  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
3232  int GPRIdx = 0;
3233  if (GPRSaveSize != 0) {
3234  if (IsWin64) {
3235  GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
3236  if (GPRSaveSize & 15)
3237  // The extra size here, if triggered, will always be 8.
3238  MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
3239  } else
3240  GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
3241 
3242  SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
3243 
3244  for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
3245  unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
3246  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
3247  SDValue Store = DAG.getStore(
3248  Val.getValue(1), DL, Val, FIN,
3249  IsWin64
3251  GPRIdx,
3252  (i - FirstVariadicGPR) * 8)
3254  MemOps.push_back(Store);
3255  FIN =
3256  DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
3257  }
3258  }
3259  FuncInfo->setVarArgsGPRIndex(GPRIdx);
3260  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
3261 
3262  if (Subtarget->hasFPARMv8() && !IsWin64) {
3263  static const MCPhysReg FPRArgRegs[] = {
3264  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
3265  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
3266  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
3267  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
3268 
3269  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
3270  int FPRIdx = 0;
3271  if (FPRSaveSize != 0) {
3272  FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
3273 
3274  SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3275 
3276  for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3277  unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3278  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3279 
3280  SDValue Store = DAG.getStore(
3281  Val.getValue(1), DL, Val, FIN,
3283  MemOps.push_back(Store);
3284  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3285  DAG.getConstant(16, DL, PtrVT));
3286  }
3287  }
3288  FuncInfo->setVarArgsFPRIndex(FPRIdx);
3289  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3290  }
3291 
3292  if (!MemOps.empty()) {
3293  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3294  }
3295 }
3296 
3297 /// LowerCallResult - Lower the result values of a call into the
3298 /// appropriate copies out of appropriate physical registers.
3299 SDValue AArch64TargetLowering::LowerCallResult(
3300  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3301  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3302  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3303  SDValue ThisVal) const {
3304  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3307  // Assign locations to each value returned by this call.
3309  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3310  *DAG.getContext());
3311  CCInfo.AnalyzeCallResult(Ins, RetCC);
3312 
3313  // Copy all of the result registers out of their specified physreg.
3314  for (unsigned i = 0; i != RVLocs.size(); ++i) {
3315  CCValAssign VA = RVLocs[i];
3316 
3317  // Pass 'this' value directly from the argument to return value, to avoid
3318  // reg unit interference
3319  if (i == 0 && isThisReturn) {
3320  assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3321  "unexpected return calling convention register assignment");
3322  InVals.push_back(ThisVal);
3323  continue;
3324  }
3325 
3326  SDValue Val =
3327  DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3328  Chain = Val.getValue(1);
3329  InFlag = Val.getValue(2);
3330 
3331  switch (VA.getLocInfo()) {
3332  default:
3333  llvm_unreachable("Unknown loc info!");
3334  case CCValAssign::Full:
3335  break;
3336  case CCValAssign::BCvt:
3337  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3338  break;
3339  }
3340 
3341  InVals.push_back(Val);
3342  }
3343 
3344  return Chain;
3345 }
3346 
3347 /// Return true if the calling convention is one that we can guarantee TCO for.
3349  return CC == CallingConv::Fast;
3350 }
3351 
3352 /// Return true if we might ever do TCO for calls with this calling convention.
3354  switch (CC) {
3355  case CallingConv::C:
3357  case CallingConv::Swift:
3358  return true;
3359  default:
3360  return canGuaranteeTCO(CC);
3361  }
3362 }
3363 
3364 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3365  SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3366  const SmallVectorImpl<ISD::OutputArg> &Outs,
3367  const SmallVectorImpl<SDValue> &OutVals,
3368  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3369  if (!mayTailCallThisCC(CalleeCC))
3370  return false;
3371 
3372  MachineFunction &MF = DAG.getMachineFunction();
3373  const Function &CallerF = MF.getFunction();
3374  CallingConv::ID CallerCC = CallerF.getCallingConv();
3375  bool CCMatch = CallerCC == CalleeCC;
3376 
3377  // Byval parameters hand the function a pointer directly into the stack area
3378  // we want to reuse during a tail call. Working around this *is* possible (see
3379  // X86) but less efficient and uglier in LowerCall.
3380  for (Function::const_arg_iterator i = CallerF.arg_begin(),
3381  e = CallerF.arg_end();
3382  i != e; ++i)
3383  if (i->hasByValAttr())
3384  return false;
3385 
3387  return canGuaranteeTCO(CalleeCC) && CCMatch;
3388 
3389  // Externally-defined functions with weak linkage should not be
3390  // tail-called on AArch64 when the OS does not support dynamic
3391  // pre-emption of symbols, as the AAELF spec requires normal calls
3392  // to undefined weak functions to be replaced with a NOP or jump to the
3393  // next instruction. The behaviour of branch instructions in this
3394  // situation (as used for tail calls) is implementation-defined, so we
3395  // cannot rely on the linker replacing the tail call with a return.
3396  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3397  const GlobalValue *GV = G->getGlobal();
3398  const Triple &TT = getTargetMachine().getTargetTriple();
3399  if (GV->hasExternalWeakLinkage() &&
3400  (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3401  return false;
3402  }
3403 
3404  // Now we search for cases where we can use a tail call without changing the
3405  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3406  // concept.
3407 
3408  // I want anyone implementing a new calling convention to think long and hard
3409  // about this assert.
3410  assert((!isVarArg || CalleeCC == CallingConv::C) &&
3411  "Unexpected variadic calling convention");
3412 
3413  LLVMContext &C = *DAG.getContext();
3414  if (isVarArg && !Outs.empty()) {
3415  // At least two cases here: if caller is fastcc then we can't have any
3416  // memory arguments (we'd be expected to clean up the stack afterwards). If
3417  // caller is C then we could potentially use its argument area.
3418 
3419  // FIXME: for now we take the most conservative of these in both cases:
3420  // disallow all variadic memory operands.
3422  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3423 
3424  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3425  for (const CCValAssign &ArgLoc : ArgLocs)
3426  if (!ArgLoc.isRegLoc())
3427  return false;
3428  }
3429 
3430  // Check that the call results are passed in the same way.
3431  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3432  CCAssignFnForCall(CalleeCC, isVarArg),
3433  CCAssignFnForCall(CallerCC, isVarArg)))
3434  return false;
3435  // The callee has to preserve all registers the caller needs to preserve.
3436  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3437  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3438  if (!CCMatch) {
3439  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3440  if (Subtarget->hasCustomCallingConv()) {
3441  TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
3442  TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
3443  }
3444  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3445  return false;
3446  }
3447 
3448  // Nothing more to check if the callee is taking no arguments
3449  if (Outs.empty())
3450  return true;
3451 
3453  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3454 
3455  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3456 
3457  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3458 
3459  // If the stack arguments for this call do not fit into our own save area then
3460  // the call cannot be made tail.
3461  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3462  return false;
3463 
3464  const MachineRegisterInfo &MRI = MF.getRegInfo();
3465  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3466  return false;
3467 
3468  return true;
3469 }
3470 
3471 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3472  SelectionDAG &DAG,
3473  MachineFrameInfo &MFI,
3474  int ClobberedFI) const {
3475  SmallVector<SDValue, 8> ArgChains;
3476  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3477  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3478 
3479  // Include the original chain at the beginning of the list. When this is
3480  // used by target LowerCall hooks, this helps legalize find the
3481  // CALLSEQ_BEGIN node.
3482  ArgChains.push_back(Chain);
3483 
3484  // Add a chain value for each stack argument corresponding
3485  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3486  UE = DAG.getEntryNode().getNode()->use_end();
3487  U != UE; ++U)
3488  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3489  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3490  if (FI->getIndex() < 0) {
3491  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3492  int64_t InLastByte = InFirstByte;
3493  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3494 
3495  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3496  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3497  ArgChains.push_back(SDValue(L, 1));
3498  }
3499 
3500  // Build a tokenfactor for all the chains.
3501  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3502 }
3503 
3504 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3505  bool TailCallOpt) const {
3506  return CallCC == CallingConv::Fast && TailCallOpt;
3507 }
3508 
3509 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3510 /// and add input and output parameter nodes.
3511 SDValue
3512 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3513  SmallVectorImpl<SDValue> &InVals) const {
3514  SelectionDAG &DAG = CLI.DAG;
3515  SDLoc &DL = CLI.DL;
3516  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3517  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3518  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3519  SDValue Chain = CLI.Chain;
3520  SDValue Callee = CLI.Callee;
3521  bool &IsTailCall = CLI.IsTailCall;
3522  CallingConv::ID CallConv = CLI.CallConv;
3523  bool IsVarArg = CLI.IsVarArg;
3524 
3525  MachineFunction &MF = DAG.getMachineFunction();
3526  bool IsThisReturn = false;
3527 
3529  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3530  bool IsSibCall = false;
3531 
3532  if (IsTailCall) {
3533  // Check if it's really possible to do a tail call.
3534  IsTailCall = isEligibleForTailCallOptimization(
3535  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3536  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3537  report_fatal_error("failed to perform tail call elimination on a call "
3538  "site marked musttail");
3539 
3540  // A sibling call is one where we're under the usual C ABI and not planning
3541  // to change that but can still do a tail call:
3542  if (!TailCallOpt && IsTailCall)
3543  IsSibCall = true;
3544 
3545  if (IsTailCall)
3546  ++NumTailCalls;
3547  }
3548 
3549  // Analyze operands of the call, assigning locations to each operand.
3551  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3552  *DAG.getContext());
3553 
3554  if (IsVarArg) {
3555  // Handle fixed and variable vector arguments differently.
3556  // Variable vector arguments always go into memory.
3557  unsigned NumArgs = Outs.size();
3558 
3559  for (unsigned i = 0; i != NumArgs; ++i) {
3560  MVT ArgVT = Outs[i].VT;
3561  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3562  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3563  /*IsVarArg=*/ !Outs[i].IsFixed);
3564  bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3565  assert(!Res && "Call operand has unhandled type");
3566  (void)Res;
3567  }
3568  } else {
3569  // At this point, Outs[].VT may already be promoted to i32. To correctly
3570  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3571  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3572  // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3573  // we use a special version of AnalyzeCallOperands to pass in ValVT and
3574  // LocVT.
3575  unsigned NumArgs = Outs.size();
3576  for (unsigned i = 0; i != NumArgs; ++i) {
3577  MVT ValVT = Outs[i].VT;
3578  // Get type of the original argument.
3579  EVT ActualVT = getValueType(DAG.getDataLayout(),
3580  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3581  /*AllowUnknown*/ true);
3582  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3583  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3584  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3585  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3586  ValVT = MVT::i8;
3587  else if (ActualMVT == MVT::i16)
3588  ValVT = MVT::i16;
3589 
3590  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3591  bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3592  assert(!Res && "Call operand has unhandled type");
3593  (void)Res;
3594  }
3595  }
3596 
3597  // Get a count of how many bytes are to be pushed on the stack.
3598  unsigned NumBytes = CCInfo.getNextStackOffset();
3599 
3600  if (IsSibCall) {
3601  // Since we're not changing the ABI to make this a tail call, the memory
3602  // operands are already available in the caller's incoming argument space.
3603  NumBytes = 0;
3604  }
3605 
3606  // FPDiff is the byte offset of the call's argument area from the callee's.
3607  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3608  // by this amount for a tail call. In a sibling call it must be 0 because the
3609  // caller will deallocate the entire stack and the callee still expects its
3610  // arguments to begin at SP+0. Completely unused for non-tail calls.
3611  int FPDiff = 0;
3612 
3613  if (IsTailCall && !IsSibCall) {
3614  unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3615 
3616  // Since callee will pop argument stack as a tail call, we must keep the
3617  // popped size 16-byte aligned.
3618  NumBytes = alignTo(NumBytes, 16);
3619 
3620  // FPDiff will be negative if this tail call requires more space than we
3621  // would automatically have in our incoming argument space. Positive if we
3622  // can actually shrink the stack.
3623  FPDiff = NumReusableBytes - NumBytes;
3624 
3625  // The stack pointer must be 16-byte aligned at all times it's used for a
3626  // memory operation, which in practice means at *all* times and in
3627  // particular across call boundaries. Therefore our own arguments started at
3628  // a 16-byte aligned SP and the delta applied for the tail call should
3629  // satisfy the same constraint.
3630  assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3631  }
3632 
3633  // Adjust the stack pointer for the new arguments...
3634  // These operations are automatically eliminated by the prolog/epilog pass
3635  if (!IsSibCall)
3636  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3637 
3638  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3639  getPointerTy(DAG.getDataLayout()));
3640 
3642  SmallVector<SDValue, 8> MemOpChains;
3643  auto PtrVT = getPointerTy(DAG.getDataLayout());
3644 
3645  if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
3646  const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
3647  for (const auto &F : Forwards) {
3648  SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
3649  RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3650  }
3651  }
3652 
3653  // Walk the register/memloc assignments, inserting copies/loads.
3654  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3655  ++i, ++realArgIdx) {
3656  CCValAssign &VA = ArgLocs[i];
3657  SDValue Arg = OutVals[realArgIdx];
3658  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3659 
3660  // Promote the value if needed.
3661  switch (VA.getLocInfo()) {
3662  default:
3663  llvm_unreachable("Unknown loc info!");
3664  case CCValAssign::Full:
3665  break;
3666  case CCValAssign::SExt:
3667  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3668  break;
3669  case CCValAssign::ZExt:
3670  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3671  break;
3672  case CCValAssign::AExt:
3673  if (Outs[realArgIdx].ArgVT == MVT::i1) {
3674  // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3675  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3676  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3677  }
3678  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3679  break;
3680  case CCValAssign::BCvt:
3681  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3682  break;
3683  case CCValAssign::FPExt:
3684  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3685  break;
3686  }
3687 
3688  if (VA.isRegLoc()) {
3689  if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3690  Outs[0].VT == MVT::i64) {
3691  assert(VA.getLocVT() == MVT::i64 &&
3692  "unexpected calling convention register assignment");
3693  assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3694  "unexpected use of 'returned'");
3695  IsThisReturn = true;
3696  }
3697  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3698  } else {
3699  assert(VA.isMemLoc());
3700 
3701  SDValue DstAddr;
3702  MachinePointerInfo DstInfo;
3703 
3704  // FIXME: This works on big-endian for composite byvals, which are the
3705  // common case. It should also work for fundamental types too.
3706  uint32_t BEAlign = 0;
3707  unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3708  : VA.getValVT().getSizeInBits();
3709  OpSize = (OpSize + 7) / 8;
3710  if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3711  !Flags.isInConsecutiveRegs()) {
3712  if (OpSize < 8)
3713  BEAlign = 8 - OpSize;
3714  }
3715  unsigned LocMemOffset = VA.getLocMemOffset();
3716  int32_t Offset = LocMemOffset + BEAlign;
3717  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3718  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3719 
3720  if (IsTailCall) {
3721  Offset = Offset + FPDiff;
3722  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3723 
3724  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3725  DstInfo =
3727 
3728  // Make sure any stack arguments overlapping with where we're storing
3729  // are loaded before this eventual operation. Otherwise they'll be
3730  // clobbered.
3731  Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3732  } else {
3733  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3734 
3735  DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3737  LocMemOffset);
3738  }
3739 
3740  if (Outs[i].Flags.isByVal()) {
3741  SDValue SizeNode =
3742  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3743  SDValue Cpy = DAG.getMemcpy(
3744  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3745  /*isVol = */ false, /*AlwaysInline = */ false,
3746  /*isTailCall = */ false,
3747  DstInfo, MachinePointerInfo());
3748 
3749  MemOpChains.push_back(Cpy);
3750  } else {
3751  // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3752  // promoted to a legal register type i32, we should truncate Arg back to
3753  // i1/i8/i16.
3754  if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3755  VA.getValVT() == MVT::i16)
3756  Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3757 
3758  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3759  MemOpChains.push_back(Store);
3760  }
3761  }
3762  }
3763 
3764  if (!MemOpChains.empty())
3765  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3766 
3767  // Build a sequence of copy-to-reg nodes chained together with token chain
3768  // and flag operands which copy the outgoing args into the appropriate regs.
3769  SDValue InFlag;
3770  for (auto &RegToPass : RegsToPass) {
3771  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3772  RegToPass.second, InFlag);
3773  InFlag = Chain.getValue(1);
3774  }
3775 
3776  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3777  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3778  // node so that legalize doesn't hack it.
3779  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3780  auto GV = G->getGlobal();
3781  if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
3783  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
3784  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3785  } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
3786  assert(Subtarget->isTargetWindows() &&
3787  "Windows is the only supported COFF target");
3788  Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
3789  } else {
3790  const GlobalValue *GV = G->getGlobal();
3791  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
3792  }
3793  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3794  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3795  Subtarget->isTargetMachO()) {
3796  const char *Sym = S->getSymbol();
3797  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
3798  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3799  } else {
3800  const char *Sym = S->getSymbol();
3801  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
3802  }
3803  }
3804 
3805  // We don't usually want to end the call-sequence here because we would tidy
3806  // the frame up *after* the call, however in the ABI-changing tail-call case
3807  // we've carefully laid out the parameters so that when sp is reset they'll be
3808  // in the correct location.
3809  if (IsTailCall && !IsSibCall) {
3810  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3811  DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
3812  InFlag = Chain.getValue(1);
3813  }
3814 
3815  std::vector<SDValue> Ops;
3816  Ops.push_back(Chain);
3817  Ops.push_back(Callee);
3818 
3819  if (IsTailCall) {
3820  // Each tail call may have to adjust the stack by a different amount, so
3821  // this information must travel along with the operation for eventual
3822  // consumption by emitEpilogue.
3823  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3824  }
3825 
3826  // Add argument registers to the end of the list so that they are known live
3827  // into the call.
3828  for (auto &RegToPass : RegsToPass)
3829  Ops.push_back(DAG.getRegister(RegToPass.first,
3830  RegToPass.second.getValueType()));
3831 
3832  // Add a register mask operand representing the call-preserved registers.
3833  const uint32_t *Mask;
3834  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3835  if (IsThisReturn) {
3836  // For 'this' returns, use the X0-preserving mask if applicable
3837  Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
3838  if (!Mask) {
3839  IsThisReturn = false;
3840  Mask = TRI->getCallPreservedMask(MF, CallConv);
3841  }
3842  } else
3843  Mask = TRI->getCallPreservedMask(MF, CallConv);
3844 
3845  if (Subtarget->hasCustomCallingConv())
3846  TRI->UpdateCustomCallPreservedMask(MF, &Mask);
3847 
3848  if (TRI->isAnyArgRegReserved(MF))
3849  TRI->emitReservedArgRegCallError(MF);
3850 
3851  assert(Mask && "Missing call preserved mask for calling convention");
3852  Ops.push_back(DAG.getRegisterMask(Mask));
3853 
3854  if (InFlag.getNode())
3855  Ops.push_back(InFlag);
3856 
3857  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3858 
3859  // If we're doing a tall call, use a TC_RETURN here rather than an
3860  // actual call instruction.
3861  if (IsTailCall) {
3863  return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
3864  }
3865 
3866  // Returns a chain and a flag for retval copy to use.
3867  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
3868  InFlag = Chain.getValue(1);
3869 
3870  uint64_t CalleePopBytes =
3871  DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
3872 
3873  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3874  DAG.getIntPtrConstant(CalleePopBytes, DL, true),
3875  InFlag, DL);
3876  if (!Ins.empty())
3877  InFlag = Chain.getValue(1);
3878 
3879  // Handle result values, copying them out of physregs into vregs that we
3880  // return.
3881  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3882  InVals, IsThisReturn,
3883  IsThisReturn ? OutVals[0] : SDValue());
3884 }
3885 
3886 bool AArch64TargetLowering::CanLowerReturn(
3887  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3888  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3889  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3893  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3894  return CCInfo.CheckReturn(Outs, RetCC);
3895 }
3896 
3897 SDValue
3898 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3899  bool isVarArg,
3900  const SmallVectorImpl<ISD::OutputArg> &Outs,
3901  const SmallVectorImpl<SDValue> &OutVals,
3902  const SDLoc &DL, SelectionDAG &DAG) const {
3903  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3907  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3908  *DAG.getContext());
3909  CCInfo.AnalyzeReturn(Outs, RetCC);
3910 
3911  // Copy the result values into the output registers.
3912  SDValue Flag;
3913  SmallVector<SDValue, 4> RetOps(1, Chain);
3914  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
3915  ++i, ++realRVLocIdx) {
3916  CCValAssign &VA = RVLocs[i];
3917  assert(VA.isRegLoc() && "Can only return in registers!");
3918  SDValue Arg = OutVals[realRVLocIdx];
3919 
3920  switch (VA.getLocInfo()) {
3921  default:
3922  llvm_unreachable("Unknown loc info!");
3923  case CCValAssign::Full:
3924  if (Outs[i].ArgVT == MVT::i1) {
3925  // AAPCS requires i1 to be zero-extended to i8 by the producer of the
3926  // value. This is strictly redundant on Darwin (which uses "zeroext
3927  // i1"), but will be optimised out before ISel.
3928  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3929  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3930  }
3931  break;
3932  case CCValAssign::BCvt:
3933  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3934  break;
3935  }
3936 
3937  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
3938  Flag = Chain.getValue(1);
3939  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3940  }
3941  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3942  const MCPhysReg *I =
3944  if (I) {
3945  for (; *I; ++I) {
3946  if (AArch64::GPR64RegClass.contains(*I))
3947  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3948  else if (AArch64::FPR64RegClass.contains(*I))
3949  RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
3950  else
3951  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3952  }
3953  }
3954 
3955  RetOps[0] = Chain; // Update chain.
3956 
3957  // Add the flag if we have it.
3958  if (Flag.getNode())
3959  RetOps.push_back(Flag);
3960 
3961  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
3962 }
3963 
3964 //===----------------------------------------------------------------------===//
3965 // Other Lowering Code
3966 //===----------------------------------------------------------------------===//
3967 
3968 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
3969  SelectionDAG &DAG,
3970  unsigned Flag) const {
3971  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
3972  N->getOffset(), Flag);
3973 }
3974 
3975 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
3976  SelectionDAG &DAG,
3977  unsigned Flag) const {
3978  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
3979 }
3980 
3981 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
3982  SelectionDAG &DAG,
3983  unsigned Flag) const {
3984  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
3985  N->getOffset(), Flag);
3986 }
3987 
3988 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
3989  SelectionDAG &DAG,
3990  unsigned Flag) const {
3991  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
3992 }
3993 
3994 // (loadGOT sym)
3995 template <class NodeTy>
3996 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
3997  unsigned Flags) const {
3998  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
3999  SDLoc DL(N);
4000  EVT Ty = getPointerTy(DAG.getDataLayout());
4001  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
4002  // FIXME: Once remat is capable of dealing with instructions with register
4003  // operands, expand this into two nodes instead of using a wrapper node.
4004  return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
4005 }
4006 
4007 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
4008 template <class NodeTy>
4009 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
4010  unsigned Flags) const {
4011  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
4012  SDLoc DL(N);
4013  EVT Ty = getPointerTy(DAG.getDataLayout());
4014  const unsigned char MO_NC = AArch64II::MO_NC;
4015  return DAG.getNode(
4016  AArch64ISD::WrapperLarge, DL, Ty,
4017  getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
4018  getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
4019  getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
4020  getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
4021 }
4022 
4023 // (addlow (adrp %hi(sym)) %lo(sym))
4024 template <class NodeTy>
4025 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
4026  unsigned Flags) const {
4027  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
4028  SDLoc DL(N);
4029  EVT Ty = getPointerTy(DAG.getDataLayout());
4030  SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
4031  SDValue Lo = getTargetNode(N, Ty, DAG,
4033  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
4034  return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
4035 }
4036 
4037 // (adr sym)
4038 template <class NodeTy>
4039 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
4040  unsigned Flags) const {
4041  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
4042  SDLoc DL(N);
4043  EVT Ty = getPointerTy(DAG.getDataLayout());
4044  SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
4045  return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
4046 }
4047 
4048 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
4049  SelectionDAG &DAG) const {
4050  GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
4051  const GlobalValue *GV = GN->getGlobal();
4052  unsigned char OpFlags =
4053  Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
4054 
4055  if (OpFlags != AArch64II::MO_NO_FLAG)
4056  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
4057  "unexpected offset in global node");
4058 
4059  // This also catches the large code model case for Darwin, and tiny code
4060  // model with got relocations.
4061  if ((OpFlags & AArch64II::MO_GOT) != 0) {
4062  return getGOT(GN, DAG, OpFlags);
4063  }
4064 
4065  SDValue Result;
4066  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
4067  Result = getAddrLarge(GN, DAG, OpFlags);
4068  } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
4069  Result = getAddrTiny(GN, DAG, OpFlags);
4070  } else {
4071  Result = getAddr(GN, DAG, OpFlags);
4072  }
4073  EVT PtrVT = getPointerTy(DAG.getDataLayout());
4074  SDLoc DL(GN);
4076  Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
4078  return Result;
4079 }
4080 
4081 /// Convert a TLS address reference into the correct sequence of loads
4082 /// and calls to compute the variable's address (for Darwin, currently) and
4083 /// return an SDValue containing the final node.
4084 
4085 /// Darwin only has one TLS scheme which must be capable of dealing with the
4086 /// fully general situation, in the worst case. This means:
4087 /// + "extern __thread" declaration.
4088 /// + Defined in a possibly unknown dynamic library.
4089 ///
4090 /// The general system is that each __thread variable has a [3 x i64] descriptor
4091 /// which contains information used by the runtime to calculate the address. The
4092 /// only part of this the compiler needs to know about is the first xword, which
4093 /// contains a function pointer that must be called with the address of the
4094 /// entire descriptor in "x0".
4095 ///
4096 /// Since this descriptor may be in a different unit, in general even the
4097 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
4098 /// is:
4099 /// adrp x0, _var@TLVPPAGE
4100 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
4101 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
4102 /// ; the function pointer
4103 /// blr x1 ; Uses descriptor address in x0
4104 /// ; Address of _var is now in x0.
4105 ///
4106 /// If the address of _var's descriptor *is* known to the linker, then it can
4107 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
4108 /// a slight efficiency gain.
4109 SDValue
4110 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,