LLVM  7.0.0svn
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the AArch64TargetLowering class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64ISelLowering.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
48 #include "llvm/IR/Attributes.h"
49 #include "llvm/IR/Constants.h"
50 #include "llvm/IR/DataLayout.h"
51 #include "llvm/IR/DebugLoc.h"
52 #include "llvm/IR/DerivedTypes.h"
53 #include "llvm/IR/Function.h"
55 #include "llvm/IR/GlobalValue.h"
56 #include "llvm/IR/IRBuilder.h"
57 #include "llvm/IR/Instruction.h"
58 #include "llvm/IR/Instructions.h"
59 #include "llvm/IR/Intrinsics.h"
60 #include "llvm/IR/Module.h"
61 #include "llvm/IR/OperandTraits.h"
62 #include "llvm/IR/Type.h"
63 #include "llvm/IR/Use.h"
64 #include "llvm/IR/Value.h"
65 #include "llvm/MC/MCRegisterInfo.h"
66 #include "llvm/Support/Casting.h"
67 #include "llvm/Support/CodeGen.h"
69 #include "llvm/Support/Compiler.h"
70 #include "llvm/Support/Debug.h"
72 #include "llvm/Support/KnownBits.h"
77 #include <algorithm>
78 #include <bitset>
79 #include <cassert>
80 #include <cctype>
81 #include <cstdint>
82 #include <cstdlib>
83 #include <iterator>
84 #include <limits>
85 #include <tuple>
86 #include <utility>
87 #include <vector>
88 
89 using namespace llvm;
90 
91 #define DEBUG_TYPE "aarch64-lower"
92 
93 STATISTIC(NumTailCalls, "Number of tail calls");
94 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
95 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
96 
97 static cl::opt<bool>
98 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
99  cl::desc("Allow AArch64 SLI/SRI formation"),
100  cl::init(false));
101 
102 // FIXME: The necessary dtprel relocations don't seem to be supported
103 // well in the GNU bfd and gold linkers at the moment. Therefore, by
104 // default, for now, fall back to GeneralDynamic code generation.
106  "aarch64-elf-ldtls-generation", cl::Hidden,
107  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
108  cl::init(false));
109 
110 static cl::opt<bool>
111 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
112  cl::desc("Enable AArch64 logical imm instruction "
113  "optimization"),
114  cl::init(true));
115 
116 /// Value type used for condition codes.
117 static const MVT MVT_CC = MVT::i32;
118 
120  const AArch64Subtarget &STI)
121  : TargetLowering(TM), Subtarget(&STI) {
122  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
123  // we have to make something up. Arbitrarily, choose ZeroOrOne.
125  // When comparing vectors the result sets the different elements in the
126  // vector to all-one or all-zero.
128 
129  // Set up the register classes.
130  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
131  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
132 
133  if (Subtarget->hasFPARMv8()) {
134  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
135  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
136  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
137  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
138  }
139 
140  if (Subtarget->hasNEON()) {
141  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
142  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
143  // Someone set us up the NEON.
144  addDRTypeForNEON(MVT::v2f32);
145  addDRTypeForNEON(MVT::v8i8);
146  addDRTypeForNEON(MVT::v4i16);
147  addDRTypeForNEON(MVT::v2i32);
148  addDRTypeForNEON(MVT::v1i64);
149  addDRTypeForNEON(MVT::v1f64);
150  addDRTypeForNEON(MVT::v4f16);
151 
152  addQRTypeForNEON(MVT::v4f32);
153  addQRTypeForNEON(MVT::v2f64);
154  addQRTypeForNEON(MVT::v16i8);
155  addQRTypeForNEON(MVT::v8i16);
156  addQRTypeForNEON(MVT::v4i32);
157  addQRTypeForNEON(MVT::v2i64);
158  addQRTypeForNEON(MVT::v8f16);
159  }
160 
161  // Compute derived properties from the register classes
163 
164  // Provide all sorts of operation actions
192 
196 
200 
201  // Custom lowering hooks are needed for XOR
202  // to fold it into CSINC/CSINV.
205 
206  // Virtually no operation on f128 is legal, but LLVM can't expand them when
207  // there's a valid register class, so we need custom operations in most cases.
229 
230  // Lowering for many of the conversions is actually specified by the non-f128
231  // type. The LowerXXX function will be trivial when f128 isn't involved.
246 
247  // Variable arguments.
252 
253  // Variable-sized objects.
256 
257  if (Subtarget->isTargetWindows())
259  else
261 
262  // Constant pool entries
264 
265  // BlockAddress
267 
268  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
277 
278  // AArch64 lacks both left-rotate and popcount instructions.
281  for (MVT VT : MVT::vector_valuetypes()) {
284  }
285 
286  // AArch64 doesn't have {U|S}MUL_LOHI.
289 
292 
295  for (MVT VT : MVT::vector_valuetypes()) {
298  }
305 
306  // Custom lower Add/Sub/Mul with overflow.
319 
328  if (Subtarget->hasFullFP16())
330  else
332 
364 
365  if (!Subtarget->hasFullFP16()) {
388 
389  // promote v4f16 to v4f32 when that is known to be safe.
402 
418 
439  }
440 
441  // AArch64 has implementations of a lot of rounding-like FP operations.
442  for (MVT Ty : {MVT::f32, MVT::f64}) {
453  }
454 
455  if (Subtarget->hasFullFP16()) {
466  }
467 
469 
475 
476  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
477  // This requires the Performance Monitors extension.
478  if (Subtarget->hasPerfMon())
480 
481  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
482  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
483  // Issue __sincos_stret if available.
486  } else {
489  }
490 
491  // Make floating-point constants legal for the large code model, so they don't
492  // become loads from the constant pool.
493  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
496  }
497 
498  // AArch64 does not have floating-point extending loads, i1 sign-extending
499  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
500  for (MVT VT : MVT::fp_valuetypes()) {
505  }
506  for (MVT VT : MVT::integer_valuetypes())
508 
516 
519 
520  // Indexed loads and stores are supported.
521  for (unsigned im = (unsigned)ISD::PRE_INC;
537  }
538 
539  // Trap.
541 
542  // We combine OR nodes for bitfield operations.
544 
545  // Vector add and sub nodes may conceal a high-half opportunity.
546  // Also, try to fold ADD into CSINC/CSINV..
553 
557 
559 
566  if (Subtarget->supportsAddressTopByteIgnored())
568 
570 
573 
577 
581 
583 
585 
586  EnableExtLdPromotion = true;
587 
588  // Set required alignment.
590  // Set preferred alignments.
593 
594  // Only change the limit for entries in a jump table if specified by
595  // the subtarget, but not at the command line.
596  unsigned MaxJT = STI.getMaximumJumpTableSize();
597  if (MaxJT && getMaximumJumpTableSize() == 0)
599 
600  setHasExtractBitsInsn(true);
601 
603 
604  if (Subtarget->hasNEON()) {
605  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
606  // silliness like this:
632 
638 
640 
641  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
642  // elements smaller than i32, so promote the input to i32 first.
647  // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
648  // -> v8f16 conversions.
653  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
658  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
659  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
662 
665 
674 
675  // AArch64 doesn't have MUL.2d:
677  // Custom handling for some quad-vector types to detect MULL.
681 
682  // Vector reductions
683  for (MVT VT : MVT::integer_valuetypes()) {
689  }
690  for (MVT VT : MVT::fp_valuetypes()) {
693  }
694 
697  // Likewise, narrowing and extending vector loads/stores aren't handled
698  // directly.
699  for (MVT VT : MVT::vector_valuetypes()) {
701 
706 
708 
709  for (MVT InnerVT : MVT::vector_valuetypes()) {
710  setTruncStoreAction(VT, InnerVT, Expand);
711  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
712  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
713  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
714  }
715  }
716 
717  // AArch64 has implementations of a lot of rounding-like FP operations.
718  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
725  }
726  }
727 
729 }
730 
731 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
732  assert(VT.isVector() && "VT should be a vector type");
733 
734  if (VT.isFloatingPoint()) {
736  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
737  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
738  }
739 
740  // Mark vector float intrinsics as expand.
741  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
750 
751  // But we do support custom-lowering for FCOPYSIGN.
753  }
754 
767 
771  for (MVT InnerVT : MVT::all_valuetypes())
772  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
773 
774  // CNT supports only B element sizes.
775  if (VT != MVT::v8i8 && VT != MVT::v16i8)
777 
783 
786 
787  if (!VT.isFloatingPoint())
789 
790  // [SU][MIN|MAX] are available for all NEON types apart from i64.
791  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
792  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
793  setOperationAction(Opcode, VT, Legal);
794 
795  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
796  if (VT.isFloatingPoint() &&
797  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
798  for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
800  setOperationAction(Opcode, VT, Legal);
801 
802  if (Subtarget->isLittleEndian()) {
803  for (unsigned im = (unsigned)ISD::PRE_INC;
807  }
808  }
809 }
810 
811 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
812  addRegisterClass(VT, &AArch64::FPR64RegClass);
813  addTypeForNEON(VT, MVT::v2i32);
814 }
815 
816 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
817  addRegisterClass(VT, &AArch64::FPR128RegClass);
818  addTypeForNEON(VT, MVT::v4i32);
819 }
820 
822  EVT VT) const {
823  if (!VT.isVector())
824  return MVT::i32;
826 }
827 
828 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
829  const APInt &Demanded,
831  unsigned NewOpc) {
832  uint64_t OldImm = Imm, NewImm, Enc;
833  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
834 
835  // Return if the immediate is already all zeros, all ones, a bimm32 or a
836  // bimm64.
837  if (Imm == 0 || Imm == Mask ||
838  AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
839  return false;
840 
841  unsigned EltSize = Size;
842  uint64_t DemandedBits = Demanded.getZExtValue();
843 
844  // Clear bits that are not demanded.
845  Imm &= DemandedBits;
846 
847  while (true) {
848  // The goal here is to set the non-demanded bits in a way that minimizes
849  // the number of switching between 0 and 1. In order to achieve this goal,
850  // we set the non-demanded bits to the value of the preceding demanded bits.
851  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
852  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
853  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
854  // The final result is 0b11000011.
855  uint64_t NonDemandedBits = ~DemandedBits;
856  uint64_t InvertedImm = ~Imm & DemandedBits;
857  uint64_t RotatedImm =
858  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
859  NonDemandedBits;
860  uint64_t Sum = RotatedImm + NonDemandedBits;
861  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
862  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
863  NewImm = (Imm | Ones) & Mask;
864 
865  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
866  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
867  // we halve the element size and continue the search.
868  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
869  break;
870 
871  // We cannot shrink the element size any further if it is 2-bits.
872  if (EltSize == 2)
873  return false;
874 
875  EltSize /= 2;
876  Mask >>= EltSize;
877  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
878 
879  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
880  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
881  return false;
882 
883  // Merge the upper and lower halves of Imm and DemandedBits.
884  Imm |= Hi;
885  DemandedBits |= DemandedBitsHi;
886  }
887 
888  ++NumOptimizedImms;
889 
890  // Replicate the element across the register width.
891  while (EltSize < Size) {
892  NewImm |= NewImm << EltSize;
893  EltSize *= 2;
894  }
895 
896  (void)OldImm;
897  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
898  "demanded bits should never be altered");
899  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
900 
901  // Create the new constant immediate node.
902  EVT VT = Op.getValueType();
903  SDLoc DL(Op);
904  SDValue New;
905 
906  // If the new constant immediate is all-zeros or all-ones, let the target
907  // independent DAG combine optimize this node.
908  if (NewImm == 0 || NewImm == OrigMask) {
909  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
910  TLO.DAG.getConstant(NewImm, DL, VT));
911  // Otherwise, create a machine node so that target independent DAG combine
912  // doesn't undo this optimization.
913  } else {
914  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
915  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
916  New = SDValue(
917  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
918  }
919 
920  return TLO.CombineTo(Op, New);
921 }
922 
924  SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
925  // Delay this optimization to as late as possible.
926  if (!TLO.LegalOps)
927  return false;
928 
930  return false;
931 
932  EVT VT = Op.getValueType();
933  if (VT.isVector())
934  return false;
935 
936  unsigned Size = VT.getSizeInBits();
937  assert((Size == 32 || Size == 64) &&
938  "i32 or i64 is expected after legalization.");
939 
940  // Exit early if we demand all bits.
941  if (Demanded.countPopulation() == Size)
942  return false;
943 
944  unsigned NewOpc;
945  switch (Op.getOpcode()) {
946  default:
947  return false;
948  case ISD::AND:
949  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
950  break;
951  case ISD::OR:
952  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
953  break;
954  case ISD::XOR:
955  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
956  break;
957  }
959  if (!C)
960  return false;
961  uint64_t Imm = C->getZExtValue();
962  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
963 }
964 
965 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
966 /// Mask are known to be either zero or one and return them Known.
968  const SDValue Op, KnownBits &Known,
969  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
970  switch (Op.getOpcode()) {
971  default:
972  break;
973  case AArch64ISD::CSEL: {
974  KnownBits Known2;
975  DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
976  DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
977  Known.Zero &= Known2.Zero;
978  Known.One &= Known2.One;
979  break;
980  }
981  case ISD::INTRINSIC_W_CHAIN: {
982  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
983  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
984  switch (IntID) {
985  default: return;
986  case Intrinsic::aarch64_ldaxr:
987  case Intrinsic::aarch64_ldxr: {
988  unsigned BitWidth = Known.getBitWidth();
989  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
990  unsigned MemBits = VT.getScalarSizeInBits();
991  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
992  return;
993  }
994  }
995  break;
996  }
998  case ISD::INTRINSIC_VOID: {
999  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1000  switch (IntNo) {
1001  default:
1002  break;
1003  case Intrinsic::aarch64_neon_umaxv:
1004  case Intrinsic::aarch64_neon_uminv: {
1005  // Figure out the datatype of the vector operand. The UMINV instruction
1006  // will zero extend the result, so we can mark as known zero all the
1007  // bits larger than the element datatype. 32-bit or larget doesn't need
1008  // this as those are legal types and will be handled by isel directly.
1009  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1010  unsigned BitWidth = Known.getBitWidth();
1011  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1012  assert(BitWidth >= 8 && "Unexpected width!");
1013  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1014  Known.Zero |= Mask;
1015  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1016  assert(BitWidth >= 16 && "Unexpected width!");
1017  APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1018  Known.Zero |= Mask;
1019  }
1020  break;
1021  } break;
1022  }
1023  }
1024  }
1025 }
1026 
1028  EVT) const {
1029  return MVT::i64;
1030 }
1031 
1033  unsigned AddrSpace,
1034  unsigned Align,
1035  bool *Fast) const {
1036  if (Subtarget->requiresStrictAlign())
1037  return false;
1038 
1039  if (Fast) {
1040  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1041  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1042  // See comments in performSTORECombine() for more details about
1043  // these conditions.
1044 
1045  // Code that uses clang vector extensions can mark that it
1046  // wants unaligned accesses to be treated as fast by
1047  // underspecifying alignment to be 1 or 2.
1048  Align <= 2 ||
1049 
1050  // Disregard v2i64. Memcpy lowering produces those and splitting
1051  // them regresses performance on micro-benchmarks and olden/bh.
1052  VT == MVT::v2i64;
1053  }
1054  return true;
1055 }
1056 
1057 FastISel *
1059  const TargetLibraryInfo *libInfo) const {
1060  return AArch64::createFastISel(funcInfo, libInfo);
1061 }
1062 
1063 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1064  switch ((AArch64ISD::NodeType)Opcode) {
1065  case AArch64ISD::FIRST_NUMBER: break;
1066  case AArch64ISD::CALL: return "AArch64ISD::CALL";
1067  case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
1068  case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
1069  case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
1070  case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
1071  case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
1072  case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
1073  case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
1074  case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
1075  case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
1076  case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
1077  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
1078  case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
1079  case AArch64ISD::ADC: return "AArch64ISD::ADC";
1080  case AArch64ISD::SBC: return "AArch64ISD::SBC";
1081  case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
1082  case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
1083  case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
1084  case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
1085  case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
1086  case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
1087  case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
1088  case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
1089  case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
1090  case AArch64ISD::DUP: return "AArch64ISD::DUP";
1091  case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
1092  case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
1093  case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
1094  case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
1095  case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
1096  case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
1097  case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
1098  case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
1099  case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
1100  case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
1101  case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
1102  case AArch64ISD::BICi: return "AArch64ISD::BICi";
1103  case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
1104  case AArch64ISD::BSL: return "AArch64ISD::BSL";
1105  case AArch64ISD::NEG: return "AArch64ISD::NEG";
1106  case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
1107  case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
1108  case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
1109  case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
1110  case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
1111  case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
1112  case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
1113  case AArch64ISD::REV16: return "AArch64ISD::REV16";
1114  case AArch64ISD::REV32: return "AArch64ISD::REV32";
1115  case AArch64ISD::REV64: return "AArch64ISD::REV64";
1116  case AArch64ISD::EXT: return "AArch64ISD::EXT";
1117  case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
1118  case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
1119  case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
1120  case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
1121  case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
1122  case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
1123  case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
1124  case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
1125  case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
1126  case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
1127  case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
1128  case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
1129  case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
1130  case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
1131  case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
1132  case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
1133  case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
1134  case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
1135  case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
1136  case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
1137  case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
1138  case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
1139  case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
1140  case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
1141  case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
1142  case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
1143  case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
1144  case AArch64ISD::NOT: return "AArch64ISD::NOT";
1145  case AArch64ISD::BIT: return "AArch64ISD::BIT";
1146  case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
1147  case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
1148  case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
1149  case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
1150  case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
1151  case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
1152  case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
1153  case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
1154  case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
1155  case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
1156  case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
1157  case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
1158  case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
1159  case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
1160  case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
1161  case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
1162  case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
1163  case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
1164  case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
1165  case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
1166  case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
1167  case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
1168  case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
1169  case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
1170  case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
1171  case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
1172  case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
1173  case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
1174  case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
1175  case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
1176  case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
1177  case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
1178  case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
1179  case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
1180  case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
1181  case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
1182  case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
1183  case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
1184  case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
1185  case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
1186  case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
1187  case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
1188  case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
1189  case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
1190  }
1191  return nullptr;
1192 }
1193 
1196  MachineBasicBlock *MBB) const {
1197  // We materialise the F128CSEL pseudo-instruction as some control flow and a
1198  // phi node:
1199 
1200  // OrigBB:
1201  // [... previous instrs leading to comparison ...]
1202  // b.ne TrueBB
1203  // b EndBB
1204  // TrueBB:
1205  // ; Fallthrough
1206  // EndBB:
1207  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1208 
1209  MachineFunction *MF = MBB->getParent();
1210  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1211  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1212  DebugLoc DL = MI.getDebugLoc();
1213  MachineFunction::iterator It = ++MBB->getIterator();
1214 
1215  unsigned DestReg = MI.getOperand(0).getReg();
1216  unsigned IfTrueReg = MI.getOperand(1).getReg();
1217  unsigned IfFalseReg = MI.getOperand(2).getReg();
1218  unsigned CondCode = MI.getOperand(3).getImm();
1219  bool NZCVKilled = MI.getOperand(4).isKill();
1220 
1221  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1222  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1223  MF->insert(It, TrueBB);
1224  MF->insert(It, EndBB);
1225 
1226  // Transfer rest of current basic-block to EndBB
1227  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1228  MBB->end());
1229  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1230 
1231  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1232  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1233  MBB->addSuccessor(TrueBB);
1234  MBB->addSuccessor(EndBB);
1235 
1236  // TrueBB falls through to the end.
1237  TrueBB->addSuccessor(EndBB);
1238 
1239  if (!NZCVKilled) {
1240  TrueBB->addLiveIn(AArch64::NZCV);
1241  EndBB->addLiveIn(AArch64::NZCV);
1242  }
1243 
1244  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1245  .addReg(IfTrueReg)
1246  .addMBB(TrueBB)
1247  .addReg(IfFalseReg)
1248  .addMBB(MBB);
1249 
1250  MI.eraseFromParent();
1251  return EndBB;
1252 }
1253 
1255  MachineInstr &MI, MachineBasicBlock *BB) const {
1256  switch (MI.getOpcode()) {
1257  default:
1258 #ifndef NDEBUG
1259  MI.dump();
1260 #endif
1261  llvm_unreachable("Unexpected instruction for custom inserter!");
1262 
1263  case AArch64::F128CSEL:
1264  return EmitF128CSEL(MI, BB);
1265 
1266  case TargetOpcode::STACKMAP:
1267  case TargetOpcode::PATCHPOINT:
1268  return emitPatchPoint(MI, BB);
1269  }
1270 }
1271 
1272 //===----------------------------------------------------------------------===//
1273 // AArch64 Lowering private implementation.
1274 //===----------------------------------------------------------------------===//
1275 
1276 //===----------------------------------------------------------------------===//
1277 // Lowering Code
1278 //===----------------------------------------------------------------------===//
1279 
1280 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1281 /// CC
1283  switch (CC) {
1284  default:
1285  llvm_unreachable("Unknown condition code!");
1286  case ISD::SETNE:
1287  return AArch64CC::NE;
1288  case ISD::SETEQ:
1289  return AArch64CC::EQ;
1290  case ISD::SETGT:
1291  return AArch64CC::GT;
1292  case ISD::SETGE:
1293  return AArch64CC::GE;
1294  case ISD::SETLT:
1295  return AArch64CC::LT;
1296  case ISD::SETLE:
1297  return AArch64CC::LE;
1298  case ISD::SETUGT:
1299  return AArch64CC::HI;
1300  case ISD::SETUGE:
1301  return AArch64CC::HS;
1302  case ISD::SETULT:
1303  return AArch64CC::LO;
1304  case ISD::SETULE:
1305  return AArch64CC::LS;
1306  }
1307 }
1308 
1309 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1312  AArch64CC::CondCode &CondCode2) {
1313  CondCode2 = AArch64CC::AL;
1314  switch (CC) {
1315  default:
1316  llvm_unreachable("Unknown FP condition!");
1317  case ISD::SETEQ:
1318  case ISD::SETOEQ:
1319  CondCode = AArch64CC::EQ;
1320  break;
1321  case ISD::SETGT:
1322  case ISD::SETOGT:
1323  CondCode = AArch64CC::GT;
1324  break;
1325  case ISD::SETGE:
1326  case ISD::SETOGE:
1327  CondCode = AArch64CC::GE;
1328  break;
1329  case ISD::SETOLT:
1330  CondCode = AArch64CC::MI;
1331  break;
1332  case ISD::SETOLE:
1333  CondCode = AArch64CC::LS;
1334  break;
1335  case ISD::SETONE:
1336  CondCode = AArch64CC::MI;
1337  CondCode2 = AArch64CC::GT;
1338  break;
1339  case ISD::SETO:
1340  CondCode = AArch64CC::VC;
1341  break;
1342  case ISD::SETUO:
1343  CondCode = AArch64CC::VS;
1344  break;
1345  case ISD::SETUEQ:
1346  CondCode = AArch64CC::EQ;
1347  CondCode2 = AArch64CC::VS;
1348  break;
1349  case ISD::SETUGT:
1350  CondCode = AArch64CC::HI;
1351  break;
1352  case ISD::SETUGE:
1353  CondCode = AArch64CC::PL;
1354  break;
1355  case ISD::SETLT:
1356  case ISD::SETULT:
1357  CondCode = AArch64CC::LT;
1358  break;
1359  case ISD::SETLE:
1360  case ISD::SETULE:
1361  CondCode = AArch64CC::LE;
1362  break;
1363  case ISD::SETNE:
1364  case ISD::SETUNE:
1365  CondCode = AArch64CC::NE;
1366  break;
1367  }
1368 }
1369 
1370 /// Convert a DAG fp condition code to an AArch64 CC.
1371 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1372 /// should be AND'ed instead of OR'ed.
1375  AArch64CC::CondCode &CondCode2) {
1376  CondCode2 = AArch64CC::AL;
1377  switch (CC) {
1378  default:
1379  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1380  assert(CondCode2 == AArch64CC::AL);
1381  break;
1382  case ISD::SETONE:
1383  // (a one b)
1384  // == ((a olt b) || (a ogt b))
1385  // == ((a ord b) && (a une b))
1386  CondCode = AArch64CC::VC;
1387  CondCode2 = AArch64CC::NE;
1388  break;
1389  case ISD::SETUEQ:
1390  // (a ueq b)
1391  // == ((a uno b) || (a oeq b))
1392  // == ((a ule b) && (a uge b))
1393  CondCode = AArch64CC::PL;
1394  CondCode2 = AArch64CC::LE;
1395  break;
1396  }
1397 }
1398 
1399 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1400 /// CC usable with the vector instructions. Fewer operations are available
1401 /// without a real NZCV register, so we have to use less efficient combinations
1402 /// to get the same effect.
1405  AArch64CC::CondCode &CondCode2,
1406  bool &Invert) {
1407  Invert = false;
1408  switch (CC) {
1409  default:
1410  // Mostly the scalar mappings work fine.
1411  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1412  break;
1413  case ISD::SETUO:
1414  Invert = true;
1416  case ISD::SETO:
1417  CondCode = AArch64CC::MI;
1418  CondCode2 = AArch64CC::GE;
1419  break;
1420  case ISD::SETUEQ:
1421  case ISD::SETULT:
1422  case ISD::SETULE:
1423  case ISD::SETUGT:
1424  case ISD::SETUGE:
1425  // All of the compare-mask comparisons are ordered, but we can switch
1426  // between the two by a double inversion. E.g. ULE == !OGT.
1427  Invert = true;
1428  changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1429  break;
1430  }
1431 }
1432 
1433 static bool isLegalArithImmed(uint64_t C) {
1434  // Matches AArch64DAGToDAGISel::SelectArithImmed().
1435  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1436  DEBUG(dbgs() << "Is imm " << C << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1437  return IsLegal;
1438 }
1439 
1441  const SDLoc &dl, SelectionDAG &DAG) {
1442  EVT VT = LHS.getValueType();
1443  const bool FullFP16 =
1444  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1445 
1446  if (VT.isFloatingPoint()) {
1447  assert(VT != MVT::f128);
1448  if (VT == MVT::f16 && !FullFP16) {
1449  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1450  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1451  VT = MVT::f32;
1452  }
1453  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1454  }
1455 
1456  // The CMP instruction is just an alias for SUBS, and representing it as
1457  // SUBS means that it's possible to get CSE with subtract operations.
1458  // A later phase can perform the optimization of setting the destination
1459  // register to WZR/XZR if it ends up being unused.
1460  unsigned Opcode = AArch64ISD::SUBS;
1461 
1462  if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
1463  (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1464  // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
1465  // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
1466  // can be set differently by this operation. It comes down to whether
1467  // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1468  // everything is fine. If not then the optimization is wrong. Thus general
1469  // comparisons are only valid if op2 != 0.
1470 
1471  // So, finally, the only LLVM-native comparisons that don't mention C and V
1472  // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1473  // the absence of information about op2.
1474  Opcode = AArch64ISD::ADDS;
1475  RHS = RHS.getOperand(1);
1476  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1477  !isUnsignedIntSetCC(CC)) {
1478  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1479  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1480  // of the signed comparisons.
1481  Opcode = AArch64ISD::ANDS;
1482  RHS = LHS.getOperand(1);
1483  LHS = LHS.getOperand(0);
1484  }
1485 
1486  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1487  .getValue(1);
1488 }
1489 
1490 /// \defgroup AArch64CCMP CMP;CCMP matching
1491 ///
1492 /// These functions deal with the formation of CMP;CCMP;... sequences.
1493 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1494 /// a comparison. They set the NZCV flags to a predefined value if their
1495 /// predicate is false. This allows to express arbitrary conjunctions, for
1496 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
1497 /// expressed as:
1498 /// cmp A
1499 /// ccmp B, inv(CB), CA
1500 /// check for CB flags
1501 ///
1502 /// In general we can create code for arbitrary "... (and (and A B) C)"
1503 /// sequences. We can also implement some "or" expressions, because "(or A B)"
1504 /// is equivalent to "not (and (not A) (not B))" and we can implement some
1505 /// negation operations:
1506 /// We can negate the results of a single comparison by inverting the flags
1507 /// used when the predicate fails and inverting the flags tested in the next
1508 /// instruction; We can also negate the results of the whole previous
1509 /// conditional compare sequence by inverting the flags tested in the next
1510 /// instruction. However there is no way to negate the result of a partial
1511 /// sequence.
1512 ///
1513 /// Therefore on encountering an "or" expression we can negate the subtree on
1514 /// one side and have to be able to push the negate to the leafs of the subtree
1515 /// on the other side (see also the comments in code). As complete example:
1516 /// "or (or (setCA (cmp A)) (setCB (cmp B)))
1517 /// (and (setCC (cmp C)) (setCD (cmp D)))"
1518 /// is transformed to
1519 /// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
1520 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1521 /// and implemented as:
1522 /// cmp C
1523 /// ccmp D, inv(CD), CC
1524 /// ccmp A, CA, inv(CD)
1525 /// ccmp B, CB, inv(CA)
1526 /// check for CB flags
1527 /// A counterexample is "or (and A B) (and C D)" which cannot be implemented
1528 /// by conditional compare sequences.
1529 /// @{
1530 
1531 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1533  ISD::CondCode CC, SDValue CCOp,
1535  AArch64CC::CondCode OutCC,
1536  const SDLoc &DL, SelectionDAG &DAG) {
1537  unsigned Opcode = 0;
1538  const bool FullFP16 =
1539  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1540 
1541  if (LHS.getValueType().isFloatingPoint()) {
1542  assert(LHS.getValueType() != MVT::f128);
1543  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1544  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1545  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1546  }
1547  Opcode = AArch64ISD::FCCMP;
1548  } else if (RHS.getOpcode() == ISD::SUB) {
1549  SDValue SubOp0 = RHS.getOperand(0);
1550  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1551  // See emitComparison() on why we can only do this for SETEQ and SETNE.
1552  Opcode = AArch64ISD::CCMN;
1553  RHS = RHS.getOperand(1);
1554  }
1555  }
1556  if (Opcode == 0)
1557  Opcode = AArch64ISD::CCMP;
1558 
1559  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1561  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1562  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1563  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1564 }
1565 
1566 /// Returns true if @p Val is a tree of AND/OR/SETCC operations.
1567 /// CanPushNegate is set to true if we can push a negate operation through
1568 /// the tree in a was that we are left with AND operations and negate operations
1569 /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
1570 /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
1571 /// brought into such a form.
1572 static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
1573  unsigned Depth = 0) {
1574  if (!Val.hasOneUse())
1575  return false;
1576  unsigned Opcode = Val->getOpcode();
1577  if (Opcode == ISD::SETCC) {
1578  if (Val->getOperand(0).getValueType() == MVT::f128)
1579  return false;
1580  CanNegate = true;
1581  return true;
1582  }
1583  // Protect against exponential runtime and stack overflow.
1584  if (Depth > 6)
1585  return false;
1586  if (Opcode == ISD::AND || Opcode == ISD::OR) {
1587  SDValue O0 = Val->getOperand(0);
1588  SDValue O1 = Val->getOperand(1);
1589  bool CanNegateL;
1590  if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
1591  return false;
1592  bool CanNegateR;
1593  if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
1594  return false;
1595 
1596  if (Opcode == ISD::OR) {
1597  // For an OR expression we need to be able to negate at least one side or
1598  // we cannot do the transformation at all.
1599  if (!CanNegateL && !CanNegateR)
1600  return false;
1601  // We can however change a (not (or x y)) to (and (not x) (not y)) if we
1602  // can negate the x and y subtrees.
1603  CanNegate = CanNegateL && CanNegateR;
1604  } else {
1605  // If the operands are OR expressions then we finally need to negate their
1606  // outputs, we can only do that for the operand with emitted last by
1607  // negating OutCC, not for both operands.
1608  bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
1609  bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
1610  if (NeedsNegOutL && NeedsNegOutR)
1611  return false;
1612  // We cannot negate an AND operation (it would become an OR),
1613  CanNegate = false;
1614  }
1615  return true;
1616  }
1617  return false;
1618 }
1619 
1620 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1621 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1622 /// Tries to transform the given i1 producing node @p Val to a series compare
1623 /// and conditional compare operations. @returns an NZCV flags producing node
1624 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1625 /// transformation was not possible.
1626 /// On recursive invocations @p PushNegate may be set to true to have negation
1627 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
1628 /// for the comparisons in the current subtree; @p Depth limits the search
1629 /// depth to avoid stack overflow.
1631  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1633  // We're at a tree leaf, produce a conditional comparison operation.
1634  unsigned Opcode = Val->getOpcode();
1635  if (Opcode == ISD::SETCC) {
1636  SDValue LHS = Val->getOperand(0);
1637  SDValue RHS = Val->getOperand(1);
1638  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1639  bool isInteger = LHS.getValueType().isInteger();
1640  if (Negate)
1641  CC = getSetCCInverse(CC, isInteger);
1642  SDLoc DL(Val);
1643  // Determine OutCC and handle FP special case.
1644  if (isInteger) {
1645  OutCC = changeIntCCToAArch64CC(CC);
1646  } else {
1648  AArch64CC::CondCode ExtraCC;
1649  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1650  // Some floating point conditions can't be tested with a single condition
1651  // code. Construct an additional comparison in this case.
1652  if (ExtraCC != AArch64CC::AL) {
1653  SDValue ExtraCmp;
1654  if (!CCOp.getNode())
1655  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1656  else
1657  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1658  ExtraCC, DL, DAG);
1659  CCOp = ExtraCmp;
1660  Predicate = ExtraCC;
1661  }
1662  }
1663 
1664  // Produce a normal comparison if we are first in the chain
1665  if (!CCOp)
1666  return emitComparison(LHS, RHS, CC, DL, DAG);
1667  // Otherwise produce a ccmp.
1668  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1669  DAG);
1670  }
1671  assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
1672  "Valid conjunction/disjunction tree");
1673 
1674  // Check if both sides can be transformed.
1675  SDValue LHS = Val->getOperand(0);
1676  SDValue RHS = Val->getOperand(1);
1677 
1678  // In case of an OR we need to negate our operands and the result.
1679  // (A v B) <=> not(not(A) ^ not(B))
1680  bool NegateOpsAndResult = Opcode == ISD::OR;
1681  // We can negate the results of all previous operations by inverting the
1682  // predicate flags giving us a free negation for one side. The other side
1683  // must be negatable by itself.
1684  if (NegateOpsAndResult) {
1685  // See which side we can negate.
1686  bool CanNegateL;
1687  bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
1688  assert(isValidL && "Valid conjunction/disjunction tree");
1689  (void)isValidL;
1690 
1691 #ifndef NDEBUG
1692  bool CanNegateR;
1693  bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
1694  assert(isValidR && "Valid conjunction/disjunction tree");
1695  assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
1696 #endif
1697 
1698  // Order the side which we cannot negate to RHS so we can emit it first.
1699  if (!CanNegateL)
1700  std::swap(LHS, RHS);
1701  } else {
1702  bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
1703  assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
1704  "Valid conjunction/disjunction tree");
1705  // Order the side where we need to negate the output flags to RHS so it
1706  // gets emitted first.
1707  if (NeedsNegOutL)
1708  std::swap(LHS, RHS);
1709  }
1710 
1711  // Emit RHS. If we want to negate the tree we only need to push a negate
1712  // through if we are already in a PushNegate case, otherwise we can negate
1713  // the "flags to test" afterwards.
1714  AArch64CC::CondCode RHSCC;
1715  SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
1716  CCOp, Predicate);
1717  if (NegateOpsAndResult && !Negate)
1718  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1719  // Emit LHS. We may need to negate it.
1720  SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
1721  NegateOpsAndResult, CmpR,
1722  RHSCC);
1723  // If we transformed an OR to and AND then we have to negate the result
1724  // (or absorb the Negate parameter).
1725  if (NegateOpsAndResult && !Negate)
1726  OutCC = AArch64CC::getInvertedCondCode(OutCC);
1727  return CmpL;
1728 }
1729 
1730 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1731 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1732 /// \see emitConjunctionDisjunctionTreeRec().
1734  AArch64CC::CondCode &OutCC) {
1735  bool CanNegate;
1736  if (!isConjunctionDisjunctionTree(Val, CanNegate))
1737  return SDValue();
1738 
1739  return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
1740  AArch64CC::AL);
1741 }
1742 
1743 /// @}
1744 
1746  SDValue &AArch64cc, SelectionDAG &DAG,
1747  const SDLoc &dl) {
1748  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1749  EVT VT = RHS.getValueType();
1750  uint64_t C = RHSC->getZExtValue();
1751  if (!isLegalArithImmed(C)) {
1752  // Constant does not fit, try adjusting it by one?
1753  switch (CC) {
1754  default:
1755  break;
1756  case ISD::SETLT:
1757  case ISD::SETGE:
1758  if ((VT == MVT::i32 && C != 0x80000000 &&
1759  isLegalArithImmed((uint32_t)(C - 1))) ||
1760  (VT == MVT::i64 && C != 0x80000000ULL &&
1761  isLegalArithImmed(C - 1ULL))) {
1762  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1763  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1764  RHS = DAG.getConstant(C, dl, VT);
1765  }
1766  break;
1767  case ISD::SETULT:
1768  case ISD::SETUGE:
1769  if ((VT == MVT::i32 && C != 0 &&
1770  isLegalArithImmed((uint32_t)(C - 1))) ||
1771  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1772  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1773  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1774  RHS = DAG.getConstant(C, dl, VT);
1775  }
1776  break;
1777  case ISD::SETLE:
1778  case ISD::SETGT:
1779  if ((VT == MVT::i32 && C != INT32_MAX &&
1780  isLegalArithImmed((uint32_t)(C + 1))) ||
1781  (VT == MVT::i64 && C != INT64_MAX &&
1782  isLegalArithImmed(C + 1ULL))) {
1783  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1784  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1785  RHS = DAG.getConstant(C, dl, VT);
1786  }
1787  break;
1788  case ISD::SETULE:
1789  case ISD::SETUGT:
1790  if ((VT == MVT::i32 && C != UINT32_MAX &&
1791  isLegalArithImmed((uint32_t)(C + 1))) ||
1792  (VT == MVT::i64 && C != UINT64_MAX &&
1793  isLegalArithImmed(C + 1ULL))) {
1794  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1795  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1796  RHS = DAG.getConstant(C, dl, VT);
1797  }
1798  break;
1799  }
1800  }
1801  }
1802  SDValue Cmp;
1803  AArch64CC::CondCode AArch64CC;
1804  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
1805  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
1806 
1807  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
1808  // For the i8 operand, the largest immediate is 255, so this can be easily
1809  // encoded in the compare instruction. For the i16 operand, however, the
1810  // largest immediate cannot be encoded in the compare.
1811  // Therefore, use a sign extending load and cmn to avoid materializing the
1812  // -1 constant. For example,
1813  // movz w1, #65535
1814  // ldrh w0, [x0, #0]
1815  // cmp w0, w1
1816  // >
1817  // ldrsh w0, [x0, #0]
1818  // cmn w0, #1
1819  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
1820  // if and only if (sext LHS) == (sext RHS). The checks are in place to
1821  // ensure both the LHS and RHS are truly zero extended and to make sure the
1822  // transformation is profitable.
1823  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
1824  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
1825  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
1826  LHS.getNode()->hasNUsesOfValue(1, 0)) {
1827  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
1828  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
1829  SDValue SExt =
1830  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
1831  DAG.getValueType(MVT::i16));
1832  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
1833  RHS.getValueType()),
1834  CC, dl, DAG);
1835  AArch64CC = changeIntCCToAArch64CC(CC);
1836  }
1837  }
1838 
1839  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
1840  if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
1841  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
1842  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
1843  }
1844  }
1845  }
1846 
1847  if (!Cmp) {
1848  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
1849  AArch64CC = changeIntCCToAArch64CC(CC);
1850  }
1851  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
1852  return Cmp;
1853 }
1854 
1855 static std::pair<SDValue, SDValue>
1857  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
1858  "Unsupported value type");
1859  SDValue Value, Overflow;
1860  SDLoc DL(Op);
1861  SDValue LHS = Op.getOperand(0);
1862  SDValue RHS = Op.getOperand(1);
1863  unsigned Opc = 0;
1864  switch (Op.getOpcode()) {
1865  default:
1866  llvm_unreachable("Unknown overflow instruction!");
1867  case ISD::SADDO:
1868  Opc = AArch64ISD::ADDS;
1869  CC = AArch64CC::VS;
1870  break;
1871  case ISD::UADDO:
1872  Opc = AArch64ISD::ADDS;
1873  CC = AArch64CC::HS;
1874  break;
1875  case ISD::SSUBO:
1876  Opc = AArch64ISD::SUBS;
1877  CC = AArch64CC::VS;
1878  break;
1879  case ISD::USUBO:
1880  Opc = AArch64ISD::SUBS;
1881  CC = AArch64CC::LO;
1882  break;
1883  // Multiply needs a little bit extra work.
1884  case ISD::SMULO:
1885  case ISD::UMULO: {
1886  CC = AArch64CC::NE;
1887  bool IsSigned = Op.getOpcode() == ISD::SMULO;
1888  if (Op.getValueType() == MVT::i32) {
1889  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1890  // For a 32 bit multiply with overflow check we want the instruction
1891  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
1892  // need to generate the following pattern:
1893  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
1894  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
1895  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
1896  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1897  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
1898  DAG.getConstant(0, DL, MVT::i64));
1899  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
1900  // operation. We need to clear out the upper 32 bits, because we used a
1901  // widening multiply that wrote all 64 bits. In the end this should be a
1902  // noop.
1903  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
1904  if (IsSigned) {
1905  // The signed overflow check requires more than just a simple check for
1906  // any bit set in the upper 32 bits of the result. These bits could be
1907  // just the sign bits of a negative number. To perform the overflow
1908  // check we have to arithmetic shift right the 32nd bit of the result by
1909  // 31 bits. Then we compare the result to the upper 32 bits.
1910  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
1911  DAG.getConstant(32, DL, MVT::i64));
1912  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
1913  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
1914  DAG.getConstant(31, DL, MVT::i64));
1915  // It is important that LowerBits is last, otherwise the arithmetic
1916  // shift will not be folded into the compare (SUBS).
1917  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
1918  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1919  .getValue(1);
1920  } else {
1921  // The overflow check for unsigned multiply is easy. We only need to
1922  // check if any of the upper 32 bits are set. This can be done with a
1923  // CMP (shifted register). For that we need to generate the following
1924  // pattern:
1925  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
1926  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
1927  DAG.getConstant(32, DL, MVT::i64));
1928  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1929  Overflow =
1930  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
1931  DAG.getConstant(0, DL, MVT::i64),
1932  UpperBits).getValue(1);
1933  }
1934  break;
1935  }
1936  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
1937  // For the 64 bit multiply
1938  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1939  if (IsSigned) {
1940  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
1941  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
1942  DAG.getConstant(63, DL, MVT::i64));
1943  // It is important that LowerBits is last, otherwise the arithmetic
1944  // shift will not be folded into the compare (SUBS).
1945  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1946  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1947  .getValue(1);
1948  } else {
1949  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
1950  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1951  Overflow =
1952  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
1953  DAG.getConstant(0, DL, MVT::i64),
1954  UpperBits).getValue(1);
1955  }
1956  break;
1957  }
1958  } // switch (...)
1959 
1960  if (Opc) {
1961  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
1962 
1963  // Emit the AArch64 operation with overflow check.
1964  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
1965  Overflow = Value.getValue(1);
1966  }
1967  return std::make_pair(Value, Overflow);
1968 }
1969 
1970 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
1971  RTLIB::Libcall Call) const {
1972  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
1973  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
1974 }
1975 
1976 // Returns true if the given Op is the overflow flag result of an overflow
1977 // intrinsic operation.
1978 static bool isOverflowIntrOpRes(SDValue Op) {
1979  unsigned Opc = Op.getOpcode();
1980  return (Op.getResNo() == 1 &&
1981  (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
1982  Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
1983 }
1984 
1986  SDValue Sel = Op.getOperand(0);
1987  SDValue Other = Op.getOperand(1);
1988  SDLoc dl(Sel);
1989 
1990  // If the operand is an overflow checking operation, invert the condition
1991  // code and kill the Not operation. I.e., transform:
1992  // (xor (overflow_op_bool, 1))
1993  // -->
1994  // (csel 1, 0, invert(cc), overflow_op_bool)
1995  // ... which later gets transformed to just a cset instruction with an
1996  // inverted condition code, rather than a cset + eor sequence.
1997  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
1998  // Only lower legal XALUO ops.
1999  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2000  return SDValue();
2001 
2002  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2003  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2005  SDValue Value, Overflow;
2006  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2007  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2008  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2009  CCVal, Overflow);
2010  }
2011  // If neither operand is a SELECT_CC, give up.
2012  if (Sel.getOpcode() != ISD::SELECT_CC)
2013  std::swap(Sel, Other);
2014  if (Sel.getOpcode() != ISD::SELECT_CC)
2015  return Op;
2016 
2017  // The folding we want to perform is:
2018  // (xor x, (select_cc a, b, cc, 0, -1) )
2019  // -->
2020  // (csel x, (xor x, -1), cc ...)
2021  //
2022  // The latter will get matched to a CSINV instruction.
2023 
2024  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2025  SDValue LHS = Sel.getOperand(0);
2026  SDValue RHS = Sel.getOperand(1);
2027  SDValue TVal = Sel.getOperand(2);
2028  SDValue FVal = Sel.getOperand(3);
2029 
2030  // FIXME: This could be generalized to non-integer comparisons.
2031  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2032  return Op;
2033 
2034  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2035  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2036 
2037  // The values aren't constants, this isn't the pattern we're looking for.
2038  if (!CFVal || !CTVal)
2039  return Op;
2040 
2041  // We can commute the SELECT_CC by inverting the condition. This
2042  // might be needed to make this fit into a CSINV pattern.
2043  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2044  std::swap(TVal, FVal);
2045  std::swap(CTVal, CFVal);
2046  CC = ISD::getSetCCInverse(CC, true);
2047  }
2048 
2049  // If the constants line up, perform the transform!
2050  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2051  SDValue CCVal;
2052  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2053 
2054  FVal = Other;
2055  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2056  DAG.getConstant(-1ULL, dl, Other.getValueType()));
2057 
2058  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2059  CCVal, Cmp);
2060  }
2061 
2062  return Op;
2063 }
2064 
2066  EVT VT = Op.getValueType();
2067 
2068  // Let legalize expand this if it isn't a legal type yet.
2069  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2070  return SDValue();
2071 
2072  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2073 
2074  unsigned Opc;
2075  bool ExtraOp = false;
2076  switch (Op.getOpcode()) {
2077  default:
2078  llvm_unreachable("Invalid code");
2079  case ISD::ADDC:
2080  Opc = AArch64ISD::ADDS;
2081  break;
2082  case ISD::SUBC:
2083  Opc = AArch64ISD::SUBS;
2084  break;
2085  case ISD::ADDE:
2086  Opc = AArch64ISD::ADCS;
2087  ExtraOp = true;
2088  break;
2089  case ISD::SUBE:
2090  Opc = AArch64ISD::SBCS;
2091  ExtraOp = true;
2092  break;
2093  }
2094 
2095  if (!ExtraOp)
2096  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2097  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2098  Op.getOperand(2));
2099 }
2100 
2102  // Let legalize expand this if it isn't a legal type yet.
2104  return SDValue();
2105 
2106  SDLoc dl(Op);
2108  // The actual operation that sets the overflow or carry flag.
2109  SDValue Value, Overflow;
2110  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2111 
2112  // We use 0 and 1 as false and true values.
2113  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2114  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2115 
2116  // We use an inverted condition, because the conditional select is inverted
2117  // too. This will allow it to be selected to a single instruction:
2118  // CSINC Wd, WZR, WZR, invert(cond).
2119  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2120  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2121  CCVal, Overflow);
2122 
2123  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2124  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2125 }
2126 
2127 // Prefetch operands are:
2128 // 1: Address to prefetch
2129 // 2: bool isWrite
2130 // 3: int locality (0 = no locality ... 3 = extreme locality)
2131 // 4: bool isDataCache
2133  SDLoc DL(Op);
2134  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2135  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2136  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2137 
2138  bool IsStream = !Locality;
2139  // When the locality number is set
2140  if (Locality) {
2141  // The front-end should have filtered out the out-of-range values
2142  assert(Locality <= 3 && "Prefetch locality out-of-range");
2143  // The locality degree is the opposite of the cache speed.
2144  // Put the number the other way around.
2145  // The encoding starts at 0 for level 1
2146  Locality = 3 - Locality;
2147  }
2148 
2149  // built the mask value encoding the expected behavior.
2150  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
2151  (!IsData << 3) | // IsDataCache bit
2152  (Locality << 1) | // Cache level bits
2153  (unsigned)IsStream; // Stream bit
2154  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2155  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2156 }
2157 
2158 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2159  SelectionDAG &DAG) const {
2160  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2161 
2162  RTLIB::Libcall LC;
2164 
2165  return LowerF128Call(Op, DAG, LC);
2166 }
2167 
2168 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2169  SelectionDAG &DAG) const {
2170  if (Op.getOperand(0).getValueType() != MVT::f128) {
2171  // It's legal except when f128 is involved
2172  return Op;
2173  }
2174 
2175  RTLIB::Libcall LC;
2177 
2178  // FP_ROUND node has a second operand indicating whether it is known to be
2179  // precise. That doesn't take part in the LibCall so we can't directly use
2180  // LowerF128Call.
2181  SDValue SrcVal = Op.getOperand(0);
2182  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
2183  SDLoc(Op)).first;
2184 }
2185 
2187  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2188  // Any additional optimization in this function should be recorded
2189  // in the cost tables.
2190  EVT InVT = Op.getOperand(0).getValueType();
2191  EVT VT = Op.getValueType();
2192  unsigned NumElts = InVT.getVectorNumElements();
2193 
2194  // f16 vectors are promoted to f32 before a conversion.
2195  if (InVT.getVectorElementType() == MVT::f16) {
2196  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2197  SDLoc dl(Op);
2198  return DAG.getNode(
2199  Op.getOpcode(), dl, Op.getValueType(),
2200  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2201  }
2202 
2203  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2204  SDLoc dl(Op);
2205  SDValue Cv =
2206  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2207  Op.getOperand(0));
2208  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2209  }
2210 
2211  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2212  SDLoc dl(Op);
2213  MVT ExtVT =
2215  VT.getVectorNumElements());
2216  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2217  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2218  }
2219 
2220  // Type changing conversions are illegal.
2221  return Op;
2222 }
2223 
2224 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2225  SelectionDAG &DAG) const {
2226  if (Op.getOperand(0).getValueType().isVector())
2227  return LowerVectorFP_TO_INT(Op, DAG);
2228 
2229  // f16 conversions are promoted to f32 when full fp16 is not supported.
2230  if (Op.getOperand(0).getValueType() == MVT::f16 &&
2231  !Subtarget->hasFullFP16()) {
2232  SDLoc dl(Op);
2233  return DAG.getNode(
2234  Op.getOpcode(), dl, Op.getValueType(),
2235  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2236  }
2237 
2238  if (Op.getOperand(0).getValueType() != MVT::f128) {
2239  // It's legal except when f128 is involved
2240  return Op;
2241  }
2242 
2243  RTLIB::Libcall LC;
2244  if (Op.getOpcode() == ISD::FP_TO_SINT)
2246  else
2248 
2249  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2250  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
2251 }
2252 
2254  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2255  // Any additional optimization in this function should be recorded
2256  // in the cost tables.
2257  EVT VT = Op.getValueType();
2258  SDLoc dl(Op);
2259  SDValue In = Op.getOperand(0);
2260  EVT InVT = In.getValueType();
2261 
2262  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2263  MVT CastVT =
2265  InVT.getVectorNumElements());
2266  In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2267  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2268  }
2269 
2270  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2271  unsigned CastOpc =
2273  EVT CastVT = VT.changeVectorElementTypeToInteger();
2274  In = DAG.getNode(CastOpc, dl, CastVT, In);
2275  return DAG.getNode(Op.getOpcode(), dl, VT, In);
2276  }
2277 
2278  return Op;
2279 }
2280 
2281 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2282  SelectionDAG &DAG) const {
2283  if (Op.getValueType().isVector())
2284  return LowerVectorINT_TO_FP(Op, DAG);
2285 
2286  // f16 conversions are promoted to f32 when full fp16 is not supported.
2287  if (Op.getValueType() == MVT::f16 &&
2288  !Subtarget->hasFullFP16()) {
2289  SDLoc dl(Op);
2290  return DAG.getNode(
2291  ISD::FP_ROUND, dl, MVT::f16,
2292  DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2293  DAG.getIntPtrConstant(0, dl));
2294  }
2295 
2296  // i128 conversions are libcalls.
2297  if (Op.getOperand(0).getValueType() == MVT::i128)
2298  return SDValue();
2299 
2300  // Other conversions are legal, unless it's to the completely software-based
2301  // fp128.
2302  if (Op.getValueType() != MVT::f128)
2303  return Op;
2304 
2305  RTLIB::Libcall LC;
2306  if (Op.getOpcode() == ISD::SINT_TO_FP)
2308  else
2310 
2311  return LowerF128Call(Op, DAG, LC);
2312 }
2313 
2314 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2315  SelectionDAG &DAG) const {
2316  // For iOS, we want to call an alternative entry point: __sincos_stret,
2317  // which returns the values in two S / D registers.
2318  SDLoc dl(Op);
2319  SDValue Arg = Op.getOperand(0);
2320  EVT ArgVT = Arg.getValueType();
2321  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2322 
2323  ArgListTy Args;
2324  ArgListEntry Entry;
2325 
2326  Entry.Node = Arg;
2327  Entry.Ty = ArgTy;
2328  Entry.IsSExt = false;
2329  Entry.IsZExt = false;
2330  Args.push_back(Entry);
2331 
2332  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2333  : RTLIB::SINCOS_STRET_F32;
2334  const char *LibcallName = getLibcallName(LC);
2335  SDValue Callee =
2336  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2337 
2338  StructType *RetTy = StructType::get(ArgTy, ArgTy);
2340  CLI.setDebugLoc(dl)
2341  .setChain(DAG.getEntryNode())
2342  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2343 
2344  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2345  return CallResult.first;
2346 }
2347 
2349  if (Op.getValueType() != MVT::f16)
2350  return SDValue();
2351 
2352  assert(Op.getOperand(0).getValueType() == MVT::i16);
2353  SDLoc DL(Op);
2354 
2355  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2356  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2357  return SDValue(
2358  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2359  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2360  0);
2361 }
2362 
2363 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2364  if (OrigVT.getSizeInBits() >= 64)
2365  return OrigVT;
2366 
2367  assert(OrigVT.isSimple() && "Expecting a simple value type");
2368 
2369  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2370  switch (OrigSimpleTy) {
2371  default: llvm_unreachable("Unexpected Vector Type");
2372  case MVT::v2i8:
2373  case MVT::v2i16:
2374  return MVT::v2i32;
2375  case MVT::v4i8:
2376  return MVT::v4i16;
2377  }
2378 }
2379 
2381  const EVT &OrigTy,
2382  const EVT &ExtTy,
2383  unsigned ExtOpcode) {
2384  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2385  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2386  // 64-bits we need to insert a new extension so that it will be 64-bits.
2387  assert(ExtTy.is128BitVector() && "Unexpected extension size");
2388  if (OrigTy.getSizeInBits() >= 64)
2389  return N;
2390 
2391  // Must extend size to at least 64 bits to be used as an operand for VMULL.
2392  EVT NewVT = getExtensionTo64Bits(OrigTy);
2393 
2394  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2395 }
2396 
2398  bool isSigned) {
2399  EVT VT = N->getValueType(0);
2400 
2401  if (N->getOpcode() != ISD::BUILD_VECTOR)
2402  return false;
2403 
2404  for (const SDValue &Elt : N->op_values()) {
2405  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2406  unsigned EltSize = VT.getScalarSizeInBits();
2407  unsigned HalfSize = EltSize / 2;
2408  if (isSigned) {
2409  if (!isIntN(HalfSize, C->getSExtValue()))
2410  return false;
2411  } else {
2412  if (!isUIntN(HalfSize, C->getZExtValue()))
2413  return false;
2414  }
2415  continue;
2416  }
2417  return false;
2418  }
2419 
2420  return true;
2421 }
2422 
2424  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2426  N->getOperand(0)->getValueType(0),
2427  N->getValueType(0),
2428  N->getOpcode());
2429 
2430  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2431  EVT VT = N->getValueType(0);
2432  SDLoc dl(N);
2433  unsigned EltSize = VT.getScalarSizeInBits() / 2;
2434  unsigned NumElts = VT.getVectorNumElements();
2435  MVT TruncVT = MVT::getIntegerVT(EltSize);
2437  for (unsigned i = 0; i != NumElts; ++i) {
2438  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2439  const APInt &CInt = C->getAPIntValue();
2440  // Element types smaller than 32 bits are not legal, so use i32 elements.
2441  // The values are implicitly truncated so sext vs. zext doesn't matter.
2442  Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2443  }
2444  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2445 }
2446 
2447 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2448  return N->getOpcode() == ISD::SIGN_EXTEND ||
2449  isExtendedBUILD_VECTOR(N, DAG, true);
2450 }
2451 
2452 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2453  return N->getOpcode() == ISD::ZERO_EXTEND ||
2454  isExtendedBUILD_VECTOR(N, DAG, false);
2455 }
2456 
2457 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2458  unsigned Opcode = N->getOpcode();
2459  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2460  SDNode *N0 = N->getOperand(0).getNode();
2461  SDNode *N1 = N->getOperand(1).getNode();
2462  return N0->hasOneUse() && N1->hasOneUse() &&
2463  isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2464  }
2465  return false;
2466 }
2467 
2468 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2469  unsigned Opcode = N->getOpcode();
2470  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2471  SDNode *N0 = N->getOperand(0).getNode();
2472  SDNode *N1 = N->getOperand(1).getNode();
2473  return N0->hasOneUse() && N1->hasOneUse() &&
2474  isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2475  }
2476  return false;
2477 }
2478 
2480  // Multiplications are only custom-lowered for 128-bit vectors so that
2481  // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
2482  EVT VT = Op.getValueType();
2483  assert(VT.is128BitVector() && VT.isInteger() &&
2484  "unexpected type for custom-lowering ISD::MUL");
2485  SDNode *N0 = Op.getOperand(0).getNode();
2486  SDNode *N1 = Op.getOperand(1).getNode();
2487  unsigned NewOpc = 0;
2488  bool isMLA = false;
2489  bool isN0SExt = isSignExtended(N0, DAG);
2490  bool isN1SExt = isSignExtended(N1, DAG);
2491  if (isN0SExt && isN1SExt)
2492  NewOpc = AArch64ISD::SMULL;
2493  else {
2494  bool isN0ZExt = isZeroExtended(N0, DAG);
2495  bool isN1ZExt = isZeroExtended(N1, DAG);
2496  if (isN0ZExt && isN1ZExt)
2497  NewOpc = AArch64ISD::UMULL;
2498  else if (isN1SExt || isN1ZExt) {
2499  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2500  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2501  if (isN1SExt && isAddSubSExt(N0, DAG)) {
2502  NewOpc = AArch64ISD::SMULL;
2503  isMLA = true;
2504  } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2505  NewOpc = AArch64ISD::UMULL;
2506  isMLA = true;
2507  } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2508  std::swap(N0, N1);
2509  NewOpc = AArch64ISD::UMULL;
2510  isMLA = true;
2511  }
2512  }
2513 
2514  if (!NewOpc) {
2515  if (VT == MVT::v2i64)
2516  // Fall through to expand this. It is not legal.
2517  return SDValue();
2518  else
2519  // Other vector multiplications are legal.
2520  return Op;
2521  }
2522  }
2523 
2524  // Legalize to a S/UMULL instruction
2525  SDLoc DL(Op);
2526  SDValue Op0;
2527  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2528  if (!isMLA) {
2529  Op0 = skipExtensionForVectorMULL(N0, DAG);
2530  assert(Op0.getValueType().is64BitVector() &&
2531  Op1.getValueType().is64BitVector() &&
2532  "unexpected types for extended operands to VMULL");
2533  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2534  }
2535  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2536  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2537  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2538  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2539  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2540  EVT Op1VT = Op1.getValueType();
2541  return DAG.getNode(N0->getOpcode(), DL, VT,
2542  DAG.getNode(NewOpc, DL, VT,
2543  DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2544  DAG.getNode(NewOpc, DL, VT,
2545  DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2546 }
2547 
2548 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2549  SelectionDAG &DAG) const {
2550  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2551  SDLoc dl(Op);
2552  switch (IntNo) {
2553  default: return SDValue(); // Don't custom lower most intrinsics.
2554  case Intrinsic::thread_pointer: {
2555  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2556  return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2557  }
2558  case Intrinsic::aarch64_neon_abs:
2559  return DAG.getNode(ISD::ABS, dl, Op.getValueType(),
2560  Op.getOperand(1));
2561  case Intrinsic::aarch64_neon_smax:
2562  return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2563  Op.getOperand(1), Op.getOperand(2));
2564  case Intrinsic::aarch64_neon_umax:
2565  return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2566  Op.getOperand(1), Op.getOperand(2));
2567  case Intrinsic::aarch64_neon_smin:
2568  return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2569  Op.getOperand(1), Op.getOperand(2));
2570  case Intrinsic::aarch64_neon_umin:
2571  return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2572  Op.getOperand(1), Op.getOperand(2));
2573  }
2574 }
2575 
2577  SelectionDAG &DAG) const {
2578  DEBUG(dbgs() << "Custom lowering: ");
2579  DEBUG(Op.dump());
2580 
2581  switch (Op.getOpcode()) {
2582  default:
2583  llvm_unreachable("unimplemented operand");
2584  return SDValue();
2585  case ISD::BITCAST:
2586  return LowerBITCAST(Op, DAG);
2587  case ISD::GlobalAddress:
2588  return LowerGlobalAddress(Op, DAG);
2589  case ISD::GlobalTLSAddress:
2590  return LowerGlobalTLSAddress(Op, DAG);
2591  case ISD::SETCC:
2592  return LowerSETCC(Op, DAG);
2593  case ISD::BR_CC:
2594  return LowerBR_CC(Op, DAG);
2595  case ISD::SELECT:
2596  return LowerSELECT(Op, DAG);
2597  case ISD::SELECT_CC:
2598  return LowerSELECT_CC(Op, DAG);
2599  case ISD::JumpTable:
2600  return LowerJumpTable(Op, DAG);
2601  case ISD::ConstantPool:
2602  return LowerConstantPool(Op, DAG);
2603  case ISD::BlockAddress:
2604  return LowerBlockAddress(Op, DAG);
2605  case ISD::VASTART:
2606  return LowerVASTART(Op, DAG);
2607  case ISD::VACOPY:
2608  return LowerVACOPY(Op, DAG);
2609  case ISD::VAARG:
2610  return LowerVAARG(Op, DAG);
2611  case ISD::ADDC:
2612  case ISD::ADDE:
2613  case ISD::SUBC:
2614  case ISD::SUBE:
2615  return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2616  case ISD::SADDO:
2617  case ISD::UADDO:
2618  case ISD::SSUBO:
2619  case ISD::USUBO:
2620  case ISD::SMULO:
2621  case ISD::UMULO:
2622  return LowerXALUO(Op, DAG);
2623  case ISD::FADD:
2624  return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
2625  case ISD::FSUB:
2626  return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
2627  case ISD::FMUL:
2628  return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
2629  case ISD::FDIV:
2630  return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
2631  case ISD::FP_ROUND:
2632  return LowerFP_ROUND(Op, DAG);
2633  case ISD::FP_EXTEND:
2634  return LowerFP_EXTEND(Op, DAG);
2635  case ISD::FRAMEADDR:
2636  return LowerFRAMEADDR(Op, DAG);
2637  case ISD::RETURNADDR:
2638  return LowerRETURNADDR(Op, DAG);
2640  return LowerINSERT_VECTOR_ELT(Op, DAG);
2642  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2643  case ISD::BUILD_VECTOR:
2644  return LowerBUILD_VECTOR(Op, DAG);
2645  case ISD::VECTOR_SHUFFLE:
2646  return LowerVECTOR_SHUFFLE(Op, DAG);
2648  return LowerEXTRACT_SUBVECTOR(Op, DAG);
2649  case ISD::SRA:
2650  case ISD::SRL:
2651  case ISD::SHL:
2652  return LowerVectorSRA_SRL_SHL(Op, DAG);
2653  case ISD::SHL_PARTS:
2654  return LowerShiftLeftParts(Op, DAG);
2655  case ISD::SRL_PARTS:
2656  case ISD::SRA_PARTS:
2657  return LowerShiftRightParts(Op, DAG);
2658  case ISD::CTPOP:
2659  return LowerCTPOP(Op, DAG);
2660  case ISD::FCOPYSIGN:
2661  return LowerFCOPYSIGN(Op, DAG);
2662  case ISD::AND:
2663  return LowerVectorAND(Op, DAG);
2664  case ISD::OR:
2665  return LowerVectorOR(Op, DAG);
2666  case ISD::XOR:
2667  return LowerXOR(Op, DAG);
2668  case ISD::PREFETCH:
2669  return LowerPREFETCH(Op, DAG);
2670  case ISD::SINT_TO_FP:
2671  case ISD::UINT_TO_FP:
2672  return LowerINT_TO_FP(Op, DAG);
2673  case ISD::FP_TO_SINT:
2674  case ISD::FP_TO_UINT:
2675  return LowerFP_TO_INT(Op, DAG);
2676  case ISD::FSINCOS:
2677  return LowerFSINCOS(Op, DAG);
2678  case ISD::MUL:
2679  return LowerMUL(Op, DAG);
2681  return LowerINTRINSIC_WO_CHAIN(Op, DAG);
2682  case ISD::VECREDUCE_ADD:
2683  case ISD::VECREDUCE_SMAX:
2684  case ISD::VECREDUCE_SMIN:
2685  case ISD::VECREDUCE_UMAX:
2686  case ISD::VECREDUCE_UMIN:
2687  case ISD::VECREDUCE_FMAX:
2688  case ISD::VECREDUCE_FMIN:
2689  return LowerVECREDUCE(Op, DAG);
2690  case ISD::ATOMIC_LOAD_SUB:
2691  return LowerATOMIC_LOAD_SUB(Op, DAG);
2692  case ISD::ATOMIC_LOAD_AND:
2693  return LowerATOMIC_LOAD_AND(Op, DAG);
2695  return LowerDYNAMIC_STACKALLOC(Op, DAG);
2696  }
2697 }
2698 
2699 //===----------------------------------------------------------------------===//
2700 // Calling Convention Implementation
2701 //===----------------------------------------------------------------------===//
2702 
2703 #include "AArch64GenCallingConv.inc"
2704 
2705 /// Selects the correct CCAssignFn for a given CallingConvention value.
2707  bool IsVarArg) const {
2708  switch (CC) {
2709  default:
2710  report_fatal_error("Unsupported calling convention.");
2712  return CC_AArch64_WebKit_JS;
2713  case CallingConv::GHC:
2714  return CC_AArch64_GHC;
2715  case CallingConv::C:
2716  case CallingConv::Fast:
2719  case CallingConv::Swift:
2720  if (Subtarget->isTargetWindows() && IsVarArg)
2721  return CC_AArch64_Win64_VarArg;
2722  if (!Subtarget->isTargetDarwin())
2723  return CC_AArch64_AAPCS;
2724  return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
2725  case CallingConv::Win64:
2726  return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
2727  }
2728 }
2729 
2730 CCAssignFn *
2732  return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
2733  : RetCC_AArch64_AAPCS;
2734 }
2735 
2736 SDValue AArch64TargetLowering::LowerFormalArguments(
2737  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2738  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2739  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2740  MachineFunction &MF = DAG.getMachineFunction();
2741  MachineFrameInfo &MFI = MF.getFrameInfo();
2742  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
2743 
2744  // Assign locations to all of the incoming arguments.
2746  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2747  *DAG.getContext());
2748 
2749  // At this point, Ins[].VT may already be promoted to i32. To correctly
2750  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
2751  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
2752  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
2753  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
2754  // LocVT.
2755  unsigned NumArgs = Ins.size();
2757  unsigned CurArgIdx = 0;
2758  for (unsigned i = 0; i != NumArgs; ++i) {
2759  MVT ValVT = Ins[i].VT;
2760  if (Ins[i].isOrigArg()) {
2761  std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
2762  CurArgIdx = Ins[i].getOrigArgIndex();
2763 
2764  // Get type of the original argument.
2765  EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
2766  /*AllowUnknown*/ true);
2767  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
2768  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
2769  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
2770  ValVT = MVT::i8;
2771  else if (ActualMVT == MVT::i16)
2772  ValVT = MVT::i16;
2773  }
2774  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
2775  bool Res =
2776  AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
2777  assert(!Res && "Call operand has unhandled type");
2778  (void)Res;
2779  }
2780  assert(ArgLocs.size() == Ins.size());
2781  SmallVector<SDValue, 16> ArgValues;
2782  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2783  CCValAssign &VA = ArgLocs[i];
2784 
2785  if (Ins[i].Flags.isByVal()) {
2786  // Byval is used for HFAs in the PCS, but the system should work in a
2787  // non-compliant manner for larger structs.
2788  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2789  int Size = Ins[i].Flags.getByValSize();
2790  unsigned NumRegs = (Size + 7) / 8;
2791 
2792  // FIXME: This works on big-endian for composite byvals, which are the common
2793  // case. It should also work for fundamental types too.
2794  unsigned FrameIdx =
2795  MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
2796  SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
2797  InVals.push_back(FrameIdxN);
2798 
2799  continue;
2800  }
2801 
2802  if (VA.isRegLoc()) {
2803  // Arguments stored in registers.
2804  EVT RegVT = VA.getLocVT();
2805 
2806  SDValue ArgValue;
2807  const TargetRegisterClass *RC;
2808 
2809  if (RegVT == MVT::i32)
2810  RC = &AArch64::GPR32RegClass;
2811  else if (RegVT == MVT::i64)
2812  RC = &AArch64::GPR64RegClass;
2813  else if (RegVT == MVT::f16)
2814  RC = &AArch64::FPR16RegClass;
2815  else if (RegVT == MVT::f32)
2816  RC = &AArch64::FPR32RegClass;
2817  else if (RegVT == MVT::f64 || RegVT.is64BitVector())
2818  RC = &AArch64::FPR64RegClass;
2819  else if (RegVT == MVT::f128 || RegVT.is128BitVector())
2820  RC = &AArch64::FPR128RegClass;
2821  else
2822  llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
2823 
2824  // Transform the arguments in physical registers into virtual ones.
2825  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2826  ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
2827 
2828  // If this is an 8, 16 or 32-bit value, it is really passed promoted
2829  // to 64 bits. Insert an assert[sz]ext to capture this, then
2830  // truncate to the right size.
2831  switch (VA.getLocInfo()) {
2832  default:
2833  llvm_unreachable("Unknown loc info!");
2834  case CCValAssign::Full:
2835  break;
2836  case CCValAssign::BCvt:
2837  ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
2838  break;
2839  case CCValAssign::AExt:
2840  case CCValAssign::SExt:
2841  case CCValAssign::ZExt:
2842  // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
2843  // nodes after our lowering.
2844  assert(RegVT == Ins[i].VT && "incorrect register location selected");
2845  break;
2846  }
2847 
2848  InVals.push_back(ArgValue);
2849 
2850  } else { // VA.isRegLoc()
2851  assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
2852  unsigned ArgOffset = VA.getLocMemOffset();
2853  unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
2854 
2855  uint32_t BEAlign = 0;
2856  if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
2857  !Ins[i].Flags.isInConsecutiveRegs())
2858  BEAlign = 8 - ArgSize;
2859 
2860  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
2861 
2862  // Create load nodes to retrieve arguments from the stack.
2863  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2864  SDValue ArgValue;
2865 
2866  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2868  MVT MemVT = VA.getValVT();
2869 
2870  switch (VA.getLocInfo()) {
2871  default:
2872  break;
2873  case CCValAssign::BCvt:
2874  MemVT = VA.getLocVT();
2875  break;
2876  case CCValAssign::SExt:
2877  ExtType = ISD::SEXTLOAD;
2878  break;
2879  case CCValAssign::ZExt:
2880  ExtType = ISD::ZEXTLOAD;
2881  break;
2882  case CCValAssign::AExt:
2883  ExtType = ISD::EXTLOAD;
2884  break;
2885  }
2886 
2887  ArgValue = DAG.getExtLoad(
2888  ExtType, DL, VA.getLocVT(), Chain, FIN,
2890  MemVT);
2891 
2892  InVals.push_back(ArgValue);
2893  }
2894  }
2895 
2896  // varargs
2898  if (isVarArg) {
2899  if (!Subtarget->isTargetDarwin() || IsWin64) {
2900  // The AAPCS variadic function ABI is identical to the non-variadic
2901  // one. As a result there may be more arguments in registers and we should
2902  // save them for future reference.
2903  // Win64 variadic functions also pass arguments in registers, but all float
2904  // arguments are passed in integer registers.
2905  saveVarArgRegisters(CCInfo, DAG, DL, Chain);
2906  }
2907 
2908  // This will point to the next argument passed via stack.
2909  unsigned StackOffset = CCInfo.getNextStackOffset();
2910  // We currently pass all varargs at 8-byte alignment.
2911  StackOffset = ((StackOffset + 7) & ~7);
2912  FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
2913  }
2914 
2915  unsigned StackArgSize = CCInfo.getNextStackOffset();
2916  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2917  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
2918  // This is a non-standard ABI so by fiat I say we're allowed to make full
2919  // use of the stack area to be popped, which must be aligned to 16 bytes in
2920  // any case:
2921  StackArgSize = alignTo(StackArgSize, 16);
2922 
2923  // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
2924  // a multiple of 16.
2925  FuncInfo->setArgumentStackToRestore(StackArgSize);
2926 
2927  // This realignment carries over to the available bytes below. Our own
2928  // callers will guarantee the space is free by giving an aligned value to
2929  // CALLSEQ_START.
2930  }
2931  // Even if we're not expected to free up the space, it's useful to know how
2932  // much is there while considering tail calls (because we can reuse it).
2933  FuncInfo->setBytesInStackArgArea(StackArgSize);
2934 
2935  return Chain;
2936 }
2937 
2938 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
2939  SelectionDAG &DAG,
2940  const SDLoc &DL,
2941  SDValue &Chain) const {
2942  MachineFunction &MF = DAG.getMachineFunction();
2943  MachineFrameInfo &MFI = MF.getFrameInfo();
2945  auto PtrVT = getPointerTy(DAG.getDataLayout());
2946  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
2947 
2948  SmallVector<SDValue, 8> MemOps;
2949 
2950  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
2951  AArch64::X3, AArch64::X4, AArch64::X5,
2952  AArch64::X6, AArch64::X7 };
2953  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
2954  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
2955 
2956  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
2957  int GPRIdx = 0;
2958  if (GPRSaveSize != 0) {
2959  if (IsWin64) {
2960  GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
2961  if (GPRSaveSize & 15)
2962  // The extra size here, if triggered, will always be 8.
2963  MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
2964  } else
2965  GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
2966 
2967  SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
2968 
2969  for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
2970  unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
2971  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
2972  SDValue Store = DAG.getStore(
2973  Val.getValue(1), DL, Val, FIN,
2974  IsWin64
2976  GPRIdx,
2977  (i - FirstVariadicGPR) * 8)
2979  MemOps.push_back(Store);
2980  FIN =
2981  DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
2982  }
2983  }
2984  FuncInfo->setVarArgsGPRIndex(GPRIdx);
2985  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
2986 
2987  if (Subtarget->hasFPARMv8() && !IsWin64) {
2988  static const MCPhysReg FPRArgRegs[] = {
2989  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
2990  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
2991  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
2992  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
2993 
2994  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
2995  int FPRIdx = 0;
2996  if (FPRSaveSize != 0) {
2997  FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
2998 
2999  SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3000 
3001  for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3002  unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3003  SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3004 
3005  SDValue Store = DAG.getStore(
3006  Val.getValue(1), DL, Val, FIN,
3008  MemOps.push_back(Store);
3009  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3010  DAG.getConstant(16, DL, PtrVT));
3011  }
3012  }
3013  FuncInfo->setVarArgsFPRIndex(FPRIdx);
3014  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3015  }
3016 
3017  if (!MemOps.empty()) {
3018  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3019  }
3020 }
3021 
3022 /// LowerCallResult - Lower the result values of a call into the
3023 /// appropriate copies out of appropriate physical registers.
3024 SDValue AArch64TargetLowering::LowerCallResult(
3025  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3026  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3027  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3028  SDValue ThisVal) const {
3029  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3030  ? RetCC_AArch64_WebKit_JS
3031  : RetCC_AArch64_AAPCS;
3032  // Assign locations to each value returned by this call.
3034  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3035  *DAG.getContext());
3036  CCInfo.AnalyzeCallResult(Ins, RetCC);
3037 
3038  // Copy all of the result registers out of their specified physreg.
3039  for (unsigned i = 0; i != RVLocs.size(); ++i) {
3040  CCValAssign VA = RVLocs[i];
3041 
3042  // Pass 'this' value directly from the argument to return value, to avoid
3043  // reg unit interference
3044  if (i == 0 && isThisReturn) {
3045  assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3046  "unexpected return calling convention register assignment");
3047  InVals.push_back(ThisVal);
3048  continue;
3049  }
3050 
3051  SDValue Val =
3052  DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3053  Chain = Val.getValue(1);
3054  InFlag = Val.getValue(2);
3055 
3056  switch (VA.getLocInfo()) {
3057  default:
3058  llvm_unreachable("Unknown loc info!");
3059  case CCValAssign::Full:
3060  break;
3061  case CCValAssign::BCvt:
3062  Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3063  break;
3064  }
3065 
3066  InVals.push_back(Val);
3067  }
3068 
3069  return Chain;
3070 }
3071 
3072 /// Return true if the calling convention is one that we can guarantee TCO for.
3074  return CC == CallingConv::Fast;
3075 }
3076 
3077 /// Return true if we might ever do TCO for calls with this calling convention.
3079  switch (CC) {
3080  case CallingConv::C:
3082  case CallingConv::Swift:
3083  return true;
3084  default:
3085  return canGuaranteeTCO(CC);
3086  }
3087 }
3088 
3089 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3090  SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3091  const SmallVectorImpl<ISD::OutputArg> &Outs,
3092  const SmallVectorImpl<SDValue> &OutVals,
3093  const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3094  if (!mayTailCallThisCC(CalleeCC))
3095  return false;
3096 
3097  MachineFunction &MF = DAG.getMachineFunction();
3098  const Function &CallerF = MF.getFunction();
3099  CallingConv::ID CallerCC = CallerF.getCallingConv();
3100  bool CCMatch = CallerCC == CalleeCC;
3101 
3102  // Byval parameters hand the function a pointer directly into the stack area
3103  // we want to reuse during a tail call. Working around this *is* possible (see
3104  // X86) but less efficient and uglier in LowerCall.
3105  for (Function::const_arg_iterator i = CallerF.arg_begin(),
3106  e = CallerF.arg_end();
3107  i != e; ++i)
3108  if (i->hasByValAttr())
3109  return false;
3110 
3112  return canGuaranteeTCO(CalleeCC) && CCMatch;
3113 
3114  // Externally-defined functions with weak linkage should not be
3115  // tail-called on AArch64 when the OS does not support dynamic
3116  // pre-emption of symbols, as the AAELF spec requires normal calls
3117  // to undefined weak functions to be replaced with a NOP or jump to the
3118  // next instruction. The behaviour of branch instructions in this
3119  // situation (as used for tail calls) is implementation-defined, so we
3120  // cannot rely on the linker replacing the tail call with a return.
3121  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3122  const GlobalValue *GV = G->getGlobal();
3123  const Triple &TT = getTargetMachine().getTargetTriple();
3124  if (GV->hasExternalWeakLinkage() &&
3125  (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3126  return false;
3127  }
3128 
3129  // Now we search for cases where we can use a tail call without changing the
3130  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3131  // concept.
3132 
3133  // I want anyone implementing a new calling convention to think long and hard
3134  // about this assert.
3135  assert((!isVarArg || CalleeCC == CallingConv::C) &&
3136  "Unexpected variadic calling convention");
3137 
3138  LLVMContext &C = *DAG.getContext();
3139  if (isVarArg && !Outs.empty()) {
3140  // At least two cases here: if caller is fastcc then we can't have any
3141  // memory arguments (we'd be expected to clean up the stack afterwards). If
3142  // caller is C then we could potentially use its argument area.
3143 
3144  // FIXME: for now we take the most conservative of these in both cases:
3145  // disallow all variadic memory operands.
3147  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3148 
3149  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3150  for (const CCValAssign &ArgLoc : ArgLocs)
3151  if (!ArgLoc.isRegLoc())
3152  return false;
3153  }
3154 
3155  // Check that the call results are passed in the same way.
3156  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3157  CCAssignFnForCall(CalleeCC, isVarArg),
3158  CCAssignFnForCall(CallerCC, isVarArg)))
3159  return false;
3160  // The callee has to preserve all registers the caller needs to preserve.
3161  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3162  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3163  if (!CCMatch) {
3164  const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3165  if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3166  return false;
3167  }
3168 
3169  // Nothing more to check if the callee is taking no arguments
3170  if (Outs.empty())
3171  return true;
3172 
3174  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3175 
3176  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3177 
3178  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3179 
3180  // If the stack arguments for this call do not fit into our own save area then
3181  // the call cannot be made tail.
3182  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3183  return false;
3184 
3185  const MachineRegisterInfo &MRI = MF.getRegInfo();
3186  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3187  return false;
3188 
3189  return true;
3190 }
3191 
3192 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3193  SelectionDAG &DAG,
3194  MachineFrameInfo &MFI,
3195  int ClobberedFI) const {
3196  SmallVector<SDValue, 8> ArgChains;
3197  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3198  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3199 
3200  // Include the original chain at the beginning of the list. When this is
3201  // used by target LowerCall hooks, this helps legalize find the
3202  // CALLSEQ_BEGIN node.
3203  ArgChains.push_back(Chain);
3204 
3205  // Add a chain value for each stack argument corresponding
3206  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3207  UE = DAG.getEntryNode().getNode()->use_end();
3208  U != UE; ++U)
3209  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3210  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3211  if (FI->getIndex() < 0) {
3212  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3213  int64_t InLastByte = InFirstByte;
3214  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3215 
3216  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3217  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3218  ArgChains.push_back(SDValue(L, 1));
3219  }
3220 
3221  // Build a tokenfactor for all the chains.
3222  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3223 }
3224 
3225 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3226  bool TailCallOpt) const {
3227  return CallCC == CallingConv::Fast && TailCallOpt;
3228 }
3229 
3230 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3231 /// and add input and output parameter nodes.
3232 SDValue
3233 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3234  SmallVectorImpl<SDValue> &InVals) const {
3235  SelectionDAG &DAG = CLI.DAG;
3236  SDLoc &DL = CLI.DL;
3237  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3238  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3239  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3240  SDValue Chain = CLI.Chain;
3241  SDValue Callee = CLI.Callee;
3242  bool &IsTailCall = CLI.IsTailCall;
3243  CallingConv::ID CallConv = CLI.CallConv;
3244  bool IsVarArg = CLI.IsVarArg;
3245 
3246  MachineFunction &MF = DAG.getMachineFunction();
3247  bool IsThisReturn = false;
3248 
3250  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3251  bool IsSibCall = false;
3252 
3253  if (IsTailCall) {
3254  // Check if it's really possible to do a tail call.
3255  IsTailCall = isEligibleForTailCallOptimization(
3256  Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3257  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3258  report_fatal_error("failed to perform tail call elimination on a call "
3259  "site marked musttail");
3260 
3261  // A sibling call is one where we're under the usual C ABI and not planning
3262  // to change that but can still do a tail call:
3263  if (!TailCallOpt && IsTailCall)
3264  IsSibCall = true;
3265 
3266  if (IsTailCall)
3267  ++NumTailCalls;
3268  }
3269 
3270  // Analyze operands of the call, assigning locations to each operand.
3272  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3273  *DAG.getContext());
3274 
3275  if (IsVarArg) {
3276  // Handle fixed and variable vector arguments differently.
3277  // Variable vector arguments always go into memory.
3278  unsigned NumArgs = Outs.size();
3279 
3280  for (unsigned i = 0; i != NumArgs; ++i) {
3281  MVT ArgVT = Outs[i].VT;
3282  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3283  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3284  /*IsVarArg=*/ !Outs[i].IsFixed);
3285  bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3286  assert(!Res && "Call operand has unhandled type");
3287  (void)Res;
3288  }
3289  } else {
3290  // At this point, Outs[].VT may already be promoted to i32. To correctly
3291  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3292  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3293  // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3294  // we use a special version of AnalyzeCallOperands to pass in ValVT and
3295  // LocVT.
3296  unsigned NumArgs = Outs.size();
3297  for (unsigned i = 0; i != NumArgs; ++i) {
3298  MVT ValVT = Outs[i].VT;
3299  // Get type of the original argument.
3300  EVT ActualVT = getValueType(DAG.getDataLayout(),
3301  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3302  /*AllowUnknown*/ true);
3303  MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3304  ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3305  // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3306  if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3307  ValVT = MVT::i8;
3308  else if (ActualMVT == MVT::i16)
3309  ValVT = MVT::i16;
3310 
3311  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3312  bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3313  assert(!Res && "Call operand has unhandled type");
3314  (void)Res;
3315  }
3316  }
3317 
3318  // Get a count of how many bytes are to be pushed on the stack.
3319  unsigned NumBytes = CCInfo.getNextStackOffset();
3320 
3321  if (IsSibCall) {
3322  // Since we're not changing the ABI to make this a tail call, the memory
3323  // operands are already available in the caller's incoming argument space.
3324  NumBytes = 0;
3325  }
3326 
3327  // FPDiff is the byte offset of the call's argument area from the callee's.
3328  // Stores to callee stack arguments will be placed in FixedStackSlots offset
3329  // by this amount for a tail call. In a sibling call it must be 0 because the
3330  // caller will deallocate the entire stack and the callee still expects its
3331  // arguments to begin at SP+0. Completely unused for non-tail calls.
3332  int FPDiff = 0;
3333 
3334  if (IsTailCall && !IsSibCall) {
3335  unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3336 
3337  // Since callee will pop argument stack as a tail call, we must keep the
3338  // popped size 16-byte aligned.
3339  NumBytes = alignTo(NumBytes, 16);
3340 
3341  // FPDiff will be negative if this tail call requires more space than we
3342  // would automatically have in our incoming argument space. Positive if we
3343  // can actually shrink the stack.
3344  FPDiff = NumReusableBytes - NumBytes;
3345 
3346  // The stack pointer must be 16-byte aligned at all times it's used for a
3347  // memory operation, which in practice means at *all* times and in
3348  // particular across call boundaries. Therefore our own arguments started at
3349  // a 16-byte aligned SP and the delta applied for the tail call should
3350  // satisfy the same constraint.
3351  assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3352  }
3353 
3354  // Adjust the stack pointer for the new arguments...
3355  // These operations are automatically eliminated by the prolog/epilog pass
3356  if (!IsSibCall)
3357  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3358 
3359  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3360  getPointerTy(DAG.getDataLayout()));
3361 
3363  SmallVector<SDValue, 8> MemOpChains;
3364  auto PtrVT = getPointerTy(DAG.getDataLayout());
3365 
3366  // Walk the register/memloc assignments, inserting copies/loads.
3367  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3368  ++i, ++realArgIdx) {
3369  CCValAssign &VA = ArgLocs[i];
3370  SDValue Arg = OutVals[realArgIdx];
3371  ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3372 
3373  // Promote the value if needed.
3374  switch (VA.getLocInfo()) {
3375  default:
3376  llvm_unreachable("Unknown loc info!");
3377  case CCValAssign::Full:
3378  break;
3379  case CCValAssign::SExt:
3380  Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3381  break;
3382  case CCValAssign::ZExt:
3383  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3384  break;
3385  case CCValAssign::AExt:
3386  if (Outs[realArgIdx].ArgVT == MVT::i1) {
3387  // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3388  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3389  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3390  }
3391  Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3392  break;
3393  case CCValAssign::BCvt:
3394  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3395  break;
3396  case CCValAssign::FPExt:
3397  Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3398  break;
3399  }
3400 
3401  if (VA.isRegLoc()) {
3402  if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3403  Outs[0].VT == MVT::i64) {
3404  assert(VA.getLocVT() == MVT::i64 &&
3405  "unexpected calling convention register assignment");
3406  assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3407  "unexpected use of 'returned'");
3408  IsThisReturn = true;
3409  }
3410  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3411  } else {
3412  assert(VA.isMemLoc());
3413 
3414  SDValue DstAddr;
3415  MachinePointerInfo DstInfo;
3416 
3417  // FIXME: This works on big-endian for composite byvals, which are the
3418  // common case. It should also work for fundamental types too.
3419  uint32_t BEAlign = 0;
3420  unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3421  : VA.getValVT().getSizeInBits();
3422  OpSize = (OpSize + 7) / 8;
3423  if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3424  !Flags.isInConsecutiveRegs()) {
3425  if (OpSize < 8)
3426  BEAlign = 8 - OpSize;
3427  }
3428  unsigned LocMemOffset = VA.getLocMemOffset();
3429  int32_t Offset = LocMemOffset + BEAlign;
3430  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3431  PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3432 
3433  if (IsTailCall) {
3434  Offset = Offset + FPDiff;
3435  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3436 
3437  DstAddr = DAG.getFrameIndex(FI, PtrVT);
3438  DstInfo =
3440 
3441  // Make sure any stack arguments overlapping with where we're storing
3442  // are loaded before this eventual operation. Otherwise they'll be
3443  // clobbered.
3444  Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3445  } else {
3446  SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3447 
3448  DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3450  LocMemOffset);
3451  }
3452 
3453  if (Outs[i].Flags.isByVal()) {
3454  SDValue SizeNode =
3455  DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3456  SDValue Cpy = DAG.getMemcpy(
3457  Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3458  /*isVol = */ false, /*AlwaysInline = */ false,
3459  /*isTailCall = */ false,
3460  DstInfo, MachinePointerInfo());
3461 
3462  MemOpChains.push_back(Cpy);
3463  } else {
3464  // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3465  // promoted to a legal register type i32, we should truncate Arg back to
3466  // i1/i8/i16.
3467  if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3468  VA.getValVT() == MVT::i16)
3469  Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3470 
3471  SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3472  MemOpChains.push_back(Store);
3473  }
3474  }
3475  }
3476 
3477  if (!MemOpChains.empty())
3478  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3479 
3480  // Build a sequence of copy-to-reg nodes chained together with token chain
3481  // and flag operands which copy the outgoing args into the appropriate regs.
3482  SDValue InFlag;
3483  for (auto &RegToPass : RegsToPass) {
3484  Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3485  RegToPass.second, InFlag);
3486  InFlag = Chain.getValue(1);
3487  }
3488 
3489  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3490  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3491  // node so that legalize doesn't hack it.
3492  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3493  auto GV = G->getGlobal();
3494  if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
3496  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
3497  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3498  } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
3499  assert(Subtarget->isTargetWindows() &&
3500  "Windows is the only supported COFF target");
3501  Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
3502  } else {
3503  const GlobalValue *GV = G->getGlobal();
3504  Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
3505  }
3506  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3507  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3508  Subtarget->isTargetMachO()) {
3509  const char *Sym = S->getSymbol();
3510  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
3511  Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3512  } else {
3513  const char *Sym = S->getSymbol();
3514  Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
3515  }
3516  }
3517 
3518  // We don't usually want to end the call-sequence here because we would tidy
3519  // the frame up *after* the call, however in the ABI-changing tail-call case
3520  // we've carefully laid out the parameters so that when sp is reset they'll be
3521  // in the correct location.
3522  if (IsTailCall && !IsSibCall) {
3523  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3524  DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
3525  InFlag = Chain.getValue(1);
3526  }
3527 
3528  std::vector<SDValue> Ops;
3529  Ops.push_back(Chain);
3530  Ops.push_back(Callee);
3531 
3532  if (IsTailCall) {
3533  // Each tail call may have to adjust the stack by a different amount, so
3534  // this information must travel along with the operation for eventual
3535  // consumption by emitEpilogue.
3536  Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3537  }
3538 
3539  // Add argument registers to the end of the list so that they are known live
3540  // into the call.
3541  for (auto &RegToPass : RegsToPass)
3542  Ops.push_back(DAG.getRegister(RegToPass.first,
3543  RegToPass.second.getValueType()));
3544 
3545  // Add a register mask operand representing the call-preserved registers.
3546  const uint32_t *Mask;
3547  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3548  if (IsThisReturn) {
3549  // For 'this' returns, use the X0-preserving mask if applicable
3550  Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
3551  if (!Mask) {
3552  IsThisReturn = false;
3553  Mask = TRI->getCallPreservedMask(MF, CallConv);
3554  }
3555  } else
3556  Mask = TRI->getCallPreservedMask(MF, CallConv);
3557 
3558  assert(Mask && "Missing call preserved mask for calling convention");
3559  Ops.push_back(DAG.getRegisterMask(Mask));
3560 
3561  if (InFlag.getNode())
3562  Ops.push_back(InFlag);
3563 
3564  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3565 
3566  // If we're doing a tall call, use a TC_RETURN here rather than an
3567  // actual call instruction.
3568  if (IsTailCall) {
3570  return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
3571  }
3572 
3573  // Returns a chain and a flag for retval copy to use.
3574  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
3575  InFlag = Chain.getValue(1);
3576 
3577  uint64_t CalleePopBytes =
3578  DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
3579 
3580  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3581  DAG.getIntPtrConstant(CalleePopBytes, DL, true),
3582  InFlag, DL);
3583  if (!Ins.empty())
3584  InFlag = Chain.getValue(1);
3585 
3586  // Handle result values, copying them out of physregs into vregs that we
3587  // return.
3588  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3589  InVals, IsThisReturn,
3590  IsThisReturn ? OutVals[0] : SDValue());
3591 }
3592 
3593 bool AArch64TargetLowering::CanLowerReturn(
3594  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3595  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3596  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3597  ? RetCC_AArch64_WebKit_JS
3598  : RetCC_AArch64_AAPCS;
3600  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3601  return CCInfo.CheckReturn(Outs, RetCC);
3602 }
3603 
3604 SDValue
3605 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3606  bool isVarArg,
3607  const SmallVectorImpl<ISD::OutputArg> &Outs,
3608  const SmallVectorImpl<SDValue> &OutVals,
3609  const SDLoc &DL, SelectionDAG &DAG) const {
3610  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3611  ? RetCC_AArch64_WebKit_JS
3612  : RetCC_AArch64_AAPCS;
3614  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3615  *DAG.getContext());
3616  CCInfo.AnalyzeReturn(Outs, RetCC);
3617 
3618  // Copy the result values into the output registers.
3619  SDValue Flag;
3620  SmallVector<SDValue, 4> RetOps(1, Chain);
3621  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
3622  ++i, ++realRVLocIdx) {
3623  CCValAssign &VA = RVLocs[i];
3624  assert(VA.isRegLoc() && "Can only return in registers!");
3625  SDValue Arg = OutVals[realRVLocIdx];
3626 
3627  switch (VA.getLocInfo()) {
3628  default:
3629  llvm_unreachable("Unknown loc info!");
3630  case CCValAssign::Full:
3631  if (Outs[i].ArgVT == MVT::i1) {
3632  // AAPCS requires i1 to be zero-extended to i8 by the producer of the
3633  // value. This is strictly redundant on Darwin (which uses "zeroext
3634  // i1"), but will be optimised out before ISel.
3635  Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3636  Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3637  }
3638  break;
3639  case CCValAssign::BCvt:
3640  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3641  break;
3642  }
3643 
3644  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
3645  Flag = Chain.getValue(1);
3646  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3647  }
3648  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3649  const MCPhysReg *I =
3651  if (I) {
3652  for (; *I; ++I) {
3653  if (AArch64::GPR64RegClass.contains(*I))
3654  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3655  else if (AArch64::FPR64RegClass.contains(*I))
3656  RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
3657  else
3658  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3659  }
3660  }
3661 
3662  RetOps[0] = Chain; // Update chain.
3663 
3664  // Add the flag if we have it.
3665  if (Flag.getNode())
3666  RetOps.push_back(Flag);
3667 
3668  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
3669 }
3670 
3671 //===----------------------------------------------------------------------===//
3672 // Other Lowering Code
3673 //===----------------------------------------------------------------------===//
3674 
3675 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
3676  SelectionDAG &DAG,
3677  unsigned Flag) const {
3678  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
3679 }
3680 
3681 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
3682  SelectionDAG &DAG,
3683  unsigned Flag) const {
3684  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
3685 }
3686 
3687 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
3688  SelectionDAG &DAG,
3689  unsigned Flag) const {
3690  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
3691  N->getOffset(), Flag);
3692 }
3693 
3694 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
3695  SelectionDAG &DAG,
3696  unsigned Flag) const {
3697  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
3698 }
3699 
3700 // (loadGOT sym)
3701 template <class NodeTy>
3702 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
3703  unsigned Flags) const {
3704  DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
3705  SDLoc DL(N);
3706  EVT Ty = getPointerTy(DAG.getDataLayout());
3707  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
3708  // FIXME: Once remat is capable of dealing with instructions with register
3709  // operands, expand this into two nodes instead of using a wrapper node.
3710  return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
3711 }
3712 
3713 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
3714 template <class NodeTy>
3715 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
3716  unsigned Flags) const {
3717  DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
3718  SDLoc DL(N);
3719  EVT Ty = getPointerTy(DAG.getDataLayout());
3720  const unsigned char MO_NC = AArch64II::MO_NC;
3721  return DAG.getNode(
3722  AArch64ISD::WrapperLarge, DL, Ty,
3723  getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
3724  getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
3725  getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
3726  getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
3727 }
3728 
3729 // (addlow (adrp %hi(sym)) %lo(sym))
3730 template <class NodeTy>
3731 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
3732  unsigned Flags) const {
3733  DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
3734  SDLoc DL(N);
3735  EVT Ty = getPointerTy(DAG.getDataLayout());
3736  SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
3737  SDValue Lo = getTargetNode(N, Ty, DAG,
3739  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
3740  return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
3741 }
3742 
3743 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
3744  SelectionDAG &DAG) const {
3745  GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
3746  const GlobalValue *GV = GN->getGlobal();
3747  const AArch64II::TOF TargetFlags =
3748  (GV->hasDLLImportStorageClass() ? AArch64II::MO_DLLIMPORT
3750  unsigned char OpFlags =
3751  Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
3752 
3753  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
3754  "unexpected offset in global node");
3755 
3756  // This also catches the large code model case for Darwin.
3757  if ((OpFlags & AArch64II::MO_GOT) != 0) {
3758  return getGOT(GN, DAG, TargetFlags);
3759  }
3760 
3761  SDValue Result;
3762  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
3763  Result = getAddrLarge(GN, DAG, TargetFlags);
3764  } else {
3765  Result = getAddr(GN, DAG, TargetFlags);
3766  }
3767  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3768  SDLoc DL(GN);
3769  if (GV->hasDLLImportStorageClass())
3770  Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3772  return Result;
3773 }
3774 
3775 /// \brief Convert a TLS address reference into the correct sequence of loads
3776 /// and calls to compute the variable's address (for Darwin, currently) and
3777 /// return an SDValue containing the final node.
3778 
3779 /// Darwin only has one TLS scheme which must be capable of dealing with the
3780 /// fully general situation, in the worst case. This means:
3781 /// + "extern __thread" declaration.
3782 /// + Defined in a possibly unknown dynamic library.
3783 ///
3784 /// The general system is that each __thread variable has a [3 x i64] descriptor
3785 /// which contains information used by the runtime to calculate the address. The
3786 /// only part of this the compiler needs to know about is the first xword, which
3787 /// contains a function pointer that must be called with the address of the
3788 /// entire descriptor in "x0".
3789 ///
3790 /// Since this descriptor may be in a different unit, in general even the
3791 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
3792 /// is:
3793 /// adrp x0, _var@TLVPPAGE
3794 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
3795 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
3796 /// ; the function pointer
3797 /// blr x1 ; Uses descriptor address in x0
3798 /// ; Address of _var is now in x0.
3799 ///
3800 /// If the address of _var's descriptor *is* known to the linker, then it can
3801 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
3802 /// a slight efficiency gain.
3803 SDValue
3804 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
3805  SelectionDAG &DAG) const {
3806  assert(Subtarget->isTargetDarwin() &&
3807  "This function expects a Darwin target");
3808 
3809  SDLoc DL(Op);
3810  MVT PtrVT = getPointerTy(DAG.getDataLayout());
3811  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3812 
3813  SDValue TLVPAddr =
3814  DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
3815  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
3816 
3817  // The first entry in the descriptor is a function pointer that we must call
3818  // to obtain the address of the variable.
3819  SDValue Chain = DAG.getEntryNode();
3820  SDValue FuncTLVGet = DAG.getLoad(
3821  MVT::i64, DL, Chain, DescAddr,
3823  /* Alignment = */ 8,
3826  Chain = FuncTLVGet.getValue(1);
3827 
3829  MFI.setAdjustsStack(true);
3830 
3831  // TLS calls preserve all registers except those that absolutely must be
3832  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3833  // silly).
3834  const uint32_t *Mask =
3835  Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
3836 
3837  // Finally, we can make the call. This is just a degenerate version of a
3838  // normal AArch64 call node: x0 takes the address of the descriptor, and
3839  // returns the address of the variable in this thread.
3840  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
3841  Chain =
3843  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
3844  DAG.getRegisterMask(Mask), Chain.getValue(1));
3845  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
3846 }
3847 
3848 /// When accessing thread-local variables under either the general-dynamic or
3849 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
3850 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
3851 /// is a function pointer to carry out the resolution.
3852 ///
3853 /// The sequence is:
3854 /// adrp x0, :tlsdesc:var
3855 /// ldr x1, [x0, #:tlsdesc_lo12:var]
3856 /// add x0, x0, #:tlsdesc_lo12:var
3857 /// .tlsdesccall var
3858 /// blr x1
3859 /// (TPIDR_EL0 offset now in x0)
3860 ///
3861 /// The above sequence must be produced unscheduled, to enable the linker to
3862 /// optimize/relax this sequence.
3863 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
3864 /// above sequence, and expanded really late in the compilation flow, to ensure
3865 /// the sequence is produced as per above.
3866 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
3867  const SDLoc &DL,
3868  SelectionDAG &DAG) const {
3869  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3870 
3871  SDValue Chain = DAG.getEntryNode();
3872  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3873 
3874  Chain =
3875  DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
3876  SDValue Glue = Chain.getValue(1);
3877 
3878  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
3879 }
3880 
3881 SDValue
3882 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
3883  SelectionDAG &DAG) const {
3884  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
3885  assert(Subtarget->useSmallAddressing() &&
3886  "ELF TLS only supported in small memory model");
3887  // Different choices can be made for the maximum size of the TLS area for a
3888  // module. For the small address model, the default TLS size is 16MiB and the
3889  // maximum TLS size is 4GiB.
3890  // FIXME: add -mtls-size command line option and make it control the 16MiB
3891  // vs. 4GiB code sequence generation.
3892  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3893 
3894  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
3895 
3897  if (Model == TLSModel::LocalDynamic)
3898  Model = TLSModel::GeneralDynamic;
3899  }
3900 
3901  SDValue TPOff;
3902  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3903  SDLoc DL(Op);
3904  const GlobalValue *GV = GA->getGlobal();
3905 
3906  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
3907 
3908  if (Model == TLSModel::LocalExec) {
3909  SDValue HiVar = DAG.getTargetGlobalAddress(
3910  GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
3911  SDValue LoVar = DAG.getTargetGlobalAddress(
3912  GV, DL, PtrVT, 0,
3914 
3915  SDValue TPWithOff_lo =
3916  SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
3917  HiVar,
3918  DAG.getTargetConstant(0, DL, MVT::i32)),
3919  0);
3920  SDValue TPWithOff =
3921  SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
3922  LoVar,
3923  DAG.getTargetConstant(0, DL, MVT::i32)),
3924  0);
3925  return TPWithOff;
3926  } else if (Model == TLSModel::InitialExec) {
3927  TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
3928  TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
3929  } else if (Model == TLSModel::LocalDynamic) {
3930  // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
3931  // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
3932  // the beginning of the module's TLS region, followed by a DTPREL offset
3933  // calculation.
3934 
3935  // These accesses will need deduplicating if there's more than one.
3936  AArch64FunctionInfo *MFI =
3939 
3940  // The call needs a relocation too for linker relaxation. It doesn't make
3941  // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
3942  // the address.
3943  SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
3945 
3946  // Now we can calculate the offset from TPIDR_EL0 to this module's
3947  // thread-local area.
3948  TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
3949 
3950  // Now use :dtprel_whatever: operations to calculate this variable's offset
3951  // in its thread-storage area.
3952  SDValue HiVar = DAG.getTargetGlobalAddress(
3954  SDValue LoVar = DAG.getTargetGlobalAddress(
3955  GV, DL, MVT::i64, 0,
3957 
3958  TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
3959  DAG.getTargetConstant(0, DL, MVT::i32)),
3960  0);
3961  TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
3962  DAG.getTargetConstant(0, DL, MVT::i32)),
3963  0);
3964  } else if (Model == TLSModel::GeneralDynamic) {
3965  // The call needs a relocation too for linker relaxation. It doesn't make
3966  // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
3967  // the address.
3968  SDValue SymAddr =
3969  DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
3970 
3971  // Finally we can make a call to calculate the offset from tpidr_el0.
3972  TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
3973  } else
3974  llvm_unreachable("Unsupported ELF TLS access model");
3975 
3976  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
3977 }
3978 
3979 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
3980  SelectionDAG &DAG) const {
3981  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3982  if (DAG.getTarget().Options.EmulatedTLS)
3983  return LowerToTLSEmulatedModel(GA, DAG);
3984 
3985  if (Subtarget->isTargetDarwin())
3986  return LowerDarwinGlobalTLSAddress(Op, DAG);
3987  if (Subtarget->isTargetELF())
3988  return LowerELFGlobalTLSAddress(Op, DAG);
3989 
3990  llvm_unreachable("Unexpected platform trying to use TLS");
3991 }
3992 
3993 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
3994  SDValue Chain = Op.getOperand(0);
3995  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3996  SDValue LHS = Op.getOperand(2);
3997  SDValue RHS = Op.getOperand(3);
3998  SDValue Dest = Op.getOperand(4);
3999  SDLoc dl(Op);
4000 
4001  // Handle f128 first, since lowering it will result in comparing the return
4002  // value of a libcall against zero, which is just what the rest of LowerBR_CC
4003  // is expecting to deal with.
4004  if (LHS.getValueType() == MVT::f128) {
4005  softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
4006 
4007  // If softenSetCCOperands returned a scalar, we need to compare the result
4008  // against zero to select between true and false values.
4009  if (!RHS.getNode()) {
4010  RHS = DAG.getConstant(0, dl, LHS.getValueType());
4011  CC = ISD::SETNE;
4012  }
4013  }
4014 
4015  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
4016  // instruction.
4017  if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
4018  (CC == ISD::SETEQ || CC == ISD::SETNE)) {
4019  // Only lower legal XALUO ops.
4020  if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
4021  return SDValue();
4022 
4023  // The actual operation with overflow check.
4024  AArch64CC::CondCode OFCC;
4025  SDValue Value, Overflow;
4026  std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
4027 
4028  if (CC == ISD::SETNE)
4029  OFCC = getInvertedCondCode(OFCC);
4030  SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
4031 
4032  return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
4033  Overflow);
4034  }
4035 
4036  if (LHS.getValueType().isInteger()) {
4037  assert((LHS.getValueType() == RHS.getValueType()) &&
4038  (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
4039 
4040  // If the RHS of the comparison is zero, we can potentially fold this
4041  // to a specialized branch.
4042  const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
4043  if (RHSC && RHSC->getZExtValue() == 0) {
4044  if (CC == ISD::SETEQ) {
4045  // See if we can use a TBZ to fold in an AND as well.
4046  // TBZ has a smaller branch displacement than CBZ. If the offset is
4047  // out of bounds, a late MI-layer pass rewrites branches.
4048  // 403.gcc is an example that hits this case.
4049  if (LHS.getOpcode() == ISD::AND &&
4050  isa<ConstantSDNode>(LHS.getOperand(1)) &&
4052  SDValue Test = LHS.getOperand(0);
4053  uint64_t Mask = LHS.getConstantOperandVal(1);
4054  return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
4055  DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
4056  Dest);
4057  }
4058 
4059  return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
4060  } else if (CC == ISD::SETNE) {
4061  // See if we can use a TBZ to fold in an AND as well.
4062  // TBZ has a smaller branch displacement than CBZ. If the offset is
4063  // out of bounds, a late MI-layer pass rewrites branches.
4064  // 403.gcc is an example that hits this case.
4065  if (LHS.getOpcode() == ISD::AND &&
4066  isa<ConstantSDNode>(LHS.getOperand(1)) &&
4068  SDValue Test = LHS.getOperand(0);
4069  uint64_t Mask = LHS.getConstantOperandVal(1);
4070  return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
4071  DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
4072  Dest);
4073  }
4074 
4075  return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
4076  } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
4077  // Don't combine AND since emitComparison converts the AND to an ANDS
4078  // (a.k.a. TST) and the test in the test bit and branch instruction
4079  // becomes redundant. This would also increase register pressure.
4080  uint64_t Mask = LHS.getValueSizeInBits() - 1;
4081  return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
4082  DAG.getConstant(Mask, dl, MVT::i64), Dest);
4083  }
4084  }
4085  if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
4086  LHS.getOpcode() != ISD::AND) {
4087  // Don't combine AND since emitComparison converts the AND to an ANDS
4088  // (a.k.a. TST) and the test in the test bit and branch instruction
4089  // becomes redundant. This would also increase register pressure.
4090  uint64_t Mask = LHS.getValueSizeInBits() - 1;
4091  return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
4092  DAG.getConstant(Mask, dl, MVT::i64), Dest);
4093  }
4094 
4095  SDValue CCVal;