LLVM  14.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ISelLowering.h"
15 #include "AArch64ExpandImm.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
34 #include "llvm/CodeGen/Analysis.h"
50 #include "llvm/IR/Attributes.h"
51 #include "llvm/IR/Constants.h"
52 #include "llvm/IR/DataLayout.h"
53 #include "llvm/IR/DebugLoc.h"
54 #include "llvm/IR/DerivedTypes.h"
55 #include "llvm/IR/Function.h"
57 #include "llvm/IR/GlobalValue.h"
58 #include "llvm/IR/IRBuilder.h"
59 #include "llvm/IR/Instruction.h"
60 #include "llvm/IR/Instructions.h"
61 #include "llvm/IR/IntrinsicInst.h"
62 #include "llvm/IR/Intrinsics.h"
63 #include "llvm/IR/IntrinsicsAArch64.h"
64 #include "llvm/IR/Module.h"
65 #include "llvm/IR/OperandTraits.h"
66 #include "llvm/IR/PatternMatch.h"
67 #include "llvm/IR/Type.h"
68 #include "llvm/IR/Use.h"
69 #include "llvm/IR/Value.h"
70 #include "llvm/MC/MCRegisterInfo.h"
71 #include "llvm/Support/Casting.h"
72 #include "llvm/Support/CodeGen.h"
74 #include "llvm/Support/Compiler.h"
75 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/KnownBits.h"
83 #include <algorithm>
84 #include <bitset>
85 #include <cassert>
86 #include <cctype>
87 #include <cstdint>
88 #include <cstdlib>
89 #include <iterator>
90 #include <limits>
91 #include <tuple>
92 #include <utility>
93 #include <vector>
94 
95 using namespace llvm;
96 using namespace llvm::PatternMatch;
97 
98 #define DEBUG_TYPE "aarch64-lower"
99 
100 STATISTIC(NumTailCalls, "Number of tail calls");
101 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
102 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
103 
104 // FIXME: The necessary dtprel relocations don't seem to be supported
105 // well in the GNU bfd and gold linkers at the moment. Therefore, by
106 // default, for now, fall back to GeneralDynamic code generation.
108  "aarch64-elf-ldtls-generation", cl::Hidden,
109  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
110  cl::init(false));
111 
112 static cl::opt<bool>
113 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
114  cl::desc("Enable AArch64 logical imm instruction "
115  "optimization"),
116  cl::init(true));
117 
118 // Temporary option added for the purpose of testing functionality added
119 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
120 // in future when both implementations will be based off MGATHER rather
121 // than the GLD1 nodes added for the SVE gather load intrinsics.
122 static cl::opt<bool>
123 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
124  cl::desc("Combine extends of AArch64 masked "
125  "gather intrinsics"),
126  cl::init(true));
127 
128 /// Value type used for condition codes.
129 static const MVT MVT_CC = MVT::i32;
130 
131 static inline EVT getPackedSVEVectorVT(EVT VT) {
132  switch (VT.getSimpleVT().SimpleTy) {
133  default:
134  llvm_unreachable("unexpected element type for vector");
135  case MVT::i8:
136  return MVT::nxv16i8;
137  case MVT::i16:
138  return MVT::nxv8i16;
139  case MVT::i32:
140  return MVT::nxv4i32;
141  case MVT::i64:
142  return MVT::nxv2i64;
143  case MVT::f16:
144  return MVT::nxv8f16;
145  case MVT::f32:
146  return MVT::nxv4f32;
147  case MVT::f64:
148  return MVT::nxv2f64;
149  case MVT::bf16:
150  return MVT::nxv8bf16;
151  }
152 }
153 
154 // NOTE: Currently there's only a need to return integer vector types. If this
155 // changes then just add an extra "type" parameter.
157  switch (EC.getKnownMinValue()) {
158  default:
159  llvm_unreachable("unexpected element count for vector");
160  case 16:
161  return MVT::nxv16i8;
162  case 8:
163  return MVT::nxv8i16;
164  case 4:
165  return MVT::nxv4i32;
166  case 2:
167  return MVT::nxv2i64;
168  }
169 }
170 
171 static inline EVT getPromotedVTForPredicate(EVT VT) {
173  "Expected scalable predicate vector type!");
174  switch (VT.getVectorMinNumElements()) {
175  default:
176  llvm_unreachable("unexpected element count for vector");
177  case 2:
178  return MVT::nxv2i64;
179  case 4:
180  return MVT::nxv4i32;
181  case 8:
182  return MVT::nxv8i16;
183  case 16:
184  return MVT::nxv16i8;
185  }
186 }
187 
188 /// Returns true if VT's elements occupy the lowest bit positions of its
189 /// associated register class without any intervening space.
190 ///
191 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
192 /// same register class, but only nxv8f16 can be treated as a packed vector.
193 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
194  assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
195  "Expected legal vector type!");
196  return VT.isFixedLengthVector() ||
198 }
199 
200 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
201 // predicate and end with a passthru value matching the result type.
202 static bool isMergePassthruOpcode(unsigned Opc) {
203  switch (Opc) {
204  default:
205  return false;
234  return true;
235  }
236 }
237 
239  const AArch64Subtarget &STI)
240  : TargetLowering(TM), Subtarget(&STI) {
241  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
242  // we have to make something up. Arbitrarily, choose ZeroOrOne.
244  // When comparing vectors the result sets the different elements in the
245  // vector to all-one or all-zero.
247 
248  // Set up the register classes.
249  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
250  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
251 
252  if (Subtarget->hasLS64()) {
253  addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
256  }
257 
258  if (Subtarget->hasFPARMv8()) {
259  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
260  addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
261  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
262  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
263  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
264  }
265 
266  if (Subtarget->hasNEON()) {
267  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
268  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
269  // Someone set us up the NEON.
270  addDRTypeForNEON(MVT::v2f32);
271  addDRTypeForNEON(MVT::v8i8);
272  addDRTypeForNEON(MVT::v4i16);
273  addDRTypeForNEON(MVT::v2i32);
274  addDRTypeForNEON(MVT::v1i64);
275  addDRTypeForNEON(MVT::v1f64);
276  addDRTypeForNEON(MVT::v4f16);
277  if (Subtarget->hasBF16())
278  addDRTypeForNEON(MVT::v4bf16);
279 
280  addQRTypeForNEON(MVT::v4f32);
281  addQRTypeForNEON(MVT::v2f64);
282  addQRTypeForNEON(MVT::v16i8);
283  addQRTypeForNEON(MVT::v8i16);
284  addQRTypeForNEON(MVT::v4i32);
285  addQRTypeForNEON(MVT::v2i64);
286  addQRTypeForNEON(MVT::v8f16);
287  if (Subtarget->hasBF16())
288  addQRTypeForNEON(MVT::v8bf16);
289  }
290 
291  if (Subtarget->hasSVE()) {
292  // Add legal sve predicate types
293  addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
294  addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
295  addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
296  addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
297 
298  // Add legal sve data types
299  addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
300  addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
301  addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
302  addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
303 
304  addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
305  addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
306  addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
307  addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
308  addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
309  addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
310 
311  if (Subtarget->hasBF16()) {
312  addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
313  addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
314  addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
315  }
316 
317  if (Subtarget->useSVEForFixedLengthVectors()) {
319  if (useSVEForFixedLengthVectorVT(VT))
320  addRegisterClass(VT, &AArch64::ZPRRegClass);
321 
323  if (useSVEForFixedLengthVectorVT(VT))
324  addRegisterClass(VT, &AArch64::ZPRRegClass);
325  }
326 
327  for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
336  }
337 
338  for (auto VT :
342 
343  for (auto VT :
345  MVT::nxv2f64 }) {
357 
369  }
370  }
371 
372  // Compute derived properties from the register classes
374 
375  // Provide all sorts of operation actions
409 
413 
417 
419 
420  // Custom lowering hooks are needed for XOR
421  // to fold it into CSINC/CSINV.
424 
425  // Virtually no operation on f128 is legal, but LLVM can't expand them when
426  // there's a valid register class, so we need custom operations in most cases.
450 
451  // Lowering for many of the conversions is actually specified by the non-f128
452  // type. The LowerXXX function will be trivial when f128 isn't involved.
483 
488 
489  // Variable arguments.
494 
495  // Variable-sized objects.
498 
499  if (Subtarget->isTargetWindows())
501  else
503 
504  // Constant pool entries
506 
507  // BlockAddress
509 
510  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
519 
520  // AArch64 lacks both left-rotate and popcount instructions.
523  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
526  }
527 
528  // AArch64 doesn't have i32 MULH{S|U}.
531 
532  // AArch64 doesn't have {U|S}MUL_LOHI.
535 
539 
542 
545  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
548  }
555 
556  // Custom lower Add/Sub/Mul with overflow.
569 
578  if (Subtarget->hasFullFP16())
580  else
582 
616 
617  if (!Subtarget->hasFullFP16()) {
641 
642  // promote v4f16 to v4f32 when that is known to be safe.
651 
668 
690  }
691 
692  // AArch64 has implementations of a lot of rounding-like FP operations.
693  for (MVT Ty : {MVT::f32, MVT::f64}) {
709  }
710 
711  if (Subtarget->hasFullFP16()) {
723  }
724 
726 
729 
735 
736  // Generate outline atomics library calls only if LSE was not specified for
737  // subtarget
738  if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
764 #define LCALLNAMES(A, B, N) \
765  setLibcallName(A##N##_RELAX, #B #N "_relax"); \
766  setLibcallName(A##N##_ACQ, #B #N "_acq"); \
767  setLibcallName(A##N##_REL, #B #N "_rel"); \
768  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
769 #define LCALLNAME4(A, B) \
770  LCALLNAMES(A, B, 1) \
771  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
772 #define LCALLNAME5(A, B) \
773  LCALLNAMES(A, B, 1) \
774  LCALLNAMES(A, B, 2) \
775  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
776  LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
777  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
778  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
779  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
780  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
781  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
782 #undef LCALLNAMES
783 #undef LCALLNAME4
784 #undef LCALLNAME5
785  }
786 
787  // 128-bit loads and stores can be done without expanding
790 
791  // Aligned 128-bit loads and stores are single-copy atomic according to the
792  // v8.4a spec.
793  if (Subtarget->hasLSE2()) {
796  }
797 
798  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
799  // custom lowering, as there are no un-paired non-temporal stores and
800  // legalization will break up 256 bit inputs.
808 
809  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
810  // This requires the Performance Monitors extension.
811  if (Subtarget->hasPerfMon())
813 
814  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
815  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
816  // Issue __sincos_stret if available.
819  } else {
822  }
823 
824  if (Subtarget->getTargetTriple().isOSMSVCRT()) {
825  // MSVCRT doesn't have powi; fall back to pow
826  setLibcallName(RTLIB::POWI_F32, nullptr);
827  setLibcallName(RTLIB::POWI_F64, nullptr);
828  }
829 
830  // Make floating-point constants legal for the large code model, so they don't
831  // become loads from the constant pool.
832  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
835  }
836 
837  // AArch64 does not have floating-point extending loads, i1 sign-extending
838  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
839  for (MVT VT : MVT::fp_valuetypes()) {
844  }
845  for (MVT VT : MVT::integer_valuetypes())
847 
855 
859 
860  // Indexed loads and stores are supported.
861  for (unsigned im = (unsigned)ISD::PRE_INC;
862  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
879  }
880 
881  // Trap.
885 
886  // We combine OR nodes for bitfield operations.
888  // Try to create BICs for vector ANDs.
890 
891  // Vector add and sub nodes may conceal a high-half opportunity.
892  // Also, try to fold ADD into CSINC/CSINV..
899 
905 
906  // Try and combine setcc with csel
908 
910 
920  if (Subtarget->supportsAddressTopByteIgnored())
922 
924 
927 
934 
936 
938 
939  // In case of strict alignment, avoid an excessive number of byte wide stores.
943 
948 
950 
954 
956 
958 
959  EnableExtLdPromotion = true;
960 
961  // Set required alignment.
963  // Set preferred alignments.
967 
968  // Only change the limit for entries in a jump table if specified by
969  // the sub target, but not at the command line.
970  unsigned MaxJT = STI.getMaximumJumpTableSize();
971  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
973 
974  setHasExtractBitsInsn(true);
975 
977 
978  if (Subtarget->hasNEON()) {
979  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
980  // silliness like this:
1007 
1013 
1016 
1018 
1019  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1020  // elements smaller than i32, so promote the input to i32 first.
1023 
1024  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1029  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1030  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1033 
1034  if (Subtarget->hasFullFP16()) {
1043  } else {
1044  // when AArch64 doesn't have fullfp16 support, promote the input
1045  // to i32 first.
1054  }
1055 
1064  for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1069  }
1070 
1071  // AArch64 doesn't have MUL.2d:
1073  // Custom handling for some quad-vector types to detect MULL.
1077 
1078  // Saturates
1079  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1085  }
1086 
1088  MVT::v4i32}) {
1091  }
1092 
1093  // Vector reductions
1094  for (MVT VT : { MVT::v4f16, MVT::v2f32,
1096  if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1099 
1101  }
1102  }
1103  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1110  }
1112 
1115  // Likewise, narrowing and extending vector loads/stores aren't handled
1116  // directly.
1117  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1119 
1120  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1123  } else {
1126  }
1129 
1132 
1133  for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1134  setTruncStoreAction(VT, InnerVT, Expand);
1135  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1136  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1137  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1138  }
1139  }
1140 
1141  // AArch64 has implementations of a lot of rounding-like FP operations.
1142  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
1150  }
1151 
1152  if (Subtarget->hasFullFP16()) {
1153  for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
1161  }
1162  }
1163 
1164  if (Subtarget->hasSVE())
1166 
1168 
1175  }
1176 
1177  if (Subtarget->hasSVE()) {
1178  for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1217 
1223  }
1224 
1225  // Illegal unpacked integer vector types.
1226  for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1229  }
1230 
1231  // Legalize unpacked bitcasts to REINTERPRET_CAST.
1235 
1236  for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1245 
1249 
1250  // There are no legal MVT::nxv16f## based types.
1251  if (VT != MVT::nxv16i1) {
1254  }
1255  }
1256 
1257  // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1265  }
1266 
1268  for (MVT InnerVT : MVT::fp_scalable_vector_valuetypes()) {
1269  // Avoid marking truncating FP stores as legal to prevent the
1270  // DAGCombiner from creating unsupported truncating stores.
1271  setTruncStoreAction(VT, InnerVT, Expand);
1272  // SVE does not have floating-point extending loads.
1273  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1274  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1275  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1276  }
1277  }
1278 
1279  // SVE supports truncating stores of 64 and 128-bit vectors
1285 
1322 
1324  }
1325 
1326  for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1332  }
1333 
1335 
1338 
1339  // NOTE: Currently this has to happen after computeRegisterProperties rather
1340  // than the preferred option of combining it with the addRegisterClass call.
1341  if (Subtarget->useSVEForFixedLengthVectors()) {
1343  if (useSVEForFixedLengthVectorVT(VT))
1344  addTypeForFixedLengthSVE(VT);
1346  if (useSVEForFixedLengthVectorVT(VT))
1347  addTypeForFixedLengthSVE(VT);
1348 
1349  // 64bit results can mean a bigger than NEON input.
1350  for (auto VT : {MVT::v8i8, MVT::v4i16})
1353 
1354  // 128bit results imply a bigger than NEON input.
1355  for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1357  for (auto VT : {MVT::v8f16, MVT::v4f32})
1359 
1360  // These operations are not supported on NEON but SVE can do them.
1399 
1400  // Int operations with no NEON support.
1401  for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1408  }
1409 
1410  // FP operations with no NEON support.
1411  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
1414 
1415  // Use SVE for vectors with more than 2 elements.
1416  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1418  }
1419 
1424  }
1425 
1427 }
1428 
1429 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1430  assert(VT.isVector() && "VT should be a vector type");
1431 
1432  if (VT.isFloatingPoint()) {
1434  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1435  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1436  }
1437 
1438  // Mark vector float intrinsics as expand.
1439  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1448  }
1449 
1450  // But we do support custom-lowering for FCOPYSIGN.
1451  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1452  ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1454 
1466 
1470  for (MVT InnerVT : MVT::all_valuetypes())
1471  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1472 
1473  // CNT supports only B element sizes, then use UADDLP to widen.
1474  if (VT != MVT::v8i8 && VT != MVT::v16i8)
1476 
1482 
1487 
1488  if (!VT.isFloatingPoint())
1490 
1491  // [SU][MIN|MAX] are available for all NEON types apart from i64.
1492  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1493  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1494  setOperationAction(Opcode, VT, Legal);
1495 
1496  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
1497  if (VT.isFloatingPoint() &&
1498  VT.getVectorElementType() != MVT::bf16 &&
1499  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1500  for (unsigned Opcode :
1502  setOperationAction(Opcode, VT, Legal);
1503 
1504  if (Subtarget->isLittleEndian()) {
1505  for (unsigned im = (unsigned)ISD::PRE_INC;
1506  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1509  }
1510  }
1511 }
1512 
1514  EVT OpVT) const {
1515  // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1516  if (!Subtarget->hasSVE())
1517  return true;
1518 
1519  // We can only support legal predicate result types.
1520  if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1521  ResVT != MVT::nxv16i1)
1522  return true;
1523 
1524  // The whilelo instruction only works with i32 or i64 scalar inputs.
1525  if (OpVT != MVT::i32 && OpVT != MVT::i64)
1526  return true;
1527 
1528  return false;
1529 }
1530 
1531 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1532  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1533 
1534  // By default everything must be expanded.
1535  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1537 
1538  // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1540 
1541  if (VT.isFloatingPoint()) {
1553  }
1554 
1555  // Mark integer truncating stores/extending loads as having custom lowering
1556  if (VT.isInteger()) {
1557  MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1558  while (InnerVT != VT) {
1559  setTruncStoreAction(VT, InnerVT, Custom);
1560  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1561  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1562  InnerVT = InnerVT.changeVectorElementType(
1563  MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1564  }
1565  }
1566 
1567  // Mark floating-point truncating stores/extending loads as having custom
1568  // lowering
1569  if (VT.isFloatingPoint()) {
1570  MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1571  while (InnerVT != VT) {
1572  setTruncStoreAction(VT, InnerVT, Custom);
1573  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1574  InnerVT = InnerVT.changeVectorElementType(
1576  }
1577  }
1578 
1579  // Lower fixed length vector operations to scalable equivalents.
1661 }
1662 
1663 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1664  addRegisterClass(VT, &AArch64::FPR64RegClass);
1665  addTypeForNEON(VT);
1666 }
1667 
1668 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1669  addRegisterClass(VT, &AArch64::FPR128RegClass);
1670  addTypeForNEON(VT);
1671 }
1672 
1674  LLVMContext &C, EVT VT) const {
1675  if (!VT.isVector())
1676  return MVT::i32;
1677  if (VT.isScalableVector())
1680 }
1681 
1682 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1683  const APInt &Demanded,
1685  unsigned NewOpc) {
1686  uint64_t OldImm = Imm, NewImm, Enc;
1687  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1688 
1689  // Return if the immediate is already all zeros, all ones, a bimm32 or a
1690  // bimm64.
1691  if (Imm == 0 || Imm == Mask ||
1693  return false;
1694 
1695  unsigned EltSize = Size;
1696  uint64_t DemandedBits = Demanded.getZExtValue();
1697 
1698  // Clear bits that are not demanded.
1699  Imm &= DemandedBits;
1700 
1701  while (true) {
1702  // The goal here is to set the non-demanded bits in a way that minimizes
1703  // the number of switching between 0 and 1. In order to achieve this goal,
1704  // we set the non-demanded bits to the value of the preceding demanded bits.
1705  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1706  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1707  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1708  // The final result is 0b11000011.
1709  uint64_t NonDemandedBits = ~DemandedBits;
1710  uint64_t InvertedImm = ~Imm & DemandedBits;
1711  uint64_t RotatedImm =
1712  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1713  NonDemandedBits;
1714  uint64_t Sum = RotatedImm + NonDemandedBits;
1715  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1716  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1717  NewImm = (Imm | Ones) & Mask;
1718 
1719  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1720  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1721  // we halve the element size and continue the search.
1722  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1723  break;
1724 
1725  // We cannot shrink the element size any further if it is 2-bits.
1726  if (EltSize == 2)
1727  return false;
1728 
1729  EltSize /= 2;
1730  Mask >>= EltSize;
1731  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1732 
1733  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1734  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1735  return false;
1736 
1737  // Merge the upper and lower halves of Imm and DemandedBits.
1738  Imm |= Hi;
1739  DemandedBits |= DemandedBitsHi;
1740  }
1741 
1742  ++NumOptimizedImms;
1743 
1744  // Replicate the element across the register width.
1745  while (EltSize < Size) {
1746  NewImm |= NewImm << EltSize;
1747  EltSize *= 2;
1748  }
1749 
1750  (void)OldImm;
1751  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1752  "demanded bits should never be altered");
1753  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1754 
1755  // Create the new constant immediate node.
1756  EVT VT = Op.getValueType();
1757  SDLoc DL(Op);
1758  SDValue New;
1759 
1760  // If the new constant immediate is all-zeros or all-ones, let the target
1761  // independent DAG combine optimize this node.
1762  if (NewImm == 0 || NewImm == OrigMask) {
1763  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1764  TLO.DAG.getConstant(NewImm, DL, VT));
1765  // Otherwise, create a machine node so that target independent DAG combine
1766  // doesn't undo this optimization.
1767  } else {
1769  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1770  New = SDValue(
1771  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1772  }
1773 
1774  return TLO.CombineTo(Op, New);
1775 }
1776 
1778  SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1779  TargetLoweringOpt &TLO) const {
1780  // Delay this optimization to as late as possible.
1781  if (!TLO.LegalOps)
1782  return false;
1783 
1785  return false;
1786 
1787  EVT VT = Op.getValueType();
1788  if (VT.isVector())
1789  return false;
1790 
1791  unsigned Size = VT.getSizeInBits();
1792  assert((Size == 32 || Size == 64) &&
1793  "i32 or i64 is expected after legalization.");
1794 
1795  // Exit early if we demand all bits.
1796  if (DemandedBits.countPopulation() == Size)
1797  return false;
1798 
1799  unsigned NewOpc;
1800  switch (Op.getOpcode()) {
1801  default:
1802  return false;
1803  case ISD::AND:
1804  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1805  break;
1806  case ISD::OR:
1807  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1808  break;
1809  case ISD::XOR:
1810  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1811  break;
1812  }
1813  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1814  if (!C)
1815  return false;
1816  uint64_t Imm = C->getZExtValue();
1817  return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1818 }
1819 
1820 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1821 /// Mask are known to be either zero or one and return them Known.
1823  const SDValue Op, KnownBits &Known,
1824  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1825  switch (Op.getOpcode()) {
1826  default:
1827  break;
1828  case AArch64ISD::CSEL: {
1829  KnownBits Known2;
1830  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1831  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1832  Known = KnownBits::commonBits(Known, Known2);
1833  break;
1834  }
1835  case AArch64ISD::LOADgot:
1836  case AArch64ISD::ADDlow: {
1837  if (!Subtarget->isTargetILP32())
1838  break;
1839  // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1840  Known.Zero = APInt::getHighBitsSet(64, 32);
1841  break;
1842  }
1844  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1845  Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
1846  break;
1847  }
1848  case ISD::INTRINSIC_W_CHAIN: {
1849  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1850  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1851  switch (IntID) {
1852  default: return;
1853  case Intrinsic::aarch64_ldaxr:
1854  case Intrinsic::aarch64_ldxr: {
1855  unsigned BitWidth = Known.getBitWidth();
1856  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1857  unsigned MemBits = VT.getScalarSizeInBits();
1858  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1859  return;
1860  }
1861  }
1862  break;
1863  }
1865  case ISD::INTRINSIC_VOID: {
1866  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1867  switch (IntNo) {
1868  default:
1869  break;
1870  case Intrinsic::aarch64_neon_umaxv:
1871  case Intrinsic::aarch64_neon_uminv: {
1872  // Figure out the datatype of the vector operand. The UMINV instruction
1873  // will zero extend the result, so we can mark as known zero all the
1874  // bits larger than the element datatype. 32-bit or larget doesn't need
1875  // this as those are legal types and will be handled by isel directly.
1876  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1877  unsigned BitWidth = Known.getBitWidth();
1878  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1879  assert(BitWidth >= 8 && "Unexpected width!");
1881  Known.Zero |= Mask;
1882  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1883  assert(BitWidth >= 16 && "Unexpected width!");
1885  Known.Zero |= Mask;
1886  }
1887  break;
1888  } break;
1889  }
1890  }
1891  }
1892 }
1893 
1895  EVT) const {
1896  return MVT::i64;
1897 }
1898 
1900  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1901  bool *Fast) const {
1902  if (Subtarget->requiresStrictAlign())
1903  return false;
1904 
1905  if (Fast) {
1906  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1907  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1908  // See comments in performSTORECombine() for more details about
1909  // these conditions.
1910 
1911  // Code that uses clang vector extensions can mark that it
1912  // wants unaligned accesses to be treated as fast by
1913  // underspecifying alignment to be 1 or 2.
1914  Alignment <= 2 ||
1915 
1916  // Disregard v2i64. Memcpy lowering produces those and splitting
1917  // them regresses performance on micro-benchmarks and olden/bh.
1918  VT == MVT::v2i64;
1919  }
1920  return true;
1921 }
1922 
1923 // Same as above but handling LLTs instead.
1925  LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1926  bool *Fast) const {
1927  if (Subtarget->requiresStrictAlign())
1928  return false;
1929 
1930  if (Fast) {
1931  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1932  *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1933  Ty.getSizeInBytes() != 16 ||
1934  // See comments in performSTORECombine() for more details about
1935  // these conditions.
1936 
1937  // Code that uses clang vector extensions can mark that it
1938  // wants unaligned accesses to be treated as fast by
1939  // underspecifying alignment to be 1 or 2.
1940  Alignment <= 2 ||
1941 
1942  // Disregard v2i64. Memcpy lowering produces those and splitting
1943  // them regresses performance on micro-benchmarks and olden/bh.
1944  Ty == LLT::fixed_vector(2, 64);
1945  }
1946  return true;
1947 }
1948 
1949 FastISel *
1951  const TargetLibraryInfo *libInfo) const {
1952  return AArch64::createFastISel(funcInfo, libInfo);
1953 }
1954 
1955 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1956 #define MAKE_CASE(V) \
1957  case V: \
1958  return #V;
1959  switch ((AArch64ISD::NodeType)Opcode) {
1961  break;
2243  }
2244 #undef MAKE_CASE
2245  return nullptr;
2246 }
2247 
2250  MachineBasicBlock *MBB) const {
2251  // We materialise the F128CSEL pseudo-instruction as some control flow and a
2252  // phi node:
2253 
2254  // OrigBB:
2255  // [... previous instrs leading to comparison ...]
2256  // b.ne TrueBB
2257  // b EndBB
2258  // TrueBB:
2259  // ; Fallthrough
2260  // EndBB:
2261  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2262 
2263  MachineFunction *MF = MBB->getParent();
2264  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2265  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2266  DebugLoc DL = MI.getDebugLoc();
2268 
2269  Register DestReg = MI.getOperand(0).getReg();
2270  Register IfTrueReg = MI.getOperand(1).getReg();
2271  Register IfFalseReg = MI.getOperand(2).getReg();
2272  unsigned CondCode = MI.getOperand(3).getImm();
2273  bool NZCVKilled = MI.getOperand(4).isKill();
2274 
2275  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2276  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2277  MF->insert(It, TrueBB);
2278  MF->insert(It, EndBB);
2279 
2280  // Transfer rest of current basic-block to EndBB
2281  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2282  MBB->end());
2284 
2285  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2286  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2287  MBB->addSuccessor(TrueBB);
2288  MBB->addSuccessor(EndBB);
2289 
2290  // TrueBB falls through to the end.
2291  TrueBB->addSuccessor(EndBB);
2292 
2293  if (!NZCVKilled) {
2294  TrueBB->addLiveIn(AArch64::NZCV);
2295  EndBB->addLiveIn(AArch64::NZCV);
2296  }
2297 
2298  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2299  .addReg(IfTrueReg)
2300  .addMBB(TrueBB)
2301  .addReg(IfFalseReg)
2302  .addMBB(MBB);
2303 
2304  MI.eraseFromParent();
2305  return EndBB;
2306 }
2307 
2309  MachineInstr &MI, MachineBasicBlock *BB) const {
2311  BB->getParent()->getFunction().getPersonalityFn())) &&
2312  "SEH does not use catchret!");
2313  return BB;
2314 }
2315 
2317  MachineInstr &MI, MachineBasicBlock *BB) const {
2318  switch (MI.getOpcode()) {
2319  default:
2320 #ifndef NDEBUG
2321  MI.dump();
2322 #endif
2323  llvm_unreachable("Unexpected instruction for custom inserter!");
2324 
2325  case AArch64::F128CSEL:
2326  return EmitF128CSEL(MI, BB);
2327 
2328  case TargetOpcode::STATEPOINT:
2329  // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2330  // while bl call instruction (where statepoint will be lowered at the end)
2331  // has implicit def. Add this implicit dead def here as a workaround.
2332  MI.addOperand(*MI.getMF(), MachineOperand::CreateReg(AArch64::LR, true,
2333  true, false, true));
2335  case TargetOpcode::STACKMAP:
2336  case TargetOpcode::PATCHPOINT:
2337  return emitPatchPoint(MI, BB);
2338 
2339  case AArch64::CATCHRET:
2340  return EmitLoweredCatchRet(MI, BB);
2341  }
2342 }
2343 
2344 //===----------------------------------------------------------------------===//
2345 // AArch64 Lowering private implementation.
2346 //===----------------------------------------------------------------------===//
2347 
2348 //===----------------------------------------------------------------------===//
2349 // Lowering Code
2350 //===----------------------------------------------------------------------===//
2351 
2352 // Forward declarations of SVE fixed length lowering helpers
2357  SelectionDAG &DAG);
2359  EVT VT);
2360 
2361 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2362 static bool isZerosVector(const SDNode *N) {
2363  // Look through a bit convert.
2364  while (N->getOpcode() == ISD::BITCAST)
2365  N = N->getOperand(0).getNode();
2366 
2368  return true;
2369 
2370  if (N->getOpcode() != AArch64ISD::DUP)
2371  return false;
2372 
2373  auto Opnd0 = N->getOperand(0);
2374  auto *CINT = dyn_cast<ConstantSDNode>(Opnd0);
2375  auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0);
2376  return (CINT && CINT->isZero()) || (CFP && CFP->isZero());
2377 }
2378 
2379 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2380 /// CC
2382  switch (CC) {
2383  default:
2384  llvm_unreachable("Unknown condition code!");
2385  case ISD::SETNE:
2386  return AArch64CC::NE;
2387  case ISD::SETEQ:
2388  return AArch64CC::EQ;
2389  case ISD::SETGT:
2390  return AArch64CC::GT;
2391  case ISD::SETGE:
2392  return AArch64CC::GE;
2393  case ISD::SETLT:
2394  return AArch64CC::LT;
2395  case ISD::SETLE:
2396  return AArch64CC::LE;
2397  case ISD::SETUGT:
2398  return AArch64CC::HI;
2399  case ISD::SETUGE:
2400  return AArch64CC::HS;
2401  case ISD::SETULT:
2402  return AArch64CC::LO;
2403  case ISD::SETULE:
2404  return AArch64CC::LS;
2405  }
2406 }
2407 
2408 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2411  AArch64CC::CondCode &CondCode2) {
2412  CondCode2 = AArch64CC::AL;
2413  switch (CC) {
2414  default:
2415  llvm_unreachable("Unknown FP condition!");
2416  case ISD::SETEQ:
2417  case ISD::SETOEQ:
2419  break;
2420  case ISD::SETGT:
2421  case ISD::SETOGT:
2423  break;
2424  case ISD::SETGE:
2425  case ISD::SETOGE:
2427  break;
2428  case ISD::SETOLT:
2430  break;
2431  case ISD::SETOLE:
2433  break;
2434  case ISD::SETONE:
2436  CondCode2 = AArch64CC::GT;
2437  break;
2438  case ISD::SETO:
2440  break;
2441  case ISD::SETUO:
2443  break;
2444  case ISD::SETUEQ:
2446  CondCode2 = AArch64CC::VS;
2447  break;
2448  case ISD::SETUGT:
2450  break;
2451  case ISD::SETUGE:
2453  break;
2454  case ISD::SETLT:
2455  case ISD::SETULT:
2457  break;
2458  case ISD::SETLE:
2459  case ISD::SETULE:
2461  break;
2462  case ISD::SETNE:
2463  case ISD::SETUNE:
2465  break;
2466  }
2467 }
2468 
2469 /// Convert a DAG fp condition code to an AArch64 CC.
2470 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2471 /// should be AND'ed instead of OR'ed.
2474  AArch64CC::CondCode &CondCode2) {
2475  CondCode2 = AArch64CC::AL;
2476  switch (CC) {
2477  default:
2478  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2479  assert(CondCode2 == AArch64CC::AL);
2480  break;
2481  case ISD::SETONE:
2482  // (a one b)
2483  // == ((a olt b) || (a ogt b))
2484  // == ((a ord b) && (a une b))
2486  CondCode2 = AArch64CC::NE;
2487  break;
2488  case ISD::SETUEQ:
2489  // (a ueq b)
2490  // == ((a uno b) || (a oeq b))
2491  // == ((a ule b) && (a uge b))
2493  CondCode2 = AArch64CC::LE;
2494  break;
2495  }
2496 }
2497 
2498 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2499 /// CC usable with the vector instructions. Fewer operations are available
2500 /// without a real NZCV register, so we have to use less efficient combinations
2501 /// to get the same effect.
2504  AArch64CC::CondCode &CondCode2,
2505  bool &Invert) {
2506  Invert = false;
2507  switch (CC) {
2508  default:
2509  // Mostly the scalar mappings work fine.
2510  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2511  break;
2512  case ISD::SETUO:
2513  Invert = true;
2515  case ISD::SETO:
2517  CondCode2 = AArch64CC::GE;
2518  break;
2519  case ISD::SETUEQ:
2520  case ISD::SETULT:
2521  case ISD::SETULE:
2522  case ISD::SETUGT:
2523  case ISD::SETUGE:
2524  // All of the compare-mask comparisons are ordered, but we can switch
2525  // between the two by a double inversion. E.g. ULE == !OGT.
2526  Invert = true;
2527  changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2528  CondCode, CondCode2);
2529  break;
2530  }
2531 }
2532 
2534  // Matches AArch64DAGToDAGISel::SelectArithImmed().
2535  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2536  LLVM_DEBUG(dbgs() << "Is imm " << C
2537  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2538  return IsLegal;
2539 }
2540 
2541 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2542 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2543 // can be set differently by this operation. It comes down to whether
2544 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2545 // everything is fine. If not then the optimization is wrong. Thus general
2546 // comparisons are only valid if op2 != 0.
2547 //
2548 // So, finally, the only LLVM-native comparisons that don't mention C and V
2549 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2550 // the absence of information about op2.
2551 static bool isCMN(SDValue Op, ISD::CondCode CC) {
2552  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2553  (CC == ISD::SETEQ || CC == ISD::SETNE);
2554 }
2555 
2557  SelectionDAG &DAG, SDValue Chain,
2558  bool IsSignaling) {
2559  EVT VT = LHS.getValueType();
2560  assert(VT != MVT::f128);
2561  assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
2562  unsigned Opcode =
2564  return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2565 }
2566 
2568  const SDLoc &dl, SelectionDAG &DAG) {
2569  EVT VT = LHS.getValueType();
2570  const bool FullFP16 =
2571  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2572 
2573  if (VT.isFloatingPoint()) {
2574  assert(VT != MVT::f128);
2575  if (VT == MVT::f16 && !FullFP16) {
2576  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2577  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2578  VT = MVT::f32;
2579  }
2580  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2581  }
2582 
2583  // The CMP instruction is just an alias for SUBS, and representing it as
2584  // SUBS means that it's possible to get CSE with subtract operations.
2585  // A later phase can perform the optimization of setting the destination
2586  // register to WZR/XZR if it ends up being unused.
2587  unsigned Opcode = AArch64ISD::SUBS;
2588 
2589  if (isCMN(RHS, CC)) {
2590  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2591  Opcode = AArch64ISD::ADDS;
2592  RHS = RHS.getOperand(1);
2593  } else if (isCMN(LHS, CC)) {
2594  // As we are looking for EQ/NE compares, the operands can be commuted ; can
2595  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2596  Opcode = AArch64ISD::ADDS;
2597  LHS = LHS.getOperand(1);
2598  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2599  if (LHS.getOpcode() == ISD::AND) {
2600  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2601  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2602  // of the signed comparisons.
2603  const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2604  DAG.getVTList(VT, MVT_CC),
2605  LHS.getOperand(0),
2606  LHS.getOperand(1));
2607  // Replace all users of (and X, Y) with newly generated (ands X, Y)
2608  DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2609  return ANDSNode.getValue(1);
2610  } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2611  // Use result of ANDS
2612  return LHS.getValue(1);
2613  }
2614  }
2615 
2616  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2617  .getValue(1);
2618 }
2619 
2620 /// \defgroup AArch64CCMP CMP;CCMP matching
2621 ///
2622 /// These functions deal with the formation of CMP;CCMP;... sequences.
2623 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2624 /// a comparison. They set the NZCV flags to a predefined value if their
2625 /// predicate is false. This allows to express arbitrary conjunctions, for
2626 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2627 /// expressed as:
2628 /// cmp A
2629 /// ccmp B, inv(CB), CA
2630 /// check for CB flags
2631 ///
2632 /// This naturally lets us implement chains of AND operations with SETCC
2633 /// operands. And we can even implement some other situations by transforming
2634 /// them:
2635 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
2636 /// negating the flags used in a CCMP/FCCMP operations.
2637 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2638 /// by negating the flags we test for afterwards. i.e.
2639 /// NEG (CMP CCMP CCCMP ...) can be implemented.
2640 /// - Note that we can only ever negate all previously processed results.
2641 /// What we can not implement by flipping the flags to test is a negation
2642 /// of two sub-trees (because the negation affects all sub-trees emitted so
2643 /// far, so the 2nd sub-tree we emit would also affect the first).
2644 /// With those tools we can implement some OR operations:
2645 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
2646 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2647 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
2648 /// elimination rules from earlier to implement the whole thing as a
2649 /// CCMP/FCCMP chain.
2650 ///
2651 /// As complete example:
2652 /// or (or (setCA (cmp A)) (setCB (cmp B)))
2653 /// (and (setCC (cmp C)) (setCD (cmp D)))"
2654 /// can be reassociated to:
2655 /// or (and (setCC (cmp C)) setCD (cmp D))
2656 // (or (setCA (cmp A)) (setCB (cmp B)))
2657 /// can be transformed to:
2658 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2659 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2660 /// which can be implemented as:
2661 /// cmp C
2662 /// ccmp D, inv(CD), CC
2663 /// ccmp A, CA, inv(CD)
2664 /// ccmp B, CB, inv(CA)
2665 /// check for CB flags
2666 ///
2667 /// A counterexample is "or (and A B) (and C D)" which translates to
2668 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2669 /// can only implement 1 of the inner (not) operations, but not both!
2670 /// @{
2671 
2672 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2674  ISD::CondCode CC, SDValue CCOp,
2676  AArch64CC::CondCode OutCC,
2677  const SDLoc &DL, SelectionDAG &DAG) {
2678  unsigned Opcode = 0;
2679  const bool FullFP16 =
2680  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2681 
2682  if (LHS.getValueType().isFloatingPoint()) {
2683  assert(LHS.getValueType() != MVT::f128);
2684  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
2687  }
2688  Opcode = AArch64ISD::FCCMP;
2689  } else if (RHS.getOpcode() == ISD::SUB) {
2690  SDValue SubOp0 = RHS.getOperand(0);
2691  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
2692  // See emitComparison() on why we can only do this for SETEQ and SETNE.
2693  Opcode = AArch64ISD::CCMN;
2694  RHS = RHS.getOperand(1);
2695  }
2696  }
2697  if (Opcode == 0)
2698  Opcode = AArch64ISD::CCMP;
2699 
2700  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
2702  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
2703  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
2704  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
2705 }
2706 
2707 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
2708 /// expressed as a conjunction. See \ref AArch64CCMP.
2709 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
2710 /// changing the conditions on the SETCC tests.
2711 /// (this means we can call emitConjunctionRec() with
2712 /// Negate==true on this sub-tree)
2713 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
2714 /// cannot do the negation naturally. We are required to
2715 /// emit the subtree first in this case.
2716 /// \param WillNegate Is true if are called when the result of this
2717 /// subexpression must be negated. This happens when the
2718 /// outer expression is an OR. We can use this fact to know
2719 /// that we have a double negation (or (or ...) ...) that
2720 /// can be implemented for free.
2721 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
2722  bool &MustBeFirst, bool WillNegate,
2723  unsigned Depth = 0) {
2724  if (!Val.hasOneUse())
2725  return false;
2726  unsigned Opcode = Val->getOpcode();
2727  if (Opcode == ISD::SETCC) {
2728  if (Val->getOperand(0).getValueType() == MVT::f128)
2729  return false;
2730  CanNegate = true;
2731  MustBeFirst = false;
2732  return true;
2733  }
2734  // Protect against exponential runtime and stack overflow.
2735  if (Depth > 6)
2736  return false;
2737  if (Opcode == ISD::AND || Opcode == ISD::OR) {
2738  bool IsOR = Opcode == ISD::OR;
2739  SDValue O0 = Val->getOperand(0);
2740  SDValue O1 = Val->getOperand(1);
2741  bool CanNegateL;
2742  bool MustBeFirstL;
2743  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
2744  return false;
2745  bool CanNegateR;
2746  bool MustBeFirstR;
2747  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
2748  return false;
2749 
2750  if (MustBeFirstL && MustBeFirstR)
2751  return false;
2752 
2753  if (IsOR) {
2754  // For an OR expression we need to be able to naturally negate at least
2755  // one side or we cannot do the transformation at all.
2756  if (!CanNegateL && !CanNegateR)
2757  return false;
2758  // If we the result of the OR will be negated and we can naturally negate
2759  // the leafs, then this sub-tree as a whole negates naturally.
2760  CanNegate = WillNegate && CanNegateL && CanNegateR;
2761  // If we cannot naturally negate the whole sub-tree, then this must be
2762  // emitted first.
2763  MustBeFirst = !CanNegate;
2764  } else {
2765  assert(Opcode == ISD::AND && "Must be OR or AND");
2766  // We cannot naturally negate an AND operation.
2767  CanNegate = false;
2768  MustBeFirst = MustBeFirstL || MustBeFirstR;
2769  }
2770  return true;
2771  }
2772  return false;
2773 }
2774 
2775 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
2776 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
2777 /// Tries to transform the given i1 producing node @p Val to a series compare
2778 /// and conditional compare operations. @returns an NZCV flags producing node
2779 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
2780 /// transformation was not possible.
2781 /// \p Negate is true if we want this sub-tree being negated just by changing
2782 /// SETCC conditions.
2784  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
2786  // We're at a tree leaf, produce a conditional comparison operation.
2787  unsigned Opcode = Val->getOpcode();
2788  if (Opcode == ISD::SETCC) {
2789  SDValue LHS = Val->getOperand(0);
2790  SDValue RHS = Val->getOperand(1);
2791  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
2792  bool isInteger = LHS.getValueType().isInteger();
2793  if (Negate)
2794  CC = getSetCCInverse(CC, LHS.getValueType());
2795  SDLoc DL(Val);
2796  // Determine OutCC and handle FP special case.
2797  if (isInteger) {
2798  OutCC = changeIntCCToAArch64CC(CC);
2799  } else {
2800  assert(LHS.getValueType().isFloatingPoint());
2801  AArch64CC::CondCode ExtraCC;
2802  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
2803  // Some floating point conditions can't be tested with a single condition
2804  // code. Construct an additional comparison in this case.
2805  if (ExtraCC != AArch64CC::AL) {
2806  SDValue ExtraCmp;
2807  if (!CCOp.getNode())
2808  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
2809  else
2810  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
2811  ExtraCC, DL, DAG);
2812  CCOp = ExtraCmp;
2813  Predicate = ExtraCC;
2814  }
2815  }
2816 
2817  // Produce a normal comparison if we are first in the chain
2818  if (!CCOp)
2819  return emitComparison(LHS, RHS, CC, DL, DAG);
2820  // Otherwise produce a ccmp.
2821  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
2822  DAG);
2823  }
2824  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
2825 
2826  bool IsOR = Opcode == ISD::OR;
2827 
2828  SDValue LHS = Val->getOperand(0);
2829  bool CanNegateL;
2830  bool MustBeFirstL;
2831  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
2832  assert(ValidL && "Valid conjunction/disjunction tree");
2833  (void)ValidL;
2834 
2835  SDValue RHS = Val->getOperand(1);
2836  bool CanNegateR;
2837  bool MustBeFirstR;
2838  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
2839  assert(ValidR && "Valid conjunction/disjunction tree");
2840  (void)ValidR;
2841 
2842  // Swap sub-tree that must come first to the right side.
2843  if (MustBeFirstL) {
2844  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
2845  std::swap(LHS, RHS);
2846  std::swap(CanNegateL, CanNegateR);
2847  std::swap(MustBeFirstL, MustBeFirstR);
2848  }
2849 
2850  bool NegateR;
2851  bool NegateAfterR;
2852  bool NegateL;
2853  bool NegateAfterAll;
2854  if (Opcode == ISD::OR) {
2855  // Swap the sub-tree that we can negate naturally to the left.
2856  if (!CanNegateL) {
2857  assert(CanNegateR && "at least one side must be negatable");
2858  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
2859  assert(!Negate);
2860  std::swap(LHS, RHS);
2861  NegateR = false;
2862  NegateAfterR = true;
2863  } else {
2864  // Negate the left sub-tree if possible, otherwise negate the result.
2865  NegateR = CanNegateR;
2866  NegateAfterR = !CanNegateR;
2867  }
2868  NegateL = true;
2869  NegateAfterAll = !Negate;
2870  } else {
2871  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
2872  assert(!Negate && "Valid conjunction/disjunction tree");
2873 
2874  NegateL = false;
2875  NegateR = false;
2876  NegateAfterR = false;
2877  NegateAfterAll = false;
2878  }
2879 
2880  // Emit sub-trees.
2881  AArch64CC::CondCode RHSCC;
2882  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
2883  if (NegateAfterR)
2884  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
2885  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
2886  if (NegateAfterAll)
2887  OutCC = AArch64CC::getInvertedCondCode(OutCC);
2888  return CmpL;
2889 }
2890 
2891 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
2892 /// In some cases this is even possible with OR operations in the expression.
2893 /// See \ref AArch64CCMP.
2894 /// \see emitConjunctionRec().
2896  AArch64CC::CondCode &OutCC) {
2897  bool DummyCanNegate;
2898  bool DummyMustBeFirst;
2899  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
2900  return SDValue();
2901 
2902  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
2903 }
2904 
2905 /// @}
2906 
2907 /// Returns how profitable it is to fold a comparison's operand's shift and/or
2908 /// extension operations.
2910  auto isSupportedExtend = [&](SDValue V) {
2911  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
2912  return true;
2913 
2914  if (V.getOpcode() == ISD::AND)
2915  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
2916  uint64_t Mask = MaskCst->getZExtValue();
2917  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
2918  }
2919 
2920  return false;
2921  };
2922 
2923  if (!Op.hasOneUse())
2924  return 0;
2925 
2926  if (isSupportedExtend(Op))
2927  return 1;
2928 
2929  unsigned Opc = Op.getOpcode();
2930  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
2931  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
2932  uint64_t Shift = ShiftCst->getZExtValue();
2933  if (isSupportedExtend(Op.getOperand(0)))
2934  return (Shift <= 4) ? 2 : 1;
2935  EVT VT = Op.getValueType();
2936  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
2937  return 1;
2938  }
2939 
2940  return 0;
2941 }
2942 
2944  SDValue &AArch64cc, SelectionDAG &DAG,
2945  const SDLoc &dl) {
2946  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
2947  EVT VT = RHS.getValueType();
2948  uint64_t C = RHSC->getZExtValue();
2949  if (!isLegalArithImmed(C)) {
2950  // Constant does not fit, try adjusting it by one?
2951  switch (CC) {
2952  default:
2953  break;
2954  case ISD::SETLT:
2955  case ISD::SETGE:
2956  if ((VT == MVT::i32 && C != 0x80000000 &&
2957  isLegalArithImmed((uint32_t)(C - 1))) ||
2958  (VT == MVT::i64 && C != 0x80000000ULL &&
2959  isLegalArithImmed(C - 1ULL))) {
2960  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2961  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2962  RHS = DAG.getConstant(C, dl, VT);
2963  }
2964  break;
2965  case ISD::SETULT:
2966  case ISD::SETUGE:
2967  if ((VT == MVT::i32 && C != 0 &&
2968  isLegalArithImmed((uint32_t)(C - 1))) ||
2969  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
2970  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2971  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2972  RHS = DAG.getConstant(C, dl, VT);
2973  }
2974  break;
2975  case ISD::SETLE:
2976  case ISD::SETGT:
2977  if ((VT == MVT::i32 && C != INT32_MAX &&
2978  isLegalArithImmed((uint32_t)(C + 1))) ||
2979  (VT == MVT::i64 && C != INT64_MAX &&
2980  isLegalArithImmed(C + 1ULL))) {
2981  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2982  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2983  RHS = DAG.getConstant(C, dl, VT);
2984  }
2985  break;
2986  case ISD::SETULE:
2987  case ISD::SETUGT:
2988  if ((VT == MVT::i32 && C != UINT32_MAX &&
2989  isLegalArithImmed((uint32_t)(C + 1))) ||
2990  (VT == MVT::i64 && C != UINT64_MAX &&
2991  isLegalArithImmed(C + 1ULL))) {
2992  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2993  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2994  RHS = DAG.getConstant(C, dl, VT);
2995  }
2996  break;
2997  }
2998  }
2999  }
3000 
3001  // Comparisons are canonicalized so that the RHS operand is simpler than the
3002  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3003  // can fold some shift+extend operations on the RHS operand, so swap the
3004  // operands if that can be done.
3005  //
3006  // For example:
3007  // lsl w13, w11, #1
3008  // cmp w13, w12
3009  // can be turned into:
3010  // cmp w12, w11, lsl #1
3011  if (!isa<ConstantSDNode>(RHS) ||
3012  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
3013  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3014 
3016  std::swap(LHS, RHS);
3018  }
3019  }
3020 
3021  SDValue Cmp;
3022  AArch64CC::CondCode AArch64CC;
3023  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3024  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3025 
3026  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3027  // For the i8 operand, the largest immediate is 255, so this can be easily
3028  // encoded in the compare instruction. For the i16 operand, however, the
3029  // largest immediate cannot be encoded in the compare.
3030  // Therefore, use a sign extending load and cmn to avoid materializing the
3031  // -1 constant. For example,
3032  // movz w1, #65535
3033  // ldrh w0, [x0, #0]
3034  // cmp w0, w1
3035  // >
3036  // ldrsh w0, [x0, #0]
3037  // cmn w0, #1
3038  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3039  // if and only if (sext LHS) == (sext RHS). The checks are in place to
3040  // ensure both the LHS and RHS are truly zero extended and to make sure the
3041  // transformation is profitable.
3042  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3043  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3044  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3045  LHS.getNode()->hasNUsesOfValue(1, 0)) {
3046  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
3047  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3048  SDValue SExt =
3049  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3050  DAG.getValueType(MVT::i16));
3051  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3052  RHS.getValueType()),
3053  CC, dl, DAG);
3054  AArch64CC = changeIntCCToAArch64CC(CC);
3055  }
3056  }
3057 
3058  if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3059  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3060  if ((CC == ISD::SETNE) ^ RHSC->isZero())
3061  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3062  }
3063  }
3064  }
3065 
3066  if (!Cmp) {
3067  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3068  AArch64CC = changeIntCCToAArch64CC(CC);
3069  }
3070  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3071  return Cmp;
3072 }
3073 
3074 static std::pair<SDValue, SDValue>
3076  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3077  "Unsupported value type");
3078  SDValue Value, Overflow;
3079  SDLoc DL(Op);
3080  SDValue LHS = Op.getOperand(0);
3081  SDValue RHS = Op.getOperand(1);
3082  unsigned Opc = 0;
3083  switch (Op.getOpcode()) {
3084  default:
3085  llvm_unreachable("Unknown overflow instruction!");
3086  case ISD::SADDO:
3087  Opc = AArch64ISD::ADDS;
3088  CC = AArch64CC::VS;
3089  break;
3090  case ISD::UADDO:
3091  Opc = AArch64ISD::ADDS;
3092  CC = AArch64CC::HS;
3093  break;
3094  case ISD::SSUBO:
3095  Opc = AArch64ISD::SUBS;
3096  CC = AArch64CC::VS;
3097  break;
3098  case ISD::USUBO:
3099  Opc = AArch64ISD::SUBS;
3100  CC = AArch64CC::LO;
3101  break;
3102  // Multiply needs a little bit extra work.
3103  case ISD::SMULO:
3104  case ISD::UMULO: {
3105  CC = AArch64CC::NE;
3106  bool IsSigned = Op.getOpcode() == ISD::SMULO;
3107  if (Op.getValueType() == MVT::i32) {
3108  // Extend to 64-bits, then perform a 64-bit multiply.
3109  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3110  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3111  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3114 
3115  // Check that the result fits into a 32-bit integer.
3116  SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3117  if (IsSigned) {
3118  // cmp xreg, wreg, sxtw
3119  SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3120  Overflow =
3121  DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3122  } else {
3123  // tst xreg, #0xffffffff00000000
3124  SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3125  Overflow =
3126  DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3127  }
3128  break;
3129  }
3130  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3131  // For the 64 bit multiply
3132  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3133  if (IsSigned) {
3134  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3135  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3136  DAG.getConstant(63, DL, MVT::i64));
3137  // It is important that LowerBits is last, otherwise the arithmetic
3138  // shift will not be folded into the compare (SUBS).
3139  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3140  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3141  .getValue(1);
3142  } else {
3143  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3144  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3145  Overflow =
3146  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3147  DAG.getConstant(0, DL, MVT::i64),
3148  UpperBits).getValue(1);
3149  }
3150  break;
3151  }
3152  } // switch (...)
3153 
3154  if (Opc) {
3155  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3156 
3157  // Emit the AArch64 operation with overflow check.
3158  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3159  Overflow = Value.getValue(1);
3160  }
3161  return std::make_pair(Value, Overflow);
3162 }
3163 
3164 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3165  if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3166  return LowerToScalableOp(Op, DAG);
3167 
3168  SDValue Sel = Op.getOperand(0);
3169  SDValue Other = Op.getOperand(1);
3170  SDLoc dl(Sel);
3171 
3172  // If the operand is an overflow checking operation, invert the condition
3173  // code and kill the Not operation. I.e., transform:
3174  // (xor (overflow_op_bool, 1))
3175  // -->
3176  // (csel 1, 0, invert(cc), overflow_op_bool)
3177  // ... which later gets transformed to just a cset instruction with an
3178  // inverted condition code, rather than a cset + eor sequence.
3179  if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3180  // Only lower legal XALUO ops.
3181  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3182  return SDValue();
3183 
3184  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3185  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3187  SDValue Value, Overflow;
3188  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3189  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3190  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3191  CCVal, Overflow);
3192  }
3193  // If neither operand is a SELECT_CC, give up.
3194  if (Sel.getOpcode() != ISD::SELECT_CC)
3195  std::swap(Sel, Other);
3196  if (Sel.getOpcode() != ISD::SELECT_CC)
3197  return Op;
3198 
3199  // The folding we want to perform is:
3200  // (xor x, (select_cc a, b, cc, 0, -1) )
3201  // -->
3202  // (csel x, (xor x, -1), cc ...)
3203  //
3204  // The latter will get matched to a CSINV instruction.
3205 
3206  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3207  SDValue LHS = Sel.getOperand(0);
3208  SDValue RHS = Sel.getOperand(1);
3209  SDValue TVal = Sel.getOperand(2);
3210  SDValue FVal = Sel.getOperand(3);
3211 
3212  // FIXME: This could be generalized to non-integer comparisons.
3213  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3214  return Op;
3215 
3216  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3217  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3218 
3219  // The values aren't constants, this isn't the pattern we're looking for.
3220  if (!CFVal || !CTVal)
3221  return Op;
3222 
3223  // We can commute the SELECT_CC by inverting the condition. This
3224  // might be needed to make this fit into a CSINV pattern.
3225  if (CTVal->isAllOnes() && CFVal->isZero()) {
3226  std::swap(TVal, FVal);
3227  std::swap(CTVal, CFVal);
3228  CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3229  }
3230 
3231  // If the constants line up, perform the transform!
3232  if (CTVal->isZero() && CFVal->isAllOnes()) {
3233  SDValue CCVal;
3234  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3235 
3236  FVal = Other;
3237  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3238  DAG.getConstant(-1ULL, dl, Other.getValueType()));
3239 
3240  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3241  CCVal, Cmp);
3242  }
3243 
3244  return Op;
3245 }
3246 
3248  EVT VT = Op.getValueType();
3249 
3250  // Let legalize expand this if it isn't a legal type yet.
3251  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3252  return SDValue();
3253 
3254  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
3255 
3256  unsigned Opc;
3257  bool ExtraOp = false;
3258  switch (Op.getOpcode()) {
3259  default:
3260  llvm_unreachable("Invalid code");
3261  case ISD::ADDC:
3262  Opc = AArch64ISD::ADDS;
3263  break;
3264  case ISD::SUBC:
3265  Opc = AArch64ISD::SUBS;
3266  break;
3267  case ISD::ADDE:
3268  Opc = AArch64ISD::ADCS;
3269  ExtraOp = true;
3270  break;
3271  case ISD::SUBE:
3272  Opc = AArch64ISD::SBCS;
3273  ExtraOp = true;
3274  break;
3275  }
3276 
3277  if (!ExtraOp)
3278  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
3279  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
3280  Op.getOperand(2));
3281 }